diff options
Diffstat (limited to 'net/ipv6')
-rw-r--r-- | net/ipv6/addrconf.c | 19 | ||||
-rw-r--r-- | net/ipv6/addrconf_core.c | 6 | ||||
-rw-r--r-- | net/ipv6/af_inet6.c | 46 | ||||
-rw-r--r-- | net/ipv6/ah6.c | 4 | ||||
-rw-r--r-- | net/ipv6/esp6.c | 23 | ||||
-rw-r--r-- | net/ipv6/esp6_offload.c | 4 | ||||
-rw-r--r-- | net/ipv6/fib6_rules.c | 12 | ||||
-rw-r--r-- | net/ipv6/icmp.c | 7 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 2 | ||||
-rw-r--r-- | net/ipv6/ip6_fib.c | 214 | ||||
-rw-r--r-- | net/ipv6/ip6_flowlabel.c | 27 | ||||
-rw-r--r-- | net/ipv6/ip6_output.c | 340 | ||||
-rw-r--r-- | net/ipv6/ipcomp6.c | 3 | ||||
-rw-r--r-- | net/ipv6/mip6.c | 6 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 11 | ||||
-rw-r--r-- | net/ipv6/netfilter.c | 129 | ||||
-rw-r--r-- | net/ipv6/netfilter/ip6t_SYNPROXY.c | 420 | ||||
-rw-r--r-- | net/ipv6/netfilter/ip6table_raw.c | 2 | ||||
-rw-r--r-- | net/ipv6/netfilter/nf_conntrack_reasm.c | 53 | ||||
-rw-r--r-- | net/ipv6/proc.c | 4 | ||||
-rw-r--r-- | net/ipv6/raw.c | 4 | ||||
-rw-r--r-- | net/ipv6/reassembly.c | 52 | ||||
-rw-r--r-- | net/ipv6/route.c | 1471 | ||||
-rw-r--r-- | net/ipv6/sysctl_net_ipv6.c | 5 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 31 | ||||
-rw-r--r-- | net/ipv6/udp.c | 33 | ||||
-rw-r--r-- | net/ipv6/xfrm6_state.c | 137 |
27 files changed, 1830 insertions, 1235 deletions
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 081bb517e40d..521e3203e83a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2417,9 +2417,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != dev->ifindex) + /* prefix routes only use builtin fib6_nh */ + if (rt->nh) continue; - if (no_gw && rt->fib6_nh.fib_nh_gw_family) + + if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex) + continue; + if (no_gw && rt->fib6_nh->fib_nh_gw_family) continue; if ((rt->fib6_flags & flags) != flags) continue; @@ -3123,11 +3127,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) struct in_device *in_dev = __in_dev_get_rtnl(dev); if (in_dev && (dev->flags & IFF_UP)) { struct in_ifaddr *ifa; - int flag = scope; - for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { - + in_dev_for_each_ifa_rtnl(ifa, in_dev) { addr.s6_addr32[3] = ifa->ifa_local; if (ifa->ifa_scope == RT_SCOPE_LINK) @@ -6350,16 +6352,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) list_for_each_entry(ifa, &idev->addr_list, if_list) { spin_lock(&ifa->lock); if (ifa->rt) { - struct fib6_info *rt = ifa->rt; + /* host routes only use builtin fib6_nh */ + struct fib6_nh *nh = ifa->rt->fib6_nh; int cpu; rcu_read_lock(); ifa->rt->dst_nopolicy = val ? true : false; - if (rt->rt6i_pcpu) { + if (nh->rt6i_pcpu) { for_each_possible_cpu(cpu) { struct rt6_info **rtp; - rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu); addrconf_set_nopolicy(*rtp, val); } } diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5b1246635e02..783f3c1466da 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -183,6 +183,11 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +{ + return -EAFNOSUPPORT; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -192,6 +197,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_select_path = eafnosupport_fib6_select_path, .ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6, .fib6_nh_init = eafnosupport_fib6_nh_init, + .ip6_del_rt = eafnosupport_ip6_del_rt, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5352708b7b2d..ef37e0574f54 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -208,7 +208,7 @@ lookup_protocol: np->mc_loop = 1; np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->repflow = net->ipv6.sysctl.flowlabel_reflect; + np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -564,6 +564,39 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, + size_t)); +int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + + if (unlikely(inet_send_prepare(sk))) + return -EAGAIN; + + return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + sk, msg, size); +} + +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *, + size_t, int, int, int *)); +int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + if (likely(!(flags & MSG_ERRQUEUE))) + sock_rps_record_flow(sk); + + err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, @@ -580,8 +613,8 @@ const struct proto_ops inet6_stream_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif @@ -614,8 +647,8 @@ const struct proto_ops inet6_dgram_ops = { .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ - .sendmsg = inet_sendmsg, /* ok */ - .recvmsg = inet_recvmsg, /* ok */ + .sendmsg = inet6_sendmsg, /* retpoline's sake */ + .recvmsg = inet6_recvmsg, /* retpoline's sake */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, @@ -922,6 +955,9 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_mtu_from_fib6 = ip6_mtu_from_fib6, .fib6_nh_init = fib6_nh_init, .fib6_nh_release = fib6_nh_release, + .fib6_update_sernum = fib6_update_sernum_stub, + .fib6_rt_update = fib6_rt_update, + .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 68b9e92e469e..25e1172fd1c3 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -793,9 +793,7 @@ static void __exit ah6_fini(void) if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); - + xfrm_unregister_type(&ah6_type, AF_INET6); } module_init(ah6_init); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ae6a739c5f52..a3b403ba8f8f 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -41,8 +41,6 @@ struct esp_skb_cb { #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu); - /* * Allocate an AEAD request structure with extra space for SG and IV. * @@ -447,7 +445,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } @@ -687,21 +685,6 @@ out: return ret; } -static u32 esp6_get_mtu(struct xfrm_state *x, int mtu) -{ - struct crypto_aead *aead = x->data; - u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4); - unsigned int net_adj; - - if (x->props.mode != XFRM_MODE_TUNNEL) - net_adj = sizeof(struct ipv6hdr); - else - net_adj = 0; - - return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - - net_adj) & ~(blksize - 1)) + net_adj - 2; -} - static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { @@ -919,7 +902,6 @@ static const struct xfrm_type esp6_type = { .flags = XFRM_TYPE_REPLAY_PROT, .init_state = esp6_init_state, .destructor = esp6_destroy, - .get_mtu = esp6_get_mtu, .input = esp6_input, .output = esp6_output, .hdr_offset = xfrm6_find_1stfragopt, @@ -951,8 +933,7 @@ static void __exit esp6_fini(void) { if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&esp6_type, AF_INET6); } module_init(esp6_init); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d0d8528b294a..e31626ffccd1 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -336,9 +336,7 @@ static int __init esp6_offload_init(void) static void __exit esp6_offload_exit(void) { - if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type offload\n", __func__); - + xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6); inet6_del_offload(&esp6_offload, IPPROTO_ESP); } diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index bcfae13409b5..d22b6c140f23 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -113,14 +113,15 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); } - dst_hold(&net->ipv6.ip6_null_entry->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } @@ -237,13 +238,14 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto out; } again: - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: - dst_hold(&rt->dst); + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); out: res->rt6 = rt; return err; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 375b4b4f9bf5..62c997201970 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -75,9 +75,9 @@ * * On SMP we have one ICMP socket per-cpu. */ -static inline struct sock *icmpv6_sk(struct net *net) +static struct sock *icmpv6_sk(struct net *net) { - return *this_cpu_ptr(net->ipv6.icmp_sk); + return this_cpu_read(*net->ipv6.icmp_sk); } static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, @@ -703,6 +703,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; memset(&fl6, 0, sizeof(fl6)); + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); + fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b2a55f300318..cf60fae9533b 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -174,7 +174,7 @@ struct sock *inet6_lookup_listener(struct net *net, saddr, sport, &in6addr_any, hnum, dif, sdif); done: - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9180c8b6f764..49884f96232b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -143,20 +143,19 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; + size_t sz = sizeof(*f6i); - f6i = kzalloc(sizeof(*f6i), gfp_flags); + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; - f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!f6i->rt6i_pcpu) { - kfree(f6i); - return NULL; - } - + /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); @@ -166,36 +165,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); - struct rt6_exception_bucket *bucket; WARN_ON(f6i->fib6_node); - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1); - kfree(bucket); - - if (f6i->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **ppcpu_rt; - struct rt6_info *pcpu_rt; - - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); - pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { - dst_dev_put(&pcpu_rt->dst); - dst_release(&pcpu_rt->dst); - *ppcpu_rt = NULL; - } - } - - free_percpu(f6i->rt6i_pcpu); - } - - fib6_nh_release(&f6i->fib6_nh); + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); - kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); @@ -338,9 +316,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; - dst_hold(&rt->dst); + if (!(flags | RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); } return &rt->dst; @@ -389,14 +368,30 @@ static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib6_notifier(nb, net, event_type, &info.info); } -static int call_fib6_entry_notifiers(struct net *net, - enum fib_event_type event_type, - struct fib6_info *rt, - struct netlink_ext_ack *extack) +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + rt->fib6_table->fib_seq++; + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, + .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; @@ -469,12 +464,19 @@ static int fib6_dump_node(struct fib6_walker *w) struct fib6_info *rt; for_each_fib6_walker_rt(w) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } + w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the @@ -526,6 +528,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); @@ -541,6 +544,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; @@ -558,9 +562,10 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, + .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct rt6_rtnl_dump_arg arg = {}; unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; @@ -577,13 +582,10 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); - arg.filter.flags = rtm->rtm_flags & (RTM_F_PREFIX|RTM_F_CLONED); + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; } - /* fib entries are never clones */ - if (arg.filter.flags & RTM_F_CLONED) - goto out; - w = (void *)cb->args[2]; if (!w) { /* New dump: @@ -895,16 +897,14 @@ insert_above: return ln; } -static void fib6_drop_pcpu_from(struct fib6_info *f6i, - const struct fib6_table *table) +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match, + const struct fib6_table *table) { int cpu; - /* Make sure rt6_make_pcpu_route() wont add other percpu routes - * while we are cleaning them here. - */ - f6i->fib6_destroying = 1; - mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + if (!fib6_nh->rt6i_pcpu) + return; /* release the reference to this fib entry from * all of its cached pcpu routes @@ -913,9 +913,15 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; - ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu); + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; - if (pcpu_rt) { + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); @@ -924,13 +930,53 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, } } +struct fib6_nh_pcpu_arg { + struct fib6_info *from; + const struct fib6_table *table; +}; + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_pcpu_arg *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg->from, arg->table); + return 0; +} + +static void fib6_drop_pcpu_from(struct fib6_info *f6i, + const struct fib6_table *table) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + struct fib6_nh_pcpu_arg arg = { + .from = f6i, + .table = table + }; + + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, + &arg); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i, table); + } +} + static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; - if (rt->rt6i_pcpu) - fib6_drop_pcpu_from(rt, table); + fib6_drop_pcpu_from(rt, table); + + if (rt->nh && !list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1101,11 +1147,13 @@ next_iter: add: nlflags |= NLM_F_CREATE; - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_ADD, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_ADD, + rt, extack); + if (err) + return err; + } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); @@ -1130,11 +1178,13 @@ add: return -ENOENT; } - err = call_fib6_entry_notifiers(info->nl_net, - FIB_EVENT_ENTRY_REPLACE, - rt, extack); - if (err) - return err; + if (!info->skip_notify_kernel) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); @@ -1218,6 +1268,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> @@ -1331,6 +1389,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { + if (rt->nh) + list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); } @@ -1536,7 +1596,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root, if (plen == fn->fn_bit) return fn; - prev = fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; next: /* @@ -1807,9 +1868,11 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, fib6_purge_rt(rt, fn, net); - call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); + if (!info->skip_notify_kernel) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); + fib6_info_release(rt); } @@ -2041,6 +2104,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root, c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; @@ -2292,9 +2356,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES @@ -2302,14 +2370,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v) #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif - if (rt->fib6_nh.fib_nh_gw_family) { + if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; - seq_printf(seq, "%pi6", &rt->fib6_nh.fib_nh_gw6); + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } - dev = rt->fib6_nh.fib_nh_dev; + dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 545e339b8c4f..ad284b1fd308 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/pid_namespace.h> +#include <linux/jump_label_ratelimit.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -53,6 +54,9 @@ static DEFINE_SPINLOCK(ip6_fl_lock); static DEFINE_SPINLOCK(ip6_sk_fl_lock); +DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ); +EXPORT_SYMBOL(ipv6_flowlabel_exclusive); + #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ @@ -90,6 +94,13 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label) return fl; } +static bool fl_shared_exclusive(struct ip6_flowlabel *fl) +{ + return fl->share == IPV6_FL_S_EXCL || + fl->share == IPV6_FL_S_PROCESS || + fl->share == IPV6_FL_S_USER; +} + static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); @@ -103,8 +114,13 @@ static void fl_free_rcu(struct rcu_head *head) static void fl_free(struct ip6_flowlabel *fl) { - if (fl) - call_rcu(&fl->rcu, fl_free_rcu); + if (!fl) + return; + + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive); + + call_rcu(&fl->rcu, fl_free_rcu); } static void fl_release(struct ip6_flowlabel *fl) @@ -240,7 +256,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, /* Socket flowlabel lists */ -struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) +struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; struct ipv6_pinfo *np = inet6_sk(sk); @@ -260,7 +276,7 @@ struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) rcu_read_unlock_bh(); return NULL; } -EXPORT_SYMBOL_GPL(fl6_sock_lookup); +EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { @@ -419,6 +435,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); + if (fl_shared_exclusive(fl) || fl->opt) + static_branch_deferred_inc(&ipv6_flowlabel_exclusive); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: @@ -854,6 +872,7 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { + static_key_deferred_flush(&ipv6_flowlabel_exclusive); del_timer(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 21efcd02f337..8e49fd62eea9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -124,16 +124,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * return -EINVAL; } -static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - int ret; - - ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); - if (ret) { - kfree_skb(skb); - return ret; - } - #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { @@ -150,6 +142,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s return ip6_finish_output2(net, sk, skb); } +static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + int ret; + + ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); + switch (ret) { + case NET_XMIT_SUCCESS: + return __ip6_finish_output(net, sk, skb); + case NET_XMIT_CN: + return __ip6_finish_output(net, sk, skb) ? : ret; + default: + kfree_skb(skb); + return ret; + } +} + int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; @@ -588,6 +596,169 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } +int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, + struct ip6_fraglist_iter *iter) +{ + unsigned int first_len; + struct frag_hdr *fh; + + /* BUILD HEADER */ + *prevhdr = NEXTHDR_FRAGMENT; + iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); + if (!iter->tmp_hdr) + return -ENOMEM; + + iter->frag = skb_shinfo(skb)->frag_list; + skb_frag_list_init(skb); + + iter->offset = 0; + iter->hlen = hlen; + iter->frag_id = frag_id; + iter->nexthdr = nexthdr; + + __skb_pull(skb, hlen); + fh = __skb_push(skb, sizeof(struct frag_hdr)); + __skb_push(skb, hlen); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); + + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + fh->identification = frag_id; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + return 0; +} +EXPORT_SYMBOL(ip6_fraglist_init); + +void ip6_fraglist_prepare(struct sk_buff *skb, + struct ip6_fraglist_iter *iter) +{ + struct sk_buff *frag = iter->frag; + unsigned int hlen = iter->hlen; + struct frag_hdr *fh; + + frag->ip_summed = CHECKSUM_NONE; + skb_reset_transport_header(frag); + fh = __skb_push(frag, sizeof(struct frag_hdr)); + __skb_push(frag, hlen); + skb_reset_network_header(frag); + memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); + iter->offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = iter->nexthdr; + fh->reserved = 0; + fh->frag_off = htons(iter->offset); + if (frag->next) + fh->frag_off |= htons(IP6_MF); + fh->identification = iter->frag_id; + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); +} +EXPORT_SYMBOL(ip6_fraglist_prepare); + +void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, + unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, + u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) +{ + state->prevhdr = prevhdr; + state->nexthdr = nexthdr; + state->frag_id = frag_id; + + state->hlen = hlen; + state->mtu = mtu; + + state->left = skb->len - hlen; /* Space per frame */ + state->ptr = hlen; /* Where to start from */ + + state->hroom = hdr_room; + state->troom = needed_tailroom; + + state->offset = 0; +} +EXPORT_SYMBOL(ip6_frag_init); + +struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) +{ + u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; + struct sk_buff *frag; + struct frag_hdr *fh; + unsigned int len; + + len = state->left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > state->mtu) + len = state->mtu; + /* IF: we are not sending up to and including the packet end + then align the next start on an eight byte boundary */ + if (len < state->left) + len &= ~7; + + /* Allocate buffer */ + frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + + state->hroom + state->troom, GFP_ATOMIC); + if (!frag) + return ERR_PTR(-ENOMEM); + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, state->hroom); + skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); + skb_reset_network_header(frag); + fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); + frag->transport_header = (frag->network_header + state->hlen + + sizeof(struct frag_hdr)); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); + + fragnexthdr_offset = skb_network_header(frag); + fragnexthdr_offset += prevhdr - skb_network_header(skb); + *fragnexthdr_offset = NEXTHDR_FRAGMENT; + + /* + * Build fragment header. + */ + fh->nexthdr = state->nexthdr; + fh->reserved = 0; + fh->identification = state->frag_id; + + /* + * Copy a block of the IP datagram. + */ + BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), + len)); + state->left -= len; + + fh->frag_off = htons(state->offset); + if (state->left > 0) + fh->frag_off |= htons(IP6_MF); + ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + state->ptr += len; + state->offset += len; + + return frag; +} +EXPORT_SYMBOL(ip6_frag_next); + int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { @@ -595,12 +766,10 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len, nexthdr_offset; - int hroom, troom; + struct ip6_frag_state state; + unsigned int mtu, hlen, nexthdr_offset; + int hroom, err = 0; __be32 frag_id; - int ptr, offset = 0, err = 0; u8 *prevhdr, nexthdr = 0; err = ip6_find_1stfragopt(skb, &prevhdr); @@ -647,6 +816,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hroom = LL_RESERVED_SPACE(rt->dst.dev); if (skb_has_frag_list(skb)) { unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; struct sk_buff *frag2; if (first_len - hlen > mtu || @@ -674,74 +844,29 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, skb->truesize -= frag->truesize; } - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - err = -ENOMEM; + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = __skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = __skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGCREATES); - if (err || !frag) + if (err || !iter.frag) break; - skb = frag; - frag = skb->next; - skb_mark_not_on_list(skb); + skb = ip6_fraglist_next(&iter); } - kfree(tmp_hdr); + kfree(iter.tmp_hdr); if (err == 0) { IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -749,7 +874,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, return 0; } - kfree_skb_list(frag); + kfree_skb_list(iter.frag); IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IPSTATS_MIB_FRAGFAILS); @@ -766,91 +891,26 @@ slow_path_clean: } slow_path: - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - /* * Fragment the datagram. */ - troom = rt->dst.dev->needed_tailroom; + ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, + LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, + &state); /* * Keep copying data until we run out. */ - while (left > 0) { - u8 *fragnexthdr_offset; - - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - err = -ENOMEM; + while (state.left > 0) { + frag = ip6_frag_next(skb, &state); + if (IS_ERR(frag)) { + err = PTR_ERR(frag); goto fail; } /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - fragnexthdr_offset = skb_network_header(frag); - fragnexthdr_offset += prevhdr - skb_network_header(skb); - *fragnexthdr_offset = NEXTHDR_FRAGMENT; - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* * Put this fragment into the sending queue. */ err = output(net, sk, frag); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 51fd33294c7c..3752bd3e92ce 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -206,8 +206,7 @@ static void __exit ipcomp6_fini(void) { if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); - if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type\n", __func__); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); } module_init(ipcomp6_init); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 91801432878c..878fcec14949 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -499,10 +499,8 @@ static void __exit mip6_fini(void) { if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0) pr_info("%s: can't remove rawv6 mh filter\n", __func__); - if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(rthdr)\n", __func__); - if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0) - pr_info("%s: can't remove xfrm type(destopt)\n", __func__); + xfrm_unregister_type(&mip6_rthdr_type, AF_INET6); + xfrm_unregister_type(&mip6_destopt_type, AF_INET6); } module_init(mip6_init); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 09dd2edfb868..083cc1c94cd3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1285,12 +1285,11 @@ static void ndisc_router_discovery(struct sk_buff *skb) !in6_dev->cnf.accept_ra_rtr_pref) pref = ICMPV6_ROUTER_PREF_MEDIUM; #endif - + /* routes added from RAs do not use nexthop objects */ rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev); - if (rt) { - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, @@ -1319,8 +1318,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - neigh = ip6_neigh_lookup(&rt->fib6_nh.fib_nh_gw6, - rt->fib6_nh.fib_nh_dev, NULL, + neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6, + rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { ND_PRINTK(0, err, diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 1240ccd57f39..61819ed858b1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -16,6 +16,9 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_conntrack_bridge.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include "../bridge/br_private.h" int ip6_route_me_harder(struct net *net, struct sk_buff *skb) { @@ -109,16 +112,142 @@ int __nf_ip6_route(struct net *net, struct dst_entry **dst, } EXPORT_SYMBOL_GPL(__nf_ip6_route); +int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, + struct nf_ct_bridge_frag_data *data, + int (*output)(struct net *, struct sock *sk, + const struct nf_ct_bridge_frag_data *data, + struct sk_buff *)) +{ + int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + struct ip6_frag_state state; + u8 *prevhdr, nexthdr = 0; + unsigned int mtu, hlen; + int hroom, err = 0; + __be32 frag_id; + + err = ip6_find_1stfragopt(skb, &prevhdr); + if (err < 0) + goto blackhole; + hlen = err; + nexthdr = *prevhdr; + + mtu = skb->dev->mtu; + if (frag_max_size > mtu || + frag_max_size < IPV6_MIN_MTU) + goto blackhole; + + mtu = frag_max_size; + if (mtu < hlen + sizeof(struct frag_hdr) + 8) + goto blackhole; + mtu -= hlen + sizeof(struct frag_hdr); + + frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, + &ipv6_hdr(skb)->saddr); + + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto blackhole; + + hroom = LL_RESERVED_SPACE(skb->dev); + if (skb_has_frag_list(skb)) { + unsigned int first_len = skb_pagelen(skb); + struct ip6_fraglist_iter iter; + struct sk_buff *frag2; + + if (first_len - hlen > mtu || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + goto blackhole; + + if (skb_cloned(skb)) + goto slow_path; + + skb_walk_frags(skb, frag2) { + if (frag2->len > mtu || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + goto blackhole; + + /* Partially cloned skb? */ + if (skb_shared(frag2)) + goto slow_path; + } + + err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, + &iter); + if (err < 0) + goto blackhole; + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. + */ + if (iter.frag) + ip6_fraglist_prepare(skb, &iter); + + err = output(net, sk, data, skb); + if (err || !iter.frag) + break; + + skb = ip6_fraglist_next(&iter); + } + + kfree(iter.tmp_hdr); + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; + } +slow_path: + /* This is a linearized skbuff, the original geometry is lost for us. + * This may also be a clone skbuff, we could preserve the geometry for + * the copies but probably not worth the effort. + */ + ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom, + LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id, + &state); + + while (state.left > 0) { + struct sk_buff *skb2; + + skb2 = ip6_frag_next(skb, &state); + if (IS_ERR(skb2)) { + err = PTR_ERR(skb2); + goto blackhole; + } + + err = output(net, sk, data, skb2); + if (err) + goto blackhole; + } + consume_skb(skb); + return err; + +blackhole: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL_GPL(br_ip6_fragment); + static const struct nf_ipv6_ops ipv6ops = { #if IS_MODULE(CONFIG_IPV6) .chk_addr = ipv6_chk_addr, .route_me_harder = ip6_route_me_harder, .dev_get_saddr = ipv6_dev_get_saddr, .route = __nf_ip6_route, +#if IS_ENABLED(CONFIG_SYN_COOKIES) + .cookie_init_sequence = __cookie_v6_init_sequence, + .cookie_v6_check = __cookie_v6_check, +#endif #endif .route_input = ip6_route_input, .fragment = ip6_fragment, .reroute = nf_ip6_reroute, +#if IS_MODULE(CONFIG_IPV6) && IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + .br_defrag = nf_ct_frag6_gather, +#endif +#if IS_MODULE(CONFIG_IPV6) + .br_fragment = br_ip6_fragment, +#endif }; int __init ipv6_netfilter_init(void) diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 41325d517478..e77ea1ed5edd 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -3,272 +3,11 @@ * Copyright (c) 2013 Patrick McHardy <kaber@trash.net> */ -#include <linux/module.h> -#include <linux/skbuff.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#include <net/tcp.h> - #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_SYNPROXY.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_synproxy.h> -#include <net/netfilter/nf_conntrack_ecache.h> - -static struct ipv6hdr * -synproxy_build_ip(struct net *net, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - struct ipv6hdr *iph; - - skb_reset_network_header(skb); - iph = skb_put(skb, sizeof(*iph)); - ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = net->ipv6.devconf_all->hop_limit; - iph->nexthdr = IPPROTO_TCP; - iph->saddr = *saddr; - iph->daddr = *daddr; - - return iph; -} - -static void -synproxy_send_tcp(struct net *net, - const struct sk_buff *skb, struct sk_buff *nskb, - struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, - struct ipv6hdr *niph, struct tcphdr *nth, - unsigned int tcp_hdr_size) -{ - struct dst_entry *dst; - struct flowi6 fl6; - - nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0); - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = (unsigned char *)nth - nskb->head; - nskb->csum_offset = offsetof(struct tcphdr, check); - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_TCP; - fl6.saddr = niph->saddr; - fl6.daddr = niph->daddr; - fl6.fl6_sport = nth->source; - fl6.fl6_dport = nth->dest; - security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - goto free_nskb; - } - dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); - if (IS_ERR(dst)) - goto free_nskb; - - skb_dst_set(nskb, dst); - - if (nfct) { - nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo); - nf_conntrack_get(nfct); - } - - ip6_local_out(net, nskb->sk, nskb); - return; - -free_nskb: - kfree_skb(nskb); -} - -static void -synproxy_send_client_synack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - u16 mss = opts->mss; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE; - nth->doff = tcp_hdr_size / 4; - nth->window = 0; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} -static void -synproxy_send_server_syn(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(recv_seq - 1); - /* ack_seq is used to relay our ISN to the synproxy hook to initialize - * sequence number translation once a connection tracking entry exists. - */ - nth->ack_seq = htonl(ntohl(th->ack_seq) - 1); - tcp_flag_word(nth) = TCP_FLAG_SYN; - if (opts->options & XT_SYNPROXY_OPT_ECN) - tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR; - nth->doff = tcp_hdr_size / 4; - nth->window = th->window; - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, - niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_server_ack(struct net *net, - const struct ip_ct_tcp *state, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->dest; - nth->dest = th->source; - nth->seq = htonl(ntohl(th->ack_seq)); - nth->ack_seq = htonl(ntohl(th->seq) + 1); - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); -} - -static void -synproxy_send_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - const struct synproxy_options *opts) -{ - struct sk_buff *nskb; - struct ipv6hdr *iph, *niph; - struct tcphdr *nth; - unsigned int tcp_hdr_size; - - iph = ipv6_hdr(skb); - - tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts); - nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER, - GFP_ATOMIC); - if (nskb == NULL) - return; - skb_reserve(nskb, MAX_TCP_HEADER); - - niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); - - skb_reset_transport_header(nskb); - nth = skb_put(nskb, tcp_hdr_size); - nth->source = th->source; - nth->dest = th->dest; - nth->seq = htonl(ntohl(th->seq) + 1); - nth->ack_seq = th->ack_seq; - tcp_flag_word(nth) = TCP_FLAG_ACK; - nth->doff = tcp_hdr_size / 4; - nth->window = htons(ntohs(th->window) >> opts->wscale); - nth->check = 0; - nth->urg_ptr = 0; - - synproxy_build_options(nth, opts); - - synproxy_send_tcp(net, skb, nskb, skb_nfct(skb), - IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); -} - -static bool -synproxy_recv_client_ack(struct net *net, - const struct sk_buff *skb, const struct tcphdr *th, - struct synproxy_options *opts, u32 recv_seq) -{ - struct synproxy_net *snet = synproxy_pernet(net); - int mss; - - mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); - if (mss == 0) { - this_cpu_inc(snet->stats->cookie_invalid); - return false; - } - - this_cpu_inc(snet->stats->cookie_valid); - opts->mss = mss; - opts->options |= XT_SYNPROXY_OPT_MSS; - - if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy_check_timestamp_cookie(opts); - - synproxy_send_server_syn(net, skb, th, opts, recv_seq); - return true; -} +#include <net/netfilter/nf_synproxy.h> static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) @@ -304,13 +43,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(net, skb, th, &opts); + synproxy_send_client_synack_ipv6(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts, + ntohl(th->seq))) { consume_skb(skb); return NF_STOLEN; } else { @@ -321,141 +61,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static unsigned int ipv6_synproxy_hook(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *nhs) -{ - struct net *net = nhs->net; - struct synproxy_net *snet = synproxy_pernet(net); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - struct nf_conn_synproxy *synproxy; - struct synproxy_options opts = {}; - const struct ip_ct_tcp *state; - struct tcphdr *th, _th; - __be16 frag_off; - u8 nexthdr; - int thoff; - - ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) - return NF_ACCEPT; - - synproxy = nfct_synproxy(ct); - if (synproxy == NULL) - return NF_ACCEPT; - - if (nf_is_loopback_packet(skb)) - return NF_ACCEPT; - - nexthdr = ipv6_hdr(skb)->nexthdr; - thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (thoff < 0 || nexthdr != IPPROTO_TCP) - return NF_ACCEPT; - - th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); - if (th == NULL) - return NF_DROP; - - state = &ct->proto.tcp; - switch (state->state) { - case TCP_CONNTRACK_CLOSE: - if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - - ntohl(th->seq) + 1); - break; - } - - if (!th->syn || th->ack || - CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - break; - - /* Reopened connection - reset the sequence number and timestamp - * adjustments, they will get initialized once the connection is - * reestablished. - */ - nf_ct_seqadj_init(ct, ctinfo, 0); - synproxy->tsoff = 0; - this_cpu_inc(snet->stats->conn_reopened); - - /* fall through */ - case TCP_CONNTRACK_SYN_SENT: - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (!th->syn && th->ack && - CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1, - * therefore we need to add 1 to make the SYN sequence - * number match the one of first SYN. - */ - if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) { - this_cpu_inc(snet->stats->cookie_retrans); - consume_skb(skb); - return NF_STOLEN; - } else { - return NF_DROP; - } - } - - synproxy->isn = ntohl(th->ack_seq); - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) - synproxy->its = opts.tsecr; - - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - break; - case TCP_CONNTRACK_SYN_RECV: - if (!th->syn || !th->ack) - break; - - if (!synproxy_parse_options(skb, thoff, th, &opts)) - return NF_DROP; - - if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) { - synproxy->tsoff = opts.tsval - synproxy->its; - nf_conntrack_event_cache(IPCT_SYNPROXY, ct); - } - - opts.options &= ~(XT_SYNPROXY_OPT_MSS | - XT_SYNPROXY_OPT_WSCALE | - XT_SYNPROXY_OPT_SACK_PERM); - - swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(net, state, skb, th, &opts); - - nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); - nf_conntrack_event_cache(IPCT_SEQADJ, ct); - - swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(net, skb, th, &opts); - - consume_skb(skb); - return NF_STOLEN; - default: - break; - } - - synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy); - return NF_ACCEPT; -} - -static const struct nf_hook_ops ipv6_synproxy_ops[] = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int synproxy_tg6_check(const struct xt_tgchk_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); @@ -471,16 +76,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par) if (err) return err; - if (snet->hook_ref6 == 0) { - err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err) { - nf_ct_netns_put(par->net, par->family); - return err; - } + err = nf_synproxy_ipv6_init(snet, par->net); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; } - snet->hook_ref6++; return err; } @@ -488,10 +89,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { struct synproxy_net *snet = synproxy_pernet(par->net); - snet->hook_ref6--; - if (snet->hook_ref6 == 0) - nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); + nf_synproxy_ipv6_fini(snet, par->net); nf_ct_netns_put(par->net, par->family); } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 3f7d4691c423..a22100b1cf2c 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -2,7 +2,7 @@ /* * IPv6 raw table, a port of the IPv4 raw table to IPv6 * - * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 84322ce81d70..398e1df41406 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -54,26 +54,21 @@ static struct inet_frags nf_frags; static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", - .data = &init_net.nf_frag.frags.timeout, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_frag6_low_thresh", - .data = &init_net.nf_frag.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.nf_frag.frags.high_thresh }, { .procname = "nf_conntrack_frag6_high_thresh", - .data = &init_net.nf_frag.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.nf_frag.frags.low_thresh }, { } }; @@ -89,15 +84,15 @@ static int nf_ct_frag6_sysctl_register(struct net *net) GFP_KERNEL); if (table == NULL) goto err_alloc; - - table[0].data = &net->nf_frag.frags.timeout; - table[1].data = &net->nf_frag.frags.low_thresh; - table[1].extra2 = &net->nf_frag.frags.high_thresh; - table[2].data = &net->nf_frag.frags.high_thresh; - table[2].extra1 = &net->nf_frag.frags.low_thresh; - table[2].extra2 = &init_net.nf_frag.frags.high_thresh; } + table[0].data = &net->nf_frag.fqdir->timeout; + table[1].data = &net->nf_frag.fqdir->low_thresh; + table[1].extra2 = &net->nf_frag.fqdir->high_thresh; + table[2].data = &net->nf_frag.fqdir->high_thresh; + table[2].extra1 = &net->nf_frag.fqdir->low_thresh; + table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; @@ -144,12 +139,10 @@ static void nf_ct_frag6_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } /* Creation primitives. */ @@ -165,7 +158,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(&net->nf_frag.frags, &key); + q = inet_frag_find(net->nf_frag.fqdir, &key); if (!q) return NULL; @@ -278,7 +271,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, fq->ecn |= ecn; if (payload_len > fq->q.max_size) fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); /* The first fragment. * nhoffset is obtained from the first fragment, of course. @@ -494,29 +487,35 @@ static int nf_ct_net_init(struct net *net) { int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - net->nf_frag.frags.f = &nf_frags; - - res = inet_frags_init_net(&net->nf_frag.frags); + res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); if (res < 0) return res; + + net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = nf_ct_frag6_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); return res; } +static void nf_ct_net_pre_exit(struct net *net) +{ + fqdir_pre_exit(net->nf_frag.fqdir); +} + static void nf_ct_net_exit(struct net *net) { nf_ct_frags6_sysctl_unregister(net); - inet_frags_exit_net(&net->nf_frag.frags); + fqdir_exit(net->nf_frag.fqdir); } static struct pernet_operations nf_ct_net_ops = { - .init = nf_ct_net_init, - .exit = nf_ct_net_exit, + .init = nf_ct_net_init, + .pre_exit = nf_ct_net_pre_exit, + .exit = nf_ct_net_exit, }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..bbff3e02e302 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -44,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "RAW6: inuse %d\n", sock_prot_inuse_get(net, &rawv6_prot)); seq_printf(seq, "FRAG6: inuse %u memory %lu\n", - atomic_read(&net->ipv6.frags.rhashtable.nelems), - frag_mem_limit(&net->ipv6.frags)); + atomic_read(&net->ipv6.fqdir->rhashtable.nelems), + frag_mem_limit(net->ipv6.fqdir)); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 70693bc7ad9d..8a6131991e38 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -834,7 +834,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -876,7 +876,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index b2b2c0c38b87..ca05b16f1bb9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -72,12 +72,10 @@ static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; - struct net *net; fq = container_of(frag, struct frag_queue, q); - net = container_of(fq->q.net, struct net, ipv6.frags); - ip6frag_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(fq->q.fqdir->net, fq); } static struct frag_queue * @@ -96,7 +94,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) IPV6_ADDR_LINKLOCAL))) key.iif = 0; - q = inet_frag_find(&net->ipv6.frags, &key); + q = inet_frag_find(net->ipv6.fqdir, &key); if (!q) return NULL; @@ -196,7 +194,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; - add_frag_mem_limit(fq->q.net, skb->truesize); + add_frag_mem_limit(fq->q.fqdir, skb->truesize); fragsize = -skb_network_offset(skb) + skb->len; if (fragsize > fq->q.max_size) @@ -250,7 +248,7 @@ err: static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) { - struct net *net = container_of(fq->q.net, struct net, ipv6.frags); + struct net *net = fq->q.fqdir->net; unsigned int nhoff; void *reasm_data; int payload_len; @@ -397,23 +395,18 @@ static const struct inet6_protocol frag_protocol = { static struct ctl_table ip6_frags_ns_ctl_table[] = { { .procname = "ip6frag_high_thresh", - .data = &init_net.ipv6.frags.high_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &init_net.ipv6.frags.low_thresh }, { .procname = "ip6frag_low_thresh", - .data = &init_net.ipv6.frags.low_thresh, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra2 = &init_net.ipv6.frags.high_thresh }, { .procname = "ip6frag_time", - .data = &init_net.ipv6.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -445,12 +438,12 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net) if (!table) goto err_alloc; - table[0].data = &net->ipv6.frags.high_thresh; - table[0].extra1 = &net->ipv6.frags.low_thresh; - table[1].data = &net->ipv6.frags.low_thresh; - table[1].extra2 = &net->ipv6.frags.high_thresh; - table[2].data = &net->ipv6.frags.timeout; } + table[0].data = &net->ipv6.fqdir->high_thresh; + table[0].extra1 = &net->ipv6.fqdir->low_thresh; + table[1].data = &net->ipv6.fqdir->low_thresh; + table[1].extra2 = &net->ipv6.fqdir->high_thresh; + table[2].data = &net->ipv6.fqdir->timeout; hdr = register_net_sysctl(net, "net/ipv6", table); if (!hdr) @@ -513,30 +506,35 @@ static int __net_init ipv6_frags_init_net(struct net *net) { int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; - net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - net->ipv6.frags.f = &ip6_frags; - - res = inet_frags_init_net(&net->ipv6.frags); + res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net); if (res < 0) return res; + net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT; + res = ip6_frags_ns_sysctl_register(net); if (res < 0) - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); return res; } +static void __net_exit ipv6_frags_pre_exit_net(struct net *net) +{ + fqdir_pre_exit(net->ipv6.fqdir); +} + static void __net_exit ipv6_frags_exit_net(struct net *net) { ip6_frags_ns_sysctl_unregister(net); - inet_frags_exit_net(&net->ipv6.frags); + fqdir_exit(net->ipv6.fqdir); } static struct pernet_operations ip6_frags_ops = { - .init = ipv6_frags_init_net, - .exit = ipv6_frags_exit_net, + .init = ipv6_frags_init_net, + .pre_exit = ipv6_frags_pre_exit_net, + .exit = ipv6_frags_exit_net, }; static const struct rhashtable_params ip6_rhash_params = { @@ -587,8 +585,8 @@ err_protocol: void ipv6_frag_exit(void) { - inet_frags_fini(&ip6_frags); ip6_frags_sysctl_unregister(); unregister_pernet_subsys(&ip6_frags_ops); inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT); + inet_frags_fini(&ip6_frags); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 97a843cf164c..4d2e6b31a8d6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -100,7 +100,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); -static size_t rt6_nlmsg_size(struct fib6_info *rt); +static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } @@ -429,21 +429,27 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; - if (!match->fib6_nsiblings || have_oif_match) + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match) goto out; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ - if (!fl6->mp_hash) + if (!fl6->mp_hash && + (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); - if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound)) + if (unlikely(match->nh)) { + nexthop_path_fib6_result(res, fl6->mp_hash); + return; + } + + if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, fib6_siblings) { - const struct fib6_nh *nh = &sibling->fib6_nh; + const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); @@ -457,7 +463,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, out: res->f6i = match; - res->nh = &match->fib6_nh; + res->nh = match->fib6_nh; } /* @@ -485,6 +491,45 @@ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, return false; } +struct fib6_nh_dm_arg { + struct net *net; + const struct in6_addr *saddr; + int oif; + int flags; + struct fib6_nh *nh; +}; + +static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_dm_arg *arg = _arg; + + arg->nh = nh; + return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, + arg->flags); +} + +/* returns fib6_nh from nexthop or NULL */ +static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, + struct fib6_result *res, + const struct in6_addr *saddr, + int oif, int flags) +{ + struct fib6_nh_dm_arg arg = { + .net = net, + .saddr = saddr, + .oif = oif, + .flags = flags, + }; + + if (nexthop_is_blackhole(nh)) + return NULL; + + if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) + return arg.nh; + + return NULL; +} + static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { @@ -493,14 +538,31 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { - nh = &spf6i->fib6_nh; - if (__rt6_device_match(net, nh, saddr, oif, flags)) { + bool matched = false; + + if (unlikely(spf6i->nh)) { + nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, + oif, flags); + if (nh) + matched = true; + } else { + nh = spf6i->fib6_nh; + if (__rt6_device_match(net, nh, saddr, oif, flags)) + matched = true; + } + if (matched) { res->f6i = spf6i; goto out; } @@ -508,19 +570,32 @@ static void rt6_device_match(struct net *net, struct fib6_result *res, if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; goto out; } - nh = &f6i->fib6_nh; + if (unlikely(f6i->nh)) { + nh = nexthop_fib6_nh(f6i->nh); + if (nexthop_is_blackhole(f6i->nh)) + goto out_blackhole; + } else { + nh = f6i->fib6_nh; + } + if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; - nh = &res->f6i->fib6_nh; + nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; + return; + +out_blackhole: + res->fib6_flags |= RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF @@ -691,6 +766,24 @@ out: return rc; } +struct fib6_nh_frl_arg { + u32 flags; + int oif; + int strict; + int *mpri; + bool *do_rr; + struct fib6_nh *nh; +}; + +static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_frl_arg *arg = _arg; + + arg->nh = nh; + return find_match(nh, arg->flags, arg->oif, arg->strict, + arg->mpri, arg->do_rr); +} + static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, @@ -701,6 +794,7 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { + bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { @@ -711,8 +805,34 @@ static void __find_rr_leaf(struct fib6_info *f6i_start, if (fib6_check_expired(f6i)) continue; - nh = &f6i->fib6_nh; - if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) { + if (unlikely(f6i->nh)) { + struct fib6_nh_frl_arg arg = { + .flags = f6i->fib6_flags, + .oif = oif, + .strict = strict, + .mpri = mpri, + .do_rr = do_rr + }; + + if (nexthop_is_blackhole(f6i->nh)) { + res->fib6_flags = RTF_REJECT; + res->fib6_type = RTN_BLACKHOLE; + res->f6i = f6i; + res->nh = nexthop_fib6_nh(f6i->nh); + return; + } + if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, + &arg)) { + matched = true; + nh = arg.nh; + } + } else { + nh = f6i->fib6_nh; + if (find_match(nh, f6i->fib6_flags, oif, strict, + mpri, do_rr)) + matched = true; + } + if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; @@ -793,7 +913,7 @@ static void rt6_select(struct net *net, struct fib6_node *fn, int oif, out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; - res->nh = &res->f6i->fib6_nh; + res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } @@ -1114,6 +1234,8 @@ restart: rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; + } else if (res.fib6_flags & RTF_REJECT) { + goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, @@ -1125,6 +1247,7 @@ restart: if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { +do_create: rt = ip6_create_rt_rcu(&res); } @@ -1265,13 +1388,9 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { - struct rt6_info *pcpu_rt, **p; - - p = this_cpu_ptr(res->f6i->rt6i_pcpu); - pcpu_rt = *p; + struct rt6_info *pcpu_rt; - if (pcpu_rt) - ip6_hold_safe(NULL, &pcpu_rt); + pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); return pcpu_rt; } @@ -1282,13 +1401,10 @@ static struct rt6_info *rt6_make_pcpu_route(struct net *net, struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); - if (!pcpu_rt) { - dst_hold(&net->ipv6.ip6_null_entry->dst); - return net->ipv6.ip6_null_entry; - } + if (!pcpu_rt) + return NULL; - dst_hold(&pcpu_rt->dst); - p = this_cpu_ptr(res->f6i->rt6i_pcpu); + p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); @@ -1458,25 +1574,74 @@ static unsigned int fib6_mtu(const struct fib6_result *res) return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } +#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL + +/* used when the flushed bit is not relevant, only access to the bucket + * (ie., all bucket users except rt6_insert_exception); + * + * called under rcu lock; sometimes called with rt6_exception_lock held + */ +static +struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + + if (lock) + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + else + bucket = rcu_dereference(nh->rt6i_exception_bucket); + + /* remove bucket flushed bit if set */ + if (bucket) { + unsigned long p = (unsigned long)bucket; + + p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + } + + return bucket; +} + +static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) +{ + unsigned long p = (unsigned long)bucket; + + return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); +} + +/* called with rt6_exception_lock held */ +static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, + spinlock_t *lock) +{ + struct rt6_exception_bucket *bucket; + unsigned long p; + + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(lock)); + + p = (unsigned long)bucket; + p |= FIB6_EXCEPTION_BUCKET_FLUSHED; + bucket = (struct rt6_exception_bucket *)p; + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); +} + static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; + struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *f6i = res->f6i; + struct fib6_nh *nh = res->nh; int err = 0; spin_lock_bh(&rt6_exception_lock); - if (f6i->exception_bucket_flushed) { - err = -EINVAL; - goto out; - } - - bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); @@ -1484,7 +1649,10 @@ static int rt6_insert_exception(struct rt6_info *nrt, err = -ENOMEM; goto out; } - rcu_assign_pointer(f6i->rt6i_exception_bucket, bucket); + rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); + } else if (fib6_nh_excptn_bucket_flushed(bucket)) { + err = -EINVAL; + goto out; } #ifdef CONFIG_IPV6_SUBTREES @@ -1539,7 +1707,7 @@ out: return err; } -void rt6_flush_exceptions(struct fib6_info *rt) +static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; @@ -1547,25 +1715,46 @@ void rt6_flush_exceptions(struct fib6_info *rt) int i; spin_lock_bh(&rt6_exception_lock); - /* Prevent rt6_insert_exception() to recreate the bucket list */ - rt->exception_bucket_flushed = 1; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; + /* Prevent rt6_insert_exception() to recreate the bucket list */ + if (!from) + fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { - hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) - rt6_remove_exception(bucket, rt6_ex); - WARN_ON_ONCE(bucket->depth); + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { + if (!from || + rcu_access_pointer(rt6_ex->rt6i->from) == from) + rt6_remove_exception(bucket, rt6_ex); + } + WARN_ON_ONCE(!from && bucket->depth); bucket++; } - out: spin_unlock_bh(&rt6_exception_lock); } +static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_info *f6i = arg; + + fib6_nh_flush_exceptions(nh, f6i); + + return 0; +} + +void rt6_flush_exceptions(struct fib6_info *f6i) +{ + if (f6i->nh) + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, + f6i); + else + fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); +} + /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ @@ -1594,7 +1783,7 @@ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, src_key = saddr; find_ex: #endif - bucket = rcu_dereference(res->f6i->rt6i_exception_bucket); + bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) @@ -1612,25 +1801,20 @@ find_ex: } /* Remove the passed in cached rt from the hash table that contains it */ -static int rt6_remove_exception_rt(struct rt6_info *rt) +static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) { + const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - struct fib6_info *from; int err; - from = rcu_dereference(rt->from); - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return -EINVAL; - - if (!rcu_access_pointer(from->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(from->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); + #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1638,7 +1822,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, @@ -1655,23 +1839,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) return err; } -/* Find rt6_ex which contains the passed in rt cache and - * refresh its stamp - */ -static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +struct fib6_nh_excptn_arg { + struct rt6_info *rt; + int plen; +}; + +static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_excptn_arg *arg = _arg; + int err; + + err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); + if (err == 0) + return 1; + + return 0; +} + +static int rt6_remove_exception_rt(struct rt6_info *rt) { - struct rt6_exception_bucket *bucket; - struct in6_addr *src_key = NULL; - struct rt6_exception *rt6_ex; struct fib6_info *from; - rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) - goto unlock; + return -EINVAL; - bucket = rcu_dereference(from->rt6i_exception_bucket); + if (from->nh) { + struct fib6_nh_excptn_arg arg = { + .rt = rt, + .plen = from->fib6_src.plen + }; + int rc; + + /* rc = 1 means an entry was found */ + rc = nexthop_for_each_fib6_nh(from->nh, + rt6_nh_remove_exception_rt, + &arg); + return rc ? 0 : -ENOENT; + } + return fib6_nh_remove_exception(from->fib6_nh, + from->fib6_src.plen, rt); +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, + const struct rt6_info *rt) +{ + const struct in6_addr *src_key = NULL; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of @@ -1679,15 +1900,63 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ - if (from->fib6_src.plen) + if (plen) src_key = &rt->rt6i_src.addr; #endif - rt6_ex = __rt6_find_exception_rcu(&bucket, - &rt->rt6i_dst.addr, - src_key); + rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; +} + +struct fib6_nh_match_arg { + const struct net_device *dev; + const struct in6_addr *gw; + struct fib6_nh *match; +}; + +/* determine if fib6_nh has given device and gateway */ +static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_match_arg *arg = _arg; + + if (arg->dev != nh->fib_nh_dev || + (arg->gw && !nh->fib_nh_gw_family) || + (!arg->gw && nh->fib_nh_gw_family) || + (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) + return 0; + + arg->match = nh; + + /* found a match, break the loop */ + return 1; +} + +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct fib6_info *from; + struct fib6_nh *fib6_nh; + + rcu_read_lock(); + + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + if (from->nh) { + struct fib6_nh_match_arg arg = { + .dev = rt->dst.dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); + + if (!arg.match) + return; + fib6_nh = arg.match; + } else { + fib6_nh = from->fib6_nh; + } + fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } @@ -1715,15 +1984,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, - struct fib6_info *rt, int mtu) + const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; @@ -1745,21 +2012,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) -static void rt6_exceptions_clean_tohost(struct fib6_info *rt, - struct in6_addr *gateway) +static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, + const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1824,23 +2089,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, gc_args->more++; } -void rt6_age_exceptions(struct fib6_info *rt, - struct fib6_gc_args *gc_args, - unsigned long now) +static void fib6_nh_age_exceptions(const struct fib6_nh *nh, + struct fib6_gc_args *gc_args, + unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; - if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); - bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, - lockdep_is_held(&rt6_exception_lock)); - + bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, @@ -1855,6 +2118,36 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } +struct fib6_nh_age_excptn_arg { + struct fib6_gc_args *gc_args; + unsigned long now; +}; + +static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_age_excptn_arg *arg = _arg; + + fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); + return 0; +} + +void rt6_age_exceptions(struct fib6_info *f6i, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + if (f6i->nh) { + struct fib6_nh_age_excptn_arg arg = { + .gc_args = gc_args, + .now = now + }; + + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, + &arg); + } else { + fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); + } +} + /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) @@ -1891,9 +2184,12 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; - struct rt6_info *rt; + struct rt6_info *rt = NULL; int strict = 0; + WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && + !rcu_read_lock_held()); + strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (net->ipv6.devconf_all->forwarding == 0) @@ -1902,23 +2198,15 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); - if (res.f6i == net->ipv6.fib6_null_entry) { - rt = net->ipv6.ip6_null_entry; - rcu_read_unlock(); - dst_hold(&rt->dst); - return rt; - } + if (res.f6i == net->ipv6.fib6_null_entry) + goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { - if (ip6_hold_safe(net, &rt)) - dst_use_noref(&rt->dst, jiffies); - - rcu_read_unlock(); - return rt; + goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be @@ -1926,40 +2214,38 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ - struct rt6_info *uncached_rt; - - uncached_rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); + rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); - rcu_read_unlock(); - - if (uncached_rt) { - /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() - * No need for another dst_hold() + if (rt) { + /* 1 refcnt is taken during ip6_rt_cache_alloc(). + * As rt6_uncached_list_add() does not consume refcnt, + * this refcnt is always returned to the caller even + * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ - rt6_uncached_list_add(uncached_rt); + rt6_uncached_list_add(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); - } else { - uncached_rt = net->ipv6.ip6_null_entry; - dst_hold(&uncached_rt->dst); - } + rcu_read_unlock(); - return uncached_rt; + return rt; + } } else { /* Get a percpu copy */ - - struct rt6_info *pcpu_rt; - local_bh_disable(); - pcpu_rt = rt6_get_pcpu_route(&res); + rt = rt6_get_pcpu_route(&res); - if (!pcpu_rt) - pcpu_rt = rt6_make_pcpu_route(net, &res); + if (!rt) + rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); - rcu_read_unlock(); - - return pcpu_rt; } +out: + if (!rt) + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + ip6_hold_safe(net, &rt); + rcu_read_unlock(); + + return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -2084,17 +2370,54 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.basic.ip_proto = fl6->flowi6_proto; } break; + case 2: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + if (skb) { + struct flow_keys keys; + + if (!flkeys) { + skb_flow_dissect_flow_keys(skb, &keys, 0); + flkeys = &keys; + } + + /* Inner can be v4 or v6 */ + if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; + } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; + hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; + hash_keys.tags.flow_label = flkeys->tags.flow_label; + hash_keys.basic.ip_proto = flkeys->basic.ip_proto; + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + ip6_multipath_l3_keys(skb, &hash_keys, flkeys); + } + } else { + /* Same as case 0 */ + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + hash_keys.addrs.v6addrs.src = fl6->saddr; + hash_keys.addrs.v6addrs.dst = fl6->daddr; + hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); + hash_keys.basic.ip_proto = fl6->flowi6_proto; + } + break; } mhash = flow_hash_from_keys(&hash_keys); return mhash >> 1; } +/* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); - int flags = RT6_LOOKUP_F_HAS_SADDR; + int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, @@ -2116,8 +2439,8 @@ void ip6_route_input(struct sk_buff *skb) if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); - skb_dst_set(skb, - ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); + skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, + &fl6, skb, flags)); } static struct rt6_info *ip6_pol_route_output(struct net *net, @@ -2129,8 +2452,9 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } -struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, - struct flowi6 *fl6, int flags) +struct dst_entry *ip6_route_output_flags_noref(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, int flags) { bool any_src; @@ -2138,6 +2462,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; + /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; @@ -2145,6 +2470,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, fl6->flowi6_iif = LOOPBACK_IFINDEX; + flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) @@ -2157,6 +2483,28 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } +EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref); + +struct dst_entry *ip6_route_output_flags(struct net *net, + const struct sock *sk, + struct flowi6 *fl6, + int flags) +{ + struct dst_entry *dst; + struct rt6_info *rt6; + + rcu_read_lock(); + dst = ip6_route_output_flags_noref(net, sk, fl6, flags); + rt6 = (struct rt6_info *)dst; + /* For dst cached in uncached_list, refcnt is already taken. */ + if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) { + dst = &net->ipv6.ip6_null_entry->dst; + dst_hold(dst); + } + rcu_read_unlock(); + + return dst; +} EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) @@ -2381,10 +2729,31 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, rcu_read_unlock(); return; } - res.nh = &res.f6i->fib6_nh; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt6->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev + gw. Should be impossible. + */ + if (!arg.match) { + rcu_read_unlock(); + return; + } + + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2491,6 +2860,21 @@ static bool ip6_redirect_nh_match(const struct fib6_result *res, return true; } +struct fib6_nh_rd_arg { + struct fib6_result *res; + struct flowi6 *fl6; + const struct in6_addr *gw; + struct rt6_info **ret; +}; + +static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_rd_arg *arg = _arg; + + arg->res->nh = nh; + return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); +} + /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; @@ -2506,6 +2890,12 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; + struct fib6_nh_rd_arg arg = { + .res = &res, + .fl6 = fl6, + .gw = &rdfl->gateway, + .ret = &ret + }; struct fib6_info *rt; struct fib6_node *fn; @@ -2530,14 +2920,24 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; - res.nh = &rt->fib6_nh; - if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; - if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) - goto out; + if (unlikely(rt->nh)) { + if (nexthop_is_blackhole(rt->nh)) + continue; + /* on match, res->nh is filled in and potentially ret */ + if (nexthop_for_each_fib6_nh(rt->nh, + fib6_nh_redirect_match, + &arg)) + goto out; + } else { + res.nh = rt->fib6_nh; + if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, + &ret)) + goto out; + } } if (!rt) @@ -2554,7 +2954,7 @@ restart: } res.f6i = rt; - res.nh = &rt->fib6_nh; + res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); @@ -2781,10 +3181,9 @@ out: return entries > rt_max_size; } -static struct rt6_info *ip6_nh_lookup_table(struct net *net, - struct fib6_config *cfg, - const struct in6_addr *gw_addr, - u32 tbid, int flags) +static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, + const struct in6_addr *gw_addr, u32 tbid, + int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, @@ -2792,25 +3191,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; - struct rt6_info *rt; + int err; table = fib6_get_table(net, tbid); if (!table) - return NULL; + return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; - rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags); - /* if table lookup failed, fall back to full lookup */ - if (rt == net->ipv6.ip6_null_entry) { - ip6_rt_put(rt); - rt = NULL; - } + err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); + if (!err && res->f6i != net->ipv6.fib6_null_entry) + fib6_select_path(net, res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); - return rt; + return err; } static int ip6_route_check_nh_onlink(struct net *net, @@ -2818,29 +3215,19 @@ static int ip6_route_check_nh_onlink(struct net *net, const struct net_device *dev, struct netlink_ext_ack *extack) { - u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; - u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; - struct fib6_info *from; - struct rt6_info *grt; + struct fib6_result res = {}; int err; - err = 0; - grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); - if (grt) { - rcu_read_lock(); - from = rcu_dereference(grt->from); - if (!grt->dst.error && - /* ignore match if it is the default route */ - from && !ipv6_addr_any(&from->fib6_dst.addr) && - (grt->rt6i_flags & flags || dev != grt->dst.dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); - err = -EINVAL; - } - rcu_read_unlock(); - - ip6_rt_put(grt); + err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); + if (!err && !(res.fib6_flags & RTF_REJECT) && + /* ignore match if it is the default route */ + !ipv6_addr_any(&res.f6i->fib6_dst.addr) && + (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { + NL_SET_ERR_MSG(extack, + "Nexthop has invalid gateway or device mismatch"); + err = -EINVAL; } return err; @@ -2853,47 +3240,50 @@ static int ip6_route_check_nh(struct net *net, { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; - struct rt6_info *grt = NULL; + int flags = RT6_LOOKUP_F_IFACE; + struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { - int flags = RT6_LOOKUP_F_IFACE; - - grt = ip6_nh_lookup_table(net, cfg, gw_addr, - cfg->fc_table, flags); - if (grt) { - if (grt->rt6i_flags & RTF_GATEWAY || - (dev && dev != grt->dst.dev)) { - ip6_rt_put(grt); - grt = NULL; - } - } + err = ip6_nh_lookup_table(net, cfg, gw_addr, + cfg->fc_table, flags, &res); + /* gw_addr can not require a gateway or resolve to a reject + * route. If a device is given, it must match the result. + */ + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family || + (dev && dev != res.nh->fib_nh_dev)) + err = -EHOSTUNREACH; } - if (!grt) - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1); + if (err < 0) { + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + }; - if (!grt) - goto out; + err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); + if (err || res.fib6_flags & RTF_REJECT || + res.nh->fib_nh_gw_family) + err = -EHOSTUNREACH; + if (err) + return err; + + fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, + cfg->fc_ifindex != 0, NULL, flags); + } + + err = 0; if (dev) { - if (dev != grt->dst.dev) { - ip6_rt_put(grt); - goto out; - } + if (dev != res.nh->fib_nh_dev) + err = -EHOSTUNREACH; } else { - *_dev = dev = grt->dst.dev; - *idev = grt->rt6i_idev; + *_dev = dev = res.nh->fib_nh_dev; dev_hold(dev); - in6_dev_hold(grt->rt6i_idev); + *idev = in6_dev_get(dev); } - if (!(grt->rt6i_flags & RTF_GATEWAY)) - err = 0; - - ip6_rt_put(grt); - -out: return err; } @@ -2934,11 +3324,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, goto out; } + rcu_read_lock(); + if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, idev); + rcu_read_unlock(); + if (err) goto out; } @@ -3039,7 +3433,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; } } - goto set_dev; + goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { @@ -3075,7 +3469,14 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; -set_dev: + +pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } + fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; @@ -3095,6 +3496,38 @@ out: void fib6_nh_release(struct fib6_nh *fib6_nh) { + struct rt6_exception_bucket *bucket; + + rcu_read_lock(); + + fib6_nh_flush_exceptions(fib6_nh, NULL); + bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); + if (bucket) { + rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); + kfree(bucket); + } + + rcu_read_unlock(); + + if (fib6_nh->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + pcpu_rt = *ppcpu_rt; + if (pcpu_rt) { + dst_dev_put(&pcpu_rt->dst); + dst_release(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } + + free_percpu(fib6_nh->rt6i_pcpu); + } + fib_nh_common_release(&fib6_nh->nh_common); } @@ -3104,7 +3537,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; + struct nexthop *nh = NULL; struct fib6_table *table; + struct fib6_nh *fib6_nh; int err = -EINVAL; int addr_type; @@ -3140,6 +3575,16 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } #endif + if (cfg->fc_nh_id) { + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out; + } + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out; + } err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && @@ -3157,7 +3602,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags); + rt = fib6_info_alloc(gfp_flags, !nh); if (!rt) goto out; @@ -3197,19 +3642,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack); - if (err) - goto out; + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + goto out; + } + if (rt->fib6_src.plen) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto out; + } + rt->nh = nh; + fib6_nh = nexthop_fib6_nh(rt->nh); + } else { + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + if (err) + goto out; - /* We cannot add true routes via loopback here, - * they would result in kernel looping; promote them to reject routes - */ - addr_type = ipv6_addr_type(&cfg->fc_dst); - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type)) - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + fib6_nh = rt->fib6_nh; + + /* We cannot add true routes via loopback here, they would + * result in kernel looping; promote them to reject routes + */ + addr_type = ipv6_addr_type(&cfg->fc_dst); + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, + addr_type)) + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; + } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { - struct net_device *dev = fib6_info_nh_dev(rt); + struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); @@ -3301,6 +3762,12 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) info->skip_notify = 1; } + info->skip_notify_kernel = 1; + call_fib6_multipath_entry_notifiers(net, + FIB_EVENT_ENTRY_DEL, + rt, + rt->fib6_nsiblings, + NULL); list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { @@ -3323,7 +3790,7 @@ out_put: return err; } -static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) +static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; @@ -3339,10 +3806,49 @@ out: return rc; } +static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, + struct fib6_nh *nh) +{ + struct fib6_result res = { + .f6i = rt, + .nh = nh, + }; + struct rt6_info *rt_cache; + + rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); + if (rt_cache) + return __ip6_del_cached_rt(rt_cache, cfg); + + return 0; +} + +struct fib6_nh_del_cached_rt_arg { + struct fib6_config *cfg; + struct fib6_info *f6i; +}; + +static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) +{ + struct fib6_nh_del_cached_rt_arg *arg = _arg; + int rc; + + rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); + return rc != -ESRCH ? rc : 0; +} + +static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) +{ + struct fib6_nh_del_cached_rt_arg arg = { + .cfg = cfg, + .f6i = f6i + }; + + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); +} + static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rt6_info *rt_cache; struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; @@ -3365,26 +3871,45 @@ static int ip6_route_del(struct fib6_config *cfg, for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; + if (rt->nh && cfg->fc_nh_id && + rt->nh->id != cfg->fc_nh_id) + continue; + if (cfg->fc_flags & RTF_CACHE) { - struct fib6_result res = { - .f6i = rt, - }; - int rc; - - rt_cache = rt6_find_cached_rt(&res, - &cfg->fc_dst, - &cfg->fc_src); - if (rt_cache) { - rc = ip6_del_cached_rt(rt_cache, cfg); - if (rc != -ESRCH) { - rcu_read_unlock(); - return rc; - } + int rc = 0; + + if (rt->nh) { + rc = ip6_del_cached_rt_nh(cfg, rt); + } else if (cfg->fc_nh_id) { + continue; + } else { + nh = rt->fib6_nh; + rc = ip6_del_cached_rt(cfg, rt, nh); + } + if (rc != -ESRCH) { + rcu_read_unlock(); + return rc; } continue; } - nh = &rt->fib6_nh; + if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) + continue; + if (cfg->fc_protocol && + cfg->fc_protocol != rt->fib6_protocol) + continue; + + if (rt->nh) { + if (!fib6_info_hold_safe(rt)) + continue; + rcu_read_unlock(); + + return __ip6_del_rt(rt, &cfg->fc_nlinfo); + } + if (cfg->fc_nh_id) + continue; + + nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) @@ -3392,10 +3917,6 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; - if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) - continue; - if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) - continue; if (!fib6_info_hold_safe(rt)) continue; rcu_read_unlock(); @@ -3506,7 +4027,25 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (!res.f6i) goto out; - res.nh = &res.f6i->fib6_nh; + if (res.f6i->nh) { + struct fib6_nh_match_arg arg = { + .dev = dst->dev, + .gw = &rt->rt6i_gateway, + }; + + nexthop_for_each_fib6_nh(res.f6i->nh, + fib6_nh_find_match, &arg); + + /* fib6_info uses a nexthop that does not have fib6_nh + * using the dst->dev. Should be impossible + */ + if (!arg.match) + goto out; + res.nh = arg.match; + } else { + res.nh = res.f6i->fib6_nh; + } + res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); @@ -3558,12 +4097,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net, goto out; for_each_fib6_node_rt_rcu(fn) { - if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex) + /* these routes do not use nexthops */ + if (rt->nh) + continue; + if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || - !rt->fib6_nh.fib_nh_gw_family) + !rt->fib6_nh->fib_nh_gw_family) continue; - if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr)) + if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; @@ -3621,8 +4163,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net, rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { - struct fib6_nh *nh = &rt->fib6_nh; + struct fib6_nh *nh; + + /* RA routes do not use nexthops */ + if (rt->nh) + continue; + nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) @@ -3873,7 +4420,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; - if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) && + if (!rt->nh && + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) { spin_lock_bh(&rt6_exception_lock); @@ -3901,18 +4449,22 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; + struct fib6_nh *nh; + /* RA routes do not use nexthops */ + if (rt->nh) + return 0; + + nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && - rt->fib6_nh.fib_nh_gw_family && - ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) { + nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; - } /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ - rt6_exceptions_clean_tohost(rt, gateway); + fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } @@ -3950,11 +4502,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) return NULL; } +/* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { - if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD || - (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN && - ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev))) + if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || + (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && + ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; @@ -3966,11 +4519,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt) int total = 0; if (!rt6_is_dead(rt)) - total += rt->fib6_nh.fib_nh_weight; + total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) - total += iter->fib6_nh.fib_nh_weight; + total += iter->fib6_nh->fib_nh_weight; } return total; @@ -3981,11 +4534,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) int upper_bound = -1; if (!rt6_is_dead(rt)) { - *weight += rt->fib6_nh.fib_nh_weight; + *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } - atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound); + atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) @@ -4028,9 +4581,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg) const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); - if (rt != net->ipv6.fib6_null_entry && - rt->fib6_nh.fib_nh_dev == arg->dev) { - rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags; + if (rt != net->ipv6.fib6_null_entry && !rt->nh && + rt->fib6_nh->fib_nh_dev == arg->dev) { + rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } @@ -4053,15 +4606,16 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } +/* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) + if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) + if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; @@ -4082,12 +4636,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, struct fib6_info *iter; unsigned int dead = 0; - if (rt->fib6_nh.fib_nh_dev == down_dev || - rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (rt->fib6_nh->fib_nh_dev == down_dev || + rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == down_dev || - iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD) + if (iter->fib6_nh->fib_nh_dev == down_dev || + iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; @@ -4099,11 +4653,11 @@ static void rt6_multipath_nh_flags_set(struct fib6_info *rt, { struct fib6_info *iter; - if (rt->fib6_nh.fib_nh_dev == dev) - rt->fib6_nh.fib_nh_flags |= nh_flags; + if (rt->fib6_nh->fib_nh_dev == dev) + rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) - if (iter->fib6_nh.fib_nh_dev == dev) - iter->fib6_nh.fib_nh_flags |= nh_flags; + if (iter->fib6_nh->fib_nh_dev == dev) + iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ @@ -4113,17 +4667,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); - if (rt == net->ipv6.fib6_null_entry) + if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) - return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0; + return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; @@ -4139,10 +4693,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) } return -2; case NETDEV_CHANGE: - if (rt->fib6_nh.fib_nh_dev != dev || + if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; - rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN; + rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } @@ -4176,9 +4730,36 @@ void rt6_disable_ip(struct net_device *dev, unsigned long event) struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; + struct fib6_info *f6i; }; -static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) +static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; + struct fib6_info *f6i = arg->f6i; + + /* For administrative MTU increase, there is no way to discover + * IPv6 PMTU increase, so PMTU increase should be updated here. + * Since RFC 1981 doesn't include administrative MTU increase + * update PMTU increase is a MUST. (i.e. jumbo frame) + */ + if (nh->fib_nh_dev == arg->dev) { + struct inet6_dev *idev = __in6_dev_get(arg->dev); + u32 mtu = f6i->fib6_pmtu; + + if (mtu >= arg->mtu || + (mtu < arg->mtu && mtu == idev->cnf.mtu6)) + fib6_metric_set(f6i, RTAX_MTU, arg->mtu); + + spin_lock_bh(&rt6_exception_lock); + rt6_exceptions_update_pmtu(idev, nh, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); + } + + return 0; +} + +static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; @@ -4193,24 +4774,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg) if (!idev) return 0; - /* For administrative MTU increase, there is no way to discover - IPv6 PMTU increase, so PMTU increase should be updated here. - Since RFC 1981 doesn't include administrative MTU increase - update PMTU increase is a MUST. (i.e. jumbo frame) - */ - if (rt->fib6_nh.fib_nh_dev == arg->dev && - !fib6_metric_locked(rt, RTAX_MTU)) { - u32 mtu = rt->fib6_pmtu; - - if (mtu >= arg->mtu || - (mtu < arg->mtu && mtu == idev->cnf.mtu6)) - fib6_metric_set(rt, RTAX_MTU, arg->mtu); + if (fib6_metric_locked(f6i, RTAX_MTU)) + return 0; - spin_lock_bh(&rt6_exception_lock); - rt6_exceptions_update_pmtu(idev, rt, arg->mtu); - spin_unlock_bh(&rt6_exception_lock); + arg->f6i = f6i; + if (f6i->nh) { + /* fib6_nh_mtu_change only returns 0, so this is safe */ + return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, + arg); } - return 0; + + return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) @@ -4224,6 +4798,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu) } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { + [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, @@ -4241,6 +4816,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, + [RTA_NH_ID] = { .type = NLA_U32 }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -4287,6 +4863,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); + if (tb[RTA_NH_ID]) { + if (tb[RTA_GATEWAY] || tb[RTA_OIF] || + tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { + NL_SET_ERR_MSG(extack, + "Nexthop specification and nexthop id are mutually exclusive"); + goto errout; + } + cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); + } + if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; @@ -4430,6 +5016,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + enum fib_event_type event_type; struct fib6_config r_cfg; struct rtnexthop *rtnh; struct fib6_info *rt; @@ -4489,7 +5076,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1; + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, rt, &r_cfg); @@ -4501,12 +5088,23 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } + if (list_empty(&rt6_nh_list)) { + NL_SET_ERR_MSG(extack, + "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; + /* For add and replace, send one notification with all nexthops. For + * append, send one notification with all appended nexthops. + */ + info->skip_notify_kernel = 1; + err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, next) { err = __ip6_ins_rt(nh->fib6_info, info, extack); @@ -4543,6 +5141,15 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nhn++; } + event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD; + err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type, + rt_notif, nhn - 1, extack); + if (err) { + /* Delete all the siblings that were just added */ + err_nh = NULL; + goto add_errout; + } + /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; @@ -4621,6 +5228,12 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + if (cfg.fc_nh_id && + !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } + if (cfg.fc_mp) return ip6_route_multipath_del(&cfg, extack); else { @@ -4648,17 +5261,46 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, return ip6_route_add(&cfg, GFP_KERNEL, extack); } -static size_t rt6_nlmsg_size(struct fib6_info *rt) +/* add the overhead of this fib6_nh to nexthop_len */ +static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { - int nexthop_len = 0; + int *nexthop_len = arg; - if (rt->fib6_nsiblings) { - nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ - + NLA_ALIGN(sizeof(struct rtnexthop)) - + nla_total_size(16) /* RTA_GATEWAY */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws); + *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16); /* RTA_GATEWAY */ - nexthop_len *= rt->fib6_nsiblings; + if (nh->fib_nh_lws) { + /* RTA_ENCAP_TYPE */ + *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); + /* RTA_ENCAP */ + *nexthop_len += nla_total_size(2); + } + + return 0; +} + +static size_t rt6_nlmsg_size(struct fib6_info *f6i) +{ + int nexthop_len; + + if (f6i->nh) { + nexthop_len = nla_total_size(4); /* RTA_NH_ID */ + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, + &nexthop_len); + } else { + struct fib6_nh *nh = f6i->fib6_nh; + + nexthop_len = 0; + if (f6i->fib6_nsiblings) { + nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ + + NLA_ALIGN(sizeof(struct rtnexthop)) + + nla_total_size(16) /* RTA_GATEWAY */ + + lwtunnel_get_encap_size(nh->fib_nh_lws); + + nexthop_len *= f6i->fib6_nsiblings; + } + nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } return NLMSG_ALIGN(sizeof(struct rtmsg)) @@ -4674,10 +5316,38 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws) + nexthop_len; } +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, + unsigned char *flags) +{ + if (nexthop_is_multipath(nh)) { + struct nlattr *mp; + + mp = nla_nest_start(skb, RTA_MULTIPATH); + if (!mp) + goto nla_put_failure; + + if (nexthop_mpath_fill_node(skb, nh)) + goto nla_put_failure; + + nla_nest_end(skb, mp); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = nexthop_fib6_nh(nh); + if (fib_nexthop_info(skb, &fib6_nh->nh_common, + flags, false) < 0) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, @@ -4687,6 +5357,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct rt6_info *rt6 = (struct rt6_info *)dst; struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; + unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; @@ -4794,22 +5465,31 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (!mp) goto nla_put_failure; - if (fib_add_nexthop(skb, &rt->fib6_nh.nh_common, - rt->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, + rt->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { - if (fib_add_nexthop(skb, &sibling->fib6_nh.nh_common, - sibling->fib6_nh.fib_nh_weight) < 0) + if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, + sibling->fib6_nh->fib_nh_weight) < 0) goto nla_put_failure; } nla_nest_end(skb, mp); - } else { - unsigned char nh_flags = 0; + } else if (rt->nh) { + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) + goto nla_put_failure; - if (fib_nexthop_info(skb, &rt->fib6_nh.nh_common, + if (nexthop_is_blackhole(rt->nh)) + rtm->rtm_type = RTN_BLACKHOLE; + + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + goto nla_put_failure; + + rtm->rtm_flags |= nh_flags; + } else { + if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, &nh_flags, false) < 0) goto nla_put_failure; @@ -4836,10 +5516,28 @@ nla_put_failure: return -EMSGSIZE; } +static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) +{ + const struct net_device *dev = arg; + + if (nh->fib_nh_dev == dev) + return 1; + + return 0; +} + static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { - if (f6i->fib6_nh.fib_nh_dev == dev) + if (f6i->nh) { + struct net_device *_dev = (struct net_device *)dev; + + return !!nexthop_for_each_fib6_nh(f6i->nh, + fib6_info_nh_uses_dev, + _dev); + } + + if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (f6i->fib6_nsiblings) { @@ -4847,7 +5545,7 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, list_for_each_entry_safe(sibling, next_sibling, &f6i->fib6_siblings, fib6_siblings) { - if (sibling->fib6_nh.fib_nh_dev == dev) + if (sibling->fib6_nh->fib_nh_dev == dev) return true; } } @@ -4855,33 +5553,131 @@ static bool fib6_info_uses_dev(const struct fib6_info *f6i, return false; } -int rt6_dump_route(struct fib6_info *rt, void *p_arg) +struct fib6_nh_exception_dump_walker { + struct rt6_rtnl_dump_arg *dump; + struct fib6_info *rt; + unsigned int flags; + unsigned int skip; + unsigned int count; +}; + +static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) +{ + struct fib6_nh_exception_dump_walker *w = arg; + struct rt6_rtnl_dump_arg *dump = w->dump; + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i, err; + + bucket = fib6_nh_get_excptn_bucket(nh, NULL); + if (!bucket) + return 0; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (w->skip) { + w->skip--; + continue; + } + + /* Expiration of entries doesn't bump sernum, insertion + * does. Removal is triggered by insertion, so we can + * rely on the fact that if entries change between two + * partial dumps, this node is scanned again completely, + * see rt6_insert_exception() and fib6_dump_table(). + * + * Count expired entries we go through as handled + * entries that we'll skip next time, in case of partial + * node dump. Otherwise, if entries expire meanwhile, + * we'll skip the wrong amount. + */ + if (rt6_check_expired(rt6_ex->rt6i)) { + w->count++; + continue; + } + + err = rt6_fill_node(dump->net, dump->skb, w->rt, + &rt6_ex->rt6i->dst, NULL, NULL, 0, + RTM_NEWROUTE, + NETLINK_CB(dump->cb->skb).portid, + dump->cb->nlh->nlmsg_seq, w->flags); + if (err) + return err; + + w->count++; + } + bucket++; + } + + return 0; +} + +/* Return -1 if done with node, number of handled routes on partial dump */ +int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; + int count = 0; if (rt == net->ipv6.fib6_null_entry) - return 0; + return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ - return 1; + return -1; } - if (filter->filter_set) { - if ((filter->rt_type && rt->fib6_type != filter->rt_type) || - (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || - (filter->protocol && rt->fib6_protocol != filter->protocol)) { - return 1; - } + if (filter->filter_set && + ((filter->rt_type && rt->fib6_type != filter->rt_type) || + (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || + (filter->protocol && rt->fib6_protocol != filter->protocol))) { + return -1; + } + + if (filter->filter_set || + !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } - return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, - RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, - arg->cb->nlh->nlmsg_seq, flags); + if (filter->dump_routes) { + if (skip) { + skip--; + } else { + if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, + 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).portid, + arg->cb->nlh->nlmsg_seq, flags)) { + return 0; + } + count++; + } + } + + if (filter->dump_exceptions) { + struct fib6_nh_exception_dump_walker w = { .dump = arg, + .rt = rt, + .flags = flags, + .skip = skip, + .count = 0 }; + int err; + + rcu_read_lock(); + if (rt->nh) { + err = nexthop_for_each_fib6_nh(rt->nh, + rt6_nh_dump_exceptions, + &w); + } else { + err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); + } + rcu_read_unlock(); + + if (err) + return count += w.count; + } + + return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, @@ -5126,6 +5922,38 @@ errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } +void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ + u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* call_fib6_entry_notifiers will be removed when in-kernel notifier + * is implemented and supported for nexthop objects + */ + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL); + + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + if (!skb) + goto errout; + + err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, + RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); + if (err < 0) { + /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, + info->nlh, gfp_any()); + return; +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); +} + static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { @@ -5136,7 +5964,7 @@ static int ip6_route_dev_notify(struct notifier_block *this, return NOTIFY_OK; if (event == NETDEV_REGISTER) { - net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev; + net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -5330,11 +6158,11 @@ static int __net_init ip6_route_net_init(struct net *net) if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; - net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template, - sizeof(*net->ipv6.fib6_null_entry), - GFP_KERNEL); + net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; + memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, + sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), @@ -5344,6 +6172,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; @@ -5355,6 +6184,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), @@ -5364,6 +6194,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); + INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached); #endif net->ipv6.sysctl.flush_delay = 0; @@ -5471,7 +6302,7 @@ void __init ip6_route_init_special_entries(void) /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ - init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev; + init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index e15cd37024fd..dc4c91e0bfb8 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,6 +23,7 @@ static int zero; static int one = 1; +static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; @@ -113,7 +114,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7a14ea37d2df..d56a9019a0fe 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -171,7 +171,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } @@ -883,9 +883,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - if (sk) - mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) { + mark = inet_twsk(sk)->tw_mark; + /* autoflowlabel relies on buff->hash */ + skb_set_hash(buff, inet_twsk(sk)->tw_txhash, + PKT_HASH_TYPE_L4); + } else { + mark = sk->sk_mark; + } + buff->tstamp = tcp_transmit_time(sk); + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; @@ -912,15 +920,17 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); + struct ipv6hdr *ipv6h = ipv6_hdr(skb); u32 seq = 0, ack_seq = 0; struct tcp_md5sig_key *key = NULL; #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif + __be32 label = 0; + struct net *net; int oif = 0; if (th->rst) @@ -932,6 +942,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (!sk && !ipv6_unicast_destination(skb)) return; + net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); @@ -945,7 +956,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), + sk1 = inet6_lookup_listener(net, &tcp_hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, @@ -975,9 +986,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) trace_tcp_send_reset(sk, skb); + if (sk->sk_state == TCP_TIME_WAIT) + label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); + } else { + if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + label = ip6_flowlabel(ipv6h); } - tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, + label); #ifdef CONFIG_TCP_MD5SIG out: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 70b01bd95022..827fe7385078 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -54,16 +54,6 @@ #include <trace/events/skb.h> #include "udp_impl.h" -static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_udp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -111,7 +101,7 @@ void udp_v6_rehash(struct sock *sk) static int compute_score(struct sock *sk, struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, - int dif, int sdif, bool exact_dif) + int dif, int sdif) { int score; struct inet_sock *inet; @@ -155,8 +145,8 @@ static int compute_score(struct sock *sk, struct net *net, static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, - int dif, int sdif, bool exact_dif, - struct udp_hslot *hslot2, struct sk_buff *skb) + int dif, int sdif, struct udp_hslot *hslot2, + struct sk_buff *skb) { struct sock *sk, *result; int score, badness; @@ -166,7 +156,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, badness = -1; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif); + daddr, hnum, dif, sdif); if (score > badness) { if (sk->sk_reuseport) { hash = udp6_ehashfn(net, daddr, hnum, @@ -195,14 +185,13 @@ struct sock *__udp6_lib_lookup(struct net *net, unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result; - bool exact_dif = udp6_lib_exact_dif_match(net, skb); hash2 = ipv6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp6_lib_lookup2(net, saddr, sport, - daddr, hnum, dif, sdif, exact_dif, + daddr, hnum, dif, sdif, hslot2, skb); if (!result) { hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); @@ -212,10 +201,9 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, - exact_dif, hslot2, - skb); + hslot2, skb); } - if (unlikely(IS_ERR(result))) + if (IS_ERR(result)) return NULL; return result; } @@ -838,8 +826,7 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) - skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check, - ip6_compute_pseudo); + skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); @@ -1332,7 +1319,7 @@ do_udp_sendmsg: fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -1384,7 +1371,7 @@ do_udp_sendmsg: } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 5bdca3d5d6b7..78daadecbdef 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -21,137 +21,6 @@ #include <net/ipv6.h> #include <net/addrconf.h> -static void -__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) -{ - const struct flowi6 *fl6 = &fl->u.ip6; - - /* Initialize temporary selector matching only - * to current session. */ - *(struct in6_addr *)&sel->daddr = fl6->daddr; - *(struct in6_addr *)&sel->saddr = fl6->saddr; - sel->dport = xfrm_flowi_dport(fl, &fl6->uli); - sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl, &fl6->uli); - sel->sport_mask = htons(0xffff); - sel->family = AF_INET6; - sel->prefixlen_d = 128; - sel->prefixlen_s = 128; - sel->proto = fl6->flowi6_proto; - sel->ifindex = fl6->flowi6_oif; -} - -static void -xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, - const xfrm_address_t *daddr, const xfrm_address_t *saddr) -{ - x->id = tmpl->id; - if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) - memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); - memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); - if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) - memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); - x->props.mode = tmpl->mode; - x->props.reqid = tmpl->reqid; - x->props.family = AF_INET6; -} - -/* distribution counting sort function for xfrm_state and xfrm_tmpl */ -static int -__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass) -{ - int count[XFRM_MAX_DEPTH] = { }; - int class[XFRM_MAX_DEPTH]; - int i; - - for (i = 0; i < n; i++) { - int c; - class[i] = c = cmp(src[i]); - count[c]++; - } - - for (i = 2; i < maxclass; i++) - count[i] += count[i - 1]; - - for (i = 0; i < n; i++) { - dst[count[class[i] - 1]++] = src[i]; - src[i] = NULL; - } - - return 0; -} - -/* - * Rule for xfrm_state: - * - * rule 1: select IPsec transport except AH - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec transport AH - * rule 4: select IPsec tunnel - * rule 5: others - */ -static int __xfrm6_state_sort_cmp(void *p) -{ - struct xfrm_state *v = p; - - switch (v->props.mode) { - case XFRM_MODE_TRANSPORT: - if (v->id.proto != IPPROTO_AH) - return 1; - else - return 3; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 4; - } - return 5; -} - -static int -__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_state_sort_cmp, 6); -} - -/* - * Rule for xfrm_tmpl: - * - * rule 1: select IPsec transport - * rule 2: select MIPv6 RO or inbound trigger - * rule 3: select IPsec tunnel - * rule 4: others - */ -static int __xfrm6_tmpl_sort_cmp(void *p) -{ - struct xfrm_tmpl *v = p; - switch (v->mode) { - case XFRM_MODE_TRANSPORT: - return 1; -#if IS_ENABLED(CONFIG_IPV6_MIP6) - case XFRM_MODE_ROUTEOPTIMIZATION: - case XFRM_MODE_IN_TRIGGER: - return 2; -#endif - case XFRM_MODE_TUNNEL: - case XFRM_MODE_BEET: - return 3; - } - return 4; -} - -static int -__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n) -{ - return __xfrm6_sort((void **)dst, (void **)src, n, - __xfrm6_tmpl_sort_cmp, 5); -} - int xfrm6_extract_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); @@ -171,12 +40,6 @@ int xfrm6_extract_header(struct sk_buff *skb) static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, - .eth_proto = htons(ETH_P_IPV6), - .owner = THIS_MODULE, - .init_tempsel = __xfrm6_init_tempsel, - .init_temprop = xfrm6_init_temprop, - .tmpl_sort = __xfrm6_tmpl_sort, - .state_sort = __xfrm6_state_sort, .output = xfrm6_output, .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, |