From 6454743bc13e7dfd4f2720758ca3fcdea76b82a4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:19 -0700 Subject: net/ipv6: Rename fib6_lookup to fib6_node_lookup Rename fib6_lookup to fib6_node_lookup to better reflect what it returns. The fib6_lookup name will be used in a later patch for an IPv6 equivalent to IPv4's fib_lookup. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index a3ec08d05756..43ab545e64ea 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); -struct fib6_node *fib6_lookup(struct fib6_node *root, - const struct in6_addr *daddr, - const struct in6_addr *saddr); +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, -- cgit v1.2.3 From 3b290a31bbc5969f9193f73d547a6dc8a25c6f9e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:20 -0700 Subject: net/ipv6: Rename rt6_multipath_select Rename rt6_multipath_select to fib6_multipath_select and export it. A later patch wants access to it similar to IPv4's fib_select_path. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 5 +++++ net/ipv6/route.c | 17 +++++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 43ab545e64ea..2597d8fdd92f 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict); + struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 443d2a0bc150..6a10608d9025 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } -static struct fib6_info *rt6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict) +struct fib6_info *fib6_multipath_select(const struct net *net, + struct fib6_info *match, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict) { struct fib6_info *sibling, *next_sibling; @@ -1068,8 +1068,9 @@ restart: f6i = rt6_device_match(net, f6i, &fl6->saddr, fl6->flowi6_oif, flags); if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0) - f6i = rt6_multipath_select(net, f6i, fl6, - fl6->flowi6_oif, skb, flags); + f6i = fib6_multipath_select(net, f6i, fl6, + fl6->flowi6_oif, skb, + flags); } if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); @@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); if (f6i->fib6_nsiblings) - f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict); + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) -- cgit v1.2.3 From 1d053da910947afccec96d90892c0f5488c7a9cf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:21 -0700 Subject: net/ipv6: Extract table lookup from ip6_pol_route ip6_pol_route is used for ingress and egress FIB lookups. Refactor it moving the table lookup into a separate fib6_table_lookup that can be invoked separately and export the new function. ip6_pol_route now calls fib6_table_lookup and uses the result to generate a dst based rt6_info. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 4 ++++ net/ipv6/route.c | 39 +++++++++++++++++++++++++-------------- 2 files changed, 29 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2597d8fdd92f..c70705f2647a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; caller needs to select path */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict); + struct fib6_info *fib6_multipath_select(const struct net *net, struct fib6_info *match, struct flowi6 *fl6, int oif, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6a10608d9025..019d8ba9021e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt, rcu_read_unlock_bh(); } -struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, - const struct sk_buff *skb, int flags) +/* must be called with rcu lock held */ +struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int strict) { struct fib6_node *fn, *saved_fn; struct fib6_info *f6i; - struct rt6_info *rt; - int strict = 0; - - strict |= flags & RT6_LOOKUP_F_IFACE; - strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; - if (net->ipv6.devconf_all->forwarding == 0) - strict |= RT6_LOOKUP_F_REACHABLE; - - rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, redo_rt6_select: f6i = rt6_select(net, fn, oif, strict); - if (f6i->fib6_nsiblings) - f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); if (f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) @@ -1838,6 +1827,28 @@ redo_rt6_select: } } + return f6i; +} + +struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, + const struct sk_buff *skb, int flags) +{ + struct fib6_info *f6i; + struct rt6_info *rt; + int strict = 0; + + strict |= flags & RT6_LOOKUP_F_IFACE; + strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; + if (net->ipv6.devconf_all->forwarding == 0) + strict |= RT6_LOOKUP_F_REACHABLE; + + rcu_read_lock(); + + f6i = fib6_table_lookup(net, table, oif, fl6, strict); + if (f6i->fib6_nsiblings) + f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict); + if (f6i == net->ipv6.fib6_null_entry) { rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); -- cgit v1.2.3 From 138118ec96cbfc303c1d7cc05fbb2caf8382c95b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:23 -0700 Subject: net/ipv6: Add fib6_lookup Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules, but returns a FIB entry, fib6_info, rather than a dst based rt6_info. fib6_lookup is any where from 140% (MULTIPLE_TABLES config disabled) to 60% faster than any of the dst based lookup methods (without custom rules) and 25% faster with custom rules (e.g., l3mdev rule). Since the lookup function has a completely different signature, fib6_rule_action is split into 2 paths: the existing one is renamed __fib6_rule_action and a new one for the fib6_info path is added. fib6_rule_action decides which to call based on the lookup_ptr. If it is fib6_table_lookup then the new path is taken. Caller must hold rcu lock as no reference is taken on the returned fib entry. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/ip6_fib.h | 6 ++++ net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++-- net/ipv6/ip6_fib.c | 7 +++++ 3 files changed, 97 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c70705f2647a..cc70f6da8462 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); +/* called with rcu lock held; can return error pointer + * caller needs to select path + */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags); + /* called with rcu lock held; caller needs to select path */ struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int strict); diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index d040c4bff3a0..f590446595d8 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net) return fib_rules_seq_read(net, AF_INET6); } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + struct fib6_info *f6i; + int err; + + if (net->ipv6.fib6_has_custom_rules) { + struct fib_lookup_arg arg = { + .lookup_ptr = fib6_table_lookup, + .lookup_data = &oif, + .flags = FIB_LOOKUP_NOREF, + }; + + l3mdev_update_flow(net, flowi6_to_flowi(fl6)); + + err = fib_rules_lookup(net->ipv6.fib6_rules_ops, + flowi6_to_flowi(fl6), flags, &arg); + if (err) + return ERR_PTR(err); + + f6i = arg.result ? : net->ipv6.fib6_null_entry; + } else { + f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, + oif, fl6, flags); + if (!f6i || f6i == net->ipv6.fib6_null_entry) + f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, + oif, fl6, flags); + } + + return f6i; +} + struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) @@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, return 0; } -static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, - int flags, struct fib_lookup_arg *arg) +static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + struct flowi6 *flp6 = &flp->u.ip6; + struct net *net = rule->fr_net; + struct fib6_table *table; + struct fib6_info *f6i; + int err = -EAGAIN, *oif; + u32 tb_id; + + switch (rule->action) { + case FR_ACT_TO_TBL: + break; + case FR_ACT_UNREACHABLE: + return -ENETUNREACH; + case FR_ACT_PROHIBIT: + return -EACCES; + case FR_ACT_BLACKHOLE: + default: + return -EINVAL; + } + + tb_id = fib_rule_get_table(rule, arg); + table = fib6_get_table(net, tb_id); + if (!table) + return -EAGAIN; + + oif = (int *)arg->lookup_data; + f6i = fib6_table_lookup(net, table, *oif, flp6, flags); + if (f6i != net->ipv6.fib6_null_entry) { + err = fib6_rule_saddr(net, rule, flags, flp6, + fib6_info_nh_dev(f6i)); + + if (likely(!err)) + arg->result = f6i; + } + + return err; +} + +static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) { struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; @@ -182,6 +255,15 @@ out: return err; } +static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, + int flags, struct fib_lookup_arg *arg) +{ + if (arg->lookup_ptr == fib6_table_lookup) + return fib6_rule_action_alt(rule, flp, flags, arg); + + return __fib6_rule_action(rule, flp, flags, arg); +} + static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg) { struct rt6_info *rt = (struct rt6_info *) arg->result; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 487faffeae28..d1dc6017f5a6 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, return &rt->dst; } +/* called with rcu lock held; no reference taken on fib6_info */ +struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags); +} + static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); -- cgit v1.2.3 From d4bea421f7322400d804c2284739e42e61f78349 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:24 -0700 Subject: net/ipv6: Update fib6 tracepoint to take fib6_info Similar to IPv4, IPv6 should use the FIB lookup result in the tracepoint. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/trace/events/fib6.h | 14 +++++++------- net/ipv6/route.c | 14 ++++++-------- 2 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 7e8d48a81b91..1b8d951e3c12 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,10 +12,10 @@ TRACE_EVENT(fib6_table_lookup, - TP_PROTO(const struct net *net, const struct rt6_info *rt, + TP_PROTO(const struct net *net, const struct fib6_info *f6i, struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, rt, table, flp), + TP_ARGS(net, f6i, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup, in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; - if (rt->rt6i_idev) { - __assign_str(name, rt->rt6i_idev->dev->name); + if (f6i->fib6_nh.nh_dev) { + __assign_str(name, f6i->fib6_nh.nh_dev); } else { __assign_str(name, ""); } - if (rt == net->ipv6.ip6_null_entry) { + if (f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; in6 = (struct in6_addr *)__entry->gw; *in6 = in6_zero; - } else if (rt) { + } else if (f6i) { in6 = (struct in6_addr *)__entry->gw; - *in6 = rt->rt6i_gateway; + *in6 = f6i->fib6_nh.nh_gw; } ), diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 019d8ba9021e..73f9c29a5878 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1078,6 +1078,8 @@ restart: goto restart; } + trace_fib6_table_lookup(net, f6i, table, fl6); + /* Search through exception table */ rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr); if (rt) { @@ -1096,8 +1098,6 @@ restart: rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); - return rt; } @@ -1827,6 +1827,8 @@ redo_rt6_select: } } + trace_fib6_table_lookup(net, f6i, table, fl6); + return f6i; } @@ -1853,7 +1855,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, rt = net->ipv6.ip6_null_entry; rcu_read_unlock(); dst_hold(&rt->dst); - trace_fib6_table_lookup(net, rt, table, fl6); return rt; } @@ -1864,7 +1865,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, dst_use_noref(&rt->dst, jiffies); rcu_read_unlock(); - trace_fib6_table_lookup(net, rt, table, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !(f6i->fib6_flags & RTF_GATEWAY))) { @@ -1890,9 +1890,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, dst_hold(&uncached_rt->dst); } - trace_fib6_table_lookup(net, uncached_rt, table, fl6); return uncached_rt; - } else { /* Get a percpu copy */ @@ -1906,7 +1904,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, local_bh_enable(); rcu_read_unlock(); - trace_fib6_table_lookup(net, pcpu_rt, table, fl6); + return pcpu_rt; } } @@ -2491,7 +2489,7 @@ out: rcu_read_unlock(); - trace_fib6_table_lookup(net, ret, table, fl6); + trace_fib6_table_lookup(net, rt, table, fl6); return ret; }; -- cgit v1.2.3 From 65a2022e89a4760f9702837e2d9d15a39a9c68a3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:25 -0700 Subject: net/ipv6: Add fib lookup stubs for use in bpf helper Add stubs to retrieve a handle to an IPv6 FIB table, fib6_get_table, a stub to do a lookup in a specific table, fib6_table_lookup, and a stub for a full route lookup. The stubs are needed for core bpf code to handle the case when the IPv6 module is not builtin. Signed-off-by: David Ahern Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/net/addrconf.h | 14 ++++++++++++++ net/ipv6/addrconf_core.c | 33 ++++++++++++++++++++++++++++++++- net/ipv6/af_inet6.c | 6 +++++- 3 files changed, 51 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 8312cc25a3af..ff766ab207e0 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -223,6 +223,20 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 32b564dfd02a..2fe754fd4f5e 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -134,8 +134,39 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, return -EAFNOSUPPORT; } +static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + int flags) +{ + return NULL; +} + +static struct fib6_info * +eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, int strict) +{ + return f6i; +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { - .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .fib6_get_table = eafnosupport_fib6_get_table, + .fib6_table_lookup = eafnosupport_fib6_table_lookup, + .fib6_lookup = eafnosupport_fib6_lookup, + .fib6_multipath_select = eafnosupport_fib6_multipath_select, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d0af96e0d109..50de8b0d4f70 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -889,7 +889,11 @@ static struct pernet_operations inet6_net_ops = { static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, - .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_dst_lookup = ip6_dst_lookup, + .fib6_get_table = fib6_get_table, + .fib6_table_lookup = fib6_table_lookup, + .fib6_lookup = fib6_lookup, + .fib6_multipath_select = fib6_multipath_select, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, .nd_tbl = &nd_tbl, -- cgit v1.2.3 From 87f5fc7e48dd3175b30dd03b41564e1a8e136323 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 9 May 2018 20:34:26 -0700 Subject: bpf: Provide helper to do forwarding lookups in kernel FIB table Provide a helper for doing a FIB and neighbor lookup in the kernel tables from an XDP program. The helper provides a fastpath for forwarding packets. If the packet is a local delivery or for any reason is not a simple lookup and forward, the packet continues up the stack. If it is to be forwarded, the forwarding can be done directly if the neighbor is already known. If the neighbor does not exist, the first few packets go up the stack for neighbor resolution. Once resolved, the xdp program provides the fast path. On successful lookup the nexthop dmac, current device smac and egress device index are returned. The API supports IPv4, IPv6 and MPLS protocols, but only IPv4 and IPv6 are implemented in this patch. The API includes layer 4 parameters if the XDP program chooses to do deep packet inspection to allow compare against ACLs implemented as FIB rules. Header rewrite is left to the XDP program. The lookup takes 2 flags: - BPF_FIB_LOOKUP_DIRECT to do a lookup that bypasses FIB rules and goes straight to the table associated with the device (expert setting for those looking to maximize throughput) - BPF_FIB_LOOKUP_OUTPUT to do a lookup from the egress perspective. Default is an ingress lookup. Initial performance numbers collected by Jesper, forwarded packets/sec: Full stack XDP FIB lookup XDP Direct lookup IPv4 1,947,969 7,074,156 7,415,333 IPv6 1,728,000 6,165,504 7,262,720 These number are single CPU core forwarding on a Broadwell E5-1650 v4 @ 3.60GHz. Signed-off-by: David Ahern Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 81 +++++++++++++- net/core/filter.c | 267 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 347 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d615c777b573..02e4112510f8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1828,6 +1828,33 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * + * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst, + * ipv6_dst or mpls_out based on family, smac is set to mac + * address of egress device, dmac is set to nexthop mac address, + * rt_metric is set to metric from route. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * + * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs + * full lookup using FIB rules + * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress + * perspective (default is ingress) + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * + * Return + * Egress device index on success, 0 if packet needs to continue + * up the stack for further processing or a negative error in case + * of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -1898,7 +1925,8 @@ union bpf_attr { FN(xdp_adjust_tail), \ FN(skb_get_xfrm_state), \ FN(get_stack), \ - FN(skb_load_bytes_relative), + FN(skb_load_bytes_relative), \ + FN(fib_lookup), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args { __u64 args[0]; }; +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +#define BPF_FIB_LOOKUP_DIRECT BIT(0) +#define BPF_FIB_LOOKUP_OUTPUT BIT(1) + +struct bpf_fib_lookup { + /* input */ + __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + /* total length of packet from network header - used for MTU check */ + __u16 tot_len; + __u32 ifindex; /* L3 device index for lookup */ + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowlabel; /* AF_INET6 */ + + /* output: metric of fib result */ + __u32 rt_metric; + }; + + union { + __be32 mpls_in; + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, *dst is destination address. + * output: bpf_fib_lookup sets to gateway address + */ + union { + /* return for MPLS lookups */ + __be32 mpls_out[4]; /* support up to 4 labels */ + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/filter.c b/net/core/filter.c index 0baa715e4699..ca60d2872da4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -60,6 +60,10 @@ #include #include #include +#include +#include +#include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { }; #endif +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowlabel; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + *dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags); +#endif + } + return 0; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags); +#endif + } + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { + .func = bpf_skb_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif + case BPF_FUNC_fib_lookup: + return &bpf_skb_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } @@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; + case BPF_FUNC_fib_lookup: + return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3