summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-09-28 11:48:20 +0300
committerDavid S. Miller <davem@davemloft.net>2016-09-28 11:48:20 +0300
commit9c5982fe260a28e84d167e894123dc342e76c39f (patch)
treebdd2565cbf373e53c32a47ff30c11e4de0028ce7 /net
parenteb523f42d77a43f80bb9c57a34fbdc8406c7b075 (diff)
parentfd41b0eaa06a8a0516f9e0b0a5889035bf423784 (diff)
downloadlinux-9c5982fe260a28e84d167e894123dc342e76c39f.tar.xz
Merge branch 'fib-offload-notifications'
Jiri Pirko says: ==================== fib offload: switch to notifier The goal of this patchset is to allow driver to propagate all prefixes configured in kernel down HW. This is necessary for routing to work as expected. If we don't do that HW might forward prefixes known to kernel incorrectly. Take an example when default route is set in switch HW and there is an IP address set on a management (non-switch) port. Currently, only FIB entries related to the switch port netdev are offloaded using switchdev ops. This model is not extendable so the first patch introduces a replacement: notifier to propagate FIB entry additions and removals to whoever is interested. The second patch introduces couple of helpers to deal with RTNH_F_OFFLOAD flags. Currently it is set in switchdev core. There the assumption is that only one offload device exists. But for FIB notifier, we assume multiple offload devices. So the patch introduces a per FIB entry reference counter and helpers use it in order to achieve this: 0 means RTNH_F_OFFLOAD is not set, no device offloads this entry n means RTNH_F_OFFLOAD is set and the entry is offloaded by n devices Patches 3 and 4 convert mlxsw and rocker to adopt this new way, registering one notifier block for each asic instance. Both of these patches also implement internal "abort" mechanism. Using switchdev ops, "abort" is called by switchdev core whenever there is an error during FIB entry add offload. This leads to removal of all offloaded entries on system by fib_trie code. Now the new notifier assumes the driver takes care of the abort action. Here's why: 1) The fact that one HW cannot offload an entry does not mean that the others can't do it. So let only one entity to abort and leave the rest to work happily. 2) The driver knows what to in order to properly abort. For example, currently abort is broken for mlxsw, as for Spectrum there is a need to set 0.0.0.0/0 trap in RALUE register. The fifth patch removes the old, no longer used FIB offload infrastructure. The last patch reflects the changes into switchdev documentation file. --- v2->v3: -patch 3/6 -fixed offload inc/dec to be done in fib4_entry_init/fini and only in case !trap as suggested by Ido v1->v2: -patch 3/6: -fixed lpm tree setup and binding for abort and pointed out by Ido -do nexthop checks as suggested by Ido -fix use after free during abort -patch 6/6: -fixed texts as suggested by Ido ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/fib_frontend.c29
-rw-r--r--net/ipv4/fib_rules.c12
-rw-r--r--net/ipv4/fib_trie.c166
-rw-r--r--net/switchdev/switchdev.c181
4 files changed, 78 insertions, 310 deletions
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e56a4c20a3c..c3b80478226e 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -182,26 +182,13 @@ static void fib_flush(struct net *net)
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
- flushed += fib_table_flush(tb);
+ flushed += fib_table_flush(net, tb);
}
if (flushed)
rt_cache_flush(net);
}
-void fib_flush_external(struct net *net)
-{
- struct fib_table *tb;
- struct hlist_head *head;
- unsigned int h;
-
- for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
- head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry(tb, head, tb_hlist)
- fib_table_flush_external(tb);
- }
-}
-
/*
* Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration.
@@ -590,13 +577,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCDELRT) {
tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
else
err = -ESRCH;
} else {
tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
else
err = -ENOBUFS;
}
@@ -719,7 +706,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_delete(tb, &cfg);
+ err = fib_table_delete(net, tb, &cfg);
errout:
return err;
}
@@ -741,7 +728,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
goto errout;
}
- err = fib_table_insert(tb, &cfg);
+ err = fib_table_insert(net, tb, &cfg);
errout:
return err;
}
@@ -828,9 +815,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- fib_table_insert(tb, &cfg);
+ fib_table_insert(net, tb, &cfg);
else
- fib_table_delete(tb, &cfg);
+ fib_table_delete(net, tb, &cfg);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -1254,7 +1241,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
- fib_table_flush(tb);
+ fib_table_flush(net, tb);
fib_free_table(tb);
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 770bebed6b28..2e50062f642d 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -164,6 +164,14 @@ static struct fib_table *fib_empty_table(struct net *net)
return NULL;
}
+static int call_fib_rule_notifiers(struct net *net,
+ enum fib_event_type event_type)
+{
+ struct fib_notifier_info info;
+
+ return call_fib_notifiers(net, event_type, &info);
+}
+
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
FRA_GENERIC_POLICY,
[FRA_FLOW] = { .type = NLA_U32 },
@@ -220,7 +228,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD);
err = 0;
errout:
@@ -242,7 +250,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--;
#endif
net->ipv4.fib_has_custom_rules = true;
- fib_flush_external(rule->fr_net);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL);
errout:
return err;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 241f27bbd7ad..31cef3602585 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -73,6 +73,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
+#include <linux/notifier.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -80,10 +81,47 @@
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/switchdev.h>
#include <trace/events/fib.h>
#include "fib_lookup.h"
+static BLOCKING_NOTIFIER_HEAD(fib_chain);
+
+int register_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&fib_chain, nb);
+}
+EXPORT_SYMBOL(register_fib_notifier);
+
+int unregister_fib_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_unregister(&fib_chain, nb);
+}
+EXPORT_SYMBOL(unregister_fib_notifier);
+
+int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
+ struct fib_notifier_info *info)
+{
+ info->net = net;
+ return blocking_notifier_call_chain(&fib_chain, event_type, info);
+}
+
+static int call_fib_entry_notifiers(struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifiers(net, event_type, &info.info);
+}
+
#define MAX_STAT_DEPTH 32
#define KEYLENGTH (8*sizeof(t_key))
@@ -1076,7 +1114,8 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
@@ -1175,17 +1214,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- err = switchdev_fib_ipv4_add(key, plen, fi,
- new_fa->fa_tos,
- cfg->fc_type,
- cfg->fc_nlflags,
- tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- kmem_cache_free(fn_alias_kmem, new_fa);
- goto out;
- }
-
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa);
@@ -1193,6 +1221,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
fib_release_info(fi_drop);
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+ key, plen, fi,
+ new_fa->fa_tos, cfg->fc_type,
+ tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
@@ -1228,30 +1261,22 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
- /* (Optionally) offload fib entry to switch hardware. */
- err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
- cfg->fc_nlflags, tb->tb_id);
- if (err) {
- switchdev_fib_ipv4_abort(fi);
- goto out_free_new_fa;
- }
-
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_sw_fib_del;
+ goto out_free_new_fa;
if (!plen)
tb->tb_num_default++;
rt_cache_flush(cfg->fc_nlinfo.nl_net);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, key, plen, fi, tos,
+ cfg->fc_type, tb->tb_id, cfg->fc_nlflags);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
&cfg->fc_nlinfo, nlflags);
succeeded:
return 0;
-out_sw_fib_del:
- switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1490,7 +1515,8 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
}
/* Caller must hold RTNL. */
-int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct net *net, struct fib_table *tb,
+ struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete;
@@ -1543,9 +1569,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete)
return -ESRCH;
- switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
- cfg->fc_type, tb->tb_id);
-
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
+ fa_to_delete->fa_info, tos, cfg->fc_type,
+ tb->tb_id, 0);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1734,82 +1760,8 @@ out:
return NULL;
}
-/* Caller must hold RTNL */
-void fib_table_flush_external(struct fib_table *tb)
-{
- struct trie *t = (struct trie *)tb->tb_data;
- struct key_vector *pn = t->kv;
- unsigned long cindex = 1;
- struct hlist_node *tmp;
- struct fib_alias *fa;
-
- /* walk trie in reverse order */
- for (;;) {
- unsigned char slen = 0;
- struct key_vector *n;
-
- if (!(cindex--)) {
- t_key pkey = pn->key;
-
- /* cannot resize the trie vector */
- if (IS_TRIE(pn))
- break;
-
- /* resize completed node */
- pn = resize(t, pn);
- cindex = get_index(pkey, pn);
-
- continue;
- }
-
- /* grab the next available node */
- n = get_child(pn, cindex);
- if (!n)
- continue;
-
- if (IS_TNODE(n)) {
- /* record pn and cindex for leaf walking */
- pn = n;
- cindex = 1ul << n->bits;
-
- continue;
- }
-
- hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
- struct fib_info *fi = fa->fa_info;
-
- /* if alias was cloned to local then we just
- * need to remove the local copy from main
- */
- if (tb->tb_id != fa->tb_id) {
- hlist_del_rcu(&fa->fa_list);
- alias_free_mem_rcu(fa);
- continue;
- }
-
- /* record local slen */
- slen = fa->fa_slen;
-
- if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
- continue;
-
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
- }
-
- /* update leaf slen */
- n->slen = slen;
-
- if (hlist_empty(&n->leaf)) {
- put_child_root(pn, n->key, NULL);
- node_free(n);
- }
- }
-}
-
/* Caller must hold RTNL. */
-int fib_table_flush(struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *pn = t->kv;
@@ -1858,9 +1810,11 @@ int fib_table_flush(struct fib_table *tb)
continue;
}
- switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
- fi, fa->fa_tos, fa->fa_type,
- tb->tb_id);
+ call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ n->key,
+ KEYLENGTH - fa->fa_slen,
+ fi, fa->fa_tos, fa->fa_type,
+ tb->tb_id, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 10b819308439..02beb35f577f 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -21,7 +21,6 @@
#include <linux/workqueue.h>
#include <linux/if_vlan.h>
#include <linux/rtnetlink.h>
-#include <net/ip_fib.h>
#include <net/switchdev.h>
/**
@@ -344,8 +343,6 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
switch (obj->id) {
case SWITCHDEV_OBJ_ID_PORT_VLAN:
return sizeof(struct switchdev_obj_port_vlan);
- case SWITCHDEV_OBJ_ID_IPV4_FIB:
- return sizeof(struct switchdev_obj_ipv4_fib);
case SWITCHDEV_OBJ_ID_PORT_FDB:
return sizeof(struct switchdev_obj_port_fdb);
case SWITCHDEV_OBJ_ID_PORT_MDB:
@@ -1108,184 +1105,6 @@ int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
}
EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
-static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
-{
- const struct switchdev_ops *ops = dev->switchdev_ops;
- struct net_device *lower_dev;
- struct net_device *port_dev;
- struct list_head *iter;
-
- /* Recusively search down until we find a sw port dev.
- * (A sw port dev supports switchdev_port_attr_get).
- */
-
- if (ops && ops->switchdev_port_attr_get)
- return dev;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter) {
- port_dev = switchdev_get_lowest_dev(lower_dev);
- if (port_dev)
- return port_dev;
- }
-
- return NULL;
-}
-
-static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
-{
- struct switchdev_attr attr = {
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
- struct switchdev_attr prev_attr;
- struct net_device *dev = NULL;
- int nhsel;
-
- ASSERT_RTNL();
-
- /* For this route, all nexthop devs must be on the same switch. */
-
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (!nh->nh_dev)
- return NULL;
-
- dev = switchdev_get_lowest_dev(nh->nh_dev);
- if (!dev)
- return NULL;
-
- attr.orig_dev = dev;
- if (switchdev_port_attr_get(dev, &attr))
- return NULL;
-
- if (nhsel > 0 &&
- !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid))
- return NULL;
-
- prev_attr = attr;
- }
-
- return dev;
-}
-
-/**
- * switchdev_fib_ipv4_add - Add/modify switch IPv4 route entry
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @nlflags: netlink flags passed in (NLM_F_*)
- * @tb_id: route table ID
- *
- * Add/modify switch IPv4 route entry.
- */
-int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 nlflags, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = nlflags,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- /* Don't offload route if using custom ip rules or if
- * IPv4 FIB offloading has been disabled completely.
- */
-
-#ifdef CONFIG_IP_MULTIPLE_TABLES
- if (fi->fib_net->ipv4.fib_has_custom_rules)
- return 0;
-#endif
-
- if (fi->fib_net->ipv4.fib_offload_disabled)
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags |= RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
-
-/**
- * switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
- *
- * @dst: route's IPv4 destination address
- * @dst_len: destination address length (prefix length)
- * @fi: route FIB info structure
- * @tos: route TOS
- * @type: route type
- * @tb_id: route table ID
- *
- * Delete IPv4 route entry from switch device.
- */
-int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
- u8 tos, u8 type, u32 tb_id)
-{
- struct switchdev_obj_ipv4_fib ipv4_fib = {
- .obj.id = SWITCHDEV_OBJ_ID_IPV4_FIB,
- .dst = dst,
- .dst_len = dst_len,
- .fi = fi,
- .tos = tos,
- .type = type,
- .nlflags = 0,
- .tb_id = tb_id,
- };
- struct net_device *dev;
- int err = 0;
-
- if (!(fi->fib_flags & RTNH_F_OFFLOAD))
- return 0;
-
- dev = switchdev_get_dev_by_nhs(fi);
- if (!dev)
- return 0;
-
- ipv4_fib.obj.orig_dev = dev;
- err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
- if (!err)
- fi->fib_flags &= ~RTNH_F_OFFLOAD;
-
- return err == -EOPNOTSUPP ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
-
-/**
- * switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
- *
- * @fi: route FIB info structure
- */
-void switchdev_fib_ipv4_abort(struct fib_info *fi)
-{
- /* There was a problem installing this route to the offload
- * device. For now, until we come up with more refined
- * policy handling, abruptly end IPv4 fib offloading for
- * for entire net by flushing offload device(s) of all
- * IPv4 routes, and mark IPv4 fib offloading broken from
- * this point forward.
- */
-
- fib_flush_external(fi->fib_net);
- fi->fib_net->ipv4.fib_offload_disabled = true;
-}
-EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-
bool switchdev_port_same_parent_id(struct net_device *a,
struct net_device *b)
{