diff options
-rw-r--r-- | drivers/net/ethernet/rocker/rocker.c | 483 | ||||
-rw-r--r-- | include/linux/netdevice.h | 22 | ||||
-rw-r--r-- | include/net/ip_fib.h | 2 | ||||
-rw-r--r-- | include/net/netns/ipv4.h | 1 | ||||
-rw-r--r-- | include/net/switchdev.h | 24 | ||||
-rw-r--r-- | include/uapi/linux/rtnetlink.h | 1 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 13 | ||||
-rw-r--r-- | net/ipv4/fib_rules.c | 3 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 92 | ||||
-rw-r--r-- | net/switchdev/switchdev.c | 161 |
10 files changed, 754 insertions, 48 deletions
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index a5d1e6ea7d58..d04d3b374e31 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -32,6 +32,9 @@ #include <linux/bitops.h> #include <net/switchdev.h> #include <net/rtnetlink.h> +#include <net/ip_fib.h> +#include <net/netevent.h> +#include <net/arp.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> #include <generated/utsrelease.h> @@ -111,9 +114,10 @@ struct rocker_flow_tbl_key { struct rocker_flow_tbl_entry { struct hlist_node entry; - u32 ref_count; + u32 cmd; u64 cookie; struct rocker_flow_tbl_key key; + size_t key_len; u32 key_crc32; /* key */ }; @@ -161,6 +165,16 @@ struct rocker_internal_vlan_tbl_entry { __be16 vlan_id; }; +struct rocker_neigh_tbl_entry { + struct hlist_node entry; + __be32 ip_addr; /* key */ + struct net_device *dev; + u32 ref_count; + u32 index; + u8 eth_dst[ETH_ALEN]; + bool ttl_check; +}; + struct rocker_desc_info { char *data; /* mapped */ size_t data_size; @@ -234,6 +248,9 @@ struct rocker { unsigned long internal_vlan_bitmap[ROCKER_INTERNAL_VLAN_BITMAP_LEN]; DECLARE_HASHTABLE(internal_vlan_tbl, 8); spinlock_t internal_vlan_tbl_lock; + DECLARE_HASHTABLE(neigh_tbl, 16); + spinlock_t neigh_tbl_lock; + u32 neigh_tbl_next_index; }; static const u8 zero_mac[ETH_ALEN] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; @@ -256,7 +273,6 @@ enum { ROCKER_PRIORITY_VLAN = 1, ROCKER_PRIORITY_TERM_MAC_UCAST = 0, ROCKER_PRIORITY_TERM_MAC_MCAST = 1, - ROCKER_PRIORITY_UNICAST_ROUTING = 1, ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_EXACT = 1, ROCKER_PRIORITY_BRIDGING_VLAN_DFLT_WILD = 2, ROCKER_PRIORITY_BRIDGING_VLAN = 3, @@ -1940,8 +1956,7 @@ static int rocker_cmd_flow_tbl_add(struct rocker *rocker, struct rocker_tlv *cmd_info; int err = 0; - if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, - ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_ADD)) + if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd)) return -EMSGSIZE; cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO); if (!cmd_info) @@ -1998,8 +2013,7 @@ static int rocker_cmd_flow_tbl_del(struct rocker *rocker, const struct rocker_flow_tbl_entry *entry = priv; struct rocker_tlv *cmd_info; - if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, - ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_DEL)) + if (rocker_tlv_put_u16(desc_info, ROCKER_TLV_CMD_TYPE, entry->cmd)) return -EMSGSIZE; cmd_info = rocker_tlv_nest_start(desc_info, ROCKER_TLV_CMD_INFO); if (!cmd_info) @@ -2168,9 +2182,9 @@ static int rocker_cmd_group_tbl_del(struct rocker *rocker, return 0; } -/***************************************** - * Flow, group, FDB, internal VLAN tables - *****************************************/ +/*************************************************** + * Flow, group, FDB, internal VLAN and neigh tables + ***************************************************/ static int rocker_init_tbls(struct rocker *rocker) { @@ -2186,6 +2200,9 @@ static int rocker_init_tbls(struct rocker *rocker) hash_init(rocker->internal_vlan_tbl); spin_lock_init(&rocker->internal_vlan_tbl_lock); + hash_init(rocker->neigh_tbl); + spin_lock_init(&rocker->neigh_tbl_lock); + return 0; } @@ -2196,6 +2213,7 @@ static void rocker_free_tbls(struct rocker *rocker) struct rocker_group_tbl_entry *group_entry; struct rocker_fdb_tbl_entry *fdb_entry; struct rocker_internal_vlan_tbl_entry *internal_vlan_entry; + struct rocker_neigh_tbl_entry *neigh_entry; struct hlist_node *tmp; int bkt; @@ -2219,16 +2237,22 @@ static void rocker_free_tbls(struct rocker *rocker) tmp, internal_vlan_entry, entry) hash_del(&internal_vlan_entry->entry); spin_unlock_irqrestore(&rocker->internal_vlan_tbl_lock, flags); + + spin_lock_irqsave(&rocker->neigh_tbl_lock, flags); + hash_for_each_safe(rocker->neigh_tbl, bkt, tmp, neigh_entry, entry) + hash_del(&neigh_entry->entry); + spin_unlock_irqrestore(&rocker->neigh_tbl_lock, flags); } static struct rocker_flow_tbl_entry * rocker_flow_tbl_find(struct rocker *rocker, struct rocker_flow_tbl_entry *match) { struct rocker_flow_tbl_entry *found; + size_t key_len = match->key_len ? match->key_len : sizeof(found->key); hash_for_each_possible(rocker->flow_tbl, found, entry, match->key_crc32) { - if (memcmp(&found->key, &match->key, sizeof(found->key)) == 0) + if (memcmp(&found->key, &match->key, key_len) == 0) return found; } @@ -2241,42 +2265,34 @@ static int rocker_flow_tbl_add(struct rocker_port *rocker_port, { struct rocker *rocker = rocker_port->rocker; struct rocker_flow_tbl_entry *found; + size_t key_len = match->key_len ? match->key_len : sizeof(found->key); unsigned long flags; - bool add_to_hw = false; - int err = 0; - match->key_crc32 = crc32(~0, &match->key, sizeof(match->key)); + match->key_crc32 = crc32(~0, &match->key, key_len); spin_lock_irqsave(&rocker->flow_tbl_lock, flags); found = rocker_flow_tbl_find(rocker, match); if (found) { - kfree(match); + match->cookie = found->cookie; + hash_del(&found->entry); + kfree(found); + found = match; + found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_MOD; } else { found = match; found->cookie = rocker->flow_tbl_next_cookie++; - hash_add(rocker->flow_tbl, &found->entry, found->key_crc32); - add_to_hw = true; + found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_ADD; } - found->ref_count++; + hash_add(rocker->flow_tbl, &found->entry, found->key_crc32); spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags); - if (add_to_hw) { - err = rocker_cmd_exec(rocker, rocker_port, - rocker_cmd_flow_tbl_add, - found, NULL, NULL, nowait); - if (err) { - spin_lock_irqsave(&rocker->flow_tbl_lock, flags); - hash_del(&found->entry); - spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags); - kfree(found); - } - } - - return err; + return rocker_cmd_exec(rocker, rocker_port, + rocker_cmd_flow_tbl_add, + found, NULL, NULL, nowait); } static int rocker_flow_tbl_del(struct rocker_port *rocker_port, @@ -2285,29 +2301,26 @@ static int rocker_flow_tbl_del(struct rocker_port *rocker_port, { struct rocker *rocker = rocker_port->rocker; struct rocker_flow_tbl_entry *found; + size_t key_len = match->key_len ? match->key_len : sizeof(found->key); unsigned long flags; - bool del_from_hw = false; int err = 0; - match->key_crc32 = crc32(~0, &match->key, sizeof(match->key)); + match->key_crc32 = crc32(~0, &match->key, key_len); spin_lock_irqsave(&rocker->flow_tbl_lock, flags); found = rocker_flow_tbl_find(rocker, match); if (found) { - found->ref_count--; - if (found->ref_count == 0) { - hash_del(&found->entry); - del_from_hw = true; - } + hash_del(&found->entry); + found->cmd = ROCKER_TLV_CMD_TYPE_OF_DPA_FLOW_DEL; } spin_unlock_irqrestore(&rocker->flow_tbl_lock, flags); kfree(match); - if (del_from_hw) { + if (found) { err = rocker_cmd_exec(rocker, rocker_port, rocker_cmd_flow_tbl_del, found, NULL, NULL, nowait); @@ -2467,6 +2480,31 @@ static int rocker_flow_tbl_bridge(struct rocker_port *rocker_port, return rocker_flow_tbl_do(rocker_port, flags, entry); } +static int rocker_flow_tbl_ucast4_routing(struct rocker_port *rocker_port, + __be16 eth_type, __be32 dst, + __be32 dst_mask, u32 priority, + enum rocker_of_dpa_table_id goto_tbl, + u32 group_id, int flags) +{ + struct rocker_flow_tbl_entry *entry; + + entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags)); + if (!entry) + return -ENOMEM; + + entry->key.tbl_id = ROCKER_OF_DPA_TABLE_ID_UNICAST_ROUTING; + entry->key.priority = priority; + entry->key.ucast_routing.eth_type = eth_type; + entry->key.ucast_routing.dst4 = dst; + entry->key.ucast_routing.dst4_mask = dst_mask; + entry->key.ucast_routing.goto_tbl = goto_tbl; + entry->key.ucast_routing.group_id = group_id; + entry->key_len = offsetof(struct rocker_flow_tbl_key, + ucast_routing.group_id); + + return rocker_flow_tbl_do(rocker_port, flags, entry); +} + static int rocker_flow_tbl_acl(struct rocker_port *rocker_port, int flags, u32 in_pport, u32 in_pport_mask, @@ -2554,7 +2592,6 @@ static int rocker_group_tbl_add(struct rocker_port *rocker_port, struct rocker *rocker = rocker_port->rocker; struct rocker_group_tbl_entry *found; unsigned long flags; - int err = 0; spin_lock_irqsave(&rocker->group_tbl_lock, flags); @@ -2574,12 +2611,9 @@ static int rocker_group_tbl_add(struct rocker_port *rocker_port, spin_unlock_irqrestore(&rocker->group_tbl_lock, flags); - if (found->cmd) - err = rocker_cmd_exec(rocker, rocker_port, - rocker_cmd_group_tbl_add, - found, NULL, NULL, nowait); - - return err; + return rocker_cmd_exec(rocker, rocker_port, + rocker_cmd_group_tbl_add, + found, NULL, NULL, nowait); } static int rocker_group_tbl_del(struct rocker_port *rocker_port, @@ -2675,6 +2709,244 @@ static int rocker_group_l2_flood(struct rocker_port *rocker_port, group_id); } +static int rocker_group_l3_unicast(struct rocker_port *rocker_port, + int flags, u32 index, u8 *src_mac, + u8 *dst_mac, __be16 vlan_id, + bool ttl_check, u32 pport) +{ + struct rocker_group_tbl_entry *entry; + + entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags)); + if (!entry) + return -ENOMEM; + + entry->group_id = ROCKER_GROUP_L3_UNICAST(index); + if (src_mac) + ether_addr_copy(entry->l3_unicast.eth_src, src_mac); + if (dst_mac) + ether_addr_copy(entry->l3_unicast.eth_dst, dst_mac); + entry->l3_unicast.vlan_id = vlan_id; + entry->l3_unicast.ttl_check = ttl_check; + entry->l3_unicast.group_id = ROCKER_GROUP_L2_INTERFACE(vlan_id, pport); + + return rocker_group_tbl_do(rocker_port, flags, entry); +} + +static struct rocker_neigh_tbl_entry * + rocker_neigh_tbl_find(struct rocker *rocker, __be32 ip_addr) +{ + struct rocker_neigh_tbl_entry *found; + + hash_for_each_possible(rocker->neigh_tbl, found, entry, ip_addr) + if (found->ip_addr == ip_addr) + return found; + + return NULL; +} + +static void _rocker_neigh_add(struct rocker *rocker, + struct rocker_neigh_tbl_entry *entry) +{ + entry->index = rocker->neigh_tbl_next_index++; + entry->ref_count++; + hash_add(rocker->neigh_tbl, &entry->entry, entry->ip_addr); +} + +static void _rocker_neigh_del(struct rocker *rocker, + struct rocker_neigh_tbl_entry *entry) +{ + if (--entry->ref_count == 0) { + hash_del(&entry->entry); + kfree(entry); + } +} + +static void _rocker_neigh_update(struct rocker *rocker, + struct rocker_neigh_tbl_entry *entry, + u8 *eth_dst, bool ttl_check) +{ + if (eth_dst) { + ether_addr_copy(entry->eth_dst, eth_dst); + entry->ttl_check = ttl_check; + } else { + entry->ref_count++; + } +} + +static int rocker_port_ipv4_neigh(struct rocker_port *rocker_port, + int flags, __be32 ip_addr, u8 *eth_dst) +{ + struct rocker *rocker = rocker_port->rocker; + struct rocker_neigh_tbl_entry *entry; + struct rocker_neigh_tbl_entry *found; + unsigned long lock_flags; + __be16 eth_type = htons(ETH_P_IP); + enum rocker_of_dpa_table_id goto_tbl = + ROCKER_OF_DPA_TABLE_ID_ACL_POLICY; + u32 group_id; + u32 priority = 0; + bool adding = !(flags & ROCKER_OP_FLAG_REMOVE); + bool updating; + bool removing; + int err = 0; + + entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags)); + if (!entry) + return -ENOMEM; + + spin_lock_irqsave(&rocker->neigh_tbl_lock, lock_flags); + + found = rocker_neigh_tbl_find(rocker, ip_addr); + + updating = found && adding; + removing = found && !adding; + adding = !found && adding; + + if (adding) { + entry->ip_addr = ip_addr; + entry->dev = rocker_port->dev; + ether_addr_copy(entry->eth_dst, eth_dst); + entry->ttl_check = true; + _rocker_neigh_add(rocker, entry); + } else if (removing) { + memcpy(entry, found, sizeof(*entry)); + _rocker_neigh_del(rocker, found); + } else if (updating) { + _rocker_neigh_update(rocker, found, eth_dst, true); + memcpy(entry, found, sizeof(*entry)); + } else { + err = -ENOENT; + } + + spin_unlock_irqrestore(&rocker->neigh_tbl_lock, lock_flags); + + if (err) + goto err_out; + + /* For each active neighbor, we have an L3 unicast group and + * a /32 route to the neighbor, which uses the L3 unicast + * group. The L3 unicast group can also be referred to by + * other routes' nexthops. + */ + + err = rocker_group_l3_unicast(rocker_port, flags, + entry->index, + rocker_port->dev->dev_addr, + entry->eth_dst, + rocker_port->internal_vlan_id, + entry->ttl_check, + rocker_port->pport); + if (err) { + netdev_err(rocker_port->dev, + "Error (%d) L3 unicast group index %d\n", + err, entry->index); + goto err_out; + } + + if (adding || removing) { + group_id = ROCKER_GROUP_L3_UNICAST(entry->index); + err = rocker_flow_tbl_ucast4_routing(rocker_port, + eth_type, ip_addr, + inet_make_mask(32), + priority, goto_tbl, + group_id, flags); + + if (err) + netdev_err(rocker_port->dev, + "Error (%d) /32 unicast route %pI4 group 0x%08x\n", + err, &entry->ip_addr, group_id); + } + +err_out: + if (!adding) + kfree(entry); + + return err; +} + +static int rocker_port_ipv4_resolve(struct rocker_port *rocker_port, + __be32 ip_addr) +{ + struct net_device *dev = rocker_port->dev; + struct neighbour *n = __ipv4_neigh_lookup(dev, ip_addr); + int err = 0; + + if (!n) + n = neigh_create(&arp_tbl, &ip_addr, dev); + if (!n) + return -ENOMEM; + + /* If the neigh is already resolved, then go ahead and + * install the entry, otherwise start the ARP process to + * resolve the neigh. + */ + + if (n->nud_state & NUD_VALID) + err = rocker_port_ipv4_neigh(rocker_port, 0, ip_addr, n->ha); + else + neigh_event_send(n, NULL); + + return err; +} + +static int rocker_port_ipv4_nh(struct rocker_port *rocker_port, int flags, + __be32 ip_addr, u32 *index) +{ + struct rocker *rocker = rocker_port->rocker; + struct rocker_neigh_tbl_entry *entry; + struct rocker_neigh_tbl_entry *found; + unsigned long lock_flags; + bool adding = !(flags & ROCKER_OP_FLAG_REMOVE); + bool updating; + bool removing; + bool resolved = true; + int err = 0; + + entry = kzalloc(sizeof(*entry), rocker_op_flags_gfp(flags)); + if (!entry) + return -ENOMEM; + + spin_lock_irqsave(&rocker->neigh_tbl_lock, lock_flags); + + found = rocker_neigh_tbl_find(rocker, ip_addr); + if (found) + *index = found->index; + + updating = found && adding; + removing = found && !adding; + adding = !found && adding; + + if (adding) { + entry->ip_addr = ip_addr; + entry->dev = rocker_port->dev; + _rocker_neigh_add(rocker, entry); + *index = entry->index; + resolved = false; + } else if (removing) { + _rocker_neigh_del(rocker, found); + } else if (updating) { + _rocker_neigh_update(rocker, found, NULL, false); + resolved = !is_zero_ether_addr(found->eth_dst); + } else { + err = -ENOENT; + } + + spin_unlock_irqrestore(&rocker->neigh_tbl_lock, lock_flags); + + if (!adding) + kfree(entry); + + if (err) + return err; + + /* Resolved means neigh ip_addr is resolved to neigh mac. */ + + if (!resolved) + err = rocker_port_ipv4_resolve(rocker_port, ip_addr); + + return err; +} + static int rocker_port_vlan_flood_group(struct rocker_port *rocker_port, int flags, __be16 vlan_id) { @@ -3429,6 +3701,51 @@ not_found: spin_unlock_irqrestore(&rocker->internal_vlan_tbl_lock, lock_flags); } +static int rocker_port_fib_ipv4(struct rocker_port *rocker_port, __be32 dst, + int dst_len, struct fib_info *fi, u32 tb_id, + int flags) +{ + struct fib_nh *nh; + __be16 eth_type = htons(ETH_P_IP); + __be32 dst_mask = inet_make_mask(dst_len); + __be16 internal_vlan_id = rocker_port->internal_vlan_id; + u32 priority = fi->fib_priority; + enum rocker_of_dpa_table_id goto_tbl = + ROCKER_OF_DPA_TABLE_ID_ACL_POLICY; + u32 group_id; + bool nh_on_port; + bool has_gw; + u32 index; + int err; + + /* XXX support ECMP */ + + nh = fi->fib_nh; + nh_on_port = (fi->fib_dev == rocker_port->dev); + has_gw = !!nh->nh_gw; + + if (has_gw && nh_on_port) { + err = rocker_port_ipv4_nh(rocker_port, flags, + nh->nh_gw, &index); + if (err) + return err; + + group_id = ROCKER_GROUP_L3_UNICAST(index); + } else { + /* Send to CPU for processing */ + group_id = ROCKER_GROUP_L2_INTERFACE(internal_vlan_id, 0); + } + + err = rocker_flow_tbl_ucast4_routing(rocker_port, eth_type, dst, + dst_mask, priority, goto_tbl, + group_id, flags); + if (err) + netdev_err(rocker_port->dev, "Error (%d) IPv4 route %pI4\n", + err, &dst); + + return err; +} + /***************** * Net device ops *****************/ @@ -3830,6 +4147,30 @@ static int rocker_port_switch_port_stp_update(struct net_device *dev, u8 state) return rocker_port_stp_update(rocker_port, state); } +static int rocker_port_switch_fib_ipv4_add(struct net_device *dev, + __be32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + struct rocker_port *rocker_port = netdev_priv(dev); + int flags = 0; + + return rocker_port_fib_ipv4(rocker_port, dst, dst_len, + fi, tb_id, flags); +} + +static int rocker_port_switch_fib_ipv4_del(struct net_device *dev, + __be32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + struct rocker_port *rocker_port = netdev_priv(dev); + int flags = ROCKER_OP_FLAG_REMOVE; + + return rocker_port_fib_ipv4(rocker_port, dst, dst_len, + fi, tb_id, flags); +} + static const struct net_device_ops rocker_port_netdev_ops = { .ndo_open = rocker_port_open, .ndo_stop = rocker_port_stop, @@ -3844,6 +4185,8 @@ static const struct net_device_ops rocker_port_netdev_ops = { .ndo_bridge_getlink = rocker_port_bridge_getlink, .ndo_switch_parent_id_get = rocker_port_switch_parent_id_get, .ndo_switch_port_stp_update = rocker_port_switch_port_stp_update, + .ndo_switch_fib_ipv4_add = rocker_port_switch_fib_ipv4_add, + .ndo_switch_fib_ipv4_del = rocker_port_switch_fib_ipv4_del, }; /******************** @@ -4204,8 +4547,9 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) NAPI_POLL_WEIGHT); rocker_carrier_init(rocker_port); - dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | - NETIF_F_HW_SWITCH_OFFLOAD; + dev->features |= NETIF_F_NETNS_LOCAL | + NETIF_F_HW_VLAN_CTAG_FILTER | + NETIF_F_HW_SWITCH_OFFLOAD; err = register_netdev(dev); if (err) { @@ -4546,6 +4890,48 @@ static struct notifier_block rocker_netdevice_nb __read_mostly = { .notifier_call = rocker_netdevice_event, }; +/************************************ + * Net event notifier event handler + ************************************/ + +static int rocker_neigh_update(struct net_device *dev, struct neighbour *n) +{ + struct rocker_port *rocker_port = netdev_priv(dev); + int flags = (n->nud_state & NUD_VALID) ? 0 : ROCKER_OP_FLAG_REMOVE; + __be32 ip_addr = *(__be32 *)n->primary_key; + + return rocker_port_ipv4_neigh(rocker_port, flags, ip_addr, n->ha); +} + +static int rocker_netevent_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev; + struct neighbour *n = ptr; + int err; + + switch (event) { + case NETEVENT_NEIGH_UPDATE: + if (n->tbl != &arp_tbl) + return NOTIFY_DONE; + dev = n->dev; + if (!rocker_port_dev_check(dev)) + return NOTIFY_DONE; + err = rocker_neigh_update(dev, n); + if (err) + netdev_warn(dev, + "failed to handle neigh update (err %d)\n", + err); + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block rocker_netevent_nb __read_mostly = { + .notifier_call = rocker_netevent_event, +}; + /*********************** * Module init and exit ***********************/ @@ -4555,18 +4941,21 @@ static int __init rocker_module_init(void) int err; register_netdevice_notifier(&rocker_netdevice_nb); + register_netevent_notifier(&rocker_netevent_nb); err = pci_register_driver(&rocker_pci_driver); if (err) goto err_pci_register_driver; return 0; err_pci_register_driver: + unregister_netdevice_notifier(&rocker_netevent_nb); unregister_netdevice_notifier(&rocker_netdevice_nb); return err; } static void __exit rocker_module_exit(void) { + unregister_netevent_notifier(&rocker_netevent_nb); unregister_netdevice_notifier(&rocker_netdevice_nb); pci_unregister_driver(&rocker_pci_driver); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 625c8d71511b..45413784a3b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -768,6 +768,8 @@ struct netdev_phys_item_id { typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); +struct fib_info; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1031,6 +1033,14 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); * Called to notify switch device port of bridge port STP * state change. + * int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to add/modify IPv4 route to switch device. + * int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst, + * int dst_len, struct fib_info *fi, + * u8 tos, u8 type, u32 tb_id); + * Called to delete IPv4 route from switch device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1192,6 +1202,18 @@ struct net_device_ops { struct netdev_phys_item_id *psid); int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); + int (*ndo_switch_fib_ipv4_add)(struct net_device *dev, + __be32 dst, + int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 tb_id); + int (*ndo_switch_fib_ipv4_del)(struct net_device *dev, + __be32 dst, + int dst_len, + struct fib_info *fi, + u8 tos, u8 type, + u32 tb_id); #endif }; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 825cb2800908..1657604c5dd3 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -196,6 +196,7 @@ int fib_table_delete(struct fib_table *, struct fib_config *); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb); int fib_table_flush(struct fib_table *table); +void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); @@ -308,6 +309,7 @@ static inline int fib_num_tclassid_users(struct net *net) return 0; } #endif +void fib_flush_external(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index db1db158a00e..1085e12f940f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,7 @@ struct netns_ipv4 { int fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; + bool fib_offload_disabled; struct sock *fibnl; struct sock * __percpu *icmp_sk; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index cfcdac2e5d25..dc0a5cc7c2c5 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -51,6 +51,12 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); +int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id); +int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id); +void netdev_switch_fib_ipv4_abort(struct fib_info *fi); + #else static inline int netdev_switch_parent_id_get(struct net_device *dev, @@ -109,6 +115,24 @@ static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device * return 0; } +static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + return 0; +} + +static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, + struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + return 0; +} + +void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +{ +} + #endif #endif /* _LINUX_SWITCHDEV_H_ */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 06f75a407f74..c3722b024e73 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -334,6 +334,7 @@ struct rtnexthop { #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_EXTERNAL 8 /* Route installed externally */ /* Macros to handle hexthops */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 220c4b4af4cf..e067770235bf 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -144,6 +144,19 @@ static void fib_flush(struct net *net) rt_cache_flush(net); } +void fib_flush_external(struct net *net) +{ + struct fib_table *tb; + struct hlist_head *head; + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, head, tb_hlist) + fib_table_flush_external(tb); + } +} + /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index d3db718be51d..190d0d00d744 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -209,6 +209,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); + err = 0; errout: return err; @@ -224,6 +226,7 @@ static void fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index fae34ad4bb1a..6544f1a0cfa1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -79,6 +79,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/switchdev.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -1135,7 +1136,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; + err = netdev_switch_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + kmem_cache_free(fn_alias_kmem, new_fa); + goto out; + } + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1171,10 +1183,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_state = 0; new_fa->fa_slen = slen; + /* (Optionally) offload fib entry to switch hardware. */ + err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, + cfg->fc_type, tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + goto out_free_new_fa; + } + /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_free_new_fa; + goto out_sw_fib_del; if (!plen) tb->tb_num_default++; @@ -1185,6 +1205,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) succeeded: return 0; +out_sw_fib_del: + netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1456,6 +1478,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; + netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); + rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1536,6 +1561,67 @@ found: return n; } +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct fib_alias *fa; + struct tnode *n, *pn; + unsigned long cindex; + unsigned char slen; + int found = 0; + + n = rcu_dereference(t->trie); + if (!n) + return; + + pn = NULL; + cindex = 0; + + while (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; +backtrace: + /* walk trie in reverse order */ + do { + while (!(cindex--)) { + t_key pkey = pn->key; + + n = pn; + pn = node_parent(n); + + /* resize completed node */ + resize(t, n); + + /* if we got the root we are done */ + if (!pn) + return; + + cindex = get_index(pkey, pn); + } + + /* grab the next available node */ + n = tnode_get_child(pn, cindex); + } while (!n); + } + + hlist_for_each_entry(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags & RTNH_F_EXTERNAL)) { + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); + } + } + + /* if trie is leaf only loop is completed */ + if (pn) + goto backtrace; +} + /* Caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { @@ -1589,6 +1675,10 @@ backtrace: struct fib_info *fi = fa->fa_info; if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8c1e558db118..f4fd575aa2a3 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/netdevice.h> +#include <net/ip_fib.h> #include <net/switchdev.h> /** @@ -225,3 +226,163 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, return ret; } EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_dellink); + +static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct net_device *lower_dev; + struct net_device *port_dev; + struct list_head *iter; + + /* Recusively search down until we find a sw port dev. + * (A sw port dev supports ndo_switch_parent_id_get). + */ + + if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD && + ops->ndo_switch_parent_id_get) + return dev; + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + port_dev = netdev_switch_get_lowest_dev(lower_dev); + if (port_dev) + return port_dev; + } + + return NULL; +} + +static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi) +{ + struct netdev_phys_item_id psid; + struct netdev_phys_item_id prev_psid; + struct net_device *dev = NULL; + int nhsel; + + /* For this route, all nexthop devs must be on the same switch. */ + + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (!nh->nh_dev) + return NULL; + + dev = netdev_switch_get_lowest_dev(nh->nh_dev); + if (!dev) + return NULL; + + if (netdev_switch_parent_id_get(dev, &psid)) + return NULL; + + if (nhsel > 0) { + if (prev_psid.id_len != psid.id_len) + return NULL; + if (memcmp(prev_psid.id, psid.id, psid.id_len)) + return NULL; + } + + prev_psid = psid; + } + + return dev; +} + +/** + * netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch + * + * @dst: route's IPv4 destination address + * @dst_len: destination address length (prefix length) + * @fi: route FIB info structure + * @tos: route TOS + * @type: route type + * @tb_id: route table ID + * + * Add IPv4 route entry to switch device. + */ +int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + struct net_device *dev; + const struct net_device_ops *ops; + int err = 0; + + /* Don't offload route if using custom ip rules or if + * IPv4 FIB offloading has been disabled completely. + */ + + if (fi->fib_net->ipv4.fib_has_custom_rules | + fi->fib_net->ipv4.fib_offload_disabled) + return 0; + + dev = netdev_switch_get_dev_by_nhs(fi); + if (!dev) + return 0; + ops = dev->netdev_ops; + + if (ops->ndo_switch_fib_ipv4_add) { + err = ops->ndo_switch_fib_ipv4_add(dev, htonl(dst), dst_len, + fi, tos, type, tb_id); + if (!err) + fi->fib_flags |= RTNH_F_EXTERNAL; + } + + return err; +} +EXPORT_SYMBOL(netdev_switch_fib_ipv4_add); + +/** + * netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch + * + * @dst: route's IPv4 destination address + * @dst_len: destination address length (prefix length) + * @fi: route FIB info structure + * @tos: route TOS + * @type: route type + * @tb_id: route table ID + * + * Delete IPv4 route entry from switch device. + */ +int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id) +{ + struct net_device *dev; + const struct net_device_ops *ops; + int err = 0; + + if (!(fi->fib_flags & RTNH_F_EXTERNAL)) + return 0; + + dev = netdev_switch_get_dev_by_nhs(fi); + if (!dev) + return 0; + ops = dev->netdev_ops; + + if (ops->ndo_switch_fib_ipv4_del) { + err = ops->ndo_switch_fib_ipv4_del(dev, htonl(dst), dst_len, + fi, tos, type, tb_id); + if (!err) + fi->fib_flags &= ~RTNH_F_EXTERNAL; + } + + return err; +} +EXPORT_SYMBOL(netdev_switch_fib_ipv4_del); + +/** + * netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation + * + * @fi: route FIB info structure + */ +void netdev_switch_fib_ipv4_abort(struct fib_info *fi) +{ + /* There was a problem installing this route to the offload + * device. For now, until we come up with more refined + * policy handling, abruptly end IPv4 fib offloading for + * for entire net by flushing offload device(s) of all + * IPv4 routes, and mark IPv4 fib offloading broken from + * this point forward. + */ + + fib_flush_external(fi->fib_net); + fi->fib_net->ipv4.fib_offload_disabled = true; +} +EXPORT_SYMBOL(netdev_switch_fib_ipv4_abort); |