From 88d0895d0ea9d4431507d576c963f2ff9918144d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 15:08:44 +0200 Subject: batman-adv: Avoid probe ELP information leak The probe ELPs for WiFi interfaces are expanded to contain at least BATADV_ELP_MIN_PROBE_SIZE bytes. This is usually a lot more than the number of bytes which the template ELP packet requires. These extra padding bytes were not initialized and thus could contain data which were previously stored at the same location. It is therefore required to set it to some predefined or random values to avoid leaking private information from the system transmitting these kind of packets. Fixes: e4623c913508 ("batman-adv: Avoid probe ELP information leak") Signed-off-by: Sven Eckelmann Acked-by: Antonio Quartulli Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 71c20c1d4002..e103c759b7ab 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -241,7 +241,7 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) * the packet to be exactly of that size to make the link * throughput estimation effective. */ - skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len); + skb_put_zero(skb, probe_len - hard_iface->bat_v.elp_skb->len); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending unicast (probe) ELP packet on interface %s to %pM\n", -- cgit v1.2.3 From b9fd14c20871e6189f635e49b32d7789e430b3c8 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 16:46:47 +0200 Subject: batman-adv: Fix segfault when writing to throughput_override The per hardif sysfs file "batman_adv/throughput_override" prints the resulting change as info text when the users writes to this file. It uses the helper function batadv_info to add it at the same time to the kernel ring buffer and to the batman-adv debug log (when CONFIG_BATMAN_ADV_DEBUG is enabled). The function batadv_info requires as first parameter the batman-adv softif net_device. This parameter is then used to find the private buffer which contains the debug log for this batman-adv interface. But batadv_store_throughput_override used as first argument the slave net_device. This slave device doesn't have the batadv_priv private data which is access by batadv_info. Writing to this file with CONFIG_BATMAN_ADV_DEBUG enabled can either lead to a segfault or to memory corruption. Fixes: 0b5ecc6811bd ("batman-adv: add throughput override attribute to hard_ifaces") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index f2eef43bd2ec..3a76e8970c02 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1090,8 +1090,9 @@ static ssize_t batadv_store_throughput_override(struct kobject *kobj, if (old_tp_override == tp_override) goto out; - batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n", - "throughput_override", + batadv_info(hard_iface->soft_iface, + "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n", + "throughput_override", net_dev->name, old_tp_override / 10, old_tp_override % 10, tp_override / 10, tp_override % 10); -- cgit v1.2.3 From a25bab9d723a08bd0bdafb1529faf9094c690b70 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 31 Aug 2018 16:56:29 +0200 Subject: batman-adv: Fix segfault when writing to sysfs elp_interval The per hardif sysfs file "batman_adv/elp_interval" is using the generic functions to store/show uint values. The helper __batadv_store_uint_attr requires the softif net_device as parameter to print the resulting change as info text when the users writes to this file. It uses the helper function batadv_info to add it at the same time to the kernel ring buffer and to the batman-adv debug log (when CONFIG_BATMAN_ADV_DEBUG is enabled). The function batadv_info requires as first parameter the batman-adv softif net_device. This parameter is then used to find the private buffer which contains the debug log for this batman-adv interface. But batadv_store_throughput_override used as first argument the slave net_device. This slave device doesn't have the batadv_priv private data which is access by batadv_info. Writing to this file with CONFIG_BATMAN_ADV_DEBUG enabled can either lead to a segfault or to memory corruption. Fixes: 0744ff8fa8fa ("batman-adv: Add hard_iface specific sysfs wrapper macros for UINT") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/sysfs.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 3a76e8970c02..09427fc6494a 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -188,7 +188,8 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ return __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &bat_priv->_var, net_dev); \ + &bat_priv->_var, net_dev, \ + NULL); \ } #define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \ @@ -262,7 +263,9 @@ ssize_t batadv_store_##_name(struct kobject *kobj, \ \ length = __batadv_store_uint_attr(buff, count, _min, _max, \ _post_func, attr, \ - &hard_iface->_var, net_dev); \ + &hard_iface->_var, \ + hard_iface->soft_iface, \ + net_dev); \ \ batadv_hardif_put(hard_iface); \ return length; \ @@ -356,10 +359,12 @@ __batadv_store_bool_attr(char *buff, size_t count, static int batadv_store_uint_attr(const char *buff, size_t count, struct net_device *net_dev, + struct net_device *slave_dev, const char *attr_name, unsigned int min, unsigned int max, atomic_t *attr) { + char ifname[IFNAMSIZ + 3] = ""; unsigned long uint_val; int ret; @@ -385,8 +390,11 @@ static int batadv_store_uint_attr(const char *buff, size_t count, if (atomic_read(attr) == uint_val) return count; - batadv_info(net_dev, "%s: Changing from: %i to: %lu\n", - attr_name, atomic_read(attr), uint_val); + if (slave_dev) + snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name); + + batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n", + attr_name, ifname, atomic_read(attr), uint_val); atomic_set(attr, uint_val); return count; @@ -397,12 +405,13 @@ static ssize_t __batadv_store_uint_attr(const char *buff, size_t count, void (*post_func)(struct net_device *), const struct attribute *attr, atomic_t *attr_store, - struct net_device *net_dev) + struct net_device *net_dev, + struct net_device *slave_dev) { int ret; - ret = batadv_store_uint_attr(buff, count, net_dev, attr->name, min, max, - attr_store); + ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev, + attr->name, min, max, attr_store); if (post_func && ret) post_func(net_dev); @@ -571,7 +580,7 @@ static ssize_t batadv_store_gw_sel_class(struct kobject *kobj, return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE, batadv_post_gw_reselect, attr, &bat_priv->gw.sel_class, - bat_priv->soft_iface); + bat_priv->soft_iface, NULL); } static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, -- cgit v1.2.3 From dff9bc42ab0b2d38c5e90ddd79b238fed5b4c7ad Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:41 +0200 Subject: batman-adv: Prevent duplicated gateway_node entry The function batadv_gw_node_add is responsible for adding new gw_node to the gateway_list. It is expecting that the caller already checked that there is not already an entry with the same key or not. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: c6c8fea29769 ("net: Add batman-adv meshing protocol") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/gateway_client.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 8b198ee798c9..140c61a3f1ec 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -348,6 +349,9 @@ out: * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information + * + * Has to be called with the appropriate locks being acquired + * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, @@ -355,6 +359,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node; + lockdep_assert_held(&bat_priv->gw.list_lock); + if (gateway->bandwidth_down == 0) return; @@ -369,10 +375,8 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - spin_lock_bh(&bat_priv->gw.list_lock); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); - spin_unlock_bh(&bat_priv->gw.list_lock); batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", @@ -428,11 +432,14 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, { struct batadv_gw_node *gw_node, *curr_gw = NULL; + spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); + spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } + spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) -- cgit v1.2.3 From fa122fec8640eb7186ce5a41b83a4c1744ceef8f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:42 +0200 Subject: batman-adv: Prevent duplicated nc_node entry The function batadv_nc_get_nc_node is responsible for adding new nc_nodes to the in_coding_list and out_coding_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: d56b1705e28c ("batman-adv: network coding - detect coding nodes and remove these after timeout") Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- net/batman-adv/network-coding.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index c3578444f3cb..34caf129a9bf 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -854,16 +854,27 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ struct list_head *list; + /* Select ingoing or outgoing coding node */ + if (in_coding) { + lock = &orig_neigh_node->in_coding_list_lock; + list = &orig_neigh_node->in_coding_list; + } else { + lock = &orig_neigh_node->out_coding_list_lock; + list = &orig_neigh_node->out_coding_list; + } + + spin_lock_bh(lock); + /* Check if nc_node is already added */ nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); /* Node found */ if (nc_node) - return nc_node; + goto unlock; nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); if (!nc_node) - return NULL; + goto unlock; /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); @@ -872,22 +883,14 @@ batadv_nc_get_nc_node(struct batadv_priv *bat_priv, kref_get(&orig_neigh_node->refcount); nc_node->orig_node = orig_neigh_node; - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", nc_node->addr, nc_node->orig_node->orig); /* Add nc_node to orig_node */ - spin_lock_bh(lock); kref_get(&nc_node->refcount); list_add_tail_rcu(&nc_node->list, list); + +unlock: spin_unlock_bh(lock); return nc_node; -- cgit v1.2.3 From 94cb82f594ed86be303398d6dfc7640a6f1d45d4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:43 +0200 Subject: batman-adv: Prevent duplicated softif_vlan entry The function batadv_softif_vlan_get is responsible for adding new softif_vlan to the softif_vlan_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: 5d2c05b21337 ("batman-adv: add per VLAN interface attribute framework") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/soft-interface.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 1485263a348b..626ddca332db 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -574,15 +574,20 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) struct batadv_softif_vlan *vlan; int err; + spin_lock_bh(&bat_priv->softif_vlan_list_lock); + vlan = batadv_softif_vlan_get(bat_priv, vid); if (vlan) { batadv_softif_vlan_put(vlan); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); - if (!vlan) + if (!vlan) { + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); return -ENOMEM; + } vlan->bat_priv = bat_priv; vlan->vid = vid; @@ -590,17 +595,23 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) atomic_set(&vlan->ap_isolation, 0); + kref_get(&vlan->refcount); + hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); + spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + + /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the + * sleeping behavior of the sysfs functions and the fs_reclaim lock + */ err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan); if (err) { - kfree(vlan); + /* ref for the function */ + batadv_softif_vlan_put(vlan); + + /* ref for the list */ + batadv_softif_vlan_put(vlan); return err; } - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - kref_get(&vlan->refcount); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); - /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ -- cgit v1.2.3 From e7136e48ffdfb9f37b0820f619380485eb407361 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:44 +0200 Subject: batman-adv: Prevent duplicated global TT entry The function batadv_tt_global_orig_entry_add is responsible for adding new tt_orig_list_entry to the orig_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: d657e621a0f5 ("batman-adv: add reference counting for type batadv_tt_orig_list_entry") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 12a2b7d21376..d21624c44665 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1613,6 +1613,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, { struct batadv_tt_orig_list_entry *orig_entry; + spin_lock_bh(&tt_global->list_lock); + orig_entry = batadv_tt_global_orig_entry_find(tt_global, orig_node); if (orig_entry) { /* refresh the ttvn: the current value could be a bogus one that @@ -1635,11 +1637,9 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, orig_entry->flags = flags; kref_init(&orig_entry->refcount); - spin_lock_bh(&tt_global->list_lock); kref_get(&orig_entry->refcount); hlist_add_head_rcu(&orig_entry->list, &tt_global->orig_list); - spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); sync_flags: @@ -1647,6 +1647,8 @@ sync_flags: out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); + + spin_unlock_bh(&tt_global->list_lock); } /** -- cgit v1.2.3 From ae3cdc97dc10c7a3b31f297dab429bfb774c9ccb Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 12 Aug 2018 21:04:45 +0200 Subject: batman-adv: Prevent duplicated tvlv handler The function batadv_tvlv_handler_register is responsible for adding new tvlv_handler to the handler_list. It first checks whether the entry already is in the list or not. If it is, then the creation of a new entry is aborted. But the lock for the list is only held when the list is really modified. This could lead to duplicated entries because another context could create an entry with the same key between the check and the list manipulation. The check and the manipulation of the list must therefore be in the same locked code section. Fixes: ef26157747d4 ("batman-adv: tvlv - basic infrastructure") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/tvlv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index a637458205d1..40e69c9346d2 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -529,15 +529,20 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, { struct batadv_tvlv_handler *tvlv_handler; + spin_lock_bh(&bat_priv->tvlv.handler_list_lock); + tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); - if (!tvlv_handler) + if (!tvlv_handler) { + spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; + } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; @@ -547,7 +552,6 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); - spin_lock_bh(&bat_priv->tvlv.handler_list_lock); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); -- cgit v1.2.3 From 5af96b9c59c72fb2af2d19c5cc2f3cdcee391dff Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Fri, 7 Sep 2018 05:45:54 +0800 Subject: batman-adv: fix backbone_gw refcount on queue_work() failure The backbone_gw refcounter is to be decreased by the queued work and currently is never decreased if the queue_work() call fails. Fix by checking the queue_work() return value and decrease refcount if necessary. Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bridge_loop_avoidance.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ff9659af6b91..5f1aeeded0e3 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1772,6 +1772,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; + bool ret; ethhdr = eth_hdr(skb); @@ -1795,8 +1796,13 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, if (unlikely(!backbone_gw)) return true; - queue_work(batadv_event_workqueue, &backbone_gw->report_work); - /* backbone_gw is unreferenced in the report work function function */ + ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); + + /* backbone_gw is unreferenced in the report work function function + * if queue_work() call was successful + */ + if (!ret) + batadv_backbone_gw_put(backbone_gw); return true; } -- cgit v1.2.3 From 4c4af6900844ab04c9434c972021d7b48610e06a Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Fri, 7 Sep 2018 05:45:55 +0800 Subject: batman-adv: fix hardif_neigh refcount on queue_work() failure The hardif_neigh refcounter is to be decreased by the queued work and currently is never decreased if the queue_work() call fails. Fix by checking the queue_work() return value and decrease refcount if necessary. Signed-off-by: Marek Lindner Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/bat_v_elp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index e103c759b7ab..9f481cfdf77d 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -268,6 +268,7 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; + bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -329,8 +330,11 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) * may sleep and that is not allowed in an rcu protected * context. Therefore schedule a task for that. */ - queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); + ret = queue_work(batadv_event_workqueue, + &hardif_neigh->bat_v.metric_work); + + if (!ret) + batadv_hardif_neigh_put(hardif_neigh); } rcu_read_unlock(); -- cgit v1.2.3 From dabeb13eee81329338b1f8f330dfcc37a86714d7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 14 Sep 2018 10:47:14 +0200 Subject: batman-adv: Increase version number to 2018.3 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 8da3c9336111..3ccc75ee719c 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -25,7 +25,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2018.2" +#define BATADV_SOURCE_VERSION "2018.3" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From 250ae0d46d857ed61f3e0abf584901e46bd2c638 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 2 Sep 2018 12:01:53 +0300 Subject: net/mlx5: Fix read from coherent memory Use accessor function READ_ONCE to read from coherent memory modified by the device and read by the driver. This becomes most important in preemptive kernels where cond_resched implementation does not have the side effect which guaranteed the updated value. Fixes: 269d26f47f6f ("net/mlx5: Reduce command polling interval") Change-Id: Ie6deeb565ffaf76777b07448c7fbcce3510bbb8a Signed-off-by: Eli Cohen Reported-by: Jesper Dangaard Brouer Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3ce14d42ddc8..a53736c26c0c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -206,7 +206,7 @@ static void poll_timeout(struct mlx5_cmd_work_ent *ent) u8 own; do { - own = ent->lay->status_own; + own = READ_ONCE(ent->lay->status_own); if (!(own & CMD_OWNER_HW)) { ent->ret = 0; return; -- cgit v1.2.3 From 6b359d5550a1ae7a1269c9dc1dd73dfdc4d6fe58 Mon Sep 17 00:00:00 2001 From: Alaa Hleihel Date: Mon, 3 Sep 2018 10:38:14 +0300 Subject: net/mlx5: Check for SQ and not RQ state when modifying hairpin SQ When modifying hairpin SQ, instead of checking if the next state equals to MLX5_SQC_STATE_RDY, we compare it against the MLX5_RQC_STATE_RDY enum value. The code worked since both of MLX5_RQC_STATE_RDY and MLX5_SQC_STATE_RDY have the same value today. This patch fixes this issue. Fixes: 18e568c390c6 ("net/mlx5: Hairpin pair core object setup") Change-Id: I6758aa7b4bd137966ae28206b70648c5bc223b46 Signed-off-by: Alaa Hleihel Reviewed-by: Or Gerlitz Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index dae1c5c5d27c..d2f76070ea7c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -509,7 +509,7 @@ static int mlx5_hairpin_modify_sq(struct mlx5_core_dev *peer_mdev, u32 sqn, sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); - if (next_state == MLX5_RQC_STATE_RDY) { + if (next_state == MLX5_SQC_STATE_RDY) { MLX5_SET(sqc, sqc, hairpin_peer_rq, peer_rq); MLX5_SET(sqc, sqc, hairpin_peer_vhca, peer_vhca); } -- cgit v1.2.3 From 8f92e35aff9692028279d3c03e88547df6d15020 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 15 Sep 2018 00:50:02 -0700 Subject: net/mlx5e: TLS, Read capabilities only when it is safe Read TLS caps from the core driver only when TLS is supported, i.e mlx5_accel_is_tls_device returns true. Fixes: 790af90c00d2 ("net/mlx5e: TLS, build TLS netdev from capabilities") Change-Id: I5f21ff4d684901af487e366a7e0cf032b54ee9cf Reported-by: Michal Kubecek Signed-off-by: Saeed Mahameed Reviewed-by: Boris Pismenny Reviewed-by: Tariq Toukan --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c index eddd7702680b..e88340e196f7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/tls.c @@ -183,12 +183,13 @@ static const struct tlsdev_ops mlx5e_tls_ops = { void mlx5e_tls_build_netdev(struct mlx5e_priv *priv) { - u32 caps = mlx5_accel_tls_device_caps(priv->mdev); struct net_device *netdev = priv->netdev; + u32 caps; if (!mlx5_accel_is_tls_device(priv->mdev)) return; + caps = mlx5_accel_tls_device_caps(priv->mdev); if (caps & MLX5_ACCEL_TLS_TX) { netdev->features |= NETIF_F_HW_TLS_TX; netdev->hw_features |= NETIF_F_HW_TLS_TX; -- cgit v1.2.3 From 8fce3331702316d4bcfeb0771c09ac75d2192bbc Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 17 Sep 2018 09:22:56 +0100 Subject: net: stmmac: Rework coalesce timer and fix multi-queue races This follows David Miller advice and tries to fix coalesce timer in multi-queue scenarios. We are now using per-queue coalesce values and per-queue TX timer. Coalesce timer default values was changed to 1ms and the coalesce frames to 25. Tested in B2B setup between XGMAC2 and GMAC5. Signed-off-by: Jose Abreu Fixes: ce736788e8a ("net: stmmac: adding multiple buffers for TX") Cc: Florian Fainelli Cc: Neil Armstrong Cc: Jerome Brunet Cc: Martin Blumenstingl Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 4 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 14 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 233 ++++++++++++---------- include/linux/stmmac.h | 1 + 4 files changed, 146 insertions(+), 106 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 1854f270ad66..b1b305f8f414 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -258,10 +258,10 @@ struct stmmac_safety_stats { #define MAX_DMA_RIWT 0xff #define MIN_DMA_RIWT 0x20 /* Tx coalesce parameters */ -#define STMMAC_COAL_TX_TIMER 40000 +#define STMMAC_COAL_TX_TIMER 1000 #define STMMAC_MAX_COAL_TX_TICK 100000 #define STMMAC_TX_MAX_FRAMES 256 -#define STMMAC_TX_FRAMES 64 +#define STMMAC_TX_FRAMES 25 /* Packets types */ enum packets_types { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index c0a855b7ab3b..63e1064b27a2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -48,6 +48,8 @@ struct stmmac_tx_info { /* Frequently used values are kept adjacent for cache effect */ struct stmmac_tx_queue { + u32 tx_count_frames; + struct timer_list txtimer; u32 queue_index; struct stmmac_priv *priv_data; struct dma_extended_desc *dma_etx ____cacheline_aligned_in_smp; @@ -73,7 +75,14 @@ struct stmmac_rx_queue { u32 rx_zeroc_thresh; dma_addr_t dma_rx_phy; u32 rx_tail_addr; +}; + +struct stmmac_channel { struct napi_struct napi ____cacheline_aligned_in_smp; + struct stmmac_priv *priv_data; + u32 index; + int has_rx; + int has_tx; }; struct stmmac_tc_entry { @@ -109,14 +118,12 @@ struct stmmac_pps_cfg { struct stmmac_priv { /* Frequently used values are kept adjacent for cache effect */ - u32 tx_count_frames; u32 tx_coal_frames; u32 tx_coal_timer; int tx_coalesce; int hwts_tx_en; bool tx_path_in_lpi_mode; - struct timer_list txtimer; bool tso; unsigned int dma_buf_sz; @@ -137,6 +144,9 @@ struct stmmac_priv { /* TX Queue */ struct stmmac_tx_queue tx_queue[MTL_MAX_TX_QUEUES]; + /* Generic channel for NAPI */ + struct stmmac_channel channel[STMMAC_CH_MAX]; + bool oldlink; int speed; int oldduplex; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 9f458bb16f2a..ab9cc0143ff2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -148,12 +148,14 @@ static void stmmac_verify_args(void) static void stmmac_disable_all_queues(struct stmmac_priv *priv) { u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u32 tx_queues_cnt = priv->plat->tx_queues_to_use; + u32 maxq = max(rx_queues_cnt, tx_queues_cnt); u32 queue; - for (queue = 0; queue < rx_queues_cnt; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - napi_disable(&rx_q->napi); + napi_disable(&ch->napi); } } @@ -164,12 +166,14 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv) static void stmmac_enable_all_queues(struct stmmac_priv *priv) { u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u32 tx_queues_cnt = priv->plat->tx_queues_to_use; + u32 maxq = max(rx_queues_cnt, tx_queues_cnt); u32 queue; - for (queue = 0; queue < rx_queues_cnt; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - napi_enable(&rx_q->napi); + napi_enable(&ch->napi); } } @@ -1843,18 +1847,18 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) * @queue: TX queue index * Description: it reclaims the transmit resources after transmission completes. */ -static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) +static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) { struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; unsigned int bytes_compl = 0, pkts_compl = 0; - unsigned int entry; + unsigned int entry, count = 0; - netif_tx_lock(priv->dev); + __netif_tx_lock_bh(netdev_get_tx_queue(priv->dev, queue)); priv->xstats.tx_clean++; entry = tx_q->dirty_tx; - while (entry != tx_q->cur_tx) { + while ((entry != tx_q->cur_tx) && (count < budget)) { struct sk_buff *skb = tx_q->tx_skbuff[entry]; struct dma_desc *p; int status; @@ -1870,6 +1874,8 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) if (unlikely(status & tx_dma_own)) break; + count++; + /* Make sure descriptor fields are read after reading * the own bit. */ @@ -1937,7 +1943,10 @@ static void stmmac_tx_clean(struct stmmac_priv *priv, u32 queue) stmmac_enable_eee_mode(priv); mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer)); } - netif_tx_unlock(priv->dev); + + __netif_tx_unlock_bh(netdev_get_tx_queue(priv->dev, queue)); + + return count; } /** @@ -2020,6 +2029,33 @@ static bool stmmac_safety_feat_interrupt(struct stmmac_priv *priv) return false; } +static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan) +{ + int status = stmmac_dma_interrupt_status(priv, priv->ioaddr, + &priv->xstats, chan); + struct stmmac_channel *ch = &priv->channel[chan]; + bool needs_work = false; + + if ((status & handle_rx) && ch->has_rx) { + needs_work = true; + } else { + status &= ~handle_rx; + } + + if ((status & handle_tx) && ch->has_tx) { + needs_work = true; + } else { + status &= ~handle_tx; + } + + if (needs_work && napi_schedule_prep(&ch->napi)) { + stmmac_disable_dma_irq(priv, priv->ioaddr, chan); + __napi_schedule(&ch->napi); + } + + return status; +} + /** * stmmac_dma_interrupt - DMA ISR * @priv: driver private structure @@ -2034,57 +2070,14 @@ static void stmmac_dma_interrupt(struct stmmac_priv *priv) u32 channels_to_check = tx_channel_count > rx_channel_count ? tx_channel_count : rx_channel_count; u32 chan; - bool poll_scheduled = false; int status[max_t(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) channels_to_check = ARRAY_SIZE(status); - /* Each DMA channel can be used for rx and tx simultaneously, yet - * napi_struct is embedded in struct stmmac_rx_queue rather than in a - * stmmac_channel struct. - * Because of this, stmmac_poll currently checks (and possibly wakes) - * all tx queues rather than just a single tx queue. - */ for (chan = 0; chan < channels_to_check; chan++) - status[chan] = stmmac_dma_interrupt_status(priv, priv->ioaddr, - &priv->xstats, chan); - - for (chan = 0; chan < rx_channel_count; chan++) { - if (likely(status[chan] & handle_rx)) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[chan]; - - if (likely(napi_schedule_prep(&rx_q->napi))) { - stmmac_disable_dma_irq(priv, priv->ioaddr, chan); - __napi_schedule(&rx_q->napi); - poll_scheduled = true; - } - } - } - - /* If we scheduled poll, we already know that tx queues will be checked. - * If we didn't schedule poll, see if any DMA channel (used by tx) has a - * completed transmission, if so, call stmmac_poll (once). - */ - if (!poll_scheduled) { - for (chan = 0; chan < tx_channel_count; chan++) { - if (status[chan] & handle_tx) { - /* It doesn't matter what rx queue we choose - * here. We use 0 since it always exists. - */ - struct stmmac_rx_queue *rx_q = - &priv->rx_queue[0]; - - if (likely(napi_schedule_prep(&rx_q->napi))) { - stmmac_disable_dma_irq(priv, - priv->ioaddr, chan); - __napi_schedule(&rx_q->napi); - } - break; - } - } - } + status[chan] = stmmac_napi_check(priv, chan); for (chan = 0; chan < tx_channel_count; chan++) { if (unlikely(status[chan] & tx_hard_error_bump_tc)) { @@ -2233,6 +2226,13 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) return ret; } +static void stmmac_tx_timer_arm(struct stmmac_priv *priv, u32 queue) +{ + struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; + + mod_timer(&tx_q->txtimer, STMMAC_COAL_TIMER(priv->tx_coal_timer)); +} + /** * stmmac_tx_timer - mitigation sw timer for tx. * @data: data pointer @@ -2241,13 +2241,14 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) */ static void stmmac_tx_timer(struct timer_list *t) { - struct stmmac_priv *priv = from_timer(priv, t, txtimer); - u32 tx_queues_count = priv->plat->tx_queues_to_use; - u32 queue; + struct stmmac_tx_queue *tx_q = from_timer(tx_q, t, txtimer); + struct stmmac_priv *priv = tx_q->priv_data; + struct stmmac_channel *ch; + + ch = &priv->channel[tx_q->queue_index]; - /* let's scan all the tx queues */ - for (queue = 0; queue < tx_queues_count; queue++) - stmmac_tx_clean(priv, queue); + if (likely(napi_schedule_prep(&ch->napi))) + __napi_schedule(&ch->napi); } /** @@ -2260,11 +2261,17 @@ static void stmmac_tx_timer(struct timer_list *t) */ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv) { + u32 tx_channel_count = priv->plat->tx_queues_to_use; + u32 chan; + priv->tx_coal_frames = STMMAC_TX_FRAMES; priv->tx_coal_timer = STMMAC_COAL_TX_TIMER; - timer_setup(&priv->txtimer, stmmac_tx_timer, 0); - priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer); - add_timer(&priv->txtimer); + + for (chan = 0; chan < tx_channel_count; chan++) { + struct stmmac_tx_queue *tx_q = &priv->tx_queue[chan]; + + timer_setup(&tx_q->txtimer, stmmac_tx_timer, 0); + } } static void stmmac_set_rings_length(struct stmmac_priv *priv) @@ -2592,6 +2599,7 @@ static void stmmac_hw_teardown(struct net_device *dev) static int stmmac_open(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + u32 chan; int ret; stmmac_check_ether_addr(priv); @@ -2688,7 +2696,9 @@ irq_error: if (dev->phydev) phy_stop(dev->phydev); - del_timer_sync(&priv->txtimer); + for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) + del_timer_sync(&priv->tx_queue[chan].txtimer); + stmmac_hw_teardown(dev); init_error: free_dma_desc_resources(priv); @@ -2708,6 +2718,7 @@ dma_desc_error: static int stmmac_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + u32 chan; if (priv->eee_enabled) del_timer_sync(&priv->eee_ctrl_timer); @@ -2722,7 +2733,8 @@ static int stmmac_release(struct net_device *dev) stmmac_disable_all_queues(priv); - del_timer_sync(&priv->txtimer); + for (chan = 0; chan < priv->plat->tx_queues_to_use; chan++) + del_timer_sync(&priv->tx_queue[chan].txtimer); /* Free the IRQ lines */ free_irq(dev->irq, dev); @@ -2936,14 +2948,13 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) priv->xstats.tx_tso_nfrags += nfrags; /* Manage tx mitigation */ - priv->tx_count_frames += nfrags + 1; - if (likely(priv->tx_coal_frames > priv->tx_count_frames)) { - mod_timer(&priv->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer)); - } else { - priv->tx_count_frames = 0; + tx_q->tx_count_frames += nfrags + 1; + if (priv->tx_coal_frames <= tx_q->tx_count_frames) { stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; + tx_q->tx_count_frames = 0; + } else { + stmmac_tx_timer_arm(priv, queue); } skb_tx_timestamp(skb); @@ -3146,14 +3157,13 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) * This approach takes care about the fragments: desc is the first * element in case of no SG. */ - priv->tx_count_frames += nfrags + 1; - if (likely(priv->tx_coal_frames > priv->tx_count_frames)) { - mod_timer(&priv->txtimer, - STMMAC_COAL_TIMER(priv->tx_coal_timer)); - } else { - priv->tx_count_frames = 0; + tx_q->tx_count_frames += nfrags + 1; + if (priv->tx_coal_frames <= tx_q->tx_count_frames) { stmmac_set_tx_ic(priv, desc); priv->xstats.tx_set_ic_bit++; + tx_q->tx_count_frames = 0; + } else { + stmmac_tx_timer_arm(priv, queue); } skb_tx_timestamp(skb); @@ -3199,6 +3209,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len); stmmac_enable_dma_transmission(priv, priv->ioaddr); + stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; @@ -3319,6 +3330,7 @@ static inline void stmmac_rx_refill(struct stmmac_priv *priv, u32 queue) static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) { struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + struct stmmac_channel *ch = &priv->channel[queue]; unsigned int entry = rx_q->cur_rx; int coe = priv->hw->rx_csum; unsigned int next_entry; @@ -3491,7 +3503,7 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) else skb->ip_summed = CHECKSUM_UNNECESSARY; - napi_gro_receive(&rx_q->napi, skb); + napi_gro_receive(&ch->napi, skb); priv->dev->stats.rx_packets++; priv->dev->stats.rx_bytes += frame_len; @@ -3514,27 +3526,33 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit, u32 queue) * Description : * To look at the incoming frames and clear the tx resources. */ -static int stmmac_poll(struct napi_struct *napi, int budget) +static int stmmac_napi_poll(struct napi_struct *napi, int budget) { - struct stmmac_rx_queue *rx_q = - container_of(napi, struct stmmac_rx_queue, napi); - struct stmmac_priv *priv = rx_q->priv_data; - u32 tx_count = priv->plat->tx_queues_to_use; - u32 chan = rx_q->queue_index; - int work_done = 0; - u32 queue; + struct stmmac_channel *ch = + container_of(napi, struct stmmac_channel, napi); + struct stmmac_priv *priv = ch->priv_data; + int work_done = 0, work_rem = budget; + u32 chan = ch->index; priv->xstats.napi_poll++; - /* check all the queues */ - for (queue = 0; queue < tx_count; queue++) - stmmac_tx_clean(priv, queue); + if (ch->has_tx) { + int done = stmmac_tx_clean(priv, work_rem, chan); - work_done = stmmac_rx(priv, budget, rx_q->queue_index); - if (work_done < budget) { - napi_complete_done(napi, work_done); - stmmac_enable_dma_irq(priv, priv->ioaddr, chan); + work_done += done; + work_rem -= done; + } + + if (ch->has_rx) { + int done = stmmac_rx(priv, work_rem, chan); + + work_done += done; + work_rem -= done; } + + if (work_done < budget && napi_complete_done(napi, work_done)) + stmmac_enable_dma_irq(priv, priv->ioaddr, chan); + return work_done; } @@ -4198,8 +4216,8 @@ int stmmac_dvr_probe(struct device *device, { struct net_device *ndev = NULL; struct stmmac_priv *priv; + u32 queue, maxq; int ret = 0; - u32 queue; ndev = alloc_etherdev_mqs(sizeof(struct stmmac_priv), MTL_MAX_TX_QUEUES, @@ -4322,11 +4340,22 @@ int stmmac_dvr_probe(struct device *device, "Enable RX Mitigation via HW Watchdog Timer\n"); } - for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + /* Setup channels NAPI */ + maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); - netif_napi_add(ndev, &rx_q->napi, stmmac_poll, - (8 * priv->plat->rx_queues_to_use)); + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; + + ch->priv_data = priv; + ch->index = queue; + + if (queue < priv->plat->rx_queues_to_use) + ch->has_rx = true; + if (queue < priv->plat->tx_queues_to_use) + ch->has_tx = true; + + netif_napi_add(ndev, &ch->napi, stmmac_napi_poll, + NAPI_POLL_WEIGHT); } mutex_init(&priv->lock); @@ -4372,10 +4401,10 @@ error_netdev_register: priv->hw->pcs != STMMAC_PCS_RTBI) stmmac_mdio_unregister(ndev); error_mdio_register: - for (queue = 0; queue < priv->plat->rx_queues_to_use; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; + for (queue = 0; queue < maxq; queue++) { + struct stmmac_channel *ch = &priv->channel[queue]; - netif_napi_del(&rx_q->napi); + netif_napi_del(&ch->napi); } error_hw_init: destroy_workqueue(priv->wq); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c43e9a01b892..7ddfc65586b0 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -30,6 +30,7 @@ #define MTL_MAX_RX_QUEUES 8 #define MTL_MAX_TX_QUEUES 8 +#define STMMAC_CH_MAX 8 #define STMMAC_RX_COE_NONE 0 #define STMMAC_RX_COE_TYPE1 1 -- cgit v1.2.3 From 0431100b3d82c509729ece1ab22ada2484e209c1 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 17 Sep 2018 09:22:57 +0100 Subject: net: stmmac: Fixup the tail addr setting in xmit path Currently we are always setting the tail address of descriptor list to the end of the pre-allocated list. According to databook this is not correct. Tail address should point to the last available descriptor + 1, which means we have to update the tail address everytime we call the xmit function. This should make no impact in older versions of MAC but in newer versions there are some DMA features which allows the IP to fetch descriptors in advance and in a non sequential order so its critical that we set the tail address correctly. Signed-off-by: Jose Abreu Fixes: f748be531d70 ("stmmac: support new GMAC4") Cc: David S. Miller Cc: Joao Pinto Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ab9cc0143ff2..75896d6ba6e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2213,8 +2213,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv) stmmac_init_tx_chan(priv, priv->ioaddr, priv->plat->dma_cfg, tx_q->dma_tx_phy, chan); - tx_q->tx_tail_addr = tx_q->dma_tx_phy + - (DMA_TX_SIZE * sizeof(struct dma_desc)); + tx_q->tx_tail_addr = tx_q->dma_tx_phy; stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, chan); } @@ -3003,6 +3002,7 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev) netdev_tx_sent_queue(netdev_get_tx_queue(dev, queue), skb->len); + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; @@ -3210,6 +3210,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) stmmac_enable_dma_transmission(priv, priv->ioaddr); + tx_q->tx_tail_addr = tx_q->dma_tx_phy + (tx_q->cur_tx * sizeof(*desc)); stmmac_set_tx_tail_ptr(priv, priv->ioaddr, tx_q->tx_tail_addr, queue); return NETDEV_TX_OK; -- cgit v1.2.3 From 0a286afee5a1e8dca86d824209dbd3200294f86f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 17 Sep 2018 15:30:06 +0200 Subject: selftests: pmtu: properly redirect stderr to /dev/null The cleanup function uses "$CMD 2 > /dev/null", which doesn't actually send stderr to /dev/null, so when the netns doesn't exist, the error message is shown. Use "2> /dev/null" instead, so that those messages disappear, as was intended. Fixes: d1f1b9cbf34c ("selftests: net: Introduce first PMTU test") Signed-off-by: Sabrina Dubroca Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 32a194e3e07a..0ab9423d009f 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -178,8 +178,8 @@ setup() { cleanup() { [ ${cleanup_done} -eq 1 ] && return - ip netns del ${NS_A} 2 > /dev/null - ip netns del ${NS_B} 2 > /dev/null + ip netns del ${NS_A} 2> /dev/null + ip netns del ${NS_B} 2> /dev/null cleanup_done=1 } -- cgit v1.2.3 From 674d9de02aa7d521ebdf66c3958758bdd9c64e11 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:40 +0200 Subject: NFC: Fix possible memory corruption when handling SHDLC I-Frame commands When handling SHDLC I-Frame commands "pipe" field used for indexing into an array should be checked before usage. If left unchecked it might access memory outside of the array of size NFC_HCI_MAX_PIPES(127). Malformed NFC HCI frames could be injected by a malicious NFC device communicating with the device being attacked (remote attack vector), or even by an attacker with physical access to the I2C bus such that they could influence the data transfers on that bus (local attack vector). skb->data is controlled by the attacker and has only been sanitized in the most trivial ways (CRC check), therefore we can consider the create_info struct and all of its members to tainted. 'create_info->pipe' with max value of 255 (uint8) is used to take an offset of the hdev->pipes array of 127 elements which can lead to OOB write. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Kevin Deus Signed-off-by: Suren Baghdasaryan Acked-by: Kees Cook Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- net/nfc/hci/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index ac8030c4bcf8..19cb2e473ea6 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; -- cgit v1.2.3 From e285d5bfb7e9785d289663baef252dd315e171f8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 17 Sep 2018 15:51:41 +0200 Subject: NFC: Fix the number of pipes According to ETSI TS 102 622 specification chapter 4.4 pipe identifier is 7 bits long which allows for 128 unique pipe IDs. Because NFC_HCI_MAX_PIPES is used as the number of pipes supported and not as the max pipe ID, its value should be 128 instead of 127. nfc_hci_recv_from_llc extracts pipe ID from packet header using NFC_HCI_FRAGMENT(0x7F) mask which allows for pipe ID value of 127. Same happens when NCI_HCP_MSG_GET_PIPE() is being used. With pipes array having only 127 elements and pipe ID of 127 the OOB memory access will result. Cc: Samuel Ortiz Cc: Allen Pais Cc: "David S. Miller" Suggested-by: Dan Carpenter Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: stable Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- include/net/nfc/hci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/nfc/hci.h b/include/net/nfc/hci.h index 316694dafa5b..008f466d1da7 100644 --- a/include/net/nfc/hci.h +++ b/include/net/nfc/hci.h @@ -87,7 +87,7 @@ struct nfc_hci_pipe { * According to specification 102 622 chapter 4.4 Pipes, * the pipe identifier is 7 bits long. */ -#define NFC_HCI_MAX_PIPES 127 +#define NFC_HCI_MAX_PIPES 128 struct nfc_hci_init_data { u8 gate_count; struct nfc_hci_gate gates[NFC_HCI_MAX_CUSTOM_GATES]; -- cgit v1.2.3 From 08e39982ef64f800fd1f9b9b92968d14d5fafa82 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Mon, 17 Sep 2018 17:22:40 +0200 Subject: net: emac: fix fixed-link setup for the RTL8363SB switch On the Netgear WNDAP620, the emac ethernet isn't receiving nor xmitting any frames from/to the RTL8363SB (identifies itself as a RTL8367RB). This is caused by the emac hardware not knowing the forced link parameters for speed, duplex, pause, etc. This begs the question, how this was working on the original driver code, when it was necessary to set the phy_address and phy_map to 0xffffffff. But I guess without access to the old PPC405/440/460 hardware, it's not possible to know. Signed-off-by: Christian Lamparter Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/emac/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/emac/core.c b/drivers/net/ethernet/ibm/emac/core.c index 372664686309..129f4e9f38da 100644 --- a/drivers/net/ethernet/ibm/emac/core.c +++ b/drivers/net/ethernet/ibm/emac/core.c @@ -2677,12 +2677,17 @@ static int emac_init_phy(struct emac_instance *dev) if (of_phy_is_fixed_link(np)) { int res = emac_dt_mdio_probe(dev); - if (!res) { - res = of_phy_register_fixed_link(np); - if (res) - mdiobus_unregister(dev->mii_bus); + if (res) + return res; + + res = of_phy_register_fixed_link(np); + dev->phy_dev = of_phy_find_device(np); + if (res || !dev->phy_dev) { + mdiobus_unregister(dev->mii_bus); + return res ? res : -EINVAL; } - return res; + emac_adjust_link(dev->ndev); + put_device(&dev->phy_dev->mdio.dev); } return 0; } -- cgit v1.2.3 From 65fac4fe9080714df80d430888834ce87c6716ba Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Tue, 18 Sep 2018 15:15:44 +0800 Subject: net: bnxt: Fix a uninitialized variable warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following compile warning: drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c:49:5: warning: ‘nvm_param.dir_type’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (nvm_param.dir_type == BNXT_NVM_PORT_CFG) Signed-off-by: zhong jiang Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index f3b9fbcc705b..790c684f08ab 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -46,6 +46,9 @@ static int bnxt_hwrm_nvm_req(struct bnxt *bp, u32 param_id, void *msg, } } + if (i == ARRAY_SIZE(nvm_params)) + return -EOPNOTSUPP; + if (nvm_param.dir_type == BNXT_NVM_PORT_CFG) idx = bp->pf.port_id; else if (nvm_param.dir_type == BNXT_NVM_FUNC_CFG) -- cgit v1.2.3 From 2fe397a3959de8a472f165e6d152f64cb77fa2cc Mon Sep 17 00:00:00 2001 From: Kazuya Mizuguchi Date: Tue, 18 Sep 2018 12:22:26 +0200 Subject: ravb: do not write 1 to reserved bits EtherAVB hardware requires 0 to be written to status register bits in order to clear them, however, care must be taken not to: 1. Clear other bits, by writing zero to them 2. Write one to reserved bits This patch corrects the ravb driver with respect to the second point above. This is done by defining reserved bit masks for the affected registers and, after auditing the code, ensure all sites that may write a one to a reserved bit use are suitably masked. Signed-off-by: Kazuya Mizuguchi Signed-off-by: Simon Horman Reviewed-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb.h | 5 +++++ drivers/net/ethernet/renesas/ravb_main.c | 11 ++++++----- drivers/net/ethernet/renesas/ravb_ptp.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h index 1470fc12282b..9b6bf557a2f5 100644 --- a/drivers/net/ethernet/renesas/ravb.h +++ b/drivers/net/ethernet/renesas/ravb.h @@ -428,6 +428,7 @@ enum EIS_BIT { EIS_CULF1 = 0x00000080, EIS_TFFF = 0x00000100, EIS_QFS = 0x00010000, + EIS_RESERVED = (GENMASK(31, 17) | GENMASK(15, 11)), }; /* RIC0 */ @@ -472,6 +473,7 @@ enum RIS0_BIT { RIS0_FRF15 = 0x00008000, RIS0_FRF16 = 0x00010000, RIS0_FRF17 = 0x00020000, + RIS0_RESERVED = GENMASK(31, 18), }; /* RIC1 */ @@ -528,6 +530,7 @@ enum RIS2_BIT { RIS2_QFF16 = 0x00010000, RIS2_QFF17 = 0x00020000, RIS2_RFFF = 0x80000000, + RIS2_RESERVED = GENMASK(30, 18), }; /* TIC */ @@ -544,6 +547,7 @@ enum TIS_BIT { TIS_FTF1 = 0x00000002, /* Undocumented? */ TIS_TFUF = 0x00000100, TIS_TFWF = 0x00000200, + TIS_RESERVED = (GENMASK(31, 20) | GENMASK(15, 12) | GENMASK(7, 4)) }; /* ISS */ @@ -617,6 +621,7 @@ enum GIC_BIT { enum GIS_BIT { GIS_PTCF = 0x00000001, /* Undocumented? */ GIS_PTMF = 0x00000004, + GIS_RESERVED = GENMASK(15, 10), }; /* GIE (R-Car Gen3 only) */ diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index aff5516b781e..d6f753925352 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -739,10 +739,11 @@ static void ravb_error_interrupt(struct net_device *ndev) u32 eis, ris2; eis = ravb_read(ndev, EIS); - ravb_write(ndev, ~EIS_QFS, EIS); + ravb_write(ndev, ~(EIS_QFS | EIS_RESERVED), EIS); if (eis & EIS_QFS) { ris2 = ravb_read(ndev, RIS2); - ravb_write(ndev, ~(RIS2_QFF0 | RIS2_RFFF), RIS2); + ravb_write(ndev, ~(RIS2_QFF0 | RIS2_RFFF | RIS2_RESERVED), + RIS2); /* Receive Descriptor Empty int */ if (ris2 & RIS2_QFF0) @@ -795,7 +796,7 @@ static bool ravb_timestamp_interrupt(struct net_device *ndev) u32 tis = ravb_read(ndev, TIS); if (tis & TIS_TFUF) { - ravb_write(ndev, ~TIS_TFUF, TIS); + ravb_write(ndev, ~(TIS_TFUF | TIS_RESERVED), TIS); ravb_get_tx_tstamp(ndev); return true; } @@ -930,7 +931,7 @@ static int ravb_poll(struct napi_struct *napi, int budget) /* Processing RX Descriptor Ring */ if (ris0 & mask) { /* Clear RX interrupt */ - ravb_write(ndev, ~mask, RIS0); + ravb_write(ndev, ~(mask | RIS0_RESERVED), RIS0); if (ravb_rx(ndev, "a, q)) goto out; } @@ -938,7 +939,7 @@ static int ravb_poll(struct napi_struct *napi, int budget) if (tis & mask) { spin_lock_irqsave(&priv->lock, flags); /* Clear TX interrupt */ - ravb_write(ndev, ~mask, TIS); + ravb_write(ndev, ~(mask | TIS_RESERVED), TIS); ravb_tx_free(ndev, q, true); netif_wake_subqueue(ndev, q); mmiowb(); diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index 0721b5c35d91..dce2a40a31e3 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -315,7 +315,7 @@ void ravb_ptp_interrupt(struct net_device *ndev) } } - ravb_write(ndev, ~gis, GIS); + ravb_write(ndev, ~(gis | GIS_RESERVED), GIS); } void ravb_ptp_init(struct net_device *ndev, struct platform_device *pdev) -- cgit v1.2.3 From 648a5a7aed346c3b8fe7c32a835edfb0dfbf4451 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:34 +0200 Subject: net/smc: fix non-blocking connect problem In state SMC_INIT smc_poll() delegates polling to the internal CLC socket. This means, once the connect worker has finished its kernel_connect() step, the poll wake-up may occur. This is not intended. The wake-up should occur from the wake up call in smc_connect_work() after __smc_connect() has finished. Thus in state SMC_INIT this patch now calls sock_poll_wait() on the main SMC socket. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 2d8a1e15e4f9..9c3976bcde46 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -742,7 +742,10 @@ static void smc_connect_work(struct work_struct *work) smc->sk.sk_err = -rc; out: - smc->sk.sk_state_change(&smc->sk); + if (smc->sk.sk_err) + smc->sk.sk_state_change(&smc->sk); + else + smc->sk.sk_write_space(&smc->sk); kfree(smc->connect_info); smc->connect_info = NULL; release_sock(&smc->sk); @@ -1529,7 +1532,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return EPOLLNVAL; smc = smc_sk(sock->sk); - if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + if (smc->use_fallback) { /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); sk->sk_err = smc->clcsock->sk->sk_err; -- cgit v1.2.3 From 1ca52fcfaca43665d525645348801a6f4a4b9e9a Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:35 +0200 Subject: net/smc: remove duplicate mutex_unlock For a failing smc_listen_rdma_finish() smc_listen_decline() is called. If fallback is possible, the new socket is already enqueued to be accepted in smc_listen_decline(). Avoid enqueuing a second time afterwards in this case, otherwise the smc_create_lgr_pending lock is released twice: [ 373.463976] WARNING: bad unlock balance detected! [ 373.463978] 4.18.0-rc7+ #123 Tainted: G O [ 373.463979] ------------------------------------- [ 373.463980] kworker/1:1/30 is trying to release lock (smc_create_lgr_pending) at: [ 373.463990] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.463991] but there are no more locks to release! [ 373.463991] other info that might help us debug this: [ 373.463993] 2 locks held by kworker/1:1/30: [ 373.463994] #0: 00000000772cbaed ((wq_completion)"events"){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464000] #1: 000000003ad0894a ((work_completion)(&new_smc->smc_listen_work)){+.+.}, at: process_one_work+0x1ec/0x6b0 [ 373.464003] stack backtrace: [ 373.464005] CPU: 1 PID: 30 Comm: kworker/1:1 Kdump: loaded Tainted: G O 4.18.0-rc7uschi+ #123 [ 373.464007] Hardware name: IBM 2827 H43 738 (LPAR) [ 373.464010] Workqueue: events smc_listen_work [smc] [ 373.464011] Call Trace: [ 373.464015] ([<0000000000114100>] show_stack+0x60/0xd8) [ 373.464019] [<0000000000a8c9bc>] dump_stack+0x9c/0xd8 [ 373.464021] [<00000000001dcaf8>] print_unlock_imbalance_bug+0xf8/0x108 [ 373.464022] [<00000000001e045c>] lock_release+0x114/0x4f8 [ 373.464025] [<0000000000aa87fa>] __mutex_unlock_slowpath+0x4a/0x300 [ 373.464027] [<000003ff801205fc>] smc_listen_work+0x22c/0x5d0 [smc] [ 373.464029] [<0000000000197a68>] process_one_work+0x2a8/0x6b0 [ 373.464030] [<0000000000197ec2>] worker_thread+0x52/0x410 [ 373.464033] [<000000000019fd0e>] kthread+0x15e/0x178 [ 373.464035] [<0000000000aaf58a>] kernel_thread_starter+0x6/0xc [ 373.464052] [<0000000000aaf584>] kernel_thread_starter+0x0/0xc [ 373.464054] INFO: lockdep is turned off. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9c3976bcde46..5c6d30eb4a71 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1153,9 +1153,9 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) } /* listen worker: finish RDMA setup */ -static void smc_listen_rdma_finish(struct smc_sock *new_smc, - struct smc_clc_msg_accept_confirm *cclc, - int local_contact) +static int smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) { struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; int reason_code = 0; @@ -1178,11 +1178,12 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc, if (reason_code) goto decline; } - return; + return 0; decline: mutex_unlock(&smc_create_lgr_pending); smc_listen_decline(new_smc, reason_code, local_contact); + return reason_code; } /* setup for RDMA connection of server */ @@ -1279,8 +1280,10 @@ static void smc_listen_work(struct work_struct *work) } /* finish worker */ - if (!ism_supported) - smc_listen_rdma_finish(new_smc, &cclc, local_contact); + if (!ism_supported) { + if (smc_listen_rdma_finish(new_smc, &cclc, local_contact)) + return; + } smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); smc_listen_out_connected(new_smc); -- cgit v1.2.3 From dd65d87a6abd537ae9bb3bdcee8905704b936884 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 18 Sep 2018 15:46:36 +0200 Subject: net/smc: enable fallback for connection abort in state INIT If a linkgroup is terminated abnormally already due to failing LLC CONFIRM LINK or LLC ADD LINK, fallback to TCP is still possible. In this case do not switch to state SMC_PEERABORTWAIT and do not set sk_err. Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_close.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index ac961dfb1ea1..ea2b87f29469 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -100,15 +100,14 @@ static void smc_close_active_abort(struct smc_sock *smc) struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - sk->sk_err = ECONNABORTED; - if (smc->clcsock && smc->clcsock->sk) { - smc->clcsock->sk->sk_err = ECONNABORTED; - smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + if (sk->sk_state != SMC_INIT && smc->clcsock && smc->clcsock->sk) { + sk->sk_err = ECONNABORTED; + if (smc->clcsock && smc->clcsock->sk) { + smc->clcsock->sk->sk_err = ECONNABORTED; + smc->clcsock->sk->sk_state_change(smc->clcsock->sk); + } } switch (sk->sk_state) { - case SMC_INIT: - sk->sk_state = SMC_PEERABORTWAIT; - break; case SMC_ACTIVE: sk->sk_state = SMC_PEERABORTWAIT; release_sock(sk); @@ -143,6 +142,7 @@ static void smc_close_active_abort(struct smc_sock *smc) case SMC_PEERFINCLOSEWAIT: sock_put(sk); /* passive closing */ break; + case SMC_INIT: case SMC_PEERABORTWAIT: case SMC_CLOSED: break; -- cgit v1.2.3 From 71d117f527425e2d6a5029e8365d82a8d2d6916a Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 18 Sep 2018 15:46:37 +0200 Subject: net/smc: no urgent data check for listen sockets Don't check a listen socket for pending urgent data in smc_poll(). Signed-off-by: Karsten Graul Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5c6d30eb4a71..015231789ed2 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1566,9 +1566,9 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; } - if (smc->conn.urg_state == SMC_URG_VALID) - mask |= EPOLLPRI; } return mask; -- cgit v1.2.3 From 381897798a94065ffcad0772eecdc6b04a7ff23d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 18 Sep 2018 15:46:38 +0200 Subject: net/smc: fix sizeof to int comparison Comparing an int to a size, which is unsigned, causes the int to become unsigned, giving the wrong result. kernel_sendmsg can return a negative error code. Signed-off-by: YueHaibing Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 83aba9ade060..52241d679cc9 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -446,14 +446,12 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type, vec[i++].iov_len = sizeof(trl); /* due to the few bytes needed for clc-handshake this cannot block */ len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen); - if (len < sizeof(pclc)) { - if (len >= 0) { - reason_code = -ENETUNREACH; - smc->sk.sk_err = -reason_code; - } else { - smc->sk.sk_err = smc->clcsock->sk->sk_err; - reason_code = -smc->sk.sk_err; - } + if (len < 0) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } else if (len < (int)sizeof(pclc)) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; } return reason_code; -- cgit v1.2.3 From 774268f3e51b53ed432a1ec516574fd5ba469398 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Tue, 18 Sep 2018 16:58:47 +0200 Subject: net: mvpp2: fix a txq_done race condition When no Tx IRQ is available, the txq_done() routine (called from tx_done()) shouldn't be called from the polling function, as in such case it is already called in the Tx path thanks to an hrtimer. This mostly occurred when using PPv2.1, as the engine then do not have Tx IRQs. Fixes: edc660fa09e2 ("net: mvpp2: replace TX coalescing interrupts with hrtimer") Reported-by: Stefan Chulski Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 702fec82d806..38cc01beea79 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3055,10 +3055,12 @@ static int mvpp2_poll(struct napi_struct *napi, int budget) cause_rx_tx & ~MVPP2_CAUSE_MISC_SUM_MASK); } - cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; - if (cause_tx) { - cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET; - mvpp2_tx_done(port, cause_tx, qv->sw_thread_id); + if (port->has_tx_irqs) { + cause_tx = cause_rx_tx & MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_MASK; + if (cause_tx) { + cause_tx >>= MVPP2_CAUSE_TXQ_OCCUP_DESC_ALL_OFFSET; + mvpp2_tx_done(port, cause_tx, qv->sw_thread_id); + } } /* Process RX packets */ -- cgit v1.2.3 From 126d6848ef13958e1cb959e96c21d19bc498ade9 Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 18 Sep 2018 16:48:53 +0100 Subject: sfp: fix oops with ethtool -m If a network interface is created prior to the SFP socket being available, ethtool can request module information. This unfortunately leads to an oops: Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = (ptrval) [00000008] *pgd=7c400831, *pte=00000000, *ppte=00000000 Internal error: Oops: 17 [#1] SMP ARM Modules linked in: CPU: 0 PID: 1480 Comm: ethtool Not tainted 4.19.0-rc3 #138 Hardware name: Broadcom Northstar Plus SoC PC is at sfp_get_module_info+0x8/0x10 LR is at dev_ethtool+0x218c/0x2afc Fix this by not filling in the network device's SFP bus pointer until SFP is fully bound, thereby avoiding the core calling into the SFP bus code. Fixes: ce0aa27ff3f6 ("sfp: add sfp-bus to bridge between network devices and sfp cages") Reported-by: Florian Fainelli Tested-by: Florian Fainelli Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 740655261e5b..83060fb349f4 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -349,6 +349,7 @@ static int sfp_register_bus(struct sfp_bus *bus) } if (bus->started) bus->socket_ops->start(bus->sfp); + bus->netdev->sfp_bus = bus; bus->registered = true; return 0; } @@ -357,6 +358,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus) { const struct sfp_upstream_ops *ops = bus->upstream_ops; + bus->netdev->sfp_bus = NULL; if (bus->registered) { if (bus->started) bus->socket_ops->stop(bus->sfp); @@ -438,7 +440,6 @@ static void sfp_upstream_clear(struct sfp_bus *bus) { bus->upstream_ops = NULL; bus->upstream = NULL; - bus->netdev->sfp_bus = NULL; bus->netdev = NULL; } @@ -467,7 +468,6 @@ struct sfp_bus *sfp_register_upstream(struct fwnode_handle *fwnode, bus->upstream_ops = ops; bus->upstream = upstream; bus->netdev = ndev; - ndev->sfp_bus = bus; if (bus->sfp) { ret = sfp_register_bus(bus); -- cgit v1.2.3 From 8675860592321a43bbb0b5aedc244a9b2321edc3 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 18 Sep 2018 13:44:59 -0700 Subject: Revert "ipv6: fix double refcount of fib6_metrics" This reverts commit e70a3aad44cc8b24986687ffc98c4a4f6ecf25ea. This change causes use-after-free on dst->_metrics. The crash trace looks like this: [ 97.763269] BUG: KASAN: use-after-free in ip6_mtu+0x116/0x140 [ 97.769038] Read of size 4 at addr ffff881781d2cf84 by task svw_NetThreadEv/8801 [ 97.777954] CPU: 76 PID: 8801 Comm: svw_NetThreadEv Not tainted 4.15.0-smp-DEV #11 [ 97.777956] Hardware name: Default string Default string/Indus_QC_02, BIOS 5.46.4 03/29/2018 [ 97.777957] Call Trace: [ 97.777971] [] dump_stack+0x4d/0x72 [ 97.777985] [] print_address_description+0x6f/0x260 [ 97.777997] [] kasan_report+0x257/0x370 [ 97.778001] [] ? ip6_mtu+0x116/0x140 [ 97.778004] [] __asan_report_load4_noabort+0x19/0x20 [ 97.778008] [] ip6_mtu+0x116/0x140 [ 97.778013] [] tcp_current_mss+0x12e/0x280 [ 97.778016] [] ? tcp_mtu_to_mss+0x2d0/0x2d0 [ 97.778022] [] ? depot_save_stack+0x138/0x4a0 [ 97.778037] [] ? __mmdrop+0x145/0x1f0 [ 97.778040] [] ? save_stack+0xb1/0xd0 [ 97.778046] [] tcp_send_mss+0x22/0x220 [ 97.778059] [] tcp_sendmsg_locked+0x4f9/0x39f0 [ 97.778062] [] ? kasan_check_write+0x14/0x20 [ 97.778066] [] ? tcp_sendpage+0x60/0x60 [ 97.778070] [] ? rw_copy_check_uvector+0x69/0x280 [ 97.778075] [] ? import_iovec+0x9f/0x430 [ 97.778078] [] ? kasan_slab_free+0x87/0xc0 [ 97.778082] [] ? memzero_page+0x140/0x140 [ 97.778085] [] ? kasan_check_write+0x14/0x20 [ 97.778088] [] tcp_sendmsg+0x2c/0x50 [ 97.778092] [] ? tcp_sendmsg+0x2c/0x50 [ 97.778098] [] inet_sendmsg+0x103/0x480 [ 97.778102] [] ? inet_gso_segment+0x15b0/0x15b0 [ 97.778105] [] sock_sendmsg+0xba/0xf0 [ 97.778108] [] ___sys_sendmsg+0x6ca/0x8e0 [ 97.778113] [] ? hrtimer_try_to_cancel+0x71/0x3b0 [ 97.778116] [] ? copy_msghdr_from_user+0x3d0/0x3d0 [ 97.778119] [] ? memset+0x31/0x40 [ 97.778123] [] ? schedule_hrtimeout_range_clock+0x165/0x380 [ 97.778127] [] ? hrtimer_nanosleep_restart+0x250/0x250 [ 97.778130] [] ? __hrtimer_init+0x180/0x180 [ 97.778133] [] ? ktime_get_ts64+0x172/0x200 [ 97.778137] [] ? __fget_light+0x8c/0x2f0 [ 97.778141] [] __sys_sendmsg+0xe6/0x190 [ 97.778144] [] ? __sys_sendmsg+0xe6/0x190 [ 97.778147] [] ? SyS_shutdown+0x20/0x20 [ 97.778152] [] ? wake_up_q+0xe0/0xe0 [ 97.778155] [] ? __sys_sendmsg+0x190/0x190 [ 97.778158] [] SyS_sendmsg+0x13/0x20 [ 97.778162] [] do_syscall_64+0x2ac/0x430 [ 97.778166] [] ? do_page_fault+0x35/0x3d0 [ 97.778171] [] ? page_fault+0x2f/0x50 [ 97.778174] [] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 97.778177] RIP: 0033:0x7f83fa36000d [ 97.778178] RSP: 002b:00007f83ef9229e0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e [ 97.778180] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f83fa36000d [ 97.778182] RDX: 0000000000004000 RSI: 00007f83ef922f00 RDI: 0000000000000036 [ 97.778183] RBP: 00007f83ef923040 R08: 00007f83ef9231f8 R09: 00007f83ef923168 [ 97.778184] R10: 0000000000000000 R11: 0000000000000293 R12: 00007f83f69c5b40 [ 97.778185] R13: 000000000000001c R14: 0000000000000001 R15: 0000000000004000 [ 97.779684] Allocated by task 5919: [ 97.783185] save_stack+0x46/0xd0 [ 97.783187] kasan_kmalloc+0xad/0xe0 [ 97.783189] kmem_cache_alloc_trace+0xdf/0x580 [ 97.783190] ip6_convert_metrics.isra.79+0x7e/0x190 [ 97.783192] ip6_route_info_create+0x60a/0x2480 [ 97.783193] ip6_route_add+0x1d/0x80 [ 97.783195] inet6_rtm_newroute+0xdd/0xf0 [ 97.783198] rtnetlink_rcv_msg+0x641/0xb10 [ 97.783200] netlink_rcv_skb+0x27b/0x3e0 [ 97.783202] rtnetlink_rcv+0x15/0x20 [ 97.783203] netlink_unicast+0x4be/0x720 [ 97.783204] netlink_sendmsg+0x7bc/0xbf0 [ 97.783205] sock_sendmsg+0xba/0xf0 [ 97.783207] ___sys_sendmsg+0x6ca/0x8e0 [ 97.783208] __sys_sendmsg+0xe6/0x190 [ 97.783209] SyS_sendmsg+0x13/0x20 [ 97.783211] do_syscall_64+0x2ac/0x430 [ 97.783213] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 [ 97.784709] Freed by task 0: [ 97.785056] knetbase: Error: /proc/sys/net/core/txcs_enable does not exist [ 97.794497] save_stack+0x46/0xd0 [ 97.794499] kasan_slab_free+0x71/0xc0 [ 97.794500] kfree+0x7c/0xf0 [ 97.794501] fib6_info_destroy_rcu+0x24f/0x310 [ 97.794504] rcu_process_callbacks+0x38b/0x1730 [ 97.794506] __do_softirq+0x1c8/0x5d0 Reported-by: John Sperbeck Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 480a79f47c52..b5d3e6b294ab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -976,6 +976,10 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true); + if (from->fib6_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + refcount_inc(&from->fib6_metrics->refcnt); + } } /* Caller must already hold reference to @ort */ -- cgit v1.2.3 From ce7ea4af0838ffd4667ecad4eb5eec7a25342f1e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 18 Sep 2018 13:45:00 -0700 Subject: ipv6: fix memory leak on dst->_metrics When dst->_metrics and f6i->fib6_metrics share the same memory, both take reference count on the dst_metrics structure. However, when dst is destroyed, ip6_dst_destroy() only invokes dst_destroy_metrics_generic() which does not take care of READONLY metrics and does not release refcnt. This causes memory leak. Similar to ipv4 logic, the fix is to properly release refcnt and free the memory space pointed by dst->_metrics if refcnt becomes 0. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Sabrina Dubroca Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b5d3e6b294ab..826b14de7dbb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -364,11 +364,14 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rt6_info *rt = (struct rt6_info *)dst; struct fib6_info *from; struct inet6_dev *idev; - dst_destroy_metrics_generic(dst); + if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) + kfree(p); + rt6_uncached_list_del(rt); idev = rt->rt6i_idev; -- cgit v1.2.3 From 080220b687147fd9376878534aba7194f17f6ef5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 18 Sep 2018 10:13:59 -0700 Subject: tools: bpf: fix license for a compat header file libc_compat.h is used by libbpf so make sure it's licensed under LGPL or BSD license. The license change should be OK, I'm the only author of the file. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/include/tools/libc_compat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/tools/libc_compat.h b/tools/include/tools/libc_compat.h index 664ced8cb1b0..e907ba6f15e5 100644 --- a/tools/include/tools/libc_compat.h +++ b/tools/include/tools/libc_compat.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: (LGPL-2.0+ OR BSD-2-Clause) /* Copyright (C) 2018 Netronome Systems, Inc. */ #ifndef __TOOLS_LIBC_COMPAT_H -- cgit v1.2.3 From 76c0ddd8c3a683f6e2c6e60e11dc1a1558caf4bc Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 19 Sep 2018 15:02:07 +0200 Subject: ip6_tunnel: be careful when accessing the inner header the ip6 tunnel xmit ndo assumes that the processed skb always contains an ip[v6] header, but syzbot has found a way to send frames that fall short of this assumption, leading to the following splat: BUG: KMSAN: uninit-value in ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307 [inline] BUG: KMSAN: uninit-value in ip6_tnl_start_xmit+0x7d2/0x1ef0 net/ipv6/ip6_tunnel.c:1390 CPU: 0 PID: 4504 Comm: syz-executor558 Not tainted 4.16.0+ #87 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683 ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307 [inline] ip6_tnl_start_xmit+0x7d2/0x1ef0 net/ipv6/ip6_tunnel.c:1390 __netdev_start_xmit include/linux/netdevice.h:4066 [inline] netdev_start_xmit include/linux/netdevice.h:4075 [inline] xmit_one net/core/dev.c:3026 [inline] dev_hard_start_xmit+0x5f1/0xc70 net/core/dev.c:3042 __dev_queue_xmit+0x27ee/0x3520 net/core/dev.c:3557 dev_queue_xmit+0x4b/0x60 net/core/dev.c:3590 packet_snd net/packet/af_packet.c:2944 [inline] packet_sendmsg+0x7c70/0x8a30 net/packet/af_packet.c:2969 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmmsg+0x42d/0x800 net/socket.c:2136 SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167 SyS_sendmmsg+0x63/0x90 net/socket.c:2162 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x441819 RSP: 002b:00007ffe58ee8268 EFLAGS: 00000213 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441819 RDX: 0000000000000002 RSI: 0000000020000100 RDI: 0000000000000003 RBP: 00000000006cd018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000213 R12: 0000000000402510 R13: 00000000004025a0 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321 slab_post_alloc_hook mm/slab.h:445 [inline] slab_alloc_node mm/slub.c:2737 [inline] __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:984 [inline] alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234 sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085 packet_alloc_skb net/packet/af_packet.c:2803 [inline] packet_snd net/packet/af_packet.c:2894 [inline] packet_sendmsg+0x6454/0x8a30 net/packet/af_packet.c:2969 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmmsg+0x42d/0x800 net/socket.c:2136 SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167 SyS_sendmmsg+0x63/0x90 net/socket.c:2162 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 This change addresses the issue adding the needed check before accessing the inner header. The ipv4 side of the issue is apparently there since the ipv4 over ipv6 initial support, and the ipv6 side predates git history. Fixes: c4d3efafcc93 ("[IPV6] IP6TUNNEL: Add support to IPv4 over IPv6 tunnel.") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+3fde91d4d394747d6db4@syzkaller.appspotmail.com Tested-by: Alexander Potapenko Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 419960b0ba16..a0b6932c3afd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1234,7 +1234,7 @@ static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); + const struct iphdr *iph; int encap_limit = -1; struct flowi6 fl6; __u8 dsfield; @@ -1242,6 +1242,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + /* ensure we can access the full inner ip header */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -1; + + iph = ip_hdr(skb); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); tproto = READ_ONCE(t->parms.proto); @@ -1306,7 +1311,7 @@ static inline int ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6hdr *ipv6h; int encap_limit = -1; __u16 offset; struct flowi6 fl6; @@ -1315,6 +1320,10 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) u8 tproto; int err; + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) + return -1; + + ipv6h = ipv6_hdr(skb); tproto = READ_ONCE(t->parms.proto); if ((tproto != IPPROTO_IPV6 && tproto != 0) || ip6_tnl_addr_conflict(t, ipv6h)) -- cgit v1.2.3 From cf5cca6e4cc4dea604d8c83e7676f10459d6809c Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 19 Sep 2018 15:29:06 +0200 Subject: net: mvneta: fix the Rx desc buffer DMA unmapping With CONFIG_DMA_API_DEBUG enabled we now get a warning when using the mvneta driver: mvneta d0030000.ethernet: DMA-API: device driver frees DMA memory with wrong function [device address=0x000000001165b000] [size=4096 bytes] [mapped as page] [unmapped as single] This is because when using the s/w buffer management, the Rx descriptor buffer is mapped with dma_map_page but unmapped with dma_unmap_single. This patch fixes this by using the right unmapping function. Fixes: 562e2f467e71 ("net: mvneta: Improve the buffer allocation method for SWBM") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index bc80a678abc3..2db9708f2e24 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2008,8 +2008,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, skb_add_rx_frag(rxq->skb, frag_num, page, frag_offset, frag_size, PAGE_SIZE); - dma_unmap_single(dev->dev.parent, phys_addr, - PAGE_SIZE, DMA_FROM_DEVICE); + dma_unmap_page(dev->dev.parent, phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); rxq->left_size -= frag_size; } } else { -- cgit v1.2.3 From 50fdf60181b01b7383b85d4b9acbb842263d96a2 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:10 -0700 Subject: qed: Fix populating the invalid stag value in multi function mode. In multi-function mode, driver receives the stag value (outer vlan) for a PF from management FW (MFW). If the stag value is negotiated prior to the driver load, then the stag is not notified to the driver and hence driver will have the invalid stag value. The fix is to request the MFW for STAG value during the driver load time. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dev.c | 15 ++++++++++++++- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 4 ++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 016ca8a7ec8a..97f073fd3725 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -1706,7 +1706,7 @@ static int qed_vf_start(struct qed_hwfn *p_hwfn, int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params) { struct qed_load_req_params load_req_params; - u32 load_code, param, drv_mb_param; + u32 load_code, resp, param, drv_mb_param; bool b_default_mtu = true; struct qed_hwfn *p_hwfn; int rc = 0, mfw_rc, i; @@ -1852,6 +1852,19 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params) if (IS_PF(cdev)) { p_hwfn = QED_LEADING_HWFN(cdev); + + /* Get pre-negotiated values for stag, bandwidth etc. */ + DP_VERBOSE(p_hwfn, + QED_MSG_SPQ, + "Sending GET_OEM_UPDATES command to trigger stag/bandwidth attention handling\n"); + drv_mb_param = 1 << DRV_MB_PARAM_DUMMY_OEM_UPDATES_OFFSET; + rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, + DRV_MSG_CODE_GET_OEM_UPDATES, + drv_mb_param, &resp, ¶m); + if (rc) + DP_NOTICE(p_hwfn, + "Failed to send GET_OEM_UPDATES attention request\n"); + drv_mb_param = STORM_FW_VERSION; rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt, DRV_MSG_CODE_OV_UPDATE_STORM_FW_VER, diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 8faceb691657..9b3ef00e5782 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -12414,6 +12414,7 @@ struct public_drv_mb { #define DRV_MSG_SET_RESOURCE_VALUE_MSG 0x35000000 #define DRV_MSG_CODE_OV_UPDATE_WOL 0x38000000 #define DRV_MSG_CODE_OV_UPDATE_ESWITCH_MODE 0x39000000 +#define DRV_MSG_CODE_GET_OEM_UPDATES 0x41000000 #define DRV_MSG_CODE_BW_UPDATE_ACK 0x32000000 #define DRV_MSG_CODE_NIG_DRAIN 0x30000000 @@ -12541,6 +12542,9 @@ struct public_drv_mb { #define DRV_MB_PARAM_ESWITCH_MODE_VEB 0x1 #define DRV_MB_PARAM_ESWITCH_MODE_VEPA 0x2 +#define DRV_MB_PARAM_DUMMY_OEM_UPDATES_MASK 0x1 +#define DRV_MB_PARAM_DUMMY_OEM_UPDATES_OFFSET 0 + #define DRV_MB_PARAM_SET_LED_MODE_OPER 0x0 #define DRV_MB_PARAM_SET_LED_MODE_ON 0x1 #define DRV_MB_PARAM_SET_LED_MODE_OFF 0x2 -- cgit v1.2.3 From 0216da9413afa546627a1b0d319dfd17fef34050 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:11 -0700 Subject: qed: Do not add VLAN 0 tag to untagged frames in multi-function mode. In certain multi-function switch dependent modes, firmware adds vlan tag 0 to the untagged frames. This leads to double tagging for the traffic if the dcbx is enabled, which is not the desired behavior. To avoid this, driver needs to set "dcb_dont_add_vlan0" flag. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 9 ++++++++- drivers/net/ethernet/qlogic/qed/qed_dcbx.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index 6bb76e6d3c14..d53f33c6b1fc 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -190,6 +190,7 @@ qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data) static void qed_dcbx_set_params(struct qed_dcbx_results *p_data, + struct qed_hwfn *p_hwfn, struct qed_hw_info *p_info, bool enable, u8 prio, @@ -206,6 +207,11 @@ qed_dcbx_set_params(struct qed_dcbx_results *p_data, else p_data->arr[type].update = DONT_UPDATE_DCB_DSCP; + /* Do not add vlan tag 0 when DCB is enabled and port in UFP/OV mode */ + if ((test_bit(QED_MF_8021Q_TAGGING, &p_hwfn->cdev->mf_bits) || + test_bit(QED_MF_8021AD_TAGGING, &p_hwfn->cdev->mf_bits))) + p_data->arr[type].dont_add_vlan0 = true; + /* QM reconf data */ if (p_info->personality == personality) qed_hw_info_set_offload_tc(p_info, tc); @@ -231,7 +237,7 @@ qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, personality = qed_dcbx_app_update[i].personality; - qed_dcbx_set_params(p_data, p_info, enable, + qed_dcbx_set_params(p_data, p_hwfn, p_info, enable, prio, tc, type, personality); } } @@ -954,6 +960,7 @@ static void qed_dcbx_update_protocol_data(struct protocol_dcb_data *p_data, p_data->dcb_enable_flag = p_src->arr[type].enable; p_data->dcb_priority = p_src->arr[type].priority; p_data->dcb_tc = p_src->arr[type].tc; + p_data->dcb_dont_add_vlan0 = p_src->arr[type].dont_add_vlan0; } /* Set pf update ramrod command params */ diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h index a4d688c04e18..01f253ea4b22 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.h @@ -55,6 +55,7 @@ struct qed_dcbx_app_data { u8 update; /* Update indication */ u8 priority; /* Priority */ u8 tc; /* Traffic Class */ + bool dont_add_vlan0; /* Do not insert a vlan tag with id 0 */ }; #define QED_DCBX_VERSION_DISABLED 0 -- cgit v1.2.3 From 7e3e375ceede8fba5218362d14db7de25fa83f12 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 19 Sep 2018 21:59:12 -0700 Subject: qed: Add missing device config for RoCE EDPM in UFP mode. This patch adds support to configure the DORQ to use vlan-id/priority for roce EDPM. Fixes: cac6f691 ("qed: Add support for Unified Fabric Port") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 40 ++++++++++++++------------ drivers/net/ethernet/qlogic/qed/qed_mcp.c | 24 +++++++++++++--- drivers/net/ethernet/qlogic/qed/qed_reg_addr.h | 6 ++++ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index d53f33c6b1fc..f5459de6d60a 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -190,11 +190,8 @@ qed_dcbx_dp_protocol(struct qed_hwfn *p_hwfn, struct qed_dcbx_results *p_data) static void qed_dcbx_set_params(struct qed_dcbx_results *p_data, - struct qed_hwfn *p_hwfn, - struct qed_hw_info *p_info, - bool enable, - u8 prio, - u8 tc, + struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + bool enable, u8 prio, u8 tc, enum dcbx_protocol_type type, enum qed_pci_personality personality) { @@ -213,18 +210,24 @@ qed_dcbx_set_params(struct qed_dcbx_results *p_data, p_data->arr[type].dont_add_vlan0 = true; /* QM reconf data */ - if (p_info->personality == personality) - qed_hw_info_set_offload_tc(p_info, tc); + if (p_hwfn->hw_info.personality == personality) + qed_hw_info_set_offload_tc(&p_hwfn->hw_info, tc); + + /* Configure dcbx vlan priority in doorbell block for roce EDPM */ + if (test_bit(QED_MF_UFP_SPECIFIC, &p_hwfn->cdev->mf_bits) && + type == DCBX_PROTOCOL_ROCE) { + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 1); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_PCP_BB_K2, prio << 1); + } } /* Update app protocol data and hw_info fields with the TLV info */ static void qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, - struct qed_hwfn *p_hwfn, - bool enable, - u8 prio, u8 tc, enum dcbx_protocol_type type) + struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, + bool enable, u8 prio, u8 tc, + enum dcbx_protocol_type type) { - struct qed_hw_info *p_info = &p_hwfn->hw_info; enum qed_pci_personality personality; enum dcbx_protocol_type id; int i; @@ -237,7 +240,7 @@ qed_dcbx_update_app_info(struct qed_dcbx_results *p_data, personality = qed_dcbx_app_update[i].personality; - qed_dcbx_set_params(p_data, p_hwfn, p_info, enable, + qed_dcbx_set_params(p_data, p_hwfn, p_ptt, enable, prio, tc, type, personality); } } @@ -271,7 +274,7 @@ qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn, * reconfiguring QM. Get protocol specific data for PF update ramrod command. */ static int -qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, +qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, struct qed_dcbx_results *p_data, struct dcbx_app_priority_entry *p_tbl, u32 pri_tc_tbl, int count, u8 dcbx_version) @@ -315,7 +318,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, enable = true; } - qed_dcbx_update_app_info(p_data, p_hwfn, enable, + qed_dcbx_update_app_info(p_data, p_hwfn, p_ptt, enable, priority, tc, type); } } @@ -337,7 +340,7 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, continue; enable = (type == DCBX_PROTOCOL_ETH) ? false : !!dcbx_version; - qed_dcbx_update_app_info(p_data, p_hwfn, enable, + qed_dcbx_update_app_info(p_data, p_hwfn, p_ptt, enable, priority, tc, type); } @@ -347,7 +350,8 @@ qed_dcbx_process_tlv(struct qed_hwfn *p_hwfn, /* Parse app TLV's to update TC information in hw_info structure for * reconfiguring QM. Get protocol specific data for PF update ramrod command. */ -static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn) +static int +qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) { struct dcbx_app_priority_feature *p_app; struct dcbx_app_priority_entry *p_tbl; @@ -371,7 +375,7 @@ static int qed_dcbx_process_mib_info(struct qed_hwfn *p_hwfn) p_info = &p_hwfn->hw_info; num_entries = QED_MFW_GET_FIELD(p_app->flags, DCBX_APP_NUM_ENTRIES); - rc = qed_dcbx_process_tlv(p_hwfn, &data, p_tbl, pri_tc_tbl, + rc = qed_dcbx_process_tlv(p_hwfn, p_ptt, &data, p_tbl, pri_tc_tbl, num_entries, dcbx_version); if (rc) return rc; @@ -897,7 +901,7 @@ qed_dcbx_mib_update_event(struct qed_hwfn *p_hwfn, return rc; if (type == QED_DCBX_OPERATIONAL_MIB) { - rc = qed_dcbx_process_mib_info(p_hwfn); + rc = qed_dcbx_process_mib_info(p_hwfn, p_ptt); if (!rc) { /* reconfigure tcs of QM queues according * to negotiation results diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 5d37ec7e9b0b..58c7eb9d8e1b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -1581,13 +1581,29 @@ static void qed_mcp_update_stag(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) p_hwfn->mcp_info->func_info.ovlan = (u16)shmem_info.ovlan_stag & FUNC_MF_CFG_OV_STAG_MASK; p_hwfn->hw_info.ovlan = p_hwfn->mcp_info->func_info.ovlan; - if ((p_hwfn->hw_info.hw_mode & BIT(MODE_MF_SD)) && - (p_hwfn->hw_info.ovlan != QED_MCP_VLAN_UNSET)) { - qed_wr(p_hwfn, p_ptt, - NIG_REG_LLH_FUNC_TAG_VALUE, p_hwfn->hw_info.ovlan); + if (test_bit(QED_MF_OVLAN_CLSS, &p_hwfn->cdev->mf_bits)) { + if (p_hwfn->hw_info.ovlan != QED_MCP_VLAN_UNSET) { + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_VALUE, + p_hwfn->hw_info.ovlan); + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_EN, 1); + + /* Configure DB to add external vlan to EDPM packets */ + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 1); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_EXT_VID_BB_K2, + p_hwfn->hw_info.ovlan); + } else { + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_EN, 0); + qed_wr(p_hwfn, p_ptt, NIG_REG_LLH_FUNC_TAG_VALUE, 0); + qed_wr(p_hwfn, p_ptt, DORQ_REG_TAG1_OVRD_MODE, 0); + qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_EXT_VID_BB_K2, 0); + } + qed_sp_pf_update_stag(p_hwfn); } + DP_VERBOSE(p_hwfn, QED_MSG_SP, "ovlan = %d hw_mode = 0x%x\n", + p_hwfn->mcp_info->func_info.ovlan, p_hwfn->hw_info.hw_mode); + /* Acknowledge the MFW */ qed_mcp_cmd(p_hwfn, p_ptt, DRV_MSG_CODE_S_TAG_UPDATE_ACK, 0, &resp, ¶m); diff --git a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h index f736f70956fd..2440970882c4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h +++ b/drivers/net/ethernet/qlogic/qed/qed_reg_addr.h @@ -216,6 +216,12 @@ 0x00c000UL #define DORQ_REG_IFEN \ 0x100040UL +#define DORQ_REG_TAG1_OVRD_MODE \ + 0x1008b4UL +#define DORQ_REG_PF_PCP_BB_K2 \ + 0x1008c4UL +#define DORQ_REG_PF_EXT_VID_BB_K2 \ + 0x1008c8UL #define DORQ_REG_DB_DROP_REASON \ 0x100a2cUL #define DORQ_REG_DB_DROP_DETAILS \ -- cgit v1.2.3 From f9d5b1d50840320fb6c66fafd1a532a995fc3758 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 20 Sep 2018 09:31:45 +0300 Subject: mlxsw: spectrum: Bump required firmware version MC-aware mode was introduced to mlxsw in commit 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") and fixed up later in commit 3a3539cd3632 ("mlxsw: spectrum_buffers: Set up a dedicated pool for BUM traffic"). As the final piece of puzzle, a firmware issue whereby a wrong priority was assigned to BUM traffic was corrected in FW version 13.1703.4. Therefore require this FW version in the driver. Fixes: 7b8195306694 ("mlxsw: spectrum: Configure MC-aware mode on mlxsw ports") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 930700413b1d..b492152c8881 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -44,8 +44,8 @@ #define MLXSW_SP_FWREV_MINOR_TO_BRANCH(minor) ((minor) / 100) #define MLXSW_SP1_FWREV_MAJOR 13 -#define MLXSW_SP1_FWREV_MINOR 1702 -#define MLXSW_SP1_FWREV_SUBMINOR 6 +#define MLXSW_SP1_FWREV_MINOR 1703 +#define MLXSW_SP1_FWREV_SUBMINOR 4 #define MLXSW_SP1_FWREV_CAN_RESET_MINOR 1702 static const struct mlxsw_fw_rev mlxsw_sp1_fw_rev = { -- cgit v1.2.3 From 56ce3c5a50f4d8cc95361b1ec7f152006c6320d8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 20 Sep 2018 09:27:30 +0200 Subject: smc: generic netlink family should be __ro_after_init The generic netlink family is only initialized during module init, so it should be __ro_after_init like all other generic netlink families. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/smc/smc_pnet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 01c6ce042a1c..7cb3e4f07c10 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -461,7 +461,7 @@ static const struct genl_ops smc_pnet_ops[] = { }; /* SMC_PNETID family definition */ -static struct genl_family smc_pnet_nl_family = { +static struct genl_family smc_pnet_nl_family __ro_after_init = { .hdrsize = 0, .name = SMCR_GENL_FAMILY_NAME, .version = SMCR_GENL_FAMILY_VERSION, -- cgit v1.2.3 From 8c6ec3613e7b0aade20a3196169c0bab32ed3e3f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 19 Sep 2018 19:01:37 +0200 Subject: bnxt_en: don't try to offload VLAN 'modify' action bnxt offload code currently supports only 'push' and 'pop' operation: let .ndo_setup_tc() return -EOPNOTSUPP if VLAN 'modify' action is configured. Fixes: 2ae7408fedfe ("bnxt_en: bnxt: add TC flower filter offload support") Signed-off-by: Davide Caratti Acked-by: Sathya Perla Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 092c817f8f11..e1594c9df4c6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -75,17 +75,23 @@ static int bnxt_tc_parse_redir(struct bnxt *bp, return 0; } -static void bnxt_tc_parse_vlan(struct bnxt *bp, - struct bnxt_tc_actions *actions, - const struct tc_action *tc_act) +static int bnxt_tc_parse_vlan(struct bnxt *bp, + struct bnxt_tc_actions *actions, + const struct tc_action *tc_act) { - if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_POP) { + switch (tcf_vlan_action(tc_act)) { + case TCA_VLAN_ACT_POP: actions->flags |= BNXT_TC_ACTION_FLAG_POP_VLAN; - } else if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_PUSH) { + break; + case TCA_VLAN_ACT_PUSH: actions->flags |= BNXT_TC_ACTION_FLAG_PUSH_VLAN; actions->push_vlan_tci = htons(tcf_vlan_push_vid(tc_act)); actions->push_vlan_tpid = tcf_vlan_push_proto(tc_act); + break; + default: + return -EOPNOTSUPP; } + return 0; } static int bnxt_tc_parse_tunnel_set(struct bnxt *bp, @@ -134,7 +140,9 @@ static int bnxt_tc_parse_actions(struct bnxt *bp, /* Push/pop VLAN */ if (is_tcf_vlan(tc_act)) { - bnxt_tc_parse_vlan(bp, actions, tc_act); + rc = bnxt_tc_parse_vlan(bp, actions, tc_act); + if (rc) + return rc; continue; } -- cgit v1.2.3 From d7ab5cdce54da631f0c8c11e506c974536a3581e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 20 Sep 2018 17:27:28 +0800 Subject: sctp: update dst pmtu with the correct daddr When processing pmtu update from an icmp packet, it calls .update_pmtu with sk instead of skb in sctp_transport_update_pmtu. However for sctp, the daddr in the transport might be different from inet_sock->inet_daddr or sk->sk_v6_daddr, which is used to update or create the route cache. The incorrect daddr will cause a different route cache created for the path. So before calling .update_pmtu, inet_sock->inet_daddr/sk->sk_v6_daddr should be updated with the daddr in the transport, and update it back after it's done. The issue has existed since route exceptions introduction. Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.") Reported-by: ian.periam@dialogic.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/transport.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 12cac85da994..033696e6f74f 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -260,6 +260,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { struct dst_entry *dst = sctp_transport_dst_check(t); + struct sock *sk = t->asoc->base.sk; bool change = true; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { @@ -271,12 +272,19 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) pmtu = SCTP_TRUNC4(pmtu); if (dst) { - dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); + struct sctp_pf *pf = sctp_get_pf_specific(dst->ops->family); + union sctp_addr addr; + + pf->af->from_sk(&addr, sk); + pf->to_sk_daddr(&t->ipaddr, sk); + dst->ops->update_pmtu(dst, sk, NULL, pmtu); + pf->to_sk_daddr(&addr, sk); + dst = sctp_transport_dst_check(t); } if (!dst) { - t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); + t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); dst = t->dst; } -- cgit v1.2.3 From 10bc6a6042c900934a097988b5d50e3cf3f91781 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 20 Sep 2018 22:47:09 +0200 Subject: r8169: fix autoneg issue on resume with RTL8168E It was reported that chip version 33 (RTL8168E) ends up with 10MBit/Half on a 1GBit link after resuming from S3 (with different link partners). For whatever reason the PHY on this chip doesn't properly start a renegotiation when soft-reset. Explicitly requesting a renegotiation fixes this. Fixes: a2965f12fde6 ("r8169: remove rtl8169_set_speed_xmii") Signed-off-by: Heiner Kallweit Reported-by: Neil MacLeod Tested-by: Neil MacLeod Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index bb529ff2ca81..ab30aaeac6d3 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -4071,6 +4071,15 @@ static void rtl8169_init_phy(struct net_device *dev, struct rtl8169_private *tp) phy_speed_up(dev->phydev); genphy_soft_reset(dev->phydev); + + /* It was reported that chip version 33 ends up with 10MBit/Half on a + * 1GBit link after resuming from S3. For whatever reason the PHY on + * this chip doesn't properly start a renegotiation when soft-reset. + * Explicitly requesting a renegotiation fixes this. + */ + if (tp->mac_version == RTL_GIGA_MAC_VER_33 && + dev->phydev->autoneg == AUTONEG_ENABLE) + phy_restart_aneg(dev->phydev); } static void rtl_rar_set(struct rtl8169_private *tp, u8 *addr) -- cgit v1.2.3 From 652ef42c134da1bbb03bd4c9b4291dfaf8d7febb Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 20 Sep 2018 12:08:54 +0200 Subject: net: mscc: fix the frame extraction into the skb When extracting frames from the Ocelot switch, the frame check sequence (FCS) is present at the end of the data extracted. The FCS was put into the sk buffer which introduced some issues (as length related ones), as the FCS shouldn't be part of an Rx sk buffer. This patch fixes the Ocelot switch extraction behaviour by discarding the FCS. Fixes: a556c76adc05 ("net: mscc: Add initial Ocelot switch support") Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_board.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_board.c b/drivers/net/ethernet/mscc/ocelot_board.c index 26bb3b18f3be..3cdf63e35b53 100644 --- a/drivers/net/ethernet/mscc/ocelot_board.c +++ b/drivers/net/ethernet/mscc/ocelot_board.c @@ -91,7 +91,7 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) struct sk_buff *skb; struct net_device *dev; u32 *buf; - int sz, len; + int sz, len, buf_len; u32 ifh[4]; u32 val; struct frame_info info; @@ -116,14 +116,20 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg) err = -ENOMEM; break; } - buf = (u32 *)skb_put(skb, info.len); + buf_len = info.len - ETH_FCS_LEN; + buf = (u32 *)skb_put(skb, buf_len); len = 0; do { sz = ocelot_rx_frame_word(ocelot, grp, false, &val); *buf++ = val; len += sz; - } while ((sz == 4) && (len < info.len)); + } while (len < buf_len); + + /* Read the FCS and discard it */ + sz = ocelot_rx_frame_word(ocelot, grp, false, &val); + /* Update the statistics if part of the FCS was read before */ + len -= ETH_FCS_LEN - sz; if (sz < 0) { err = sz; -- cgit v1.2.3 From 5607fff303636d48b88414c6be353d9fed700af2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:44 -0700 Subject: bpf: sockmap only allow ESTABLISHED sock state After this patch we only allow socks that are in ESTABLISHED state or are being added via a sock_ops event that is transitioning into an ESTABLISHED state. By allowing sock_ops events we allow users to manage sockmaps directly from sock ops programs. The two supported sock_ops ops are BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB and BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB. Similar to TLS ULP this ensures sk_user_data is correct. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 488ef9663c01..1f97b559892a 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2097,8 +2097,12 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ if (skops.sk->sk_type != SOCK_STREAM || - skops.sk->sk_protocol != IPPROTO_TCP) { + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { fput(socket->file); return -EOPNOTSUPP; } @@ -2453,6 +2457,16 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. + */ + if (skops.sk->sk_type != SOCK_STREAM || + skops.sk->sk_protocol != IPPROTO_TCP || + skops.sk->sk_state != TCP_ESTABLISHED) { + fput(socket->file); + return -EOPNOTSUPP; + } + lock_sock(skops.sk); preempt_disable(); rcu_read_lock(); @@ -2543,10 +2557,22 @@ const struct bpf_map_ops sock_hash_ops = { .map_check_btf = map_check_no_btf, }; +static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops) +{ + return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB || + ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB; +} BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + /* ULPs are currently supported only for TCP sockets in ESTABLISHED + * state. This checks that the sock ops triggering the update is + * one indicating we are (or will be soon) in an ESTABLISHED state. + */ + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } @@ -2565,6 +2591,9 @@ BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock, struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!bpf_is_valid_sock_op(bpf_sock)) + return -EOPNOTSUPP; return sock_hash_ctx_update_elem(bpf_sock, map, key, flags); } -- cgit v1.2.3 From b05545e15e1ff1d6a6a8593971275f9cc3e6b92b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:49 -0700 Subject: bpf: sockmap, fix transition through disconnect without close It is possible (via shutdown()) for TCP socks to go trough TCP_CLOSE state via tcp_disconnect() without actually calling tcp_close which would then call our bpf_tcp_close() callback. Because of this a user could disconnect a socket then put it in a LISTEN state which would break our assumptions about sockets always being ESTABLISHED state. To resolve this rely on the unhash hook, which is called in the disconnect case, to remove the sock from the sockmap. Reported-by: Eric Dumazet Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 60 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 1f97b559892a..0a0f2ec75370 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -132,6 +132,7 @@ struct smap_psock { struct work_struct gc_work; struct proto *sk_proto; + void (*save_unhash)(struct sock *sk); void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); @@ -143,6 +144,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); +static void bpf_tcp_unhash(struct sock *sk); static void bpf_tcp_close(struct sock *sk, long timeout); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -184,6 +186,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], struct proto *base) { prot[SOCKMAP_BASE] = *base; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].close = bpf_tcp_close; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -217,6 +220,7 @@ static int bpf_tcp_init(struct sock *sk) return -EBUSY; } + psock->save_unhash = sk->sk_prot->unhash; psock->save_close = sk->sk_prot->close; psock->sk_proto = sk->sk_prot; @@ -305,30 +309,12 @@ static struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - lock_sock(sk); - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - release_sock(sk); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork, true); kfree(psock->cork); @@ -379,6 +365,42 @@ static void bpf_tcp_close(struct sock *sk, long timeout) kfree(e); e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + if (sk->sk_prot->unhash) + sk->sk_prot->unhash(sk); + return; + } + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + lock_sock(sk); + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + release_sock(sk); + return sk->sk_prot->close(sk, timeout); + } + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); release_sock(sk); close_fun(sk, timeout); -- cgit v1.2.3 From 5028027844cfc6168e39650abecd817ba64c9d98 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 18 Sep 2018 09:01:54 -0700 Subject: bpf: test_maps, only support ESTABLISHED socks Ensure that sockets added to a sock{map|hash} that is not in the ESTABLISHED state is rejected. Fixes: 1aa12bdf1bfb ("bpf: sockmap, add sock close() hook to remove socks") Signed-off-by: John Fastabend Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_maps.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 6f54f84144a0..9b552c0fc47d 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -580,7 +580,11 @@ static void test_sockmap(int tasks, void *data) /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (err) { + if (i < 2 && !err) { + printf("Allowed update sockmap '%i:%i' not in ESTABLISHED\n", + i, sfd[i]); + goto out_sockmap; + } else if (i >= 2 && err) { printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; @@ -741,7 +745,7 @@ static void test_sockmap(int tasks, void *data) } /* Test map update elem afterwards fd lives in fd and map_fd */ - for (i = 0; i < 6; i++) { + for (i = 2; i < 6; i++) { err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY); if (err) { printf("Failed map_fd_rx update sockmap %i '%i:%i'\n", @@ -845,7 +849,7 @@ static void test_sockmap(int tasks, void *data) } /* Delete the elems without programs */ - for (i = 0; i < 6; i++) { + for (i = 2; i < 6; i++) { err = bpf_map_delete_elem(fd, &i); if (err) { printf("Failed delete sockmap %i '%i:%i'\n", -- cgit v1.2.3 From f88b4c01b97e09535505cf3c327fdbce55c27f00 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Thu, 20 Sep 2018 14:29:45 -0600 Subject: netlabel: check for IPV4MASK in addrinfo_get netlbl_unlabel_addrinfo_get() assumes that if it finds the NLBL_UNLABEL_A_IPV4ADDR attribute, it must also have the NLBL_UNLABEL_A_IPV4MASK attribute as well. However, this is not necessarily the case as the current checks in netlbl_unlabel_staticadd() and friends are not sufficent to enforce this. If passed a netlink message with NLBL_UNLABEL_A_IPV4ADDR, NLBL_UNLABEL_A_IPV6ADDR, and NLBL_UNLABEL_A_IPV6MASK attributes, these functions will all call netlbl_unlabel_addrinfo_get() which will then attempt dereference NULL when fetching the non-existent NLBL_UNLABEL_A_IPV4MASK attribute: Unable to handle kernel NULL pointer dereference at virtual address 0 Process unlab (pid: 31762, stack limit = 0xffffff80502d8000) Call trace: netlbl_unlabel_addrinfo_get+0x44/0xd8 netlbl_unlabel_staticremovedef+0x98/0xe0 genl_rcv_msg+0x354/0x388 netlink_rcv_skb+0xac/0x118 genl_rcv+0x34/0x48 netlink_unicast+0x158/0x1f0 netlink_sendmsg+0x32c/0x338 sock_sendmsg+0x44/0x60 ___sys_sendmsg+0x1d0/0x2a8 __sys_sendmsg+0x64/0xb4 SyS_sendmsg+0x34/0x4c el0_svc_naked+0x34/0x38 Code: 51001149 7100113f 540000a0 f9401508 (79400108) ---[ end trace f6438a488e737143 ]--- Kernel panic - not syncing: Fatal exception Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller --- net/netlabel/netlabel_unlabeled.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index c070dfc0190a..c92894c3e40a 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -781,7 +781,8 @@ static int netlbl_unlabel_addrinfo_get(struct genl_info *info, { u32 addr_len; - if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR]) { + if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] && + info->attrs[NLBL_UNLABEL_A_IPV4MASK]) { addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]); if (addr_len != sizeof(struct in_addr) && addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK])) -- cgit v1.2.3 From 86f9bd1ff61c413a2a251fa736463295e4e24733 Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Fri, 21 Sep 2018 00:45:27 +0000 Subject: net/ipv6: Display all addresses in output of /proc/net/if_inet6 The backend handling for /proc/net/if_inet6 in addrconf.c doesn't properly handle starting/stopping the iteration. The problem is that at some point during the iteration, an overflow is detected and the process is subsequently stopped. The item being shown via seq_printf() when the overflow occurs is not actually shown, though. When start() is subsequently called to resume iterating, it returns the next item, and thus the item that was being processed when the overflow occurred never gets printed. Alter the meaning of the private data member "offset". Currently, when it is not 0 (which only happens at the very beginning), "offset" represents the next hlist item to be printed. After this change, "offset" always represents the current item. This is also consistent with the private data member "bucket", which represents the current bucket, and also the use of "pos" as defined in seq_file.txt: The pos passed to start() will always be either zero, or the most recent pos used in the previous session. Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d51a8c0b3372..c63ccce6425f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4201,7 +4201,6 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) p++; continue; } - state->offset++; return ifa; } @@ -4225,13 +4224,12 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, return ifa; } + state->offset = 0; while (++state->bucket < IN6_ADDR_HSIZE) { - state->offset = 0; hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], addr_lst) { if (!net_eq(dev_net(ifa->idev->dev), net)) continue; - state->offset++; return ifa; } } -- cgit v1.2.3 From 54be5b8ce33f8d1a05b258070c81ed98f935883d Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 21 Sep 2018 02:53:17 +0000 Subject: PCI: hv: Fix return value check in hv_pci_assign_slots() In case of error, the function pci_create_slot() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: a15f2c08c708 ("PCI: hv: support reporting serial number as slot information") Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- drivers/pci/controller/pci-hyperv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ee80e79db21a..9ba4d12c179c 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1484,8 +1484,10 @@ static void hv_pci_assign_slots(struct hv_pcibus_device *hbus) snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser); hpdev->pci_slot = pci_create_slot(hbus->pci_bus, slot_nr, name, NULL); - if (!hpdev->pci_slot) + if (IS_ERR(hpdev->pci_slot)) { pr_warn("pci_create slot %s failed\n", name); + hpdev->pci_slot = NULL; + } } } -- cgit v1.2.3 From 72b462798c2a258725dd741d1c6c139446e20d12 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 10:53:47 +0800 Subject: net: seeq: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/seeq/ether3.c | 5 +++-- drivers/net/ethernet/seeq/sgiseeq.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index c5bc124b41a9..d1bb73bf9914 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -77,7 +77,8 @@ static void ether3_setmulticastlist(struct net_device *dev); static int ether3_rx(struct net_device *dev, unsigned int maxcnt); static void ether3_tx(struct net_device *dev); static int ether3_open (struct net_device *dev); -static int ether3_sendpacket (struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ether3_sendpacket(struct sk_buff *skb, + struct net_device *dev); static irqreturn_t ether3_interrupt (int irq, void *dev_id); static int ether3_close (struct net_device *dev); static void ether3_setmulticastlist (struct net_device *dev); @@ -481,7 +482,7 @@ static void ether3_timeout(struct net_device *dev) /* * Transmit a packet */ -static int +static netdev_tx_t ether3_sendpacket(struct sk_buff *skb, struct net_device *dev) { unsigned long flags; diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index 573691bc3b71..70cce63a6081 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -578,7 +578,8 @@ static inline int sgiseeq_reset(struct net_device *dev) return 0; } -static int sgiseeq_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t +sgiseeq_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct sgiseeq_private *sp = netdev_priv(dev); struct hpc3_ethregs *hregs = sp->hregs; -- cgit v1.2.3 From f3bf939f3d4552becb115d76e17b3e5957bab4a2 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:02:37 +0800 Subject: net: cirrus: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/cirrus/ep93xx_eth.c | 2 +- drivers/net/ethernet/cirrus/mac89x0.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cirrus/ep93xx_eth.c b/drivers/net/ethernet/cirrus/ep93xx_eth.c index e2a702996db4..13dfdfca49fc 100644 --- a/drivers/net/ethernet/cirrus/ep93xx_eth.c +++ b/drivers/net/ethernet/cirrus/ep93xx_eth.c @@ -332,7 +332,7 @@ static int ep93xx_poll(struct napi_struct *napi, int budget) return rx; } -static int ep93xx_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ep93xx_xmit(struct sk_buff *skb, struct net_device *dev) { struct ep93xx_priv *ep = netdev_priv(dev); struct ep93xx_tdesc *txd; diff --git a/drivers/net/ethernet/cirrus/mac89x0.c b/drivers/net/ethernet/cirrus/mac89x0.c index 3f8fe8fd79cc..6324e80960c3 100644 --- a/drivers/net/ethernet/cirrus/mac89x0.c +++ b/drivers/net/ethernet/cirrus/mac89x0.c @@ -113,7 +113,7 @@ struct net_local { /* Index to functions, as function prototypes. */ static int net_open(struct net_device *dev); -static int net_send_packet(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t net_send_packet(struct sk_buff *skb, struct net_device *dev); static irqreturn_t net_interrupt(int irq, void *dev_id); static void set_multicast_list(struct net_device *dev); static void net_rx(struct net_device *dev); @@ -324,7 +324,7 @@ net_open(struct net_device *dev) return 0; } -static int +static netdev_tx_t net_send_packet(struct sk_buff *skb, struct net_device *dev) { struct net_local *lp = netdev_priv(dev); -- cgit v1.2.3 From 28d304efb88f18aadfe9219f0e4b2b0ef9558a3b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:05:50 +0800 Subject: net: sgi: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/sgi/ioc3-eth.c | 4 ++-- drivers/net/ethernet/sgi/meth.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/sgi/ioc3-eth.c b/drivers/net/ethernet/sgi/ioc3-eth.c index 18d533fdf14c..3140999642ba 100644 --- a/drivers/net/ethernet/sgi/ioc3-eth.c +++ b/drivers/net/ethernet/sgi/ioc3-eth.c @@ -99,7 +99,7 @@ struct ioc3_private { static int ioc3_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); static void ioc3_set_multicast_list(struct net_device *dev); -static int ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev); static void ioc3_timeout(struct net_device *dev); static inline unsigned int ioc3_hash(const unsigned char *addr); static inline void ioc3_stop(struct ioc3_private *ip); @@ -1390,7 +1390,7 @@ static struct pci_driver ioc3_driver = { .remove = ioc3_remove_one, }; -static int ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t ioc3_start_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned long data; struct ioc3_private *ip = netdev_priv(dev); diff --git a/drivers/net/ethernet/sgi/meth.c b/drivers/net/ethernet/sgi/meth.c index ea55abd62ec7..703fbbefea44 100644 --- a/drivers/net/ethernet/sgi/meth.c +++ b/drivers/net/ethernet/sgi/meth.c @@ -697,7 +697,7 @@ static void meth_add_to_tx_ring(struct meth_private *priv, struct sk_buff *skb) /* * Transmit a packet (called by the kernel) */ -static int meth_tx(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t meth_tx(struct sk_buff *skb, struct net_device *dev) { struct meth_private *priv = netdev_priv(dev); unsigned long flags; -- cgit v1.2.3 From f0f25516e3b9efff2a34059347d13361d26e314a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:35:11 +0800 Subject: net: wiznet: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/wiznet/w5100.c | 2 +- drivers/net/ethernet/wiznet/w5300.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index 2bdfb39215e9..d8ba512f166a 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -835,7 +835,7 @@ static void w5100_tx_work(struct work_struct *work) w5100_tx_skb(priv->ndev, skb); } -static int w5100_start_tx(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t w5100_start_tx(struct sk_buff *skb, struct net_device *ndev) { struct w5100_priv *priv = netdev_priv(ndev); diff --git a/drivers/net/ethernet/wiznet/w5300.c b/drivers/net/ethernet/wiznet/w5300.c index 56ae573001e8..80fdbff67d82 100644 --- a/drivers/net/ethernet/wiznet/w5300.c +++ b/drivers/net/ethernet/wiznet/w5300.c @@ -365,7 +365,7 @@ static void w5300_tx_timeout(struct net_device *ndev) netif_wake_queue(ndev); } -static int w5300_start_tx(struct sk_buff *skb, struct net_device *ndev) +static netdev_tx_t w5300_start_tx(struct sk_buff *skb, struct net_device *ndev) { struct w5300_priv *priv = netdev_priv(ndev); -- cgit v1.2.3 From 648c361a568dcd098a3496c6c66bca4c8473e52b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:44:05 +0800 Subject: net: i825xx: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/i825xx/ether1.c | 5 +++-- drivers/net/ethernet/i825xx/lib82596.c | 4 ++-- drivers/net/ethernet/i825xx/sun3_82586.c | 6 ++++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index dc983450354b..35f6291a3672 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -64,7 +64,8 @@ static unsigned int net_debug = NET_DEBUG; #define RX_AREA_END 0x0fc00 static int ether1_open(struct net_device *dev); -static int ether1_sendpacket(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t ether1_sendpacket(struct sk_buff *skb, + struct net_device *dev); static irqreturn_t ether1_interrupt(int irq, void *dev_id); static int ether1_close(struct net_device *dev); static void ether1_setmulticastlist(struct net_device *dev); @@ -667,7 +668,7 @@ ether1_timeout(struct net_device *dev) netif_wake_queue(dev); } -static int +static netdev_tx_t ether1_sendpacket (struct sk_buff *skb, struct net_device *dev) { int tmp, tst, nopaddr, txaddr, tbdaddr, dataddr; diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c index f00a1dc2128c..2f7ae118217f 100644 --- a/drivers/net/ethernet/i825xx/lib82596.c +++ b/drivers/net/ethernet/i825xx/lib82596.c @@ -347,7 +347,7 @@ static const char init_setup[] = 0x7f /* *multi IA */ }; static int i596_open(struct net_device *dev); -static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev); static irqreturn_t i596_interrupt(int irq, void *dev_id); static int i596_close(struct net_device *dev); static void i596_add_cmd(struct net_device *dev, struct i596_cmd *cmd); @@ -966,7 +966,7 @@ static void i596_tx_timeout (struct net_device *dev) } -static int i596_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t i596_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct i596_private *lp = netdev_priv(dev); struct tx_cmd *tx_cmd; diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index 8bb15a8c2a40..1a86184d44c0 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -121,7 +121,8 @@ static int sun3_82586_probe1(struct net_device *dev,int ioaddr); static irqreturn_t sun3_82586_interrupt(int irq,void *dev_id); static int sun3_82586_open(struct net_device *dev); static int sun3_82586_close(struct net_device *dev); -static int sun3_82586_send_packet(struct sk_buff *,struct net_device *); +static netdev_tx_t sun3_82586_send_packet(struct sk_buff *, + struct net_device *); static struct net_device_stats *sun3_82586_get_stats(struct net_device *dev); static void set_multicast_list(struct net_device *dev); static void sun3_82586_timeout(struct net_device *dev); @@ -1002,7 +1003,8 @@ static void sun3_82586_timeout(struct net_device *dev) * send frame */ -static int sun3_82586_send_packet(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t +sun3_82586_send_packet(struct sk_buff *skb, struct net_device *dev) { int len,i; #ifndef NO_NOPCOMMANDS -- cgit v1.2.3 From e6ce3822a9f264b3f24c1acdc624131fb014f2f0 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 21 Sep 2018 11:46:37 +0800 Subject: net: apple: fix return type of ndo_start_xmit function The method ndo_start_xmit() is defined as returning an 'netdev_tx_t', which is a typedef for an enum type, so make sure the implementation in this driver has returns 'netdev_tx_t' value, and change the function return type to netdev_tx_t. Found by coccinelle. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/apple/bmac.c | 4 ++-- drivers/net/ethernet/apple/mace.c | 4 ++-- drivers/net/ethernet/apple/macmace.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/apple/bmac.c b/drivers/net/ethernet/apple/bmac.c index 024998d6d8c6..6a8e2567f2bd 100644 --- a/drivers/net/ethernet/apple/bmac.c +++ b/drivers/net/ethernet/apple/bmac.c @@ -154,7 +154,7 @@ static irqreturn_t bmac_txdma_intr(int irq, void *dev_id); static irqreturn_t bmac_rxdma_intr(int irq, void *dev_id); static void bmac_set_timeout(struct net_device *dev); static void bmac_tx_timeout(struct timer_list *t); -static int bmac_output(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t bmac_output(struct sk_buff *skb, struct net_device *dev); static void bmac_start(struct net_device *dev); #define DBDMA_SET(x) ( ((x) | (x) << 16) ) @@ -1456,7 +1456,7 @@ bmac_start(struct net_device *dev) spin_unlock_irqrestore(&bp->lock, flags); } -static int +static netdev_tx_t bmac_output(struct sk_buff *skb, struct net_device *dev) { struct bmac_data *bp = netdev_priv(dev); diff --git a/drivers/net/ethernet/apple/mace.c b/drivers/net/ethernet/apple/mace.c index 0b5429d76bcf..68b9ee489489 100644 --- a/drivers/net/ethernet/apple/mace.c +++ b/drivers/net/ethernet/apple/mace.c @@ -78,7 +78,7 @@ struct mace_data { static int mace_open(struct net_device *dev); static int mace_close(struct net_device *dev); -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev); static void mace_set_multicast(struct net_device *dev); static void mace_reset(struct net_device *dev); static int mace_set_address(struct net_device *dev, void *addr); @@ -525,7 +525,7 @@ static inline void mace_set_timeout(struct net_device *dev) mp->timeout_active = 1; } -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev) { struct mace_data *mp = netdev_priv(dev); volatile struct dbdma_regs __iomem *td = mp->tx_dma; diff --git a/drivers/net/ethernet/apple/macmace.c b/drivers/net/ethernet/apple/macmace.c index 137cbb470af2..376f2c2613e7 100644 --- a/drivers/net/ethernet/apple/macmace.c +++ b/drivers/net/ethernet/apple/macmace.c @@ -89,7 +89,7 @@ struct mace_frame { static int mace_open(struct net_device *dev); static int mace_close(struct net_device *dev); -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev); +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev); static void mace_set_multicast(struct net_device *dev); static int mace_set_address(struct net_device *dev, void *addr); static void mace_reset(struct net_device *dev); @@ -444,7 +444,7 @@ static int mace_close(struct net_device *dev) * Transmit a frame */ -static int mace_xmit_start(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t mace_xmit_start(struct sk_buff *skb, struct net_device *dev) { struct mace_data *mp = netdev_priv(dev); unsigned long flags; -- cgit v1.2.3 From 83fe9a966111b51a34f10c35e568e45bff34de48 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 21 Sep 2018 11:07:55 +0300 Subject: devlink: double free in devlink_resource_fill() Smatch reports that devlink_dpipe_send_and_alloc_skb() frees the skb on error so this is a double free. We fixed a bunch of these bugs in commit 7fe4d6dcbcb4 ("devlink: Remove redundant free on error path") but we accidentally overlooked this one. Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/devlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index 65fc366a78a4..8c0ed225e280 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2592,7 +2592,7 @@ send_done: if (!nlh) { err = devlink_dpipe_send_and_alloc_skb(&skb, info); if (err) - goto err_skb_send_alloc; + return err; goto send_done; } return genlmsg_reply(skb, info); @@ -2600,7 +2600,6 @@ send_done: nla_put_failure: err = -EMSGSIZE; err_resource_put: -err_skb_send_alloc: nlmsg_free(skb); return err; } -- cgit v1.2.3 From 8ac1ee6f4d62e781e3b3fd8b9c42b70371427669 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 21 Sep 2018 02:44:12 -0700 Subject: net/mlx4: Use cpumask_available for eq->affinity_mask Clang warns that the address of a pointer will always evaluated as true in a boolean context: drivers/net/ethernet/mellanox/mlx4/eq.c:243:11: warning: address of array 'eq->affinity_mask' will always evaluate to 'true' [-Wpointer-bool-conversion] if (!eq->affinity_mask || cpumask_empty(eq->affinity_mask)) ~~~~~^~~~~~~~~~~~~ 1 warning generated. Use cpumask_available, introduced in commit f7e30f01a9e2 ("cpumask: Add helper cpumask_available()"), which does the proper checking and avoids this warning. Link: https://github.com/ClangBuiltLinux/linux/issues/86 Signed-off-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/eq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 1f3372c1802e..2df92dbd38e1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -240,7 +240,8 @@ static void mlx4_set_eq_affinity_hint(struct mlx4_priv *priv, int vec) struct mlx4_dev *dev = &priv->dev; struct mlx4_eq *eq = &priv->eq_table.eq[vec]; - if (!eq->affinity_mask || cpumask_empty(eq->affinity_mask)) + if (!cpumask_available(eq->affinity_mask) || + cpumask_empty(eq->affinity_mask)) return; hint_err = irq_set_affinity_hint(eq->irq, eq->affinity_mask); -- cgit v1.2.3 From 8360ed6745df6de2d8ddff8e2116789258fe5890 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 21 Sep 2018 11:04:51 -0700 Subject: RDS: IB: Use DEFINE_PER_CPU_SHARED_ALIGNED for rds_ib_stats Clang warns when two declarations' section attributes don't match. net/rds/ib_stats.c:40:1: warning: section does not match previous declaration [-Wsection] DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); ^ ./include/linux/percpu-defs.h:142:2: note: expanded from macro 'DEFINE_PER_CPU_SHARED_ALIGNED' DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ^ ./include/linux/percpu-defs.h:93:9: note: expanded from macro 'DEFINE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name; \ ^ ./include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ net/rds/ib.h:446:1: note: previous attribute is here DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); ^ ./include/linux/percpu-defs.h:111:2: note: expanded from macro 'DECLARE_PER_CPU' DECLARE_PER_CPU_SECTION(type, name, "") ^ ./include/linux/percpu-defs.h:87:9: note: expanded from macro 'DECLARE_PER_CPU_SECTION' extern __PCPU_ATTRS(sec) __typeof__(type) name ^ ./include/linux/percpu-defs.h:49:26: note: expanded from macro '__PCPU_ATTRS' __percpu __attribute__((section(PER_CPU_BASE_SECTION sec))) \ ^ 1 warning generated. The initial definition was added in commit ec16227e1414 ("RDS/IB: Infiniband transport") and the cache aligned definition was added in commit e6babe4cc4ce ("RDS/IB: Stats and sysctls") right after. The definition probably should have been updated in net/rds/ib.h, which is what this patch does. Link: https://github.com/ClangBuiltLinux/linux/issues/114 Signed-off-by: Nathan Chancellor Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index 73427ff439f9..fd483760c910 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -443,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ -DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) #define rds_ib_stats_add(member, count) \ rds_stats_add_which(rds_ib_stats, member, count) -- cgit v1.2.3 From 474ff2600889e16280dbc6ada8bfecb216169a70 Mon Sep 17 00:00:00 2001 From: Maciej Żenczykowski Date: Sat, 22 Sep 2018 01:34:01 -0700 Subject: net-ethtool: ETHTOOL_GUFO did not and should not require CAP_NET_ADMIN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So it should not fail with EPERM even though it is no longer implemented... This is a fix for: (userns)$ egrep ^Cap /proc/self/status CapInh: 0000003fffffffff CapPrm: 0000003fffffffff CapEff: 0000003fffffffff CapBnd: 0000003fffffffff CapAmb: 0000003fffffffff (userns)$ tcpdump -i usb_rndis0 tcpdump: WARNING: usb_rndis0: SIOCETHTOOL(ETHTOOL_GUFO) ioctl failed: Operation not permitted Warning: Kernel filter failed: Bad file descriptor tcpdump: can't remove kernel filter: Bad file descriptor With this change it returns EOPNOTSUPP instead of EPERM. See also https://github.com/the-tcpdump-group/libpcap/issues/689 Fixes: 08a00fea6de2 "net: Remove references to NETIF_F_UFO from ethtool." Cc: David S. Miller Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c9993c6c2fd4..234a0ec2e932 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2624,6 +2624,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: + case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: -- cgit v1.2.3 From 16fdf8ba98391650ce4bc4f3f71629d8a413bc21 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 23 Sep 2018 12:25:15 -0700 Subject: rds: Fix build regression. Use DECLARE_* not DEFINE_* Fixes: 8360ed6745df ("RDS: IB: Use DEFINE_PER_CPU_SHARED_ALIGNED for rds_ib_stats") Signed-off-by: David S. Miller --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib.h b/net/rds/ib.h index fd483760c910..71ff356ee702 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -443,7 +443,7 @@ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); +DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); #define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member) #define rds_ib_stats_add(member, count) \ rds_stats_add_which(rds_ib_stats, member, count) -- cgit v1.2.3 From ac3d9dd034e565df2c034ab2ca71f0a9f69153c1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:38 -0700 Subject: netpoll: make ndo_poll_controller() optional As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. It seems that all networking drivers that do use NAPI for their TX completions, should not provide a ndo_poll_controller(). NAPI drivers have netpoll support already handled in core networking stack, since netpoll_poll_dev() uses poll_napi(dev) to iterate through registered NAPI contexts for a device. This patch allows netpoll_poll_dev() to process NAPI contexts even for drivers not providing ndo_poll_controller(), allowing for following patches in NAPI drivers. Also we export netpoll_poll_dev() so that it can be called by bonding/team drivers in following patches. Reported-by: Song Liu Signed-off-by: Eric Dumazet Tested-by: Song Liu Signed-off-by: David S. Miller --- include/linux/netpoll.h | 5 +++-- net/core/netpoll.c | 19 +++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 67662d01130a..3ef82d3a78db 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -49,8 +49,9 @@ struct netpoll_info { }; #ifdef CONFIG_NETPOLL -extern void netpoll_poll_disable(struct net_device *dev); -extern void netpoll_poll_enable(struct net_device *dev); +void netpoll_poll_dev(struct net_device *dev); +void netpoll_poll_disable(struct net_device *dev); +void netpoll_poll_enable(struct net_device *dev); #else static inline void netpoll_poll_disable(struct net_device *dev) { return; } static inline void netpoll_poll_enable(struct net_device *dev) { return; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 57557a6a950c..3219a2932463 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -187,16 +187,16 @@ static void poll_napi(struct net_device *dev) } } -static void netpoll_poll_dev(struct net_device *dev) +void netpoll_poll_dev(struct net_device *dev) { - const struct net_device_ops *ops; struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); + const struct net_device_ops *ops; /* Don't do any rx activity if the dev_lock mutex is held * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (down_trylock(&ni->dev_lock)) + if (!ni || down_trylock(&ni->dev_lock)) return; if (!netif_running(dev)) { @@ -205,13 +205,8 @@ static void netpoll_poll_dev(struct net_device *dev) } ops = dev->netdev_ops; - if (!ops->ndo_poll_controller) { - up(&ni->dev_lock); - return; - } - - /* Process pending work on NIC */ - ops->ndo_poll_controller(dev); + if (ops->ndo_poll_controller) + ops->ndo_poll_controller(dev); poll_napi(dev); @@ -219,6 +214,7 @@ static void netpoll_poll_dev(struct net_device *dev) zap_completion_queue(); } +EXPORT_SYMBOL(netpoll_poll_dev); void netpoll_poll_disable(struct net_device *dev) { @@ -613,8 +609,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) strlcpy(np->dev_name, ndev->name, IFNAMSIZ); INIT_WORK(&np->cleanup_work, netpoll_async_cleanup); - if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || - !ndev->netdev_ops->ndo_poll_controller) { + if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", np->dev_name); err = -ENOTSUPP; -- cgit v1.2.3 From 93f62ad5e83a13e0c224dfca5ef40f90c09aad51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:39 -0700 Subject: bonding: use netpoll_poll_dev() helper We want to allow NAPI drivers to no longer provide ndo_poll_controller() method, as it has been proven problematic. team driver must not look at its presence, but instead call netpoll_poll_dev() which factorize the needed actions. Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index a764a83f99da..0d87e11e7f1d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -971,16 +971,13 @@ static void bond_poll_controller(struct net_device *bond_dev) struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; - struct netpoll_info *ni; - const struct net_device_ops *ops; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { - ops = slave->dev->netdev_ops; - if (!bond_slave_is_up(slave) || !ops->ndo_poll_controller) + if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { @@ -992,11 +989,7 @@ static void bond_poll_controller(struct net_device *bond_dev) continue; } - ni = rcu_dereference_bh(slave->dev->npinfo); - if (down_trylock(&ni->dev_lock)) - continue; - ops->ndo_poll_controller(slave->dev); - up(&ni->dev_lock); + netpoll_poll_dev(slave->dev); } } -- cgit v1.2.3 From b80e71a986c2ab5677dc6b84923cd7030b690800 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:40 -0700 Subject: ixgbe: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgbe uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Reported-by: Song Liu Signed-off-by: Eric Dumazet Tested-by: Song Liu Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9a23d33a47ed..f27d73a7bf16 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8768,28 +8768,6 @@ static int ixgbe_del_sanmac_netdev(struct net_device *dev) return err; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* - * Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void ixgbe_netpoll(struct net_device *netdev) -{ - struct ixgbe_adapter *adapter = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__IXGBE_DOWN, &adapter->state)) - return; - - /* loop through and schedule all active queues */ - for (i = 0; i < adapter->num_q_vectors; i++) - ixgbe_msix_clean_rings(0, adapter->q_vector[i]); -} - -#endif - static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, struct ixgbe_ring *ring) { @@ -10251,9 +10229,6 @@ static const struct net_device_ops ixgbe_netdev_ops = { .ndo_get_vf_config = ixgbe_ndo_get_vf_config, .ndo_get_stats64 = ixgbe_get_stats64, .ndo_setup_tc = __ixgbe_setup_tc, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgbe_netpoll, -#endif #ifdef IXGBE_FCOE .ndo_select_queue = ixgbe_select_queue, .ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get, -- cgit v1.2.3 From 6f5d941ebadeb1150530f205af0eb5cf5bfeb219 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:41 -0700 Subject: ixgbevf: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgbevf uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index d86446d202d5..5a228582423b 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -4233,24 +4233,6 @@ static int ixgbevf_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void ixgbevf_netpoll(struct net_device *netdev) -{ - struct ixgbevf_adapter *adapter = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__IXGBEVF_DOWN, &adapter->state)) - return; - for (i = 0; i < adapter->num_rx_queues; i++) - ixgbevf_msix_clean_rings(0, adapter->q_vector[i]); -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - static int ixgbevf_suspend(struct pci_dev *pdev, pm_message_t state) { struct net_device *netdev = pci_get_drvdata(pdev); @@ -4482,9 +4464,6 @@ static const struct net_device_ops ixgbevf_netdev_ops = { .ndo_tx_timeout = ixgbevf_tx_timeout, .ndo_vlan_rx_add_vid = ixgbevf_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgbevf_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgbevf_netpoll, -#endif .ndo_features_check = ixgbevf_features_check, .ndo_bpf = ixgbevf_xdp, }; -- cgit v1.2.3 From dda9d57e2d4245c0d8e309ee7399bf2ebfca64ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:42 -0700 Subject: fm10k: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture lasts for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. fm10k uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/fm10k/fm10k.h | 3 --- drivers/net/ethernet/intel/fm10k/fm10k_netdev.c | 3 --- drivers/net/ethernet/intel/fm10k/fm10k_pci.c | 22 ---------------------- 3 files changed, 28 deletions(-) diff --git a/drivers/net/ethernet/intel/fm10k/fm10k.h b/drivers/net/ethernet/intel/fm10k/fm10k.h index a903a0ba45e1..7d42582ed48d 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k.h +++ b/drivers/net/ethernet/intel/fm10k/fm10k.h @@ -504,9 +504,6 @@ void fm10k_update_stats(struct fm10k_intfc *interface); void fm10k_service_event_schedule(struct fm10k_intfc *interface); void fm10k_macvlan_schedule(struct fm10k_intfc *interface); void fm10k_update_rx_drop_en(struct fm10k_intfc *interface); -#ifdef CONFIG_NET_POLL_CONTROLLER -void fm10k_netpoll(struct net_device *netdev); -#endif /* Netdev */ struct net_device *fm10k_alloc_netdev(const struct fm10k_info *info); diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c index 929f538d28bc..538a8467f434 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c @@ -1648,9 +1648,6 @@ static const struct net_device_ops fm10k_netdev_ops = { .ndo_udp_tunnel_del = fm10k_udp_tunnel_del, .ndo_dfwd_add_station = fm10k_dfwd_add_station, .ndo_dfwd_del_station = fm10k_dfwd_del_station, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = fm10k_netpoll, -#endif .ndo_features_check = fm10k_features_check, }; diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c index 15071e4adb98..c859ababeed5 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c @@ -1210,28 +1210,6 @@ static irqreturn_t fm10k_msix_mbx_vf(int __always_unused irq, void *data) return IRQ_HANDLED; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * fm10k_netpoll - A Polling 'interrupt' handler - * @netdev: network interface device structure - * - * This is used by netconsole to send skbs without having to re-enable - * interrupts. It's not called while the normal interrupt routine is executing. - **/ -void fm10k_netpoll(struct net_device *netdev) -{ - struct fm10k_intfc *interface = netdev_priv(netdev); - int i; - - /* if interface is down do nothing */ - if (test_bit(__FM10K_DOWN, interface->state)) - return; - - for (i = 0; i < interface->num_q_vectors; i++) - fm10k_msix_clean_rings(0, interface->q_vector[i]); -} - -#endif #define FM10K_ERR_MSG(type) case (type): error = #type; break static void fm10k_handle_fault(struct fm10k_intfc *interface, int type, struct fm10k_fault *fault) -- cgit v1.2.3 From 2753166e4be9df4677745a7f7d9d8daa94938a08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:43 -0700 Subject: ixgb: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ixgb uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. This also removes a problematic use of disable_irq() in a context it is forbidden, as explained in commit af3e0fcf7887 ("8139too: Use disable_irq_nosync() in rtl8139_poll_controller()") Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgb/ixgb_main.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c index d3e72d0f66ef..7722153c4ac2 100644 --- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c +++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c @@ -81,11 +81,6 @@ static int ixgb_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid); static void ixgb_restore_vlan(struct ixgb_adapter *adapter); -#ifdef CONFIG_NET_POLL_CONTROLLER -/* for netdump / net console */ -static void ixgb_netpoll(struct net_device *dev); -#endif - static pci_ers_result_t ixgb_io_error_detected (struct pci_dev *pdev, enum pci_channel_state state); static pci_ers_result_t ixgb_io_slot_reset (struct pci_dev *pdev); @@ -348,9 +343,6 @@ static const struct net_device_ops ixgb_netdev_ops = { .ndo_tx_timeout = ixgb_tx_timeout, .ndo_vlan_rx_add_vid = ixgb_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ixgb_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ixgb_netpoll, -#endif .ndo_fix_features = ixgb_fix_features, .ndo_set_features = ixgb_set_features, }; @@ -2195,23 +2187,6 @@ ixgb_restore_vlan(struct ixgb_adapter *adapter) ixgb_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid); } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* - * Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ - -static void ixgb_netpoll(struct net_device *dev) -{ - struct ixgb_adapter *adapter = netdev_priv(dev); - - disable_irq(adapter->pdev->irq); - ixgb_intr(adapter->pdev->irq, dev); - enable_irq(adapter->pdev->irq); -} -#endif - /** * ixgb_io_error_detected - called when PCI error is detected * @pdev: pointer to pci device with error -- cgit v1.2.3 From 0542997edece2d75e0bf3e173dfddf1b2b3ef756 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:44 -0700 Subject: igb: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. igb uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igb/igb_main.c | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a32c576c1e65..0796cef96fa3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -205,10 +205,6 @@ static struct notifier_block dca_notifier = { .priority = 0 }; #endif -#ifdef CONFIG_NET_POLL_CONTROLLER -/* for netdump / net console */ -static void igb_netpoll(struct net_device *); -#endif #ifdef CONFIG_PCI_IOV static unsigned int max_vfs; module_param(max_vfs, uint, 0); @@ -2881,9 +2877,6 @@ static const struct net_device_ops igb_netdev_ops = { .ndo_set_vf_spoofchk = igb_ndo_set_vf_spoofchk, .ndo_set_vf_trust = igb_ndo_set_vf_trust, .ndo_get_vf_config = igb_ndo_get_vf_config, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = igb_netpoll, -#endif .ndo_fix_features = igb_fix_features, .ndo_set_features = igb_set_features, .ndo_fdb_add = igb_ndo_fdb_add, @@ -9053,29 +9046,6 @@ static int igb_pci_sriov_configure(struct pci_dev *dev, int num_vfs) return 0; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Polling 'interrupt' - used by things like netconsole to send skbs - * without having to re-enable interrupts. It's not called while - * the interrupt routine is executing. - */ -static void igb_netpoll(struct net_device *netdev) -{ - struct igb_adapter *adapter = netdev_priv(netdev); - struct e1000_hw *hw = &adapter->hw; - struct igb_q_vector *q_vector; - int i; - - for (i = 0; i < adapter->num_q_vectors; i++) { - q_vector = adapter->q_vector[i]; - if (adapter->flags & IGB_FLAG_HAS_MSIX) - wr32(E1000_EIMC, q_vector->eims_value); - else - igb_irq_disable(adapter); - napi_schedule(&q_vector->napi); - } -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - /** * igb_io_error_detected - called when PCI error is detected * @pdev: Pointer to PCI device -- cgit v1.2.3 From 158a08a694c4ecf9574fe452db24a6b4943853aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:45 -0700 Subject: ice: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. ice uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_main.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index f1e80eed2fd6..3f047bb43348 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4806,30 +4806,6 @@ void ice_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) stats->rx_length_errors = vsi_stats->rx_length_errors; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * ice_netpoll - polling "interrupt" handler - * @netdev: network interface device structure - * - * Used by netconsole to send skbs without having to re-enable interrupts. - * This is not called in the normal interrupt path. - */ -static void ice_netpoll(struct net_device *netdev) -{ - struct ice_netdev_priv *np = netdev_priv(netdev); - struct ice_vsi *vsi = np->vsi; - struct ice_pf *pf = vsi->back; - int i; - - if (test_bit(__ICE_DOWN, vsi->state) || - !test_bit(ICE_FLAG_MSIX_ENA, pf->flags)) - return; - - for (i = 0; i < vsi->num_q_vectors; i++) - ice_msix_clean_rings(0, vsi->q_vectors[i]); -} -#endif /* CONFIG_NET_POLL_CONTROLLER */ - /** * ice_napi_disable_all - Disable NAPI for all q_vectors in the VSI * @vsi: VSI having NAPI disabled @@ -5497,9 +5473,6 @@ static const struct net_device_ops ice_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ice_change_mtu, .ndo_get_stats64 = ice_get_stats64, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = ice_netpoll, -#endif /* CONFIG_NET_POLL_CONTROLLER */ .ndo_vlan_rx_add_vid = ice_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = ice_vlan_rx_kill_vid, .ndo_set_features = ice_set_features, -- cgit v1.2.3 From 1aa28fb98368078bcaf527bf46c0e001db934414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:46 -0700 Subject: i40evf: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. i40evf uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 26 ------------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 5906c1c1d19d..fef6d892ed4c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -396,29 +396,6 @@ static void i40evf_map_rings_to_vectors(struct i40evf_adapter *adapter) adapter->aq_required |= I40EVF_FLAG_AQ_MAP_VECTORS; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * i40evf_netpoll - A Polling 'interrupt' handler - * @netdev: network interface device structure - * - * This is used by netconsole to send skbs without having to re-enable - * interrupts. It's not called while the normal interrupt routine is executing. - **/ -static void i40evf_netpoll(struct net_device *netdev) -{ - struct i40evf_adapter *adapter = netdev_priv(netdev); - int q_vectors = adapter->num_msix_vectors - NONQ_VECS; - int i; - - /* if interface is down do nothing */ - if (test_bit(__I40E_VSI_DOWN, adapter->vsi.state)) - return; - - for (i = 0; i < q_vectors; i++) - i40evf_msix_clean_rings(0, &adapter->q_vectors[i]); -} - -#endif /** * i40evf_irq_affinity_notify - Callback for affinity changes * @notify: context as to what irq was changed @@ -3229,9 +3206,6 @@ static const struct net_device_ops i40evf_netdev_ops = { .ndo_features_check = i40evf_features_check, .ndo_fix_features = i40evf_fix_features, .ndo_set_features = i40evf_set_features, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = i40evf_netpoll, -#endif .ndo_setup_tc = i40evf_setup_tc, }; -- cgit v1.2.3 From a24b66c249f71dde11da5ba3b4c397a42ece3fc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:47 -0700 Subject: mlx4: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. mlx4 uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 6785661d1a72..fe49384eba48 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1286,20 +1286,6 @@ out: mutex_unlock(&mdev->state_lock); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void mlx4_en_netpoll(struct net_device *dev) -{ - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_cq *cq; - int i; - - for (i = 0; i < priv->tx_ring_num[TX]; i++) { - cq = priv->tx_cq[TX][i]; - napi_schedule(&cq->napi); - } -} -#endif - static int mlx4_en_set_rss_steer_rules(struct mlx4_en_priv *priv) { u64 reg_id; @@ -2946,9 +2932,6 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_tx_timeout = mlx4_en_tx_timeout, .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx4_en_netpoll, -#endif .ndo_set_features = mlx4_en_set_features, .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = __mlx4_en_setup_tc, @@ -2983,9 +2966,6 @@ static const struct net_device_ops mlx4_netdev_ops_master = { .ndo_set_vf_link_state = mlx4_en_set_vf_link_state, .ndo_get_vf_stats = mlx4_en_get_vf_stats, .ndo_get_vf_config = mlx4_en_get_vf_config, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx4_en_netpoll, -#endif .ndo_set_features = mlx4_en_set_features, .ndo_fix_features = mlx4_en_fix_features, .ndo_setup_tc = __mlx4_en_setup_tc, -- cgit v1.2.3 From 9c29bcd189f4ab1644b7125713602532d0aefdb7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:48 -0700 Subject: mlx5: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. mlx5 uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 5a7939e70190..54118b77dc1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4315,22 +4315,6 @@ static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -#ifdef CONFIG_NET_POLL_CONTROLLER -/* Fake "interrupt" called by netpoll (eg netconsole) to send skbs without - * reenabling interrupts. - */ -static void mlx5e_netpoll(struct net_device *dev) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - struct mlx5e_channels *chs = &priv->channels; - - int i; - - for (i = 0; i < chs->num; i++) - napi_schedule(&chs->c[i]->napi); -} -#endif - static const struct net_device_ops mlx5e_netdev_ops = { .ndo_open = mlx5e_open, .ndo_stop = mlx5e_close, @@ -4356,9 +4340,6 @@ static const struct net_device_ops mlx5e_netdev_ops = { #ifdef CONFIG_MLX5_EN_ARFS .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = mlx5e_netpoll, -#endif #ifdef CONFIG_MLX5_ESWITCH /* SRIOV E-Switch NDOs */ .ndo_set_vf_mac = mlx5e_set_vf_mac, -- cgit v1.2.3 From d8ea6a91ad70ae0881d76446df27d3f58fd478c3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:49 -0700 Subject: bnx2x: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. bnx2x uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Ariel Elior Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index 71362b7f6040..fcc2328bb0d9 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12894,19 +12894,6 @@ static int bnx2x_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void poll_bnx2x(struct net_device *dev) -{ - struct bnx2x *bp = netdev_priv(dev); - int i; - - for_each_eth_queue(bp, i) { - struct bnx2x_fastpath *fp = &bp->fp[i]; - napi_schedule(&bnx2x_fp(bp, fp->index, napi)); - } -} -#endif - static int bnx2x_validate_addr(struct net_device *dev) { struct bnx2x *bp = netdev_priv(dev); @@ -13113,9 +13100,6 @@ static const struct net_device_ops bnx2x_netdev_ops = { .ndo_tx_timeout = bnx2x_tx_timeout, .ndo_vlan_rx_add_vid = bnx2x_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bnx2x_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = poll_bnx2x, -#endif .ndo_setup_tc = __bnx2x_setup_tc, #ifdef CONFIG_BNX2X_SRIOV .ndo_set_vf_mac = bnx2x_set_vf_mac, -- cgit v1.2.3 From 58e0e22bff638055278ea73e34d0d07a95260790 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:50 -0700 Subject: bnxt: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. bnxt uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 177587f9c3f1..61957b0bbd8c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -7672,21 +7672,6 @@ static void bnxt_tx_timeout(struct net_device *dev) bnxt_queue_sp_work(bp); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void bnxt_poll_controller(struct net_device *dev) -{ - struct bnxt *bp = netdev_priv(dev); - int i; - - /* Only process tx rings/combined rings in netpoll mode. */ - for (i = 0; i < bp->tx_nr_rings; i++) { - struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; - - napi_schedule(&txr->bnapi->napi); - } -} -#endif - static void bnxt_timer(struct timer_list *t) { struct bnxt *bp = from_timer(bp, t, timer); @@ -8519,9 +8504,6 @@ static const struct net_device_ops bnxt_netdev_ops = { .ndo_set_vf_link_state = bnxt_set_vf_link_state, .ndo_set_vf_spoofchk = bnxt_set_vf_spoofchk, .ndo_set_vf_trust = bnxt_set_vf_trust, -#endif -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = bnxt_poll_controller, #endif .ndo_setup_tc = bnxt_setup_tc, #ifdef CONFIG_RFS_ACCEL -- cgit v1.2.3 From 0825ce70318e9c576acbdd5ceb4a8d563263cf8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:51 -0700 Subject: nfp: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. nfp uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Cc: Jakub Kicinski Acked-by: Jakub Kicinski Tested-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 253bdaef1505..8ed38fd5a852 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -3146,21 +3146,6 @@ nfp_net_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) return nfp_net_reconfig_mbox(nn, NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_KILL); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void nfp_net_netpoll(struct net_device *netdev) -{ - struct nfp_net *nn = netdev_priv(netdev); - int i; - - /* nfp_net's NAPIs are statically allocated so even if there is a race - * with reconfig path this will simply try to schedule some disabled - * NAPI instances. - */ - for (i = 0; i < nn->dp.num_stack_tx_rings; i++) - napi_schedule_irqoff(&nn->r_vecs[i].napi); -} -#endif - static void nfp_net_stat64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { @@ -3519,9 +3504,6 @@ const struct net_device_ops nfp_net_netdev_ops = { .ndo_get_stats64 = nfp_net_stat64, .ndo_vlan_rx_add_vid = nfp_net_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = nfp_net_vlan_rx_kill_vid, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = nfp_net_netpoll, -#endif .ndo_set_vf_mac = nfp_app_set_vf_mac, .ndo_set_vf_vlan = nfp_app_set_vf_vlan, .ndo_set_vf_spoofchk = nfp_app_set_vf_spoofchk, -- cgit v1.2.3 From 765cdc209cb89fb81215310a066cd0a3018ffef7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Sep 2018 15:27:52 -0700 Subject: tun: remove ndo_poll_controller As diagnosed by Song Liu, ndo_poll_controller() can be very dangerous on loaded hosts, since the cpu calling ndo_poll_controller() might steal all NAPI contexts (for all RX/TX queues of the NIC). This capture can last for unlimited amount of time, since one cpu is generally not able to drain all the queues under load. tun uses NAPI for TX completions, so we better let core networking stack call the napi->poll() to avoid the capture. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 43 ------------------------------------------- 1 file changed, 43 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index ebd07ad82431..e2648b5a3861 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1153,43 +1153,6 @@ static netdev_features_t tun_net_fix_features(struct net_device *dev, return (features & tun->set_features) | (features & ~TUN_USER_FEATURES); } -#ifdef CONFIG_NET_POLL_CONTROLLER -static void tun_poll_controller(struct net_device *dev) -{ - /* - * Tun only receives frames when: - * 1) the char device endpoint gets data from user space - * 2) the tun socket gets a sendmsg call from user space - * If NAPI is not enabled, since both of those are synchronous - * operations, we are guaranteed never to have pending data when we poll - * for it so there is nothing to do here but return. - * We need this though so netpoll recognizes us as an interface that - * supports polling, which enables bridge devices in virt setups to - * still use netconsole - * If NAPI is enabled, however, we need to schedule polling for all - * queues unless we are using napi_gro_frags(), which we call in - * process context and not in NAPI context. - */ - struct tun_struct *tun = netdev_priv(dev); - - if (tun->flags & IFF_NAPI) { - struct tun_file *tfile; - int i; - - if (tun_napi_frags_enabled(tun)) - return; - - rcu_read_lock(); - for (i = 0; i < tun->numqueues; i++) { - tfile = rcu_dereference(tun->tfiles[i]); - if (tfile->napi_enabled) - napi_schedule(&tfile->napi); - } - rcu_read_unlock(); - } - return; -} -#endif static void tun_set_headroom(struct net_device *dev, int new_hr) { @@ -1283,9 +1246,6 @@ static const struct net_device_ops tun_netdev_ops = { .ndo_start_xmit = tun_net_xmit, .ndo_fix_features = tun_net_fix_features, .ndo_select_queue = tun_select_queue, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = tun_poll_controller, -#endif .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, }; @@ -1365,9 +1325,6 @@ static const struct net_device_ops tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_select_queue = tun_select_queue, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = tun_poll_controller, -#endif .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = tun_set_headroom, .ndo_get_stats64 = tun_net_get_stats64, -- cgit v1.2.3 From d26ed6b0e5e23190d43ab34bc69cbecdc464a2cf Mon Sep 17 00:00:00 2001 From: Friedemann Gerold Date: Sat, 15 Sep 2018 18:03:39 +0300 Subject: net: aquantia: memory corruption on jumbo frames This patch fixes skb_shared area, which will be corrupted upon reception of 4K jumbo packets. Originally build_skb usage purpose was to reuse page for skb to eliminate needs of extra fragments. But that logic does not take into account that skb_shared_info should be reserved at the end of skb data area. In case packet data consumes all the page (4K), skb_shinfo location overflows the page. As a consequence, __build_skb zeroed shinfo data above the allocated page, corrupting next page. The issue is rarely seen in real life because jumbo are normally larger than 4K and that causes another code path to trigger. But it 100% reproducible with simple scapy packet, like: sendp(IP(dst="192.168.100.3") / TCP(dport=443) \ / Raw(RandString(size=(4096-40))), iface="enp1s0") Fixes: 018423e90bee ("net: ethernet: aquantia: Add ring support code") Reported-by: Friedemann Gerold Reported-by: Michael Rauch Signed-off-by: Friedemann Gerold Tested-by: Nikita Danilov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_ring.c | 32 +++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index b5f1f62e8e25..d1e1a0ba8615 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -225,9 +225,10 @@ int aq_ring_rx_clean(struct aq_ring_s *self, } /* for single fragment packets use build_skb() */ - if (buff->is_eop) { + if (buff->is_eop && + buff->len <= AQ_CFG_RX_FRAME_MAX - AQ_SKB_ALIGN) { skb = build_skb(page_address(buff->page), - buff->len + AQ_SKB_ALIGN); + AQ_CFG_RX_FRAME_MAX); if (unlikely(!skb)) { err = -ENOMEM; goto err_exit; @@ -247,18 +248,21 @@ int aq_ring_rx_clean(struct aq_ring_s *self, buff->len - ETH_HLEN, SKB_TRUESIZE(buff->len - ETH_HLEN)); - for (i = 1U, next_ = buff->next, - buff_ = &self->buff_ring[next_]; true; - next_ = buff_->next, - buff_ = &self->buff_ring[next_], ++i) { - skb_add_rx_frag(skb, i, buff_->page, 0, - buff_->len, - SKB_TRUESIZE(buff->len - - ETH_HLEN)); - buff_->is_cleaned = 1; - - if (buff_->is_eop) - break; + if (!buff->is_eop) { + for (i = 1U, next_ = buff->next, + buff_ = &self->buff_ring[next_]; + true; next_ = buff_->next, + buff_ = &self->buff_ring[next_], ++i) { + skb_add_rx_frag(skb, i, + buff_->page, 0, + buff_->len, + SKB_TRUESIZE(buff->len - + ETH_HLEN)); + buff_->is_cleaned = 1; + + if (buff_->is_eop) + break; + } } } -- cgit v1.2.3 From d8e2262a5044c1a05b4da51e5d0976f10c487a9f Mon Sep 17 00:00:00 2001 From: Saif Hasan Date: Fri, 21 Sep 2018 14:30:05 -0700 Subject: mpls: allow routes on ip6gre devices Summary: This appears to be necessary and sufficient change to enable `MPLS` on `ip6gre` tunnels (RFC4023). This diff allows IP6GRE devices to be recognized by MPLS kernel module and hence user can configure interface to accept packets with mpls headers as well setup mpls routes on them. Test Plan: Test plan consists of multiple containers connected via GRE-V6 tunnel. Then carrying out testing steps as below. - Carry out necessary sysctl settings on all containers ``` sysctl -w net.mpls.platform_labels=65536 sysctl -w net.mpls.ip_ttl_propagate=1 sysctl -w net.mpls.conf.lo.input=1 ``` - Establish IP6GRE tunnels ``` ip -6 tunnel add name if_1_2_1 mode ip6gre \ local 2401:db00:21:6048:feed:0::1 \ remote 2401:db00:21:6048:feed:0::2 key 1 ip link set dev if_1_2_1 up sysctl -w net.mpls.conf.if_1_2_1.input=1 ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link ip -6 tunnel add name if_1_3_1 mode ip6gre \ local 2401:db00:21:6048:feed:0::1 \ remote 2401:db00:21:6048:feed:0::3 key 1 ip link set dev if_1_3_1 up sysctl -w net.mpls.conf.if_1_3_1.input=1 ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link ``` - Install MPLS encap rules on node-1 towards node-2 ``` ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \ via inet 169.254.0.3 dev if_1_2_1 ``` - Install MPLS forwarding rules on node-2 and node-3 ``` // node2 ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1 // node3 ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1 ``` - Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing towards 192.168.0.1 is via IP route directly towards node1 from node4) ``` ping 192.168.0.11 ``` - tcpdump on interface to capture ping packets wrapped within MPLS header which inturn wrapped within IP6GRE header ``` 16:43:41.121073 IP6 2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2: DSTOPT GREv0, key=0x1, length 100: MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255) IP 192.168.0.1 > 192.168.0.11: ICMP echo request, id 1208, seq 45, length 64 0x0000: 6000 2cdb 006c 3c3f 2401 db00 0021 6048 `.,..l Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7a4de6d618b1..8fbe6cdbe255 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1533,10 +1533,14 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, unsigned int flags; if (event == NETDEV_REGISTER) { - /* For now just support Ethernet, IPGRE, SIT and IPIP devices */ + + /* For now just support Ethernet, IPGRE, IP6GRE, SIT and + * IPIP devices + */ if (dev->type == ARPHRD_ETHER || dev->type == ARPHRD_LOOPBACK || dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || dev->type == ARPHRD_TUNNEL) { mdev = mpls_add_dev(dev); -- cgit v1.2.3 From ccfec9e5cb2d48df5a955b7bf47f7782157d3bc2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 24 Sep 2018 15:48:19 +0200 Subject: ip_tunnel: be careful when accessing the inner header Cong noted that we need the same checks introduced by commit 76c0ddd8c3a6 ("ip6_tunnel: be careful when accessing the inner header") even for ipv4 tunnels. Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Suggested-by: Cong Wang Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index c4f5602308ed..284a22154b4e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -627,6 +627,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); + unsigned int inner_nhdr_len = 0; const struct iphdr *inner_iph; struct flowi4 fl4; u8 tos, ttl; @@ -636,6 +637,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, __be32 dst; bool connected; + /* ensure we can access the inner net header, for several users below */ + if (skb->protocol == htons(ETH_P_IP)) + inner_nhdr_len = sizeof(struct iphdr); + else if (skb->protocol == htons(ETH_P_IPV6)) + inner_nhdr_len = sizeof(struct ipv6hdr); + if (unlikely(!pskb_may_pull(skb, inner_nhdr_len))) + goto tx_error; + inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); -- cgit v1.2.3 From f4a518797b40e956b6e6220b42657e045dc24470 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 24 Sep 2018 16:56:13 +0200 Subject: net: mvneta: fix the remaining Rx descriptor unmapping issues With CONFIG_DMA_API_DEBUG enabled we get DMA unmapping warning in various places of the mvneta driver, for example when putting down an interface while traffic is passing through. The issue is when using s/w buffer management, the Rx buffers are mapped using dma_map_page but unmapped with dma_unmap_single. This patch fixes this by using the right unmapping function. Fixes: 562e2f467e71 ("net: mvneta: Improve the buffer allocation method for SWBM") Signed-off-by: Antoine Tenart Reviewed-by: Gregory CLEMENT Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 2db9708f2e24..b4ed7d394d07 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1890,8 +1890,8 @@ static void mvneta_rxq_drop_pkts(struct mvneta_port *pp, if (!data || !(rx_desc->buf_phys_addr)) continue; - dma_unmap_single(pp->dev->dev.parent, rx_desc->buf_phys_addr, - MVNETA_RX_BUF_SIZE(pp->pkt_size), DMA_FROM_DEVICE); + dma_unmap_page(pp->dev->dev.parent, rx_desc->buf_phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); __free_page(data); } } @@ -2039,9 +2039,8 @@ static int mvneta_rx_swbm(struct napi_struct *napi, frag_offset, frag_size, PAGE_SIZE); - dma_unmap_single(dev->dev.parent, phys_addr, - PAGE_SIZE, - DMA_FROM_DEVICE); + dma_unmap_page(dev->dev.parent, phys_addr, + PAGE_SIZE, DMA_FROM_DEVICE); rxq->left_size -= frag_size; } -- cgit v1.2.3