summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrice Goglin <brice@myri.com>2009-08-07 14:44:22 +0400
committerDavid S. Miller <davem@davemloft.net>2009-08-13 08:54:59 +0400
commitd02342151c51344034fbdeceff8effcb0a77c573 (patch)
tree62d215202b8afe9e04f857d60185d40aa1a6c229
parentc9145a2df072f75d97592ddac1624baeb7bad195 (diff)
downloadlinux-d02342151c51344034fbdeceff8effcb0a77c573.tar.xz
myri10ge: improve parity error detection and recovery
Improve myri10ge parity error detection and recovery: 1) Don't restore PCI config space to a rebooted NIC until AFTER the host is quiescent. 2) Let myri10ge_close() know the NIC is dead, so it won't waste time waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN 3) When the NIC is quiet (link down, or otherwise idle link) use a pci config space read to detect a rebooted NIC. Otherwise we might never notice that a NIC rebooted Signed-off-by: Andrew Gallatin <gallatin@myri.com> Signed-off-by: Brice Goglin <brice@myri.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/myri10ge/myri10ge.c63
1 files changed, 46 insertions, 17 deletions
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 1a34f7e11d98..75deef35b1e0 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -75,7 +75,7 @@
#include "myri10ge_mcp.h"
#include "myri10ge_mcp_gen_header.h"
-#define MYRI10GE_VERSION_STR "1.5.0-1.418"
+#define MYRI10GE_VERSION_STR "1.5.0-1.432"
MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
MODULE_AUTHOR("Maintainer: help@myri.com");
@@ -188,6 +188,7 @@ struct myri10ge_slice_state {
dma_addr_t fw_stats_bus;
int watchdog_tx_done;
int watchdog_tx_req;
+ int watchdog_rx_done;
#ifdef CONFIG_MYRI10GE_DCA
int cached_dca_tag;
int cpu;
@@ -256,6 +257,7 @@ struct myri10ge_priv {
u32 link_changes;
u32 msg_enable;
unsigned int board_number;
+ int rebooted;
};
static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
netif_carrier_off(dev);
netif_tx_stop_all_queues(dev);
- old_down_cnt = mgp->down_cnt;
- mb();
- status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
- if (status)
- printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
- dev->name);
-
- wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
- if (old_down_cnt == mgp->down_cnt)
- printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
+ if (mgp->rebooted == 0) {
+ old_down_cnt = mgp->down_cnt;
+ mb();
+ status =
+ myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
+ if (status)
+ printk(KERN_ERR
+ "myri10ge: %s: Couldn't bring down link\n",
+ dev->name);
+ wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
+ HZ);
+ if (old_down_cnt == mgp->down_cnt)
+ printk(KERN_ERR "myri10ge: %s never got down irq\n",
+ dev->name);
+ }
netif_tx_disable(dev);
myri10ge_free_irq(mgp);
for (i = 0; i < mgp->num_slices; i++)
@@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
container_of(work, struct myri10ge_priv, watchdog_work);
struct myri10ge_tx_buf *tx;
u32 reboot;
- int status;
+ int status, rebooted;
int i;
u16 cmd, vendor;
mgp->watchdog_resets++;
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
+ rebooted = 0;
if ((cmd & PCI_COMMAND_MASTER) == 0) {
/* Bus master DMA disabled? Check to see
* if the card rebooted due to a parity error
@@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
myri10ge_reset_recover ? " " : " not");
if (myri10ge_reset_recover == 0)
return;
-
+ rtnl_lock();
+ mgp->rebooted = 1;
+ rebooted = 1;
+ myri10ge_close(mgp->dev);
myri10ge_reset_recover--;
-
+ mgp->rebooted = 0;
/*
* A rebooted nic will come back with config space as
* it was after power was applied to PCIe bus.
@@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
}
}
- rtnl_lock();
- myri10ge_close(mgp->dev);
+ if (!rebooted) {
+ rtnl_lock();
+ myri10ge_close(mgp->dev);
+ }
status = myri10ge_load_firmware(mgp, 1);
if (status != 0)
printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
{
struct myri10ge_priv *mgp;
struct myri10ge_slice_state *ss;
- int i, reset_needed;
+ int i, reset_needed, busy_slice_cnt;
u32 rx_pause_cnt;
+ u16 cmd;
mgp = (struct myri10ge_priv *)arg;
rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
+ busy_slice_cnt = 0;
for (i = 0, reset_needed = 0;
i < mgp->num_slices && reset_needed == 0; ++i) {
@@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
reset_needed = 1;
}
}
+ if (ss->watchdog_tx_done != ss->tx.done ||
+ ss->watchdog_rx_done != ss->rx_done.cnt) {
+ busy_slice_cnt++;
+ }
ss->watchdog_tx_done = ss->tx.done;
ss->watchdog_tx_req = ss->tx.req;
+ ss->watchdog_rx_done = ss->rx_done.cnt;
+ }
+ /* if we've sent or received no traffic, poll the NIC to
+ * ensure it is still there. Otherwise, we risk not noticing
+ * an error in a timely fashion */
+ if (busy_slice_cnt == 0) {
+ pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
+ if ((cmd & PCI_COMMAND_MASTER) == 0) {
+ reset_needed = 1;
+ }
}
mgp->watchdog_pause = rx_pause_cnt;