scsi: lpfc: Implement health checking when aborting I/O

Several errors have occurred where the adapter stops or fails but does not raise the register values for the driver to detect failure. Thus driver is unaware of the failure. The failure typically results in I/O timeouts, the I/O timeout handler failing (after several seconds), and the error handler escalating recovery policy and resulting in more errors. Eventually, the driver is in a position where things have spiraled and it can't do recovery because other recovery ops are still outstanding and it becomes unusable. Resolve the situation by having the I/O timeout handler (actually a els, SCSI I/O, NVMe ls, or NVMe I/O timeout), in addition to aborting the I/O, perform a mailbox command and look for a response from the hardware. If the mailbox command fails, it will mark the adapter offline and then invoke the adapter reset handler to clean up. The new I/O timeout test will be limited to a test every 5s. If there are multiple I/O timeouts concurrently, only the 1st I/O timeout will generate the mailbox command. Further testing will only occur once a timeout occurs after a 5s delay from the last mailbox command has expired. Link: https://lore.kernel.org/r/20210104180240.46824-14-jsmart2021@gmail.com Co-developed-by: Dick Kennedy <dick.kennedy@broadcom.com> Signed-off-by: Dick Kennedy <dick.kennedy@broadcom.com> Signed-off-by: James Smart <jsmart2021@gmail.com> Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
author: James Smart <jsmart2021@gmail.com> 2021-01-04 21:02:38 +0300
committer: Martin K. Petersen <martin.petersen@oracle.com> 2021-01-08 07:02:37 +0300
commit: a22d73b655a8ec6d41f08790e28ee19dc55d0d33 (patch)
tree: 3345859058948854b63779e22f2bd4ef555b8323 /drivers/scsi/lpfc/lpfc_sli.c
parent: 243156c0108d6f0a61c24547248fc68e5464b4bb (diff)
download: linux-a22d73b655a8ec6d41f08790e28ee19dc55d0d33.tar.xz
1 files changed, 41 insertions, 3 deletions
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 176706aaebf5..d51d86959bbc 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -4246,6 +4246,8 @@ lpfc_sli_abort_iocb_ring(struct lpfc_hba *phba, struct lpfc_sli_ring *pring)
 			lpfc_sli_issue_abort_iotag(phba, pring, iocb, NULL);
 		spin_unlock_irq(&phba->hbalock);
 	}
+	/* Make sure HBA is alive */
+	lpfc_issue_hb_tmo(phba);
 
 	/* Cancel all the IOCBs from the completions list */
 	lpfc_sli_cancel_iocbs(phba, &completions, IOSTAT_LOCAL_REJECT,
@@ -8044,7 +8046,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
 	/* Start heart beat timer */
 	mod_timer(&phba->hb_tmofunc,
 		  jiffies + msecs_to_jiffies(1000 * LPFC_HB_MBOX_INTERVAL));
-	phba->hb_outstanding = 0;
+	phba->hba_flag &= ~(HBA_HBEAT_INP | HBA_HBEAT_TMO);
 	phba->last_completion_time = jiffies;
 
 	/* start eq_delay heartbeat */
@@ -11218,6 +11220,9 @@ lpfc_sli_host_down(struct lpfc_vport *vport)
 	}
 	spin_unlock_irqrestore(&phba->hbalock, flags);
 
+	/* Make sure HBA is alive */
+	lpfc_issue_hb_tmo(phba);
+
 	/* Cancel all the IOCBs from the completions list */
 	lpfc_sli_cancel_iocbs(phba, &completions, IOSTAT_LOCAL_REJECT,
 			      IOERR_SLI_DOWN);
@@ -13029,7 +13034,21 @@ lpfc_sli_sp_intr_handler(int irq, void *dev_id)
 				spin_unlock_irqrestore(
 						&phba->pport->work_port_lock,
 						iflag);
-				lpfc_mbox_cmpl_put(phba, pmb);
+
+				/* Do NOT queue MBX_HEARTBEAT to the worker
+				 * thread for processing.
+				 */
+				if (pmbox->mbxCommand == MBX_HEARTBEAT) {
+					/* Process mbox now */
+					phba->sli.mbox_active = NULL;
+					phba->sli.sli_flag &=
+						~LPFC_SLI_MBOX_ACTIVE;
+					if (pmb->mbox_cmpl)
+						pmb->mbox_cmpl(phba, pmb);
+				} else {
+					/* Queue to worker thread to process */
+					lpfc_mbox_cmpl_put(phba, pmb);
+				}
 			}
 		} else
 			spin_unlock_irqrestore(&phba->hbalock, iflag);
@@ -13625,7 +13644,26 @@ lpfc_sli4_sp_handle_mbox_event(struct lpfc_hba *phba, struct lpfc_mcqe *mcqe)
 	phba->pport->work_port_events &= ~WORKER_MBOX_TMO;
 	spin_unlock_irqrestore(&phba->pport->work_port_lock, iflags);
 
-	/* There is mailbox completion work to do */
+	/* Do NOT queue MBX_HEARTBEAT to the worker thread for processing. */
+	if (pmbox->mbxCommand == MBX_HEARTBEAT) {
+		spin_lock_irqsave(&phba->hbalock, iflags);
+		/* Release the mailbox command posting token */
+		phba->sli.sli_flag &= ~LPFC_SLI_MBOX_ACTIVE;
+		phba->sli.mbox_active = NULL;
+		if (bf_get(lpfc_trailer_consumed, mcqe))
+			lpfc_sli4_mq_release(phba->sli4_hba.mbx_wq);
+		spin_unlock_irqrestore(&phba->hbalock, iflags);
+
+		/* Post the next mbox command, if there is one */
+		lpfc_sli4_post_async_mbox(phba);
+
+		/* Process cmpl now */
+		if (pmb->mbox_cmpl)
+			pmb->mbox_cmpl(phba, pmb);
+		return false;
+	}
+
+	/* There is mailbox completion work to queue to the worker thread */
 	spin_lock_irqsave(&phba->hbalock, iflags);
 	__lpfc_mbox_cmpl_put(phba, pmb);
 	phba->work_ha |= HA_MBATT;
author	James Smart <jsmart2021@gmail.com>	2021-01-04 21:02:38 +0300
committer	Martin K. Petersen <martin.petersen@oracle.com>	2021-01-08 07:02:37 +0300
commit	a22d73b655a8ec6d41f08790e28ee19dc55d0d33 (patch)
tree	3345859058948854b63779e22f2bd4ef555b8323 /drivers/scsi/lpfc/lpfc_sli.c
parent	243156c0108d6f0a61c24547248fc68e5464b4bb (diff)
download	linux-a22d73b655a8ec6d41f08790e28ee19dc55d0d33.tar.xz