summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorDipayaan Roy <dipayanroy@linux.microsoft.com>2026-02-27 11:15:02 +0300
committerPaolo Abeni <pabeni@redhat.com>2026-03-03 13:14:22 +0300
commit2b12ffb669553972f5cd017c69a2b81593b09106 (patch)
treef6acfc61c51075e8c6350a5ce9976b76ad5bfa18 /include
parentc69855ada28656fdd7e197b6e24cd40a04fe14d3 (diff)
downloadlinux-2b12ffb669553972f5cd017c69a2b81593b09106.tar.xz
net: mana: Trigger VF reset/recovery on health check failure due to HWC timeout
The GF stats periodic query is used as mechanism to monitor HWC health check. If this HWC command times out, it is a strong indication that the device/SoC is in a faulty state and requires recovery. Today, when a timeout is detected, the driver marks hwc_timeout_occurred, clears cached stats, and stops rescheduling the periodic work. However, the device itself is left in the same failing state. Extend the timeout handling path to trigger the existing MANA VF recovery service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item. This is expected to initiate the appropriate recovery flow by suspende resume first and if it fails then trigger a bus rescan. This change is intentionally limited to HWC command timeouts and does not trigger recovery for errors reported by the SoC as a normal command response. Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com> Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://patch.msgid.link/aaFShvKnwR5FY8dH@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'include')
-rw-r--r--include/net/mana/gdma.h16
1 files changed, 14 insertions, 2 deletions
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 766f4fb25e26..ec17004b10c0 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -215,6 +215,12 @@ enum gdma_page_type {
#define GDMA_INVALID_DMA_REGION 0
+struct mana_serv_work {
+ struct work_struct serv_work;
+ struct pci_dev *pdev;
+ enum gdma_eqe_type type;
+};
+
struct gdma_mem_info {
struct device *dev;
@@ -386,6 +392,7 @@ struct gdma_irq_context {
enum gdma_context_flags {
GC_PROBE_SUCCEEDED = 0,
+ GC_IN_SERVICE = 1,
};
struct gdma_context {
@@ -411,7 +418,6 @@ struct gdma_context {
u32 test_event_eq_id;
bool is_pf;
- bool in_service;
phys_addr_t bar0_pa;
void __iomem *bar0_va;
@@ -473,6 +479,8 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe);
void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit);
+int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type);
+
struct gdma_wqe {
u32 reserved :24;
u32 last_vbytes :8;
@@ -615,6 +623,9 @@ enum {
/* Driver can handle hardware recovery events during probe */
#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22)
+/* Driver supports self recovery on Hardware Channel timeouts */
+#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY BIT(25)
+
#define GDMA_DRV_CAP_FLAGS1 \
(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
@@ -628,7 +639,8 @@ enum {
GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \
GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
- GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY)
+ GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
+ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
#define GDMA_DRV_CAP_FLAGS2 0