diff options
author | Christophe Lombard <clombard@linux.vnet.ibm.com> | 2016-03-04 14:26:41 +0300 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2016-03-09 15:40:00 +0300 |
commit | 0d400f77c19e8d2606f8194846bcf18ebdc9df2a (patch) | |
tree | 307436d00d93f55e2aa27d2233c91346c14e8961 /drivers/misc/cxl/guest.c | |
parent | d601ea918b878582e60b773f2f943d8d292b2abf (diff) | |
download | linux-0d400f77c19e8d2606f8194846bcf18ebdc9df2a.tar.xz |
cxl: Adapter failure handling
Check the AFU state whenever an API is called. The hypervisor may
issue a reset of the adapter when it detects a fault. When it happens,
it launches an error recovery which will either move the AFU to a
permanent failure state, or in the disabled state.
If the AFU is found to be disabled, detach all existing contexts from
it before issuing a AFU reset to re-enable it.
Before detaching contexts, notify any kernel driver through the EEH
callbacks of the AFU pci device.
Co-authored-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Reviewed-by: Manoj Kumar <manoj@linux.vnet.ibm.com>
Acked-by: Ian Munsie <imunsie@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'drivers/misc/cxl/guest.c')
-rw-r--r-- | drivers/misc/cxl/guest.c | 167 |
1 files changed, 164 insertions, 3 deletions
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 2b07ebd2b429..8213372de2b7 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -15,6 +15,46 @@ #include "hcalls.h" #include "trace.h" +#define CXL_ERROR_DETECTED_EVENT 1 +#define CXL_SLOT_RESET_EVENT 2 +#define CXL_RESUME_EVENT 3 + +static void pci_error_handlers(struct cxl_afu *afu, + int bus_error_event, + pci_channel_state_t state) +{ + struct pci_dev *afu_dev; + + if (afu->phb == NULL) + return; + + list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) { + if (!afu_dev->driver) + continue; + + switch (bus_error_event) { + case CXL_ERROR_DETECTED_EVENT: + afu_dev->error_state = state; + + if (afu_dev->driver->err_handler && + afu_dev->driver->err_handler->error_detected) + afu_dev->driver->err_handler->error_detected(afu_dev, state); + break; + case CXL_SLOT_RESET_EVENT: + afu_dev->error_state = state; + + if (afu_dev->driver->err_handler && + afu_dev->driver->err_handler->slot_reset) + afu_dev->driver->err_handler->slot_reset(afu_dev); + break; + case CXL_RESUME_EVENT: + if (afu_dev->driver->err_handler && + afu_dev->driver->err_handler->resume) + afu_dev->driver->err_handler->resume(afu_dev); + break; + } + } +} static irqreturn_t guest_handle_psl_slice_error(struct cxl_context *ctx, u64 dsisr, u64 errstat) @@ -133,6 +173,22 @@ static irqreturn_t guest_psl_irq(int irq, void *data) return rc; } +static int afu_read_error_state(struct cxl_afu *afu, int *state_out) +{ + u64 state; + int rc = 0; + + rc = cxl_h_read_error_state(afu->guest->handle, &state); + if (!rc) { + WARN_ON(state != H_STATE_NORMAL && + state != H_STATE_DISABLE && + state != H_STATE_TEMP_UNAVAILABLE && + state != H_STATE_PERM_UNAVAILABLE); + *state_out = state & 0xffffffff; + } + return rc; +} + static irqreturn_t guest_slice_irq_err(int irq, void *data) { struct cxl_afu *afu = data; @@ -201,10 +257,26 @@ static int irq_free_range(struct cxl *adapter, int irq, int len) static int guest_reset(struct cxl *adapter) { - int rc; + struct cxl_afu *afu = NULL; + int i, rc; pr_devel("Adapter reset request\n"); + for (i = 0; i < adapter->slices; i++) { + if ((afu = adapter->afu[i])) { + pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, + pci_channel_io_frozen); + cxl_context_detach_all(afu); + } + } + rc = cxl_h_reset_adapter(adapter->guest->handle); + for (i = 0; i < adapter->slices; i++) { + if (!rc && (afu = adapter->afu[i])) { + pci_error_handlers(afu, CXL_SLOT_RESET_EVENT, + pci_channel_io_normal); + pci_error_handlers(afu, CXL_RESUME_EVENT, 0); + } + } return rc; } @@ -556,7 +628,7 @@ static int guest_detach_process(struct cxl_context *ctx) pr_devel("in %s\n", __func__); trace_cxl_detach(ctx); - if (!cxl_ops->link_ok(ctx->afu->adapter)) + if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) return -EIO; if (ctx->afu->current_mode == CXL_MODE_DIRECTED) @@ -730,8 +802,95 @@ static void guest_unmap_slice_regs(struct cxl_afu *afu) iounmap(afu->p2n_mmio); } -static bool guest_link_ok(struct cxl *cxl) +static int afu_update_state(struct cxl_afu *afu) { + int rc, cur_state; + + rc = afu_read_error_state(afu, &cur_state); + if (rc) + return rc; + + if (afu->guest->previous_state == cur_state) + return 0; + + pr_devel("AFU(%d) update state to %#x\n", afu->slice, cur_state); + + switch (cur_state) { + case H_STATE_NORMAL: + afu->guest->previous_state = cur_state; + rc = 1; + break; + + case H_STATE_DISABLE: + pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, + pci_channel_io_frozen); + + cxl_context_detach_all(afu); + if ((rc = cxl_ops->afu_reset(afu))) + pr_devel("reset hcall failed %d\n", rc); + + rc = afu_read_error_state(afu, &cur_state); + if (!rc && cur_state == H_STATE_NORMAL) { + pci_error_handlers(afu, CXL_SLOT_RESET_EVENT, + pci_channel_io_normal); + pci_error_handlers(afu, CXL_RESUME_EVENT, 0); + rc = 1; + } + afu->guest->previous_state = 0; + break; + + case H_STATE_TEMP_UNAVAILABLE: + afu->guest->previous_state = cur_state; + break; + + case H_STATE_PERM_UNAVAILABLE: + dev_err(&afu->dev, "AFU is in permanent error state\n"); + pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT, + pci_channel_io_perm_failure); + afu->guest->previous_state = cur_state; + break; + + default: + pr_err("Unexpected AFU(%d) error state: %#x\n", + afu->slice, cur_state); + return -EINVAL; + } + + return rc; +} + +static int afu_do_recovery(struct cxl_afu *afu) +{ + int rc; + + /* many threads can arrive here, in case of detach_all for example. + * Only one needs to drive the recovery + */ + if (mutex_trylock(&afu->guest->recovery_lock)) { + rc = afu_update_state(afu); + mutex_unlock(&afu->guest->recovery_lock); + return rc; + } + return 0; +} + +static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu) +{ + int state; + + if (afu) { + if (afu_read_error_state(afu, &state) || + state != H_STATE_NORMAL) { + if (afu_do_recovery(afu) > 0) { + /* check again in case we've just fixed it */ + if (!afu_read_error_state(afu, &state) && + state == H_STATE_NORMAL) + return true; + } + return false; + } + } + return true; } @@ -770,6 +929,8 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return -ENOMEM; } + mutex_init(&afu->guest->recovery_lock); + if ((rc = dev_set_name(&afu->dev, "afu%i.%i", adapter->adapter_num, slice))) |