diff options
author | Stefan Weinhuber <wein@de.ibm.com> | 2009-12-07 14:51:51 +0300 |
---|---|---|
committer | Martin Schwidefsky <sky@mschwide.boeblingen.de.ibm.com> | 2009-12-07 14:51:34 +0300 |
commit | eb6e199bef288611157b8198c25d12b32bf058d0 (patch) | |
tree | 80737a2703a9f4d09cee2410342aeccb281413ae /drivers/s390/block/dasd.c | |
parent | 626350b63ef2cd447023d3dc2a34eaa7ca01bfff (diff) | |
download | linux-eb6e199bef288611157b8198c25d12b32bf058d0.tar.xz |
[S390] dasd: improve error recovery for internal I/O
Most of the error conditions reported by a FICON storage server
indicate situations which can be recovered. Sometimes the host just
needs to retry an I/O request, but sometimes the recovery
is more complex and requires the device driver to wait, choose
a different path, etc.
The DASD device driver has a fully featured error recovery
for normal block layer I/O, but not for internal I/O request which
are for example used during the device bring up.
This can lead to situations where the IPL of a system fails because
DASD devices are not properly recognized.
This patch will extend the internal I/O handling to use the existing
error recovery procedures.
Signed-off-by: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'drivers/s390/block/dasd.c')
-rw-r--r-- | drivers/s390/block/dasd.c | 207 |
1 files changed, 152 insertions, 55 deletions
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 329115a4d4b3..4f211c175b55 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -63,6 +63,7 @@ static void do_restore_device(struct work_struct *); static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *); static void dasd_device_timeout(unsigned long); static void dasd_block_timeout(unsigned long); +static void __dasd_process_erp(struct dasd_device *, struct dasd_ccw_req *); /* * SECTION: Operations on the device structure. @@ -959,7 +960,7 @@ static void dasd_device_timeout(unsigned long ptr) device = (struct dasd_device *) ptr; spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags); /* re-activate request queue */ - device->stopped &= ~DASD_STOPPED_PENDING; + dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); dasd_schedule_device_bh(device); } @@ -1022,7 +1023,7 @@ void dasd_generic_handle_state_change(struct dasd_device *device) /* First of all start sense subsystem status request. */ dasd_eer_snss(device); - device->stopped &= ~DASD_STOPPED_PENDING; + dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); dasd_schedule_device_bh(device); if (device->block) dasd_schedule_block_bh(device->block); @@ -1404,6 +1405,20 @@ void dasd_schedule_device_bh(struct dasd_device *device) tasklet_hi_schedule(&device->tasklet); } +void dasd_device_set_stop_bits(struct dasd_device *device, int bits) +{ + device->stopped |= bits; +} +EXPORT_SYMBOL_GPL(dasd_device_set_stop_bits); + +void dasd_device_remove_stop_bits(struct dasd_device *device, int bits) +{ + device->stopped &= ~bits; + if (!device->stopped) + wake_up(&generic_waitq); +} +EXPORT_SYMBOL_GPL(dasd_device_remove_stop_bits); + /* * Queue a request to the head of the device ccw_queue. * Start the I/O if possible. @@ -1464,58 +1479,135 @@ static inline int _wait_for_wakeup(struct dasd_ccw_req *cqr) } /* - * Queue a request to the tail of the device ccw_queue and wait for - * it's completion. + * checks if error recovery is necessary, returns 1 if yes, 0 otherwise. */ -int dasd_sleep_on(struct dasd_ccw_req *cqr) +static int __dasd_sleep_on_erp(struct dasd_ccw_req *cqr) { struct dasd_device *device; - int rc; + dasd_erp_fn_t erp_fn; + if (cqr->status == DASD_CQR_FILLED) + return 0; device = cqr->startdev; + if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) { + if (cqr->status == DASD_CQR_TERMINATED) { + device->discipline->handle_terminated_request(cqr); + return 1; + } + if (cqr->status == DASD_CQR_NEED_ERP) { + erp_fn = device->discipline->erp_action(cqr); + erp_fn(cqr); + return 1; + } + if (cqr->status == DASD_CQR_FAILED) + dasd_log_sense(cqr, &cqr->irb); + if (cqr->refers) { + __dasd_process_erp(device, cqr); + return 1; + } + } + return 0; +} - cqr->callback = dasd_wakeup_cb; - cqr->callback_data = (void *) &generic_waitq; - dasd_add_request_tail(cqr); - wait_event(generic_waitq, _wait_for_wakeup(cqr)); +static int __dasd_sleep_on_loop_condition(struct dasd_ccw_req *cqr) +{ + if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) { + if (cqr->refers) /* erp is not done yet */ + return 1; + return ((cqr->status != DASD_CQR_DONE) && + (cqr->status != DASD_CQR_FAILED)); + } else + return (cqr->status == DASD_CQR_FILLED); +} - if (cqr->status == DASD_CQR_DONE) +static int _dasd_sleep_on(struct dasd_ccw_req *maincqr, int interruptible) +{ + struct dasd_device *device; + int rc; + struct list_head ccw_queue; + struct dasd_ccw_req *cqr; + + INIT_LIST_HEAD(&ccw_queue); + maincqr->status = DASD_CQR_FILLED; + device = maincqr->startdev; + list_add(&maincqr->blocklist, &ccw_queue); + for (cqr = maincqr; __dasd_sleep_on_loop_condition(cqr); + cqr = list_first_entry(&ccw_queue, + struct dasd_ccw_req, blocklist)) { + + if (__dasd_sleep_on_erp(cqr)) + continue; + if (cqr->status != DASD_CQR_FILLED) /* could be failed */ + continue; + + /* Non-temporary stop condition will trigger fail fast */ + if (device->stopped & ~DASD_STOPPED_PENDING && + test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags) && + (!dasd_eer_enabled(device))) { + cqr->status = DASD_CQR_FAILED; + continue; + } + + /* Don't try to start requests if device is stopped */ + if (interruptible) { + rc = wait_event_interruptible( + generic_waitq, !(device->stopped)); + if (rc == -ERESTARTSYS) { + cqr->status = DASD_CQR_FAILED; + maincqr->intrc = rc; + continue; + } + } else + wait_event(generic_waitq, !(device->stopped)); + + cqr->callback = dasd_wakeup_cb; + cqr->callback_data = (void *) &generic_waitq; + dasd_add_request_tail(cqr); + if (interruptible) { + rc = wait_event_interruptible( + generic_waitq, _wait_for_wakeup(cqr)); + if (rc == -ERESTARTSYS) { + dasd_cancel_req(cqr); + /* wait (non-interruptible) for final status */ + wait_event(generic_waitq, + _wait_for_wakeup(cqr)); + cqr->status = DASD_CQR_FAILED; + maincqr->intrc = rc; + continue; + } + } else + wait_event(generic_waitq, _wait_for_wakeup(cqr)); + } + + maincqr->endclk = get_clock(); + if ((maincqr->status != DASD_CQR_DONE) && + (maincqr->intrc != -ERESTARTSYS)) + dasd_log_sense(maincqr, &maincqr->irb); + if (maincqr->status == DASD_CQR_DONE) rc = 0; - else if (cqr->intrc) - rc = cqr->intrc; + else if (maincqr->intrc) + rc = maincqr->intrc; else rc = -EIO; return rc; } /* + * Queue a request to the tail of the device ccw_queue and wait for + * it's completion. + */ +int dasd_sleep_on(struct dasd_ccw_req *cqr) +{ + return _dasd_sleep_on(cqr, 0); +} + +/* * Queue a request to the tail of the device ccw_queue and wait * interruptible for it's completion. */ int dasd_sleep_on_interruptible(struct dasd_ccw_req *cqr) { - struct dasd_device *device; - int rc; - - device = cqr->startdev; - cqr->callback = dasd_wakeup_cb; - cqr->callback_data = (void *) &generic_waitq; - dasd_add_request_tail(cqr); - rc = wait_event_interruptible(generic_waitq, _wait_for_wakeup(cqr)); - if (rc == -ERESTARTSYS) { - dasd_cancel_req(cqr); - /* wait (non-interruptible) for final status */ - wait_event(generic_waitq, _wait_for_wakeup(cqr)); - cqr->intrc = rc; - } - - if (cqr->status == DASD_CQR_DONE) - rc = 0; - else if (cqr->intrc) - rc = cqr->intrc; - else - rc = -EIO; - return rc; + return _dasd_sleep_on(cqr, 1); } /* @@ -1629,7 +1721,7 @@ static void dasd_block_timeout(unsigned long ptr) block = (struct dasd_block *) ptr; spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags); /* re-activate request queue */ - block->base->stopped &= ~DASD_STOPPED_PENDING; + dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); dasd_schedule_block_bh(block); } @@ -1656,11 +1748,10 @@ void dasd_block_clear_timer(struct dasd_block *block) /* * Process finished error recovery ccw. */ -static inline void __dasd_block_process_erp(struct dasd_block *block, - struct dasd_ccw_req *cqr) +static void __dasd_process_erp(struct dasd_device *device, + struct dasd_ccw_req *cqr) { dasd_erp_fn_t erp_fn; - struct dasd_device *device = block->base; if (cqr->status == DASD_CQR_DONE) DBF_DEV_EVENT(DBF_NOTICE, device, "%s", "ERP successful"); @@ -1724,9 +1815,12 @@ static void __dasd_process_request_queue(struct dasd_block *block) */ if (!list_empty(&block->ccw_queue)) break; - spin_lock_irqsave(get_ccwdev_lock(basedev->cdev), flags); - basedev->stopped |= DASD_STOPPED_PENDING; - spin_unlock_irqrestore(get_ccwdev_lock(basedev->cdev), flags); + spin_lock_irqsave( + get_ccwdev_lock(basedev->cdev), flags); + dasd_device_set_stop_bits(basedev, + DASD_STOPPED_PENDING); + spin_unlock_irqrestore( + get_ccwdev_lock(basedev->cdev), flags); dasd_block_set_timer(block, HZ/2); break; } @@ -1812,7 +1906,7 @@ restart: cqr->status = DASD_CQR_FILLED; cqr->retries = 255; spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags); - base->stopped |= DASD_STOPPED_QUIESCE; + dasd_device_set_stop_bits(base, DASD_STOPPED_QUIESCE); spin_unlock_irqrestore(get_ccwdev_lock(base->cdev), flags); goto restart; @@ -1820,7 +1914,7 @@ restart: /* Process finished ERP request. */ if (cqr->refers) { - __dasd_block_process_erp(block, cqr); + __dasd_process_erp(base, cqr); goto restart; } @@ -1951,7 +2045,7 @@ restart_cb: /* Process finished ERP request. */ if (cqr->refers) { spin_lock_bh(&block->queue_lock); - __dasd_block_process_erp(block, cqr); + __dasd_process_erp(block->base, cqr); spin_unlock_bh(&block->queue_lock); /* restart list_for_xx loop since dasd_process_erp * might remove multiple elements */ @@ -2410,16 +2504,16 @@ int dasd_generic_notify(struct ccw_device *cdev, int event) cqr->status = DASD_CQR_QUEUED; cqr->retries++; } - device->stopped |= DASD_STOPPED_DC_WAIT; + dasd_device_set_stop_bits(device, DASD_STOPPED_DC_WAIT); dasd_device_clear_timer(device); dasd_schedule_device_bh(device); ret = 1; break; case CIO_OPER: /* FIXME: add a sanity check. */ - device->stopped &= ~DASD_STOPPED_DC_WAIT; + dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT); if (device->stopped & DASD_UNRESUMED_PM) { - device->stopped &= ~DASD_UNRESUMED_PM; + dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM); dasd_restore_device(device); ret = 1; break; @@ -2444,7 +2538,7 @@ int dasd_generic_pm_freeze(struct ccw_device *cdev) if (IS_ERR(device)) return PTR_ERR(device); /* disallow new I/O */ - device->stopped |= DASD_STOPPED_PM; + dasd_device_set_stop_bits(device, DASD_STOPPED_PM); /* clear active requests */ INIT_LIST_HEAD(&freeze_queue); spin_lock_irq(get_ccwdev_lock(cdev)); @@ -2496,14 +2590,18 @@ int dasd_generic_restore_device(struct ccw_device *cdev) return PTR_ERR(device); /* allow new IO again */ - device->stopped &= ~DASD_STOPPED_PM; - device->stopped &= ~DASD_UNRESUMED_PM; + dasd_device_remove_stop_bits(device, + (DASD_STOPPED_PM | DASD_UNRESUMED_PM)); dasd_schedule_device_bh(device); - if (device->discipline->restore) + /* + * call discipline restore function + * if device is stopped do nothing e.g. for disconnected devices + */ + if (device->discipline->restore && !(device->stopped)) rc = device->discipline->restore(device); - if (rc) + if (rc || device->stopped) /* * if the resume failed for the DASD we put it in * an UNRESUMED stop state @@ -2553,8 +2651,7 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, cqr->startdev = device; cqr->memdev = device; cqr->expires = 10*HZ; - clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); - cqr->retries = 2; + cqr->retries = 256; cqr->buildclk = get_clock(); cqr->status = DASD_CQR_FILLED; return cqr; |