Import 2.3.18pre1
[davej-history.git] / drivers / scsi / scsi_error.c
blob30b1d77bfd70ce900776f251e613717e7aad57e0
1 /*
2 * scsi_error.c Copyright (C) 1997 Eric Youngdale
4 * SCSI error/timeout handling
5 * Initial versions: Eric Youngdale. Based upon conversations with
6 * Leonard Zubkoff and David Miller at Linux Expo,
7 * ideas originating from all over the place.
9 */
11 #define __NO_VERSION__
12 #include <linux/module.h>
14 #include <linux/sched.h>
15 #include <linux/timer.h>
16 #include <linux/string.h>
17 #include <linux/malloc.h>
18 #include <linux/ioport.h>
19 #include <linux/kernel.h>
20 #include <linux/stat.h>
21 #include <linux/blk.h>
22 #include <linux/interrupt.h>
23 #include <linux/delay.h>
24 #include <linux/smp_lock.h>
26 #define __KERNEL_SYSCALLS__
28 #include <linux/unistd.h>
30 #include <asm/system.h>
31 #include <asm/irq.h>
32 #include <asm/dma.h>
34 #include "scsi.h"
35 #include "hosts.h"
36 #include "constants.h"
38 #ifdef MODULE
39 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
40 #else
41 #define SHUTDOWN_SIGS (0UL)
42 #endif
44 #ifdef DEBUG
45 #define SENSE_TIMEOUT SCSI_TIMEOUT
46 #define ABORT_TIMEOUT SCSI_TIMEOUT
47 #define RESET_TIMEOUT SCSI_TIMEOUT
48 #else
49 #define SENSE_TIMEOUT (10*HZ)
50 #define RESET_TIMEOUT (2*HZ)
51 #define ABORT_TIMEOUT (15*HZ)
52 #endif
54 #define STATIC
57 * These should *probably* be handled by the host itself.
58 * Since it is allowed to sleep, it probably should.
60 #define BUS_RESET_SETTLE_TIME 5*HZ
61 #define HOST_RESET_SETTLE_TIME 10*HZ
64 static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
66 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
67 STATIC int scsi_request_sense(Scsi_Cmnd *);
68 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
69 STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
70 STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
71 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
72 STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
73 STATIC int scsi_try_host_reset(Scsi_Cmnd *);
74 STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
75 STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
76 STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
77 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
78 STATIC void scsi_restart_operations(struct Scsi_Host *);
79 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
83 * Function: scsi_add_timer()
85 * Purpose: Start timeout timer for a single scsi command.
87 * Arguments: SCset - command that is about to start running.
88 * timeout - amount of time to allow this command to run.
89 * complete - timeout function to call if timer isn't
90 * canceled.
92 * Returns: Nothing
94 * Notes: This should be turned into an inline function.
96 * More Notes: Each scsi command has it's own timer, and as it is added to
97 * the queue, we set up the timer. When the command completes,
98 * we cancel the timer. Pretty simple, really, especially
99 * compared to the old way of handling this crap.
101 void scsi_add_timer(Scsi_Cmnd * SCset,
102 int timeout,
103 void (*complete) (Scsi_Cmnd *))
107 * If the clock was already running for this command, then
108 * first delete the timer. The timer handling code gets rather
109 * confused if we don't do this.
111 if (SCset->eh_timeout.function != NULL) {
112 del_timer(&SCset->eh_timeout);
114 SCset->eh_timeout.data = (unsigned long) SCset;
115 SCset->eh_timeout.expires = jiffies + timeout;
116 SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
118 SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
120 add_timer(&SCset->eh_timeout);
125 * Function: scsi_delete_timer()
127 * Purpose: Delete/cancel timer for a given function.
129 * Arguments: SCset - command that we are canceling timer for.
131 * Returns: Amount of time remaining before command would have timed out.
133 * Notes: This should be turned into an inline function.
135 int scsi_delete_timer(Scsi_Cmnd * SCset)
137 int rtn;
139 rtn = jiffies - SCset->eh_timeout.expires;
140 del_timer(&SCset->eh_timeout);
142 SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p\n", SCset));
144 SCset->eh_timeout.data = (unsigned long) NULL;
145 SCset->eh_timeout.function = NULL;
147 return rtn;
151 * Function: scsi_times_out()
153 * Purpose: Timeout function for normal scsi commands..
155 * Arguments: SCpnt - command that is timing out.
157 * Returns: Nothing.
159 * Notes:
161 static void do_scsi_times_out(Scsi_Cmnd * SCpnt)
165 * Notify the low-level code that this operation failed and we are
166 * reposessing the command.
168 #ifdef ERIC_neverdef
170 * FIXME(eric)
171 * Allow the host adapter to push a queue ordering tag
172 * out to the bus to force the command in question to complete.
173 * If the host wants to do this, then we just restart the timer
174 * for the command. Before we really do this, some real thought
175 * as to the optimum way to handle this should be done. We *do*
176 * need to force ordering every so often to ensure that all requests
177 * do eventually complete, but I am not sure if this is the best way
178 * to actually go about it.
180 * Better yet, force a sync here, but don't block since we are in an
181 * interrupt.
183 if (SCpnt->host->hostt->eh_ordered_queue_tag) {
184 if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
185 scsi_add_timer(SCpnt, SCpnt->internal_timeout,
186 scsi_times_out);
187 return;
191 * FIXME(eric) - add a second special interface to handle this
192 * case. Ideally that interface can also be used to request
193 * a queu
195 if (SCpnt->host->can_queue) {
196 SCpnt->host->hostt->queuecommand(SCpnt, NULL);
198 #endif
200 /* Set the serial_number_at_timeout to the current serial_number */
201 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
203 SCpnt->eh_state = FAILED;
204 SCpnt->state = SCSI_STATE_TIMEOUT;
205 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
207 SCpnt->host->in_recovery = 1;
208 SCpnt->host->host_failed++;
210 SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
211 atomic_read(&SCpnt->host->host_active),
212 SCpnt->host->host_busy,
213 SCpnt->host->host_failed));
216 * If the host is having troubles, then look to see if this was the last
217 * command that might have failed. If so, wake up the error handler.
219 if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
220 up(SCpnt->host->eh_wait);
224 void scsi_times_out(Scsi_Cmnd * SCpnt)
226 unsigned long flags;
228 spin_lock_irqsave(&io_request_lock, flags);
229 do_scsi_times_out(SCpnt);
230 spin_unlock_irqrestore(&io_request_lock, flags);
234 * Function scsi_block_when_processing_errors
236 * Purpose: Prevent more commands from being queued while error recovery
237 * is taking place.
239 * Arguments: SDpnt - device on which we are performing recovery.
241 * Returns: FALSE The device was taken offline by error recovery.
242 * TRUE OK to proceed.
244 * Notes: We block until the host is out of error recovery, and then
245 * check to see whether the host or the device is offline.
247 int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
250 SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
252 SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
254 return SDpnt->online;
258 * Function: scsi_eh_times_out()
260 * Purpose: Timeout function for error handling.
262 * Arguments: SCpnt - command that is timing out.
264 * Returns: Nothing.
266 * Notes: During error handling, the kernel thread will be sleeping
267 * waiting for some action to complete on the device. Our only
268 * job is to record that it timed out, and to wake up the
269 * thread.
271 STATIC
272 void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
274 unsigned long flags;
275 int rtn = FAILED;
277 spin_lock_irqsave(&io_request_lock, flags);
279 SCpnt->eh_state = SCSI_STATE_TIMEOUT;
280 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
283 * As far as the low level driver is concerned, this command is still
284 * active, so we must give the low level driver a chance to abort it. (DB)
286 if (SCpnt->host->hostt->eh_abort_handler)
287 rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
289 SCpnt->request.rq_status = RQ_SCSI_DONE;
290 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
292 SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
294 if (SCpnt->host->eh_action != NULL)
295 up(SCpnt->host->eh_action);
296 else
297 printk("Missing scsi error handler thread\n");
298 spin_unlock_irqrestore(&io_request_lock, flags);
303 * Function: scsi_eh_done()
305 * Purpose: Completion function for error handling.
307 * Arguments: SCpnt - command that is timing out.
309 * Returns: Nothing.
311 * Notes: During error handling, the kernel thread will be sleeping
312 * waiting for some action to complete on the device. Our only
313 * job is to record that the action completed, and to wake up the
314 * thread.
316 STATIC
317 void scsi_eh_done(Scsi_Cmnd * SCpnt)
319 SCpnt->request.rq_status = RQ_SCSI_DONE;
321 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
322 SCpnt->eh_state = SUCCESS;
324 SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
325 SCpnt->result));
327 if (SCpnt->host->eh_action != NULL)
328 up(SCpnt->host->eh_action);
332 * Function: scsi_eh_action_done()
334 * Purpose: Completion function for error handling.
336 * Arguments: SCpnt - command that is timing out.
337 * answer - boolean that indicates whether operation succeeded.
339 * Returns: Nothing.
341 * Notes: This callback is only used for abort and reset operations.
343 STATIC
344 void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
346 SCpnt->request.rq_status = RQ_SCSI_DONE;
348 SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
349 SCpnt->eh_state = (answer ? SUCCESS : FAILED);
351 if (SCpnt->host->eh_action != NULL)
352 up(SCpnt->host->eh_action);
356 * Function: scsi_sense_valid()
358 * Purpose: Determine whether a host has automatically obtained sense
359 * information or not. If we have it, then give a recommendation
360 * as to what we should do next.
362 int scsi_sense_valid(Scsi_Cmnd * SCpnt)
364 if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
365 return FALSE;
367 return TRUE;
371 * Function: scsi_eh_retry_command()
373 * Purpose: Retry the original command
375 * Returns: SUCCESS - we were able to get the sense data.
376 * FAILED - we were not able to get the sense data.
378 * Notes: This function will *NOT* return until the command either
379 * times out, or it completes.
381 STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
383 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
384 sizeof(SCpnt->data_cmnd));
385 SCpnt->request_buffer = SCpnt->buffer;
386 SCpnt->request_bufflen = SCpnt->bufflen;
387 SCpnt->use_sg = SCpnt->old_use_sg;
388 SCpnt->cmd_len = SCpnt->old_cmd_len;
390 scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
393 * Hey, we are done. Let's look to see what happened.
395 return SCpnt->eh_state;
399 * Function: scsi_request_sense()
401 * Purpose: Request sense data from a particular target.
403 * Returns: SUCCESS - we were able to get the sense data.
404 * FAILED - we were not able to get the sense data.
406 * Notes: Some hosts automatically obtain this information, others
407 * require that we obtain it on our own.
409 * This function will *NOT* return until the command either
410 * times out, or it completes.
412 STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
414 static unsigned char generic_sense[6] =
415 {REQUEST_SENSE, 0, 0, 0, 255, 0};
416 unsigned char scsi_result0[256], *scsi_result = NULL;
419 memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
420 sizeof(generic_sense));
422 SCpnt->cmnd[1] = SCpnt->lun << 5;
424 scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
425 ? &scsi_result0[0] : scsi_init_malloc(512, GFP_ATOMIC | GFP_DMA);
427 if (scsi_result == NULL) {
428 printk("cannot allocate scsi_result in scsi_request_sense.\n");
429 return FAILED;
432 * Zero the sense buffer. Some host adapters automatically always request
433 * sense, so it is not a good idea that SCpnt->request_buffer and
434 * SCpnt->sense_buffer point to the same address (DB).
435 * 0 is not a valid sense code.
437 memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
438 memset((void *) scsi_result, 0, 256);
440 SCpnt->request_buffer = scsi_result;
441 SCpnt->request_bufflen = 256;
442 SCpnt->use_sg = 0;
443 SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
445 scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
447 /* Last chance to have valid sense data */
448 if (!scsi_sense_valid(SCpnt))
449 memcpy((void *) SCpnt->sense_buffer,
450 SCpnt->request_buffer,
451 sizeof(SCpnt->sense_buffer));
453 if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
454 scsi_init_free(scsi_result, 512);
457 * When we eventually call scsi_finish, we really wish to complete
458 * the original request, so let's restore the original data. (DB)
460 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
461 sizeof(SCpnt->data_cmnd));
462 SCpnt->request_buffer = SCpnt->buffer;
463 SCpnt->request_bufflen = SCpnt->bufflen;
464 SCpnt->use_sg = SCpnt->old_use_sg;
465 SCpnt->cmd_len = SCpnt->old_cmd_len;
468 * Hey, we are done. Let's look to see what happened.
470 return SCpnt->eh_state;
474 * Function: scsi_test_unit_ready()
476 * Purpose: Run test unit ready command to see if the device is talking to us or not.
479 STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
481 static unsigned char tur_command[6] =
482 {TEST_UNIT_READY, 0, 0, 0, 0, 0};
483 unsigned char scsi_result0[256], *scsi_result = NULL;
485 memcpy((void *) SCpnt->cmnd, (void *) tur_command,
486 sizeof(tur_command));
488 SCpnt->cmnd[1] = SCpnt->lun << 5;
490 scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
491 ? &scsi_result0[0] : scsi_init_malloc(512, GFP_ATOMIC | GFP_DMA);
493 if (scsi_result == NULL) {
494 printk("cannot allocate scsi_result in scsi_test_unit_ready.\n");
495 return FAILED;
498 * Zero the sense buffer. Some host adapters automatically always request
499 * sense, so it is not a good idea that SCpnt->request_buffer and
500 * SCpnt->sense_buffer point to the same address (DB).
501 * 0 is not a valid sense code.
503 memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
504 memset((void *) scsi_result, 0, 256);
506 SCpnt->request_buffer = scsi_result;
507 SCpnt->request_bufflen = 256;
508 SCpnt->use_sg = 0;
509 SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
510 scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
512 /* Last chance to have valid sense data */
513 if (!scsi_sense_valid(SCpnt))
514 memcpy((void *) SCpnt->sense_buffer,
515 SCpnt->request_buffer,
516 sizeof(SCpnt->sense_buffer));
518 if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
519 scsi_init_free(scsi_result, 512);
522 * When we eventually call scsi_finish, we really wish to complete
523 * the original request, so let's restore the original data. (DB)
525 memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
526 sizeof(SCpnt->data_cmnd));
527 SCpnt->request_buffer = SCpnt->buffer;
528 SCpnt->request_bufflen = SCpnt->bufflen;
529 SCpnt->use_sg = SCpnt->old_use_sg;
530 SCpnt->cmd_len = SCpnt->old_cmd_len;
533 * Hey, we are done. Let's look to see what happened.
535 return SCpnt->eh_state;
539 * This would normally need to get the IO request lock,
540 * but as it doesn't actually touch anything that needs
541 * to be locked we can avoid the lock here..
543 STATIC
544 void scsi_sleep_done(struct semaphore *sem)
546 if (sem != NULL) {
547 up(sem);
552 void scsi_sleep(int timeout)
554 DECLARE_MUTEX_LOCKED(sem);
555 struct timer_list timer;
557 init_timer(&timer);
558 timer.data = (unsigned long) &sem;
559 timer.expires = jiffies + timeout;
560 timer.function = (void (*)(unsigned long)) scsi_sleep_done;
562 SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
564 add_timer(&timer);
566 spin_unlock_irq(&io_request_lock);
567 down(&sem);
568 spin_lock_irq(&io_request_lock);
570 del_timer(&timer);
574 * Function: scsi_send_eh_cmnd
576 * Purpose: Send a command out to a device as part of error recovery.
578 * Notes: The initialization of the structures is quite a bit different
579 * in this case, and furthermore, there is a different completion
580 * handler.
582 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
584 struct Scsi_Host *host;
586 host = SCpnt->host;
588 retry:
590 * We will use a queued command if possible, otherwise we will emulate the
591 * queuing and calling of completion function ourselves.
593 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
595 if (host->can_queue) {
596 DECLARE_MUTEX_LOCKED(sem);
598 SCpnt->eh_state = SCSI_STATE_QUEUED;
600 scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
603 * Set up the semaphore so we wait for the command to complete.
605 SCpnt->host->eh_action = &sem;
606 SCpnt->request.rq_status = RQ_SCSI_BUSY;
608 host->hostt->queuecommand(SCpnt, scsi_eh_done);
609 spin_unlock_irq(&io_request_lock);
610 down(&sem);
611 spin_lock_irq(&io_request_lock);
613 SCpnt->host->eh_action = NULL;
615 del_timer(&SCpnt->eh_timeout);
618 * See if timeout. If so, tell the host to forget about it.
619 * In other words, we don't want a callback any more.
621 if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
622 SCpnt->eh_state = FAILED;
624 SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
625 SCpnt, SCpnt->eh_state));
626 } else {
627 int temp;
630 * We damn well had better never use this code. There is no timeout
631 * protection here, since we would end up waiting in the actual low
632 * level driver, we don't know how to wake it up.
634 temp = host->hostt->command(SCpnt);
635 SCpnt->result = temp;
636 if (scsi_eh_completed_normally(SCpnt)) {
637 SCpnt->eh_state = SUCCESS;
638 } else {
639 SCpnt->eh_state = FAILED;
644 * Now examine the actual status codes to see whether the command actually
645 * did complete normally.
647 if (SCpnt->eh_state == SUCCESS) {
648 switch (scsi_eh_completed_normally(SCpnt)) {
649 case SUCCESS:
650 SCpnt->eh_state = SUCCESS;
651 break;
652 case NEEDS_RETRY:
653 goto retry;
654 case FAILED:
655 default:
656 SCpnt->eh_state = FAILED;
657 break;
659 } else {
660 SCpnt->eh_state = FAILED;
665 * Function: scsi_unit_is_ready()
667 * Purpose: Called after TEST_UNIT_READY is run, to test to see if
668 * the unit responded in a way that indicates it is ready.
670 STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
672 if (SCpnt->result) {
673 if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
674 (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
675 ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
676 if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
677 ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
678 ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
679 return 0;
683 return 1;
687 * Function: scsi_eh_finish_command
689 * Purpose: Handle a command that we are finished with WRT error handling.
691 * Arguments: SClist - pointer to list into which we are putting completed commands.
692 * SCpnt - command that is completing
694 * Notes: We don't want to use the normal command completion while we are
695 * are still handling errors - it may cause other commands to be queued,
696 * and that would disturb what we are doing. Thus we really want to keep
697 * a list of pending commands for final completion, and once we
698 * are ready to leave error handling we handle completion for real.
700 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
702 SCpnt->state = SCSI_STATE_BHQUEUE;
703 SCpnt->bh_next = *SClist;
705 * Set this back so that the upper level can correctly free up
706 * things.
708 SCpnt->use_sg = SCpnt->old_use_sg;
709 *SClist = SCpnt;
713 * Function: scsi_try_to_abort_command
715 * Purpose: Ask host adapter to abort a running command.
717 * Returns: FAILED Operation failed or not supported.
718 * SUCCESS Succeeded.
720 * Notes: This function will not return until the user's completion
721 * function has been called. There is no timeout on this
722 * operation. If the author of the low-level driver wishes
723 * this operation to be timed, they can provide this facility
724 * themselves. Helper functions in scsi_error.c can be supplied
725 * to make this easier to do.
727 * Notes: It may be possible to combine this with all of the reset
728 * handling to eliminate a lot of code duplication. I don't
729 * know what makes more sense at the moment - this is just a
730 * prototype.
732 STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
734 SCpnt->eh_state = FAILED; /* Until we come up with something better */
736 if (SCpnt->host->hostt->eh_abort_handler == NULL) {
737 return FAILED;
740 * scsi_done was called just after the command timed out and before
741 * we had a chance to process it. (DB)
743 if (SCpnt->serial_number == 0)
744 return SUCCESS;
746 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
748 return SCpnt->host->hostt->eh_abort_handler(SCpnt);
752 * Function: scsi_try_bus_device_reset
754 * Purpose: Ask host adapter to perform a bus device reset for a given
755 * device.
757 * Returns: FAILED Operation failed or not supported.
758 * SUCCESS Succeeded.
760 * Notes: There is no timeout for this operation. If this operation is
761 * unreliable for a given host, then the host itself needs to put a
762 * timer on it, and set the host back to a consistent state prior
763 * to returning.
765 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
767 int rtn;
769 SCpnt->eh_state = FAILED; /* Until we come up with something better */
771 if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
772 return FAILED;
774 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
776 rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
778 if (rtn == SUCCESS)
779 SCpnt->eh_state = SUCCESS;
781 return SCpnt->eh_state;
785 * Function: scsi_try_bus_reset
787 * Purpose: Ask host adapter to perform a bus reset for a host.
789 * Returns: FAILED Operation failed or not supported.
790 * SUCCESS Succeeded.
792 * Notes:
794 STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
796 int rtn;
798 SCpnt->eh_state = FAILED; /* Until we come up with something better */
799 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
800 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
802 if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
803 return FAILED;
805 rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
807 if (rtn == SUCCESS)
808 SCpnt->eh_state = SUCCESS;
811 * If we had a successful bus reset, mark the command blocks to expect
812 * a condition code of unit attention.
814 scsi_sleep(BUS_RESET_SETTLE_TIME);
815 if (SCpnt->eh_state == SUCCESS) {
816 Scsi_Device *SDloop;
817 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
818 if (SCpnt->channel == SDloop->channel) {
819 SDloop->was_reset = 1;
820 SDloop->expecting_cc_ua = 1;
824 return SCpnt->eh_state;
828 * Function: scsi_try_host_reset
830 * Purpose: Ask host adapter to reset itself, and the bus.
832 * Returns: FAILED Operation failed or not supported.
833 * SUCCESS Succeeded.
835 * Notes:
837 STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
839 int rtn;
841 SCpnt->eh_state = FAILED; /* Until we come up with something better */
842 SCpnt->owner = SCSI_OWNER_LOWLEVEL;
843 SCpnt->serial_number_at_timeout = SCpnt->serial_number;
845 if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
846 return FAILED;
848 rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
850 if (rtn == SUCCESS)
851 SCpnt->eh_state = SUCCESS;
854 * If we had a successful host reset, mark the command blocks to expect
855 * a condition code of unit attention.
857 scsi_sleep(HOST_RESET_SETTLE_TIME);
858 if (SCpnt->eh_state == SUCCESS) {
859 Scsi_Device *SDloop;
860 for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
861 SDloop->was_reset = 1;
862 SDloop->expecting_cc_ua = 1;
865 return SCpnt->eh_state;
869 * Function: scsi_decide_disposition
871 * Purpose: Examine a command block that has come back from the low-level
872 * and figure out what to do next.
874 * Returns: SUCCESS - pass on to upper level.
875 * FAILED - pass on to error handler thread.
876 * RETRY - command should be retried.
877 * SOFTERR - command succeeded, but we need to log
878 * a soft error.
880 * Notes: This is *ONLY* called when we are examining the status
881 * after sending out the actual data command. Any commands
882 * that are queued for error recovery (i.e. TEST_UNIT_READY)
883 * do *NOT* come through here.
885 * NOTE - When this routine returns FAILED, it means the error
886 * handler thread is woken. In cases where the error code
887 * indicates an error that doesn't require the error handler
888 * thread (i.e. we don't need to abort/reset), then this function
889 * should return SUCCESS.
891 int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
893 int rtn;
896 * If the device is offline, then we clearly just pass the result back
897 * up to the top level.
899 if (SCpnt->device->online == FALSE) {
900 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
901 return SUCCESS;
904 * First check the host byte, to see if there is anything in there
905 * that would indicate what we need to do.
908 switch (host_byte(SCpnt->result)) {
909 case DID_PASSTHROUGH:
911 * No matter what, pass this through to the upper layer.
912 * Nuke this special code so that it looks like we are saying
913 * DID_OK.
915 SCpnt->result &= 0xff00ffff;
916 return SUCCESS;
917 case DID_OK:
919 * Looks good. Drop through, and check the next byte.
921 break;
922 case DID_NO_CONNECT:
923 case DID_BAD_TARGET:
924 case DID_ABORT:
926 * Note - this means that we just report the status back to the
927 * top level driver, not that we actually think that it indicates
928 * success.
930 return SUCCESS;
932 * When the low level driver returns DID_SOFT_ERROR,
933 * it is responsible for keeping an internal retry counter
934 * in order to avoid endless loops (DB)
936 case DID_SOFT_ERROR:
937 return NEEDS_RETRY;
939 case DID_BUS_BUSY:
940 case DID_PARITY:
941 case DID_ERROR:
942 goto maybe_retry;
943 case DID_TIME_OUT:
945 * When we scan the bus, we get timeout messages for
946 * these commands if there is no device available.
947 * Other hosts report DID_NO_CONNECT for the same thing.
949 if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
950 SCpnt->cmnd[0] == INQUIRY)) {
951 return SUCCESS;
952 } else {
953 return FAILED;
955 case DID_RESET:
957 * In the normal case where we haven't initiated a reset, this is
958 * a failure.
960 if (SCpnt->flags & IS_RESETTING) {
961 SCpnt->flags &= ~IS_RESETTING;
962 goto maybe_retry;
965 * Examine the sense data to figure out how to proceed from here.
966 * If there is no sense data, we will be forced into the error
967 * handler thread, where we get to examine the thing in a lot more
968 * detail.
970 return scsi_check_sense(SCpnt);
971 default:
972 return FAILED;
976 * Next, check the message byte.
978 if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
979 return FAILED;
982 * Now, check the status byte to see if this indicates anything special.
984 switch (status_byte(SCpnt->result)) {
985 case QUEUE_FULL:
987 * The case of trying to send too many commands to a tagged queueing
988 * device.
990 return ADD_TO_MLQUEUE;
991 case GOOD:
992 case COMMAND_TERMINATED:
993 return SUCCESS;
994 case CHECK_CONDITION:
995 rtn = scsi_check_sense(SCpnt);
996 if (rtn == NEEDS_RETRY) {
997 goto maybe_retry;
999 return rtn;
1000 case CONDITION_GOOD:
1001 case INTERMEDIATE_GOOD:
1002 case INTERMEDIATE_C_GOOD:
1004 * Who knows? FIXME(eric)
1006 return SUCCESS;
1007 case BUSY:
1008 case RESERVATION_CONFLICT:
1009 goto maybe_retry;
1010 default:
1011 return FAILED;
1013 return FAILED;
1015 maybe_retry:
1017 if ((++SCpnt->retries) < SCpnt->allowed) {
1018 return NEEDS_RETRY;
1019 } else {
1020 return FAILED;
1025 * Function: scsi_eh_completed_normally
1027 * Purpose: Examine a command block that has come back from the low-level
1028 * and figure out what to do next.
1030 * Returns: SUCCESS - pass on to upper level.
1031 * FAILED - pass on to error handler thread.
1032 * RETRY - command should be retried.
1033 * SOFTERR - command succeeded, but we need to log
1034 * a soft error.
1036 * Notes: This is *ONLY* called when we are examining the status
1037 * of commands queued during error recovery. The main
1038 * difference here is that we don't allow for the possibility
1039 * of retries here, and we are a lot more restrictive about what
1040 * we consider acceptable.
1042 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1044 int rtn;
1046 * First check the host byte, to see if there is anything in there
1047 * that would indicate what we need to do.
1049 if (host_byte(SCpnt->result) == DID_RESET) {
1050 if (SCpnt->flags & IS_RESETTING) {
1052 * OK, this is normal. We don't know whether in fact the
1053 * command in question really needs to be rerun or not -
1054 * if this was the original data command then the answer is yes,
1055 * otherwise we just flag it as success.
1057 SCpnt->flags &= ~IS_RESETTING;
1058 return NEEDS_RETRY;
1061 * Rats. We are already in the error handler, so we now get to try
1062 * and figure out what to do next. If the sense is valid, we have
1063 * a pretty good idea of what to do. If not, we mark it as failed.
1065 return scsi_check_sense(SCpnt);
1067 if (host_byte(SCpnt->result) != DID_OK) {
1068 return FAILED;
1071 * Next, check the message byte.
1073 if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1074 return FAILED;
1077 * Now, check the status byte to see if this indicates anything special.
1079 switch (status_byte(SCpnt->result)) {
1080 case GOOD:
1081 case COMMAND_TERMINATED:
1082 return SUCCESS;
1083 case CHECK_CONDITION:
1084 rtn = scsi_check_sense(SCpnt);
1085 if (rtn == NEEDS_RETRY) {
1086 return FAILED;
1088 return rtn;
1089 case CONDITION_GOOD:
1090 case INTERMEDIATE_GOOD:
1091 case INTERMEDIATE_C_GOOD:
1093 * Who knows? FIXME(eric)
1095 return SUCCESS;
1096 case BUSY:
1097 case QUEUE_FULL:
1098 case RESERVATION_CONFLICT:
1099 default:
1100 return FAILED;
1102 return FAILED;
1106 * Function: scsi_check_sense
1108 * Purpose: Examine sense information - give suggestion as to what
1109 * we should do with it.
1111 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1113 if (!scsi_sense_valid(SCpnt)) {
1114 return FAILED;
1116 if (SCpnt->sense_buffer[2] & 0xe0)
1117 return SUCCESS;
1119 switch (SCpnt->sense_buffer[2] & 0xf) {
1120 case NO_SENSE:
1121 return SUCCESS;
1122 case RECOVERED_ERROR:
1123 return /* SOFT_ERROR */ SUCCESS;
1125 case ABORTED_COMMAND:
1126 return NEEDS_RETRY;
1127 case NOT_READY:
1128 case UNIT_ATTENTION:
1130 * If we are expecting a CC/UA because of a bus reset that we
1131 * performed, treat this just as a retry. Otherwise this is
1132 * information that we should pass up to the upper-level driver
1133 * so that we can deal with it there.
1135 if (SCpnt->device->expecting_cc_ua) {
1136 SCpnt->device->expecting_cc_ua = 0;
1137 return NEEDS_RETRY;
1139 return SUCCESS;
1141 /* these three are not supported */
1142 case COPY_ABORTED:
1143 case VOLUME_OVERFLOW:
1144 case MISCOMPARE:
1145 return SUCCESS;
1147 case MEDIUM_ERROR:
1148 return NEEDS_RETRY;
1150 case ILLEGAL_REQUEST:
1151 case BLANK_CHECK:
1152 case DATA_PROTECT:
1153 case HARDWARE_ERROR:
1154 default:
1155 return SUCCESS;
1161 * Function: scsi_restart_operations
1163 * Purpose: Restart IO operations to the specified host.
1165 * Arguments: host - host that we are restarting
1167 * Returns: Nothing
1169 * Notes: When we entered the error handler, we blocked all further
1170 * I/O to this device. We need to 'reverse' this process.
1172 STATIC void scsi_restart_operations(struct Scsi_Host *host)
1174 Scsi_Device *SDpnt;
1177 * Next free up anything directly waiting upon the host. This will be
1178 * requests for character device operations, and also for ioctls to queued
1179 * block devices.
1181 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1183 wake_up(&host->host_wait);
1186 * Finally, block devices need an extra kick in the pants. This is because
1187 * the request queueing mechanism may have queued lots of pending requests
1188 * and there won't be a process waiting in a place where we can simply wake
1189 * it up. Thus we simply go through and call the request function to goose
1190 * the various top level drivers and get things moving again.
1192 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1193 SCSI_LOG_ERROR_RECOVERY(5, printk("Calling request function to restart things...\n"));
1195 if (SDpnt->scsi_request_fn != NULL)
1196 (*SDpnt->scsi_request_fn) ();
1201 * Function: scsi_unjam_host
1203 * Purpose: Attempt to fix a host which has a command that failed for
1204 * some reason.
1206 * Arguments: host - host that needs unjamming.
1208 * Returns: Nothing
1210 * Notes: When we come in here, we *know* that all commands on the
1211 * bus have either completed, failed or timed out. We also
1212 * know that no further commands are being sent to the host,
1213 * so things are relatively quiet and we have freedom to
1214 * fiddle with things as we wish.
1216 * Additional note: This is only the *default* implementation. It is possible
1217 * for individual drivers to supply their own version of this
1218 * function, and if the maintainer wishes to do this, it is
1219 * strongly suggested that this function be taken as a template
1220 * and modified. This function was designed to correctly handle
1221 * problems for about 95% of the different cases out there, and
1222 * it should always provide at least a reasonable amount of error
1223 * recovery.
1225 * Note3: Any command marked 'FAILED' or 'TIMEOUT' must eventually
1226 * have scsi_finish_command() called for it. We do all of
1227 * the retry stuff here, so when we restart the host after we
1228 * return it should have an empty queue.
1230 STATIC int scsi_unjam_host(struct Scsi_Host *host)
1232 int devices_failed;
1233 int numfailed;
1234 int ourrtn;
1235 int rtn = FALSE;
1236 int result;
1237 Scsi_Cmnd *SCloop;
1238 Scsi_Cmnd *SCpnt;
1239 Scsi_Device *SDpnt;
1240 Scsi_Device *SDloop;
1241 Scsi_Cmnd *SCdone;
1242 int timed_out;
1244 SCdone = NULL;
1247 * First, protect against any sort of race condition. If any of the outstanding
1248 * commands are in states that indicate that we are not yet blocked (i.e. we are
1249 * not in a quiet state) then we got woken up in error. If we ever end up here,
1250 * we need to re-examine some of the assumptions.
1252 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1253 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1254 if (SCpnt->state == SCSI_STATE_FAILED
1255 || SCpnt->state == SCSI_STATE_TIMEOUT
1256 || SCpnt->state == SCSI_STATE_INITIALIZING
1257 || SCpnt->state == SCSI_STATE_UNUSED) {
1258 continue;
1261 * Rats. Something is still floating around out there. This could
1262 * be the result of the fact that the upper level drivers are still frobbing
1263 * commands that might have succeeded. There are two outcomes. One is that
1264 * the command block will eventually be freed, and the other one is that
1265 * the command will be queued and will be finished along the way.
1267 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1270 * panic("SCSI Error handler woken too early\n");
1272 * This is no longer a problem, since now the code cares only about
1273 * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1274 * Other states are useful only to release active commands when devices are
1275 * set offline. If (host->host_active == host->host_busy) we can safely assume
1276 * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1278 * FIXME:
1279 * It is not easy to release correctly commands according to their state when
1280 * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1281 * When a device is set offline, we can have some command with
1282 * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1283 * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1284 * (DB, 17 May 1998)
1290 * Next, see if we need to request sense information. if so,
1291 * then get it now, so we have a better idea of what to do.
1292 * FIXME(eric) this has the unfortunate side effect that if a host
1293 * adapter does not automatically request sense information, that we end
1294 * up shutting it down before we request it. All hosts should be doing this
1295 * anyways, so for now all I have to say is tough noogies if you end up in here.
1296 * On second thought, this is probably a good idea. We *really* want to give
1297 * authors an incentive to automatically request this.
1299 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1301 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1302 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1303 if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1304 continue;
1306 SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1307 SCpnt->target));
1308 rtn = scsi_request_sense(SCpnt);
1309 if (rtn != SUCCESS) {
1310 continue;
1312 SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1313 SCpnt, SCpnt->result));
1314 SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1316 result = scsi_decide_disposition(SCpnt);
1319 * If the result was normal, then just pass it along to the
1320 * upper level.
1322 if (result == SUCCESS) {
1323 SCpnt->host->host_failed--;
1324 scsi_eh_finish_command(&SCdone, SCpnt);
1326 if (result != NEEDS_RETRY) {
1327 continue;
1330 * We only come in here if we want to retry a
1331 * command. The test to see whether the command
1332 * should be retried should be keeping track of the
1333 * number of tries, so we don't end up looping, of
1334 * course.
1336 SCpnt->state = NEEDS_RETRY;
1337 rtn = scsi_eh_retry_command(SCpnt);
1338 if (rtn != SUCCESS) {
1339 continue;
1342 * We eventually hand this one back to the top level.
1344 SCpnt->host->host_failed--;
1345 scsi_eh_finish_command(&SCdone, SCpnt);
1350 * Go through the list of commands and figure out where we stand and how bad things
1351 * really are.
1353 numfailed = 0;
1354 timed_out = 0;
1355 devices_failed = 0;
1356 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1357 unsigned int device_error = 0;
1359 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1360 if (SCpnt->state == SCSI_STATE_FAILED) {
1361 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1362 SCpnt->target));
1363 numfailed++;
1364 device_error++;
1366 if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1367 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1368 SCpnt->target));
1369 timed_out++;
1370 device_error++;
1373 if (device_error > 0) {
1374 devices_failed++;
1378 SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1379 numfailed, timed_out, devices_failed));
1381 if (host->host_failed == 0) {
1382 ourrtn = TRUE;
1383 goto leave;
1386 * Next, try and see whether or not it makes sense to try and abort
1387 * the running command. This only works out to be the case if we have
1388 * one command that has timed out. If the command simply failed, it
1389 * makes no sense to try and abort the command, since as far as the
1390 * host adapter is concerned, it isn't running.
1393 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1395 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1396 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1397 if (SCloop->state != SCSI_STATE_TIMEOUT) {
1398 continue;
1400 rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1401 if (rtn == SUCCESS) {
1402 rtn = scsi_test_unit_ready(SCloop);
1404 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1405 rtn = scsi_eh_retry_command(SCloop);
1407 if (rtn == SUCCESS) {
1408 SCloop->host->host_failed--;
1409 scsi_eh_finish_command(&SCdone, SCloop);
1417 * If we have corrected all of the problems, then we are done.
1419 if (host->host_failed == 0) {
1420 ourrtn = TRUE;
1421 goto leave;
1424 * Either the abort wasn't appropriate, or it didn't succeed.
1425 * Now try a bus device reset. Still, look to see whether we have
1426 * multiple devices that are jammed or not - if we have multiple devices,
1427 * it makes no sense to try BUS_DEVICE_RESET - we really would need
1428 * to try a BUS_RESET instead.
1430 * Does this make sense - should we try BDR on each device individually?
1431 * Yes, definitely.
1433 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1435 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1436 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1437 if (SCloop->state == SCSI_STATE_FAILED
1438 || SCloop->state == SCSI_STATE_TIMEOUT) {
1439 break;
1443 if (SCloop == NULL) {
1444 continue;
1447 * OK, we have a device that is having problems. Try and send
1448 * a bus device reset to it.
1450 * FIXME(eric) - make sure we handle the case where multiple
1451 * commands to the same device have failed. They all must
1452 * get properly restarted.
1454 rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1456 if (rtn == SUCCESS) {
1457 rtn = scsi_test_unit_ready(SCloop);
1459 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1460 rtn = scsi_eh_retry_command(SCloop);
1462 if (rtn == SUCCESS) {
1463 SCloop->host->host_failed--;
1464 scsi_eh_finish_command(&SCdone, SCloop);
1470 if (host->host_failed == 0) {
1471 ourrtn = TRUE;
1472 goto leave;
1475 * If we ended up here, we have serious problems. The only thing left
1476 * to try is a full bus reset. If someone has grabbed the bus and isn't
1477 * letting go, then perhaps this will help.
1479 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1482 * We really want to loop over the various channels, and do this on
1483 * a channel by channel basis. We should also check to see if any
1484 * of the failed commands are on soft_reset devices, and if so, skip
1485 * the reset.
1487 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1488 next_device:
1489 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1490 if (SCpnt->state != SCSI_STATE_FAILED
1491 && SCpnt->state != SCSI_STATE_TIMEOUT) {
1492 continue;
1495 * We have a failed command. Make sure there are no other failed
1496 * commands on the same channel that are timed out and implement a
1497 * soft reset.
1499 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1500 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1501 if (SCloop->channel != SCpnt->channel) {
1502 continue;
1504 if (SCloop->state != SCSI_STATE_FAILED
1505 && SCloop->state != SCSI_STATE_TIMEOUT) {
1506 continue;
1508 if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1510 * If this device uses the soft reset option, and this
1511 * is one of the devices acting up, then our only
1512 * option is to wait a bit, since the command is
1513 * supposedly still running.
1515 * FIXME(eric) - right now we will just end up falling
1516 * through to the 'take device offline' case.
1518 * FIXME(eric) - It is possible that the command completed
1519 * *after* the error recovery procedure started, and if this
1520 * is the case, we are worrying about nothing here.
1524 * Due to the spinlock, we will never get out of this
1525 * loop without a proper wait (DB)
1527 scsi_sleep(1 * HZ);
1529 goto next_device;
1535 * We now know that we are able to perform a reset for the
1536 * bus that SCpnt points to. There are no soft-reset devices
1537 * with outstanding timed out commands.
1539 rtn = scsi_try_bus_reset(SCpnt);
1540 if (rtn == SUCCESS) {
1541 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1542 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1543 if (SCloop->channel != SCpnt->channel) {
1544 continue;
1546 if (SCloop->state != SCSI_STATE_FAILED
1547 && SCloop->state != SCSI_STATE_TIMEOUT) {
1548 continue;
1550 rtn = scsi_test_unit_ready(SCloop);
1552 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1553 rtn = scsi_eh_retry_command(SCloop);
1555 if (rtn == SUCCESS) {
1556 SCpnt->host->host_failed--;
1557 scsi_eh_finish_command(&SCdone, SCloop);
1561 * If the bus reset worked, but we are still unable to
1562 * talk to the device, take it offline.
1563 * FIXME(eric) - is this really the correct thing to do?
1565 if (rtn != SUCCESS) {
1566 SCloop->device->online = FALSE;
1567 SCloop->host->host_failed--;
1568 scsi_eh_finish_command(&SCdone, SCloop);
1576 if (host->host_failed == 0) {
1577 ourrtn = TRUE;
1578 goto leave;
1581 * If we ended up here, we have serious problems. The only thing left
1582 * to try is a full host reset - perhaps the firmware on the device
1583 * crashed, or something like that.
1585 * It is assumed that a succesful host reset will cause *all* information
1586 * about the command to be flushed from both the host adapter *and* the
1587 * device.
1589 * FIXME(eric) - it isn't clear that devices that implement the soft reset
1590 * option can ever be cleared except via cycling the power. The problem is
1591 * that sending the host reset command will cause the host to forget
1592 * about the pending command, but the device won't forget. For now, we
1593 * skip the host reset option if any of the failed devices are configured
1594 * to use the soft reset option.
1596 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1597 next_device2:
1598 for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1599 if (SCpnt->state != SCSI_STATE_FAILED
1600 && SCpnt->state != SCSI_STATE_TIMEOUT) {
1601 continue;
1603 if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1605 * If this device uses the soft reset option, and this
1606 * is one of the devices acting up, then our only
1607 * option is to wait a bit, since the command is
1608 * supposedly still running.
1610 * FIXME(eric) - right now we will just end up falling
1611 * through to the 'take device offline' case.
1613 SCSI_LOG_ERROR_RECOVERY(3,
1614 printk("scsi_unjam_host: Unable to try hard host reset\n"));
1617 * Due to the spinlock, we will never get out of this
1618 * loop without a proper wait. (DB)
1620 scsi_sleep(1 * HZ);
1622 goto next_device2;
1624 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1627 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1629 rtn = scsi_try_host_reset(SCpnt);
1630 if (rtn == SUCCESS) {
1632 * FIXME(eric) we assume that all commands are flushed from the
1633 * controller. We should get a DID_RESET for all of the commands
1634 * that were pending. We should ignore these so that we can
1635 * guarantee that we are in a consistent state.
1637 * I believe this to be the case right now, but this needs to be
1638 * tested.
1640 for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1641 for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1642 if (SCloop->state != SCSI_STATE_FAILED
1643 && SCloop->state != SCSI_STATE_TIMEOUT) {
1644 continue;
1646 rtn = scsi_test_unit_ready(SCloop);
1648 if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1649 rtn = scsi_eh_retry_command(SCloop);
1651 if (rtn == SUCCESS) {
1652 SCpnt->host->host_failed--;
1653 scsi_eh_finish_command(&SCdone, SCloop);
1656 if (rtn != SUCCESS) {
1657 SCloop->device->online = FALSE;
1658 SCloop->host->host_failed--;
1659 scsi_eh_finish_command(&SCdone, SCloop);
1668 * If we solved all of the problems, then let's rev up the engines again.
1670 if (host->host_failed == 0) {
1671 ourrtn = TRUE;
1672 goto leave;
1675 * If the HOST RESET failed, then for now we assume that the entire host
1676 * adapter is too hosed to be of any use. For our purposes, however, it is
1677 * easier to simply take the devices offline that correspond to commands
1678 * that failed.
1680 SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1682 for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1683 for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1684 if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1685 SCloop->device->online = FALSE;
1688 * This should pass the failure up to the top level driver, and
1689 * it will have to try and do something intelligent with it.
1691 SCloop->host->host_failed--;
1693 if (SCloop->state == SCSI_STATE_TIMEOUT) {
1694 SCloop->result |= (DRIVER_TIMEOUT << 24);
1696 SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1697 SCloop->device->id, SCloop->result));
1699 scsi_eh_finish_command(&SCdone, SCloop);
1704 if (host->host_failed != 0) {
1705 panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1707 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1709 ourrtn = FALSE;
1711 leave:
1714 * We should have a list of commands that we 'finished' during the course of
1715 * error recovery. This should be the same as the list of commands that timed out
1716 * or failed. We are currently holding these things in a linked list - we didn't
1717 * put them in the bottom half queue because we wanted to keep things quiet while
1718 * we were working on recovery, and passing them up to the top level could easily
1719 * cause the top level to try and queue something else again.
1721 * Start by marking that the host is no longer in error recovery.
1723 host->in_recovery = 0;
1726 * Take the list of commands, and stick them in the bottom half queue.
1727 * The current implementation of scsi_done will do this for us - if need
1728 * be we can create a special version of this function to do the
1729 * same job for us.
1731 for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1732 SCdone = SCpnt->bh_next;
1733 SCpnt->bh_next = NULL;
1734 scsi_done(SCpnt);
1737 return (ourrtn);
1742 * Function: scsi_error_handler
1744 * Purpose: Handle errors/timeouts of scsi commands, try and clean up
1745 * and unjam the bus, and restart things.
1747 * Arguments: host - host for which we are running.
1749 * Returns: Never returns.
1751 * Notes: This is always run in the context of a kernel thread. The
1752 * idea is that we start this thing up when the kernel starts
1753 * up (one per host that we detect), and it immediately goes to
1754 * sleep and waits for some event (i.e. failure). When this
1755 * takes place, we have the job of trying to unjam the bus
1756 * and restarting things.
1759 void scsi_error_handler(void *data)
1761 struct Scsi_Host *host = (struct Scsi_Host *) data;
1762 int rtn;
1763 DECLARE_MUTEX_LOCKED(sem);
1764 unsigned long flags;
1765 struct fs_struct *fs;
1767 lock_kernel();
1770 * If we were started as result of loading a module, close all of the
1771 * user space pages. We don't need them, and if we didn't close them
1772 * they would be locked into memory.
1774 exit_mm(current);
1776 current->session = 1;
1777 current->pgrp = 1;
1779 /* Become as one with the init task */
1781 exit_fs(current); /* current->fs->count--; */
1782 fs = init_task.fs;
1783 current->fs = fs;
1784 atomic_inc(&fs->count);
1786 siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
1790 * Set the name of this process.
1792 sprintf(current->comm, "scsi_eh_%d", host->host_no);
1794 host->eh_wait = &sem;
1795 host->ehandler = current;
1797 unlock_kernel();
1800 * Wake up the thread that created us.
1802 SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host->eh_notify->count.counter));
1804 up(host->eh_notify);
1806 while (1) {
1808 * If we get a signal, it means we are supposed to go
1809 * away and die. This typically happens if the user is
1810 * trying to unload a module.
1812 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1813 down_interruptible(&sem);
1815 if (signal_pending(current))
1816 break;
1818 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1820 spin_lock_irqsave(&io_request_lock, flags);
1821 host->eh_active = 1;
1824 * We have a host that is failing for some reason. Figure out
1825 * what we need to do to get it up and online again (if we can).
1826 * If we fail, we end up taking the thing offline.
1828 if (host->hostt->eh_strategy_handler != NULL) {
1829 rtn = host->hostt->eh_strategy_handler(host);
1830 } else {
1831 rtn = scsi_unjam_host(host);
1834 host->eh_active = 0;
1837 * Note - if the above fails completely, the action is to take
1838 * individual devices offline and flush the queue of any
1839 * outstanding requests that may have been pending. When we
1840 * restart, we restart any I/O to any other devices on the bus
1841 * which are still online.
1843 scsi_restart_operations(host);
1845 /* The spinlock is really needed up to this point. (DB) */
1846 spin_unlock_irqrestore(&io_request_lock, flags);
1849 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1852 * Make sure that nobody tries to wake us up again.
1854 host->eh_wait = NULL;
1857 * Knock this down too. From this point on, the host is flying
1858 * without a pilot. If this is because the module is being unloaded,
1859 * that's fine. If the user sent a signal to this thing, we are
1860 * potentially in real danger.
1862 host->in_recovery = 0;
1863 host->eh_active = 0;
1864 host->ehandler = NULL;
1867 * If anyone is waiting for us to exit (i.e. someone trying to unload
1868 * a driver), then wake up that process to let them know we are on
1869 * the way out the door. This may be overkill - I *think* that we
1870 * could probably just unload the driver and send the signal, and when
1871 * the error handling thread wakes up that it would just exit without
1872 * needing to touch any memory associated with the driver itself.
1874 if (host->eh_notify != NULL)
1875 up(host->eh_notify);
1879 * Overrides for Emacs so that we follow Linus's tabbing style.
1880 * Emacs will notice this stuff at the end of the file and automatically
1881 * adjust the settings for this buffer only. This must remain at the end
1882 * of the file.
1883 * ---------------------------------------------------------------------------
1884 * Local variables:
1885 * c-indent-level: 4
1886 * c-brace-imaginary-offset: 0
1887 * c-brace-offset: -4
1888 * c-argdecl-indent: 4
1889 * c-label-offset: -4
1890 * c-continued-statement-offset: 4
1891 * c-continued-brace-offset: 0
1892 * indent-tabs-mode: nil
1893 * tab-width: 8
1894 * End: