2 * scsi_error.c Copyright (C) 1997 Eric Youngdale
4 * SCSI error/timeout handling
5 * Initial versions: Eric Youngdale. Based upon conversations with
6 * Leonard Zubkoff and David Miller at Linux Expo,
7 * ideas originating from all over the place.
11 #define __NO_VERSION__
12 #include <linux/module.h>
14 #include <linux/sched.h>
15 #include <linux/timer.h>
16 #include <linux/string.h>
17 #include <linux/malloc.h>
18 #include <linux/ioport.h>
19 #include <linux/kernel.h>
20 #include <linux/stat.h>
21 #include <linux/blk.h>
22 #include <linux/interrupt.h>
23 #include <linux/delay.h>
24 #include <linux/smp_lock.h>
26 #define __KERNEL_SYSCALLS__
28 #include <linux/unistd.h>
30 #include <asm/system.h>
36 #include "constants.h"
39 #define SHUTDOWN_SIGS (sigmask(SIGKILL)|sigmask(SIGINT)|sigmask(SIGTERM))
41 #define SHUTDOWN_SIGS (0UL)
45 #define SENSE_TIMEOUT SCSI_TIMEOUT
46 #define ABORT_TIMEOUT SCSI_TIMEOUT
47 #define RESET_TIMEOUT SCSI_TIMEOUT
49 #define SENSE_TIMEOUT (10*HZ)
50 #define RESET_TIMEOUT (2*HZ)
51 #define ABORT_TIMEOUT (15*HZ)
57 * These should *probably* be handled by the host itself.
58 * Since it is allowed to sleep, it probably should.
60 #define BUS_RESET_SETTLE_TIME 5*HZ
61 #define HOST_RESET_SETTLE_TIME 10*HZ
64 static const char RCSid
[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
66 STATIC
int scsi_check_sense(Scsi_Cmnd
* SCpnt
);
67 STATIC
int scsi_request_sense(Scsi_Cmnd
*);
68 STATIC
void scsi_send_eh_cmnd(Scsi_Cmnd
* SCpnt
, int timeout
);
69 STATIC
int scsi_try_to_abort_command(Scsi_Cmnd
*, int);
70 STATIC
int scsi_test_unit_ready(Scsi_Cmnd
*);
71 STATIC
int scsi_try_bus_device_reset(Scsi_Cmnd
*, int timeout
);
72 STATIC
int scsi_try_bus_reset(Scsi_Cmnd
*);
73 STATIC
int scsi_try_host_reset(Scsi_Cmnd
*);
74 STATIC
int scsi_unit_is_ready(Scsi_Cmnd
*);
75 STATIC
void scsi_eh_action_done(Scsi_Cmnd
*, int);
76 STATIC
int scsi_eh_retry_command(Scsi_Cmnd
*);
77 STATIC
int scsi_eh_completed_normally(Scsi_Cmnd
* SCpnt
);
78 STATIC
void scsi_restart_operations(struct Scsi_Host
*);
79 STATIC
void scsi_eh_finish_command(Scsi_Cmnd
** SClist
, Scsi_Cmnd
* SCpnt
);
83 * Function: scsi_add_timer()
85 * Purpose: Start timeout timer for a single scsi command.
87 * Arguments: SCset - command that is about to start running.
88 * timeout - amount of time to allow this command to run.
89 * complete - timeout function to call if timer isn't
94 * Notes: This should be turned into an inline function.
96 * More Notes: Each scsi command has it's own timer, and as it is added to
97 * the queue, we set up the timer. When the command completes,
98 * we cancel the timer. Pretty simple, really, especially
99 * compared to the old way of handling this crap.
101 void scsi_add_timer(Scsi_Cmnd
* SCset
,
103 void (*complete
) (Scsi_Cmnd
*))
107 * If the clock was already running for this command, then
108 * first delete the timer. The timer handling code gets rather
109 * confused if we don't do this.
111 if (SCset
->eh_timeout
.function
!= NULL
) {
112 del_timer(&SCset
->eh_timeout
);
114 SCset
->eh_timeout
.data
= (unsigned long) SCset
;
115 SCset
->eh_timeout
.expires
= jiffies
+ timeout
;
116 SCset
->eh_timeout
.function
= (void (*)(unsigned long)) complete
;
118 SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset
, timeout
, complete
));
120 add_timer(&SCset
->eh_timeout
);
125 * Function: scsi_delete_timer()
127 * Purpose: Delete/cancel timer for a given function.
129 * Arguments: SCset - command that we are canceling timer for.
131 * Returns: Amount of time remaining before command would have timed out.
133 * Notes: This should be turned into an inline function.
135 int scsi_delete_timer(Scsi_Cmnd
* SCset
)
139 rtn
= jiffies
- SCset
->eh_timeout
.expires
;
140 del_timer(&SCset
->eh_timeout
);
142 SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p\n", SCset
));
144 SCset
->eh_timeout
.data
= (unsigned long) NULL
;
145 SCset
->eh_timeout
.function
= NULL
;
151 * Function: scsi_times_out()
153 * Purpose: Timeout function for normal scsi commands..
155 * Arguments: SCpnt - command that is timing out.
161 static void do_scsi_times_out(Scsi_Cmnd
* SCpnt
)
165 * Notify the low-level code that this operation failed and we are
166 * reposessing the command.
171 * Allow the host adapter to push a queue ordering tag
172 * out to the bus to force the command in question to complete.
173 * If the host wants to do this, then we just restart the timer
174 * for the command. Before we really do this, some real thought
175 * as to the optimum way to handle this should be done. We *do*
176 * need to force ordering every so often to ensure that all requests
177 * do eventually complete, but I am not sure if this is the best way
178 * to actually go about it.
180 * Better yet, force a sync here, but don't block since we are in an
183 if (SCpnt
->host
->hostt
->eh_ordered_queue_tag
) {
184 if ((*SCpnt
->host
->hostt
->eh_ordered_queue_tag
) (SCpnt
)) {
185 scsi_add_timer(SCpnt
, SCpnt
->internal_timeout
,
191 * FIXME(eric) - add a second special interface to handle this
192 * case. Ideally that interface can also be used to request
195 if (SCpnt
->host
->can_queue
) {
196 SCpnt
->host
->hostt
->queuecommand(SCpnt
, NULL
);
200 /* Set the serial_number_at_timeout to the current serial_number */
201 SCpnt
->serial_number_at_timeout
= SCpnt
->serial_number
;
203 SCpnt
->eh_state
= FAILED
;
204 SCpnt
->state
= SCSI_STATE_TIMEOUT
;
205 SCpnt
->owner
= SCSI_OWNER_ERROR_HANDLER
;
207 SCpnt
->host
->in_recovery
= 1;
208 SCpnt
->host
->host_failed
++;
210 SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
211 atomic_read(&SCpnt
->host
->host_active
),
212 SCpnt
->host
->host_busy
,
213 SCpnt
->host
->host_failed
));
216 * If the host is having troubles, then look to see if this was the last
217 * command that might have failed. If so, wake up the error handler.
219 if (SCpnt
->host
->host_busy
== SCpnt
->host
->host_failed
) {
220 up(SCpnt
->host
->eh_wait
);
224 void scsi_times_out(Scsi_Cmnd
* SCpnt
)
228 spin_lock_irqsave(&io_request_lock
, flags
);
229 do_scsi_times_out(SCpnt
);
230 spin_unlock_irqrestore(&io_request_lock
, flags
);
234 * Function scsi_block_when_processing_errors
236 * Purpose: Prevent more commands from being queued while error recovery
239 * Arguments: SDpnt - device on which we are performing recovery.
241 * Returns: FALSE The device was taken offline by error recovery.
242 * TRUE OK to proceed.
244 * Notes: We block until the host is out of error recovery, and then
245 * check to see whether the host or the device is offline.
247 int scsi_block_when_processing_errors(Scsi_Device
* SDpnt
)
250 SCSI_SLEEP(&SDpnt
->host
->host_wait
, SDpnt
->host
->in_recovery
);
252 SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt
->online
));
254 return SDpnt
->online
;
258 * Function: scsi_eh_times_out()
260 * Purpose: Timeout function for error handling.
262 * Arguments: SCpnt - command that is timing out.
266 * Notes: During error handling, the kernel thread will be sleeping
267 * waiting for some action to complete on the device. Our only
268 * job is to record that it timed out, and to wake up the
272 void scsi_eh_times_out(Scsi_Cmnd
* SCpnt
)
277 spin_lock_irqsave(&io_request_lock
, flags
);
279 SCpnt
->eh_state
= SCSI_STATE_TIMEOUT
;
280 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
283 * As far as the low level driver is concerned, this command is still
284 * active, so we must give the low level driver a chance to abort it. (DB)
286 if (SCpnt
->host
->hostt
->eh_abort_handler
)
287 rtn
= SCpnt
->host
->hostt
->eh_abort_handler(SCpnt
);
289 SCpnt
->request
.rq_status
= RQ_SCSI_DONE
;
290 SCpnt
->owner
= SCSI_OWNER_ERROR_HANDLER
;
292 SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt
));
294 if (SCpnt
->host
->eh_action
!= NULL
)
295 up(SCpnt
->host
->eh_action
);
297 printk("Missing scsi error handler thread\n");
298 spin_unlock_irqrestore(&io_request_lock
, flags
);
303 * Function: scsi_eh_done()
305 * Purpose: Completion function for error handling.
307 * Arguments: SCpnt - command that is timing out.
311 * Notes: During error handling, the kernel thread will be sleeping
312 * waiting for some action to complete on the device. Our only
313 * job is to record that the action completed, and to wake up the
317 void scsi_eh_done(Scsi_Cmnd
* SCpnt
)
319 SCpnt
->request
.rq_status
= RQ_SCSI_DONE
;
321 SCpnt
->owner
= SCSI_OWNER_ERROR_HANDLER
;
322 SCpnt
->eh_state
= SUCCESS
;
324 SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt
,
327 if (SCpnt
->host
->eh_action
!= NULL
)
328 up(SCpnt
->host
->eh_action
);
332 * Function: scsi_eh_action_done()
334 * Purpose: Completion function for error handling.
336 * Arguments: SCpnt - command that is timing out.
337 * answer - boolean that indicates whether operation succeeded.
341 * Notes: This callback is only used for abort and reset operations.
344 void scsi_eh_action_done(Scsi_Cmnd
* SCpnt
, int answer
)
346 SCpnt
->request
.rq_status
= RQ_SCSI_DONE
;
348 SCpnt
->owner
= SCSI_OWNER_ERROR_HANDLER
;
349 SCpnt
->eh_state
= (answer
? SUCCESS
: FAILED
);
351 if (SCpnt
->host
->eh_action
!= NULL
)
352 up(SCpnt
->host
->eh_action
);
356 * Function: scsi_sense_valid()
358 * Purpose: Determine whether a host has automatically obtained sense
359 * information or not. If we have it, then give a recommendation
360 * as to what we should do next.
362 int scsi_sense_valid(Scsi_Cmnd
* SCpnt
)
364 if (((SCpnt
->sense_buffer
[0] & 0x70) >> 4) != 7) {
371 * Function: scsi_eh_retry_command()
373 * Purpose: Retry the original command
375 * Returns: SUCCESS - we were able to get the sense data.
376 * FAILED - we were not able to get the sense data.
378 * Notes: This function will *NOT* return until the command either
379 * times out, or it completes.
381 STATIC
int scsi_eh_retry_command(Scsi_Cmnd
* SCpnt
)
383 memcpy((void *) SCpnt
->cmnd
, (void *) SCpnt
->data_cmnd
,
384 sizeof(SCpnt
->data_cmnd
));
385 SCpnt
->request_buffer
= SCpnt
->buffer
;
386 SCpnt
->request_bufflen
= SCpnt
->bufflen
;
387 SCpnt
->use_sg
= SCpnt
->old_use_sg
;
388 SCpnt
->cmd_len
= SCpnt
->old_cmd_len
;
390 scsi_send_eh_cmnd(SCpnt
, SCpnt
->timeout_per_command
);
393 * Hey, we are done. Let's look to see what happened.
395 return SCpnt
->eh_state
;
399 * Function: scsi_request_sense()
401 * Purpose: Request sense data from a particular target.
403 * Returns: SUCCESS - we were able to get the sense data.
404 * FAILED - we were not able to get the sense data.
406 * Notes: Some hosts automatically obtain this information, others
407 * require that we obtain it on our own.
409 * This function will *NOT* return until the command either
410 * times out, or it completes.
412 STATIC
int scsi_request_sense(Scsi_Cmnd
* SCpnt
)
414 static unsigned char generic_sense
[6] =
415 {REQUEST_SENSE
, 0, 0, 0, 255, 0};
416 unsigned char scsi_result0
[256], *scsi_result
= NULL
;
419 memcpy((void *) SCpnt
->cmnd
, (void *) generic_sense
,
420 sizeof(generic_sense
));
422 SCpnt
->cmnd
[1] = SCpnt
->lun
<< 5;
424 scsi_result
= (!SCpnt
->host
->hostt
->unchecked_isa_dma
)
425 ? &scsi_result0
[0] : scsi_init_malloc(512, GFP_ATOMIC
| GFP_DMA
);
427 if (scsi_result
== NULL
) {
428 printk("cannot allocate scsi_result in scsi_request_sense.\n");
432 * Zero the sense buffer. Some host adapters automatically always request
433 * sense, so it is not a good idea that SCpnt->request_buffer and
434 * SCpnt->sense_buffer point to the same address (DB).
435 * 0 is not a valid sense code.
437 memset((void *) SCpnt
->sense_buffer
, 0, sizeof(SCpnt
->sense_buffer
));
438 memset((void *) scsi_result
, 0, 256);
440 SCpnt
->request_buffer
= scsi_result
;
441 SCpnt
->request_bufflen
= 256;
443 SCpnt
->cmd_len
= COMMAND_SIZE(SCpnt
->cmnd
[0]);
445 scsi_send_eh_cmnd(SCpnt
, SENSE_TIMEOUT
);
447 /* Last chance to have valid sense data */
448 if (!scsi_sense_valid(SCpnt
))
449 memcpy((void *) SCpnt
->sense_buffer
,
450 SCpnt
->request_buffer
,
451 sizeof(SCpnt
->sense_buffer
));
453 if (scsi_result
!= &scsi_result0
[0] && scsi_result
!= NULL
)
454 scsi_init_free(scsi_result
, 512);
457 * When we eventually call scsi_finish, we really wish to complete
458 * the original request, so let's restore the original data. (DB)
460 memcpy((void *) SCpnt
->cmnd
, (void *) SCpnt
->data_cmnd
,
461 sizeof(SCpnt
->data_cmnd
));
462 SCpnt
->request_buffer
= SCpnt
->buffer
;
463 SCpnt
->request_bufflen
= SCpnt
->bufflen
;
464 SCpnt
->use_sg
= SCpnt
->old_use_sg
;
465 SCpnt
->cmd_len
= SCpnt
->old_cmd_len
;
468 * Hey, we are done. Let's look to see what happened.
470 return SCpnt
->eh_state
;
474 * Function: scsi_test_unit_ready()
476 * Purpose: Run test unit ready command to see if the device is talking to us or not.
479 STATIC
int scsi_test_unit_ready(Scsi_Cmnd
* SCpnt
)
481 static unsigned char tur_command
[6] =
482 {TEST_UNIT_READY
, 0, 0, 0, 0, 0};
483 unsigned char scsi_result0
[256], *scsi_result
= NULL
;
485 memcpy((void *) SCpnt
->cmnd
, (void *) tur_command
,
486 sizeof(tur_command
));
488 SCpnt
->cmnd
[1] = SCpnt
->lun
<< 5;
490 scsi_result
= (!SCpnt
->host
->hostt
->unchecked_isa_dma
)
491 ? &scsi_result0
[0] : scsi_init_malloc(512, GFP_ATOMIC
| GFP_DMA
);
493 if (scsi_result
== NULL
) {
494 printk("cannot allocate scsi_result in scsi_test_unit_ready.\n");
498 * Zero the sense buffer. Some host adapters automatically always request
499 * sense, so it is not a good idea that SCpnt->request_buffer and
500 * SCpnt->sense_buffer point to the same address (DB).
501 * 0 is not a valid sense code.
503 memset((void *) SCpnt
->sense_buffer
, 0, sizeof(SCpnt
->sense_buffer
));
504 memset((void *) scsi_result
, 0, 256);
506 SCpnt
->request_buffer
= scsi_result
;
507 SCpnt
->request_bufflen
= 256;
509 SCpnt
->cmd_len
= COMMAND_SIZE(SCpnt
->cmnd
[0]);
510 scsi_send_eh_cmnd(SCpnt
, SENSE_TIMEOUT
);
512 /* Last chance to have valid sense data */
513 if (!scsi_sense_valid(SCpnt
))
514 memcpy((void *) SCpnt
->sense_buffer
,
515 SCpnt
->request_buffer
,
516 sizeof(SCpnt
->sense_buffer
));
518 if (scsi_result
!= &scsi_result0
[0] && scsi_result
!= NULL
)
519 scsi_init_free(scsi_result
, 512);
522 * When we eventually call scsi_finish, we really wish to complete
523 * the original request, so let's restore the original data. (DB)
525 memcpy((void *) SCpnt
->cmnd
, (void *) SCpnt
->data_cmnd
,
526 sizeof(SCpnt
->data_cmnd
));
527 SCpnt
->request_buffer
= SCpnt
->buffer
;
528 SCpnt
->request_bufflen
= SCpnt
->bufflen
;
529 SCpnt
->use_sg
= SCpnt
->old_use_sg
;
530 SCpnt
->cmd_len
= SCpnt
->old_cmd_len
;
533 * Hey, we are done. Let's look to see what happened.
535 return SCpnt
->eh_state
;
539 * This would normally need to get the IO request lock,
540 * but as it doesn't actually touch anything that needs
541 * to be locked we can avoid the lock here..
544 void scsi_sleep_done(struct semaphore
*sem
)
552 void scsi_sleep(int timeout
)
554 DECLARE_MUTEX_LOCKED(sem
);
555 struct timer_list timer
;
558 timer
.data
= (unsigned long) &sem
;
559 timer
.expires
= jiffies
+ timeout
;
560 timer
.function
= (void (*)(unsigned long)) scsi_sleep_done
;
562 SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout
));
566 spin_unlock_irq(&io_request_lock
);
568 spin_lock_irq(&io_request_lock
);
574 * Function: scsi_send_eh_cmnd
576 * Purpose: Send a command out to a device as part of error recovery.
578 * Notes: The initialization of the structures is quite a bit different
579 * in this case, and furthermore, there is a different completion
582 STATIC
void scsi_send_eh_cmnd(Scsi_Cmnd
* SCpnt
, int timeout
)
584 struct Scsi_Host
*host
;
590 * We will use a queued command if possible, otherwise we will emulate the
591 * queuing and calling of completion function ourselves.
593 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
595 if (host
->can_queue
) {
596 DECLARE_MUTEX_LOCKED(sem
);
598 SCpnt
->eh_state
= SCSI_STATE_QUEUED
;
600 scsi_add_timer(SCpnt
, timeout
, scsi_eh_times_out
);
603 * Set up the semaphore so we wait for the command to complete.
605 SCpnt
->host
->eh_action
= &sem
;
606 SCpnt
->request
.rq_status
= RQ_SCSI_BUSY
;
608 host
->hostt
->queuecommand(SCpnt
, scsi_eh_done
);
609 spin_unlock_irq(&io_request_lock
);
611 spin_lock_irq(&io_request_lock
);
613 SCpnt
->host
->eh_action
= NULL
;
615 del_timer(&SCpnt
->eh_timeout
);
618 * See if timeout. If so, tell the host to forget about it.
619 * In other words, we don't want a callback any more.
621 if (SCpnt
->eh_state
== SCSI_STATE_TIMEOUT
) {
622 SCpnt
->eh_state
= FAILED
;
624 SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
625 SCpnt
, SCpnt
->eh_state
));
630 * We damn well had better never use this code. There is no timeout
631 * protection here, since we would end up waiting in the actual low
632 * level driver, we don't know how to wake it up.
634 temp
= host
->hostt
->command(SCpnt
);
635 SCpnt
->result
= temp
;
636 if (scsi_eh_completed_normally(SCpnt
)) {
637 SCpnt
->eh_state
= SUCCESS
;
639 SCpnt
->eh_state
= FAILED
;
644 * Now examine the actual status codes to see whether the command actually
645 * did complete normally.
647 if (SCpnt
->eh_state
== SUCCESS
) {
648 switch (scsi_eh_completed_normally(SCpnt
)) {
650 SCpnt
->eh_state
= SUCCESS
;
656 SCpnt
->eh_state
= FAILED
;
660 SCpnt
->eh_state
= FAILED
;
665 * Function: scsi_unit_is_ready()
667 * Purpose: Called after TEST_UNIT_READY is run, to test to see if
668 * the unit responded in a way that indicates it is ready.
670 STATIC
int scsi_unit_is_ready(Scsi_Cmnd
* SCpnt
)
673 if (((driver_byte(SCpnt
->result
) & DRIVER_SENSE
) ||
674 (status_byte(SCpnt
->result
) & CHECK_CONDITION
)) &&
675 ((SCpnt
->sense_buffer
[0] & 0x70) >> 4) == 7) {
676 if (((SCpnt
->sense_buffer
[2] & 0xf) != NOT_READY
) &&
677 ((SCpnt
->sense_buffer
[2] & 0xf) != UNIT_ATTENTION
) &&
678 ((SCpnt
->sense_buffer
[2] & 0xf) != ILLEGAL_REQUEST
)) {
687 * Function: scsi_eh_finish_command
689 * Purpose: Handle a command that we are finished with WRT error handling.
691 * Arguments: SClist - pointer to list into which we are putting completed commands.
692 * SCpnt - command that is completing
694 * Notes: We don't want to use the normal command completion while we are
695 * are still handling errors - it may cause other commands to be queued,
696 * and that would disturb what we are doing. Thus we really want to keep
697 * a list of pending commands for final completion, and once we
698 * are ready to leave error handling we handle completion for real.
700 STATIC
void scsi_eh_finish_command(Scsi_Cmnd
** SClist
, Scsi_Cmnd
* SCpnt
)
702 SCpnt
->state
= SCSI_STATE_BHQUEUE
;
703 SCpnt
->bh_next
= *SClist
;
705 * Set this back so that the upper level can correctly free up
708 SCpnt
->use_sg
= SCpnt
->old_use_sg
;
713 * Function: scsi_try_to_abort_command
715 * Purpose: Ask host adapter to abort a running command.
717 * Returns: FAILED Operation failed or not supported.
720 * Notes: This function will not return until the user's completion
721 * function has been called. There is no timeout on this
722 * operation. If the author of the low-level driver wishes
723 * this operation to be timed, they can provide this facility
724 * themselves. Helper functions in scsi_error.c can be supplied
725 * to make this easier to do.
727 * Notes: It may be possible to combine this with all of the reset
728 * handling to eliminate a lot of code duplication. I don't
729 * know what makes more sense at the moment - this is just a
732 STATIC
int scsi_try_to_abort_command(Scsi_Cmnd
* SCpnt
, int timeout
)
734 SCpnt
->eh_state
= FAILED
; /* Until we come up with something better */
736 if (SCpnt
->host
->hostt
->eh_abort_handler
== NULL
) {
740 * scsi_done was called just after the command timed out and before
741 * we had a chance to process it. (DB)
743 if (SCpnt
->serial_number
== 0)
746 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
748 return SCpnt
->host
->hostt
->eh_abort_handler(SCpnt
);
752 * Function: scsi_try_bus_device_reset
754 * Purpose: Ask host adapter to perform a bus device reset for a given
757 * Returns: FAILED Operation failed or not supported.
760 * Notes: There is no timeout for this operation. If this operation is
761 * unreliable for a given host, then the host itself needs to put a
762 * timer on it, and set the host back to a consistent state prior
765 STATIC
int scsi_try_bus_device_reset(Scsi_Cmnd
* SCpnt
, int timeout
)
769 SCpnt
->eh_state
= FAILED
; /* Until we come up with something better */
771 if (SCpnt
->host
->hostt
->eh_device_reset_handler
== NULL
) {
774 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
776 rtn
= SCpnt
->host
->hostt
->eh_device_reset_handler(SCpnt
);
779 SCpnt
->eh_state
= SUCCESS
;
781 return SCpnt
->eh_state
;
785 * Function: scsi_try_bus_reset
787 * Purpose: Ask host adapter to perform a bus reset for a host.
789 * Returns: FAILED Operation failed or not supported.
794 STATIC
int scsi_try_bus_reset(Scsi_Cmnd
* SCpnt
)
798 SCpnt
->eh_state
= FAILED
; /* Until we come up with something better */
799 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
800 SCpnt
->serial_number_at_timeout
= SCpnt
->serial_number
;
802 if (SCpnt
->host
->hostt
->eh_bus_reset_handler
== NULL
) {
805 rtn
= SCpnt
->host
->hostt
->eh_bus_reset_handler(SCpnt
);
808 SCpnt
->eh_state
= SUCCESS
;
811 * If we had a successful bus reset, mark the command blocks to expect
812 * a condition code of unit attention.
814 scsi_sleep(BUS_RESET_SETTLE_TIME
);
815 if (SCpnt
->eh_state
== SUCCESS
) {
817 for (SDloop
= SCpnt
->host
->host_queue
; SDloop
; SDloop
= SDloop
->next
) {
818 if (SCpnt
->channel
== SDloop
->channel
) {
819 SDloop
->was_reset
= 1;
820 SDloop
->expecting_cc_ua
= 1;
824 return SCpnt
->eh_state
;
828 * Function: scsi_try_host_reset
830 * Purpose: Ask host adapter to reset itself, and the bus.
832 * Returns: FAILED Operation failed or not supported.
837 STATIC
int scsi_try_host_reset(Scsi_Cmnd
* SCpnt
)
841 SCpnt
->eh_state
= FAILED
; /* Until we come up with something better */
842 SCpnt
->owner
= SCSI_OWNER_LOWLEVEL
;
843 SCpnt
->serial_number_at_timeout
= SCpnt
->serial_number
;
845 if (SCpnt
->host
->hostt
->eh_host_reset_handler
== NULL
) {
848 rtn
= SCpnt
->host
->hostt
->eh_host_reset_handler(SCpnt
);
851 SCpnt
->eh_state
= SUCCESS
;
854 * If we had a successful host reset, mark the command blocks to expect
855 * a condition code of unit attention.
857 scsi_sleep(HOST_RESET_SETTLE_TIME
);
858 if (SCpnt
->eh_state
== SUCCESS
) {
860 for (SDloop
= SCpnt
->host
->host_queue
; SDloop
; SDloop
= SDloop
->next
) {
861 SDloop
->was_reset
= 1;
862 SDloop
->expecting_cc_ua
= 1;
865 return SCpnt
->eh_state
;
869 * Function: scsi_decide_disposition
871 * Purpose: Examine a command block that has come back from the low-level
872 * and figure out what to do next.
874 * Returns: SUCCESS - pass on to upper level.
875 * FAILED - pass on to error handler thread.
876 * RETRY - command should be retried.
877 * SOFTERR - command succeeded, but we need to log
880 * Notes: This is *ONLY* called when we are examining the status
881 * after sending out the actual data command. Any commands
882 * that are queued for error recovery (i.e. TEST_UNIT_READY)
883 * do *NOT* come through here.
885 * NOTE - When this routine returns FAILED, it means the error
886 * handler thread is woken. In cases where the error code
887 * indicates an error that doesn't require the error handler
888 * thread (i.e. we don't need to abort/reset), then this function
889 * should return SUCCESS.
891 int scsi_decide_disposition(Scsi_Cmnd
* SCpnt
)
896 * If the device is offline, then we clearly just pass the result back
897 * up to the top level.
899 if (SCpnt
->device
->online
== FALSE
) {
900 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
904 * First check the host byte, to see if there is anything in there
905 * that would indicate what we need to do.
908 switch (host_byte(SCpnt
->result
)) {
909 case DID_PASSTHROUGH
:
911 * No matter what, pass this through to the upper layer.
912 * Nuke this special code so that it looks like we are saying
915 SCpnt
->result
&= 0xff00ffff;
919 * Looks good. Drop through, and check the next byte.
926 * Note - this means that we just report the status back to the
927 * top level driver, not that we actually think that it indicates
932 * When the low level driver returns DID_SOFT_ERROR,
933 * it is responsible for keeping an internal retry counter
934 * in order to avoid endless loops (DB)
945 * When we scan the bus, we get timeout messages for
946 * these commands if there is no device available.
947 * Other hosts report DID_NO_CONNECT for the same thing.
949 if ((SCpnt
->cmnd
[0] == TEST_UNIT_READY
||
950 SCpnt
->cmnd
[0] == INQUIRY
)) {
957 * In the normal case where we haven't initiated a reset, this is
960 if (SCpnt
->flags
& IS_RESETTING
) {
961 SCpnt
->flags
&= ~IS_RESETTING
;
965 * Examine the sense data to figure out how to proceed from here.
966 * If there is no sense data, we will be forced into the error
967 * handler thread, where we get to examine the thing in a lot more
970 return scsi_check_sense(SCpnt
);
976 * Next, check the message byte.
978 if (msg_byte(SCpnt
->result
) != COMMAND_COMPLETE
) {
982 * Now, check the status byte to see if this indicates anything special.
984 switch (status_byte(SCpnt
->result
)) {
987 * The case of trying to send too many commands to a tagged queueing
990 return ADD_TO_MLQUEUE
;
992 case COMMAND_TERMINATED
:
994 case CHECK_CONDITION
:
995 rtn
= scsi_check_sense(SCpnt
);
996 if (rtn
== NEEDS_RETRY
) {
1000 case CONDITION_GOOD
:
1001 case INTERMEDIATE_GOOD
:
1002 case INTERMEDIATE_C_GOOD
:
1004 * Who knows? FIXME(eric)
1008 case RESERVATION_CONFLICT
:
1017 if ((++SCpnt
->retries
) < SCpnt
->allowed
) {
1025 * Function: scsi_eh_completed_normally
1027 * Purpose: Examine a command block that has come back from the low-level
1028 * and figure out what to do next.
1030 * Returns: SUCCESS - pass on to upper level.
1031 * FAILED - pass on to error handler thread.
1032 * RETRY - command should be retried.
1033 * SOFTERR - command succeeded, but we need to log
1036 * Notes: This is *ONLY* called when we are examining the status
1037 * of commands queued during error recovery. The main
1038 * difference here is that we don't allow for the possibility
1039 * of retries here, and we are a lot more restrictive about what
1040 * we consider acceptable.
1042 STATIC
int scsi_eh_completed_normally(Scsi_Cmnd
* SCpnt
)
1046 * First check the host byte, to see if there is anything in there
1047 * that would indicate what we need to do.
1049 if (host_byte(SCpnt
->result
) == DID_RESET
) {
1050 if (SCpnt
->flags
& IS_RESETTING
) {
1052 * OK, this is normal. We don't know whether in fact the
1053 * command in question really needs to be rerun or not -
1054 * if this was the original data command then the answer is yes,
1055 * otherwise we just flag it as success.
1057 SCpnt
->flags
&= ~IS_RESETTING
;
1061 * Rats. We are already in the error handler, so we now get to try
1062 * and figure out what to do next. If the sense is valid, we have
1063 * a pretty good idea of what to do. If not, we mark it as failed.
1065 return scsi_check_sense(SCpnt
);
1067 if (host_byte(SCpnt
->result
) != DID_OK
) {
1071 * Next, check the message byte.
1073 if (msg_byte(SCpnt
->result
) != COMMAND_COMPLETE
) {
1077 * Now, check the status byte to see if this indicates anything special.
1079 switch (status_byte(SCpnt
->result
)) {
1081 case COMMAND_TERMINATED
:
1083 case CHECK_CONDITION
:
1084 rtn
= scsi_check_sense(SCpnt
);
1085 if (rtn
== NEEDS_RETRY
) {
1089 case CONDITION_GOOD
:
1090 case INTERMEDIATE_GOOD
:
1091 case INTERMEDIATE_C_GOOD
:
1093 * Who knows? FIXME(eric)
1098 case RESERVATION_CONFLICT
:
1106 * Function: scsi_check_sense
1108 * Purpose: Examine sense information - give suggestion as to what
1109 * we should do with it.
1111 STATIC
int scsi_check_sense(Scsi_Cmnd
* SCpnt
)
1113 if (!scsi_sense_valid(SCpnt
)) {
1116 if (SCpnt
->sense_buffer
[2] & 0xe0)
1119 switch (SCpnt
->sense_buffer
[2] & 0xf) {
1122 case RECOVERED_ERROR
:
1123 return /* SOFT_ERROR */ SUCCESS
;
1125 case ABORTED_COMMAND
:
1128 case UNIT_ATTENTION
:
1130 * If we are expecting a CC/UA because of a bus reset that we
1131 * performed, treat this just as a retry. Otherwise this is
1132 * information that we should pass up to the upper-level driver
1133 * so that we can deal with it there.
1135 if (SCpnt
->device
->expecting_cc_ua
) {
1136 SCpnt
->device
->expecting_cc_ua
= 0;
1141 /* these three are not supported */
1143 case VOLUME_OVERFLOW
:
1150 case ILLEGAL_REQUEST
:
1153 case HARDWARE_ERROR
:
1161 * Function: scsi_restart_operations
1163 * Purpose: Restart IO operations to the specified host.
1165 * Arguments: host - host that we are restarting
1169 * Notes: When we entered the error handler, we blocked all further
1170 * I/O to this device. We need to 'reverse' this process.
1172 STATIC
void scsi_restart_operations(struct Scsi_Host
*host
)
1177 * Next free up anything directly waiting upon the host. This will be
1178 * requests for character device operations, and also for ioctls to queued
1181 SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1183 wake_up(&host
->host_wait
);
1186 * Finally, block devices need an extra kick in the pants. This is because
1187 * the request queueing mechanism may have queued lots of pending requests
1188 * and there won't be a process waiting in a place where we can simply wake
1189 * it up. Thus we simply go through and call the request function to goose
1190 * the various top level drivers and get things moving again.
1192 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1193 SCSI_LOG_ERROR_RECOVERY(5, printk("Calling request function to restart things...\n"));
1195 if (SDpnt
->scsi_request_fn
!= NULL
)
1196 (*SDpnt
->scsi_request_fn
) ();
1201 * Function: scsi_unjam_host
1203 * Purpose: Attempt to fix a host which has a command that failed for
1206 * Arguments: host - host that needs unjamming.
1210 * Notes: When we come in here, we *know* that all commands on the
1211 * bus have either completed, failed or timed out. We also
1212 * know that no further commands are being sent to the host,
1213 * so things are relatively quiet and we have freedom to
1214 * fiddle with things as we wish.
1216 * Additional note: This is only the *default* implementation. It is possible
1217 * for individual drivers to supply their own version of this
1218 * function, and if the maintainer wishes to do this, it is
1219 * strongly suggested that this function be taken as a template
1220 * and modified. This function was designed to correctly handle
1221 * problems for about 95% of the different cases out there, and
1222 * it should always provide at least a reasonable amount of error
1225 * Note3: Any command marked 'FAILED' or 'TIMEOUT' must eventually
1226 * have scsi_finish_command() called for it. We do all of
1227 * the retry stuff here, so when we restart the host after we
1228 * return it should have an empty queue.
1230 STATIC
int scsi_unjam_host(struct Scsi_Host
*host
)
1240 Scsi_Device
*SDloop
;
1247 * First, protect against any sort of race condition. If any of the outstanding
1248 * commands are in states that indicate that we are not yet blocked (i.e. we are
1249 * not in a quiet state) then we got woken up in error. If we ever end up here,
1250 * we need to re-examine some of the assumptions.
1252 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1253 for (SCpnt
= SDpnt
->device_queue
; SCpnt
; SCpnt
= SCpnt
->next
) {
1254 if (SCpnt
->state
== SCSI_STATE_FAILED
1255 || SCpnt
->state
== SCSI_STATE_TIMEOUT
1256 || SCpnt
->state
== SCSI_STATE_INITIALIZING
1257 || SCpnt
->state
== SCSI_STATE_UNUSED
) {
1261 * Rats. Something is still floating around out there. This could
1262 * be the result of the fact that the upper level drivers are still frobbing
1263 * commands that might have succeeded. There are two outcomes. One is that
1264 * the command block will eventually be freed, and the other one is that
1265 * the command will be queued and will be finished along the way.
1267 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt
, SCpnt
->state
, SCpnt
->target
));
1270 * panic("SCSI Error handler woken too early\n");
1272 * This is no longer a problem, since now the code cares only about
1273 * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1274 * Other states are useful only to release active commands when devices are
1275 * set offline. If (host->host_active == host->host_busy) we can safely assume
1276 * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1279 * It is not easy to release correctly commands according to their state when
1280 * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1281 * When a device is set offline, we can have some command with
1282 * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1283 * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1290 * Next, see if we need to request sense information. if so,
1291 * then get it now, so we have a better idea of what to do.
1292 * FIXME(eric) this has the unfortunate side effect that if a host
1293 * adapter does not automatically request sense information, that we end
1294 * up shutting it down before we request it. All hosts should be doing this
1295 * anyways, so for now all I have to say is tough noogies if you end up in here.
1296 * On second thought, this is probably a good idea. We *really* want to give
1297 * authors an incentive to automatically request this.
1299 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1301 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1302 for (SCpnt
= SDpnt
->device_queue
; SCpnt
; SCpnt
= SCpnt
->next
) {
1303 if (SCpnt
->state
!= SCSI_STATE_FAILED
|| scsi_sense_valid(SCpnt
)) {
1306 SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1308 rtn
= scsi_request_sense(SCpnt
);
1309 if (rtn
!= SUCCESS
) {
1312 SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1313 SCpnt
, SCpnt
->result
));
1314 SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt
));
1316 result
= scsi_decide_disposition(SCpnt
);
1319 * If the result was normal, then just pass it along to the
1322 if (result
== SUCCESS
) {
1323 SCpnt
->host
->host_failed
--;
1324 scsi_eh_finish_command(&SCdone
, SCpnt
);
1326 if (result
!= NEEDS_RETRY
) {
1330 * We only come in here if we want to retry a
1331 * command. The test to see whether the command
1332 * should be retried should be keeping track of the
1333 * number of tries, so we don't end up looping, of
1336 SCpnt
->state
= NEEDS_RETRY
;
1337 rtn
= scsi_eh_retry_command(SCpnt
);
1338 if (rtn
!= SUCCESS
) {
1342 * We eventually hand this one back to the top level.
1344 SCpnt
->host
->host_failed
--;
1345 scsi_eh_finish_command(&SCdone
, SCpnt
);
1350 * Go through the list of commands and figure out where we stand and how bad things
1356 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1357 unsigned int device_error
= 0;
1359 for (SCpnt
= SDpnt
->device_queue
; SCpnt
; SCpnt
= SCpnt
->next
) {
1360 if (SCpnt
->state
== SCSI_STATE_FAILED
) {
1361 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1366 if (SCpnt
->state
== SCSI_STATE_TIMEOUT
) {
1367 SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1373 if (device_error
> 0) {
1378 SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1379 numfailed
, timed_out
, devices_failed
));
1381 if (host
->host_failed
== 0) {
1386 * Next, try and see whether or not it makes sense to try and abort
1387 * the running command. This only works out to be the case if we have
1388 * one command that has timed out. If the command simply failed, it
1389 * makes no sense to try and abort the command, since as far as the
1390 * host adapter is concerned, it isn't running.
1393 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1395 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1396 for (SCloop
= SDpnt
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1397 if (SCloop
->state
!= SCSI_STATE_TIMEOUT
) {
1400 rtn
= scsi_try_to_abort_command(SCloop
, ABORT_TIMEOUT
);
1401 if (rtn
== SUCCESS
) {
1402 rtn
= scsi_test_unit_ready(SCloop
);
1404 if (rtn
== SUCCESS
&& scsi_unit_is_ready(SCloop
)) {
1405 rtn
= scsi_eh_retry_command(SCloop
);
1407 if (rtn
== SUCCESS
) {
1408 SCloop
->host
->host_failed
--;
1409 scsi_eh_finish_command(&SCdone
, SCloop
);
1417 * If we have corrected all of the problems, then we are done.
1419 if (host
->host_failed
== 0) {
1424 * Either the abort wasn't appropriate, or it didn't succeed.
1425 * Now try a bus device reset. Still, look to see whether we have
1426 * multiple devices that are jammed or not - if we have multiple devices,
1427 * it makes no sense to try BUS_DEVICE_RESET - we really would need
1428 * to try a BUS_RESET instead.
1430 * Does this make sense - should we try BDR on each device individually?
1433 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1435 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1436 for (SCloop
= SDpnt
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1437 if (SCloop
->state
== SCSI_STATE_FAILED
1438 || SCloop
->state
== SCSI_STATE_TIMEOUT
) {
1443 if (SCloop
== NULL
) {
1447 * OK, we have a device that is having problems. Try and send
1448 * a bus device reset to it.
1450 * FIXME(eric) - make sure we handle the case where multiple
1451 * commands to the same device have failed. They all must
1452 * get properly restarted.
1454 rtn
= scsi_try_bus_device_reset(SCloop
, RESET_TIMEOUT
);
1456 if (rtn
== SUCCESS
) {
1457 rtn
= scsi_test_unit_ready(SCloop
);
1459 if (rtn
== SUCCESS
&& scsi_unit_is_ready(SCloop
)) {
1460 rtn
= scsi_eh_retry_command(SCloop
);
1462 if (rtn
== SUCCESS
) {
1463 SCloop
->host
->host_failed
--;
1464 scsi_eh_finish_command(&SCdone
, SCloop
);
1470 if (host
->host_failed
== 0) {
1475 * If we ended up here, we have serious problems. The only thing left
1476 * to try is a full bus reset. If someone has grabbed the bus and isn't
1477 * letting go, then perhaps this will help.
1479 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1482 * We really want to loop over the various channels, and do this on
1483 * a channel by channel basis. We should also check to see if any
1484 * of the failed commands are on soft_reset devices, and if so, skip
1487 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1489 for (SCpnt
= SDpnt
->device_queue
; SCpnt
; SCpnt
= SCpnt
->next
) {
1490 if (SCpnt
->state
!= SCSI_STATE_FAILED
1491 && SCpnt
->state
!= SCSI_STATE_TIMEOUT
) {
1495 * We have a failed command. Make sure there are no other failed
1496 * commands on the same channel that are timed out and implement a
1499 for (SDloop
= host
->host_queue
; SDloop
; SDloop
= SDloop
->next
) {
1500 for (SCloop
= SDloop
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1501 if (SCloop
->channel
!= SCpnt
->channel
) {
1504 if (SCloop
->state
!= SCSI_STATE_FAILED
1505 && SCloop
->state
!= SCSI_STATE_TIMEOUT
) {
1508 if (SDloop
->soft_reset
&& SCloop
->state
== SCSI_STATE_TIMEOUT
) {
1510 * If this device uses the soft reset option, and this
1511 * is one of the devices acting up, then our only
1512 * option is to wait a bit, since the command is
1513 * supposedly still running.
1515 * FIXME(eric) - right now we will just end up falling
1516 * through to the 'take device offline' case.
1518 * FIXME(eric) - It is possible that the command completed
1519 * *after* the error recovery procedure started, and if this
1520 * is the case, we are worrying about nothing here.
1524 * Due to the spinlock, we will never get out of this
1525 * loop without a proper wait (DB)
1535 * We now know that we are able to perform a reset for the
1536 * bus that SCpnt points to. There are no soft-reset devices
1537 * with outstanding timed out commands.
1539 rtn
= scsi_try_bus_reset(SCpnt
);
1540 if (rtn
== SUCCESS
) {
1541 for (SDloop
= host
->host_queue
; SDloop
; SDloop
= SDloop
->next
) {
1542 for (SCloop
= SDloop
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1543 if (SCloop
->channel
!= SCpnt
->channel
) {
1546 if (SCloop
->state
!= SCSI_STATE_FAILED
1547 && SCloop
->state
!= SCSI_STATE_TIMEOUT
) {
1550 rtn
= scsi_test_unit_ready(SCloop
);
1552 if (rtn
== SUCCESS
&& scsi_unit_is_ready(SCloop
)) {
1553 rtn
= scsi_eh_retry_command(SCloop
);
1555 if (rtn
== SUCCESS
) {
1556 SCpnt
->host
->host_failed
--;
1557 scsi_eh_finish_command(&SCdone
, SCloop
);
1561 * If the bus reset worked, but we are still unable to
1562 * talk to the device, take it offline.
1563 * FIXME(eric) - is this really the correct thing to do?
1565 if (rtn
!= SUCCESS
) {
1566 SCloop
->device
->online
= FALSE
;
1567 SCloop
->host
->host_failed
--;
1568 scsi_eh_finish_command(&SCdone
, SCloop
);
1576 if (host
->host_failed
== 0) {
1581 * If we ended up here, we have serious problems. The only thing left
1582 * to try is a full host reset - perhaps the firmware on the device
1583 * crashed, or something like that.
1585 * It is assumed that a succesful host reset will cause *all* information
1586 * about the command to be flushed from both the host adapter *and* the
1589 * FIXME(eric) - it isn't clear that devices that implement the soft reset
1590 * option can ever be cleared except via cycling the power. The problem is
1591 * that sending the host reset command will cause the host to forget
1592 * about the pending command, but the device won't forget. For now, we
1593 * skip the host reset option if any of the failed devices are configured
1594 * to use the soft reset option.
1596 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1598 for (SCpnt
= SDpnt
->device_queue
; SCpnt
; SCpnt
= SCpnt
->next
) {
1599 if (SCpnt
->state
!= SCSI_STATE_FAILED
1600 && SCpnt
->state
!= SCSI_STATE_TIMEOUT
) {
1603 if (SDpnt
->soft_reset
&& SCpnt
->state
== SCSI_STATE_TIMEOUT
) {
1605 * If this device uses the soft reset option, and this
1606 * is one of the devices acting up, then our only
1607 * option is to wait a bit, since the command is
1608 * supposedly still running.
1610 * FIXME(eric) - right now we will just end up falling
1611 * through to the 'take device offline' case.
1613 SCSI_LOG_ERROR_RECOVERY(3,
1614 printk("scsi_unjam_host: Unable to try hard host reset\n"));
1617 * Due to the spinlock, we will never get out of this
1618 * loop without a proper wait. (DB)
1624 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1627 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1629 rtn
= scsi_try_host_reset(SCpnt
);
1630 if (rtn
== SUCCESS
) {
1632 * FIXME(eric) we assume that all commands are flushed from the
1633 * controller. We should get a DID_RESET for all of the commands
1634 * that were pending. We should ignore these so that we can
1635 * guarantee that we are in a consistent state.
1637 * I believe this to be the case right now, but this needs to be
1640 for (SDloop
= host
->host_queue
; SDloop
; SDloop
= SDloop
->next
) {
1641 for (SCloop
= SDloop
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1642 if (SCloop
->state
!= SCSI_STATE_FAILED
1643 && SCloop
->state
!= SCSI_STATE_TIMEOUT
) {
1646 rtn
= scsi_test_unit_ready(SCloop
);
1648 if (rtn
== SUCCESS
&& scsi_unit_is_ready(SCloop
)) {
1649 rtn
= scsi_eh_retry_command(SCloop
);
1651 if (rtn
== SUCCESS
) {
1652 SCpnt
->host
->host_failed
--;
1653 scsi_eh_finish_command(&SCdone
, SCloop
);
1656 if (rtn
!= SUCCESS
) {
1657 SCloop
->device
->online
= FALSE
;
1658 SCloop
->host
->host_failed
--;
1659 scsi_eh_finish_command(&SCdone
, SCloop
);
1668 * If we solved all of the problems, then let's rev up the engines again.
1670 if (host
->host_failed
== 0) {
1675 * If the HOST RESET failed, then for now we assume that the entire host
1676 * adapter is too hosed to be of any use. For our purposes, however, it is
1677 * easier to simply take the devices offline that correspond to commands
1680 SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1682 for (SDpnt
= host
->host_queue
; SDpnt
; SDpnt
= SDpnt
->next
) {
1683 for (SCloop
= SDpnt
->device_queue
; SCloop
; SCloop
= SCloop
->next
) {
1684 if (SCloop
->state
== SCSI_STATE_FAILED
|| SCloop
->state
== SCSI_STATE_TIMEOUT
) {
1685 SCloop
->device
->online
= FALSE
;
1688 * This should pass the failure up to the top level driver, and
1689 * it will have to try and do something intelligent with it.
1691 SCloop
->host
->host_failed
--;
1693 if (SCloop
->state
== SCSI_STATE_TIMEOUT
) {
1694 SCloop
->result
|= (DRIVER_TIMEOUT
<< 24);
1696 SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1697 SCloop
->device
->id
, SCloop
->result
));
1699 scsi_eh_finish_command(&SCdone
, SCloop
);
1704 if (host
->host_failed
!= 0) {
1705 panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1707 SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1714 * We should have a list of commands that we 'finished' during the course of
1715 * error recovery. This should be the same as the list of commands that timed out
1716 * or failed. We are currently holding these things in a linked list - we didn't
1717 * put them in the bottom half queue because we wanted to keep things quiet while
1718 * we were working on recovery, and passing them up to the top level could easily
1719 * cause the top level to try and queue something else again.
1721 * Start by marking that the host is no longer in error recovery.
1723 host
->in_recovery
= 0;
1726 * Take the list of commands, and stick them in the bottom half queue.
1727 * The current implementation of scsi_done will do this for us - if need
1728 * be we can create a special version of this function to do the
1731 for (SCpnt
= SCdone
; SCpnt
!= NULL
; SCpnt
= SCdone
) {
1732 SCdone
= SCpnt
->bh_next
;
1733 SCpnt
->bh_next
= NULL
;
1742 * Function: scsi_error_handler
1744 * Purpose: Handle errors/timeouts of scsi commands, try and clean up
1745 * and unjam the bus, and restart things.
1747 * Arguments: host - host for which we are running.
1749 * Returns: Never returns.
1751 * Notes: This is always run in the context of a kernel thread. The
1752 * idea is that we start this thing up when the kernel starts
1753 * up (one per host that we detect), and it immediately goes to
1754 * sleep and waits for some event (i.e. failure). When this
1755 * takes place, we have the job of trying to unjam the bus
1756 * and restarting things.
1759 void scsi_error_handler(void *data
)
1761 struct Scsi_Host
*host
= (struct Scsi_Host
*) data
;
1763 DECLARE_MUTEX_LOCKED(sem
);
1764 unsigned long flags
;
1765 struct fs_struct
*fs
;
1770 * If we were started as result of loading a module, close all of the
1771 * user space pages. We don't need them, and if we didn't close them
1772 * they would be locked into memory.
1776 current
->session
= 1;
1779 /* Become as one with the init task */
1781 exit_fs(current
); /* current->fs->count--; */
1784 atomic_inc(&fs
->count
);
1786 siginitsetinv(¤t
->blocked
, SHUTDOWN_SIGS
);
1790 * Set the name of this process.
1792 sprintf(current
->comm
, "scsi_eh_%d", host
->host_no
);
1794 host
->eh_wait
= &sem
;
1795 host
->ehandler
= current
;
1800 * Wake up the thread that created us.
1802 SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", host
->eh_notify
->count
.counter
));
1804 up(host
->eh_notify
);
1808 * If we get a signal, it means we are supposed to go
1809 * away and die. This typically happens if the user is
1810 * trying to unload a module.
1812 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1813 down_interruptible(&sem
);
1815 if (signal_pending(current
))
1818 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1820 spin_lock_irqsave(&io_request_lock
, flags
);
1821 host
->eh_active
= 1;
1824 * We have a host that is failing for some reason. Figure out
1825 * what we need to do to get it up and online again (if we can).
1826 * If we fail, we end up taking the thing offline.
1828 if (host
->hostt
->eh_strategy_handler
!= NULL
) {
1829 rtn
= host
->hostt
->eh_strategy_handler(host
);
1831 rtn
= scsi_unjam_host(host
);
1834 host
->eh_active
= 0;
1837 * Note - if the above fails completely, the action is to take
1838 * individual devices offline and flush the queue of any
1839 * outstanding requests that may have been pending. When we
1840 * restart, we restart any I/O to any other devices on the bus
1841 * which are still online.
1843 scsi_restart_operations(host
);
1845 /* The spinlock is really needed up to this point. (DB) */
1846 spin_unlock_irqrestore(&io_request_lock
, flags
);
1849 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1852 * Make sure that nobody tries to wake us up again.
1854 host
->eh_wait
= NULL
;
1857 * Knock this down too. From this point on, the host is flying
1858 * without a pilot. If this is because the module is being unloaded,
1859 * that's fine. If the user sent a signal to this thing, we are
1860 * potentially in real danger.
1862 host
->in_recovery
= 0;
1863 host
->eh_active
= 0;
1864 host
->ehandler
= NULL
;
1867 * If anyone is waiting for us to exit (i.e. someone trying to unload
1868 * a driver), then wake up that process to let them know we are on
1869 * the way out the door. This may be overkill - I *think* that we
1870 * could probably just unload the driver and send the signal, and when
1871 * the error handling thread wakes up that it would just exit without
1872 * needing to touch any memory associated with the driver itself.
1874 if (host
->eh_notify
!= NULL
)
1875 up(host
->eh_notify
);
1879 * Overrides for Emacs so that we follow Linus's tabbing style.
1880 * Emacs will notice this stuff at the end of the file and automatically
1881 * adjust the settings for this buffer only. This must remain at the end
1883 * ---------------------------------------------------------------------------
1886 * c-brace-imaginary-offset: 0
1887 * c-brace-offset: -4
1888 * c-argdecl-indent: 4
1889 * c-label-offset: -4
1890 * c-continued-statement-offset: 4
1891 * c-continued-brace-offset: 0
1892 * indent-tabs-mode: nil