Merge commit '2cedd8f0ecbd2b29bf0aac72bb8b7413b0326938' into merges
[unleashed.git] / usr / src / cmd / format / analyze.c
blob64417130d2e1e94f3fc329a5f3da4bd80982c29d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * This file contains routines to analyze the surface of a disk.
29 #include "global.h"
30 #include "analyze.h"
31 #include <stdlib.h>
32 #include <errno.h>
33 #include "misc.h"
34 #include "defect.h"
35 #include "label.h"
36 #include "param.h"
37 #include "checkdev.h"
41 * These global variables control the surface analysis process. They
42 * are set from a command in the defect menu.
44 int scan_entire = 1; /* scan whole disk flag */
45 diskaddr_t scan_lower = 0; /* lower bound */
46 diskaddr_t scan_upper = 0; /* upper bound */
47 int scan_correct = 1; /* correct errors flag */
48 int scan_stop = 0; /* stop after error flag */
49 int scan_loop = 0; /* loop forever flag */
50 int scan_passes = 2; /* number of passes */
51 int scan_random = 0; /* random patterns flag */
52 uint_t scan_size = 0; /* sectors/scan operation */
53 int scan_auto = 1; /* scan after format flag */
54 int scan_restore_defects = 1; /* restore defect list after writing */
55 int scan_restore_label = 1; /* restore label after writing */
58 * These are summary variables to print out info after analysis.
59 * Values less than 0 imply they are invalid.
61 offset_t scan_cur_block = -1; /* current block */
62 int64_t scan_blocks_fixed = -1; /* # blocks repaired */
65 * This variable is used to tell whether the most recent surface
66 * analysis error was caused by a media defect or some other problem.
68 int media_error; /* error was caused by defect */
70 int disk_error; /* disk errors during analysis */
73 * These are the data patterns used if random patterns are not chosen.
74 * They are designed to show pattern dependent errors.
76 static unsigned int scan_patterns[] = {
77 0xc6dec6de,
78 0x6db6db6d,
79 0x00000000,
80 0xffffffff,
81 0xaaaaaaaa,
83 #define NPATTERNS 5 /* number of predefined patterns */
86 * These are the data patterns from the SunFed requirements document.
88 static unsigned int purge_patterns[] = { /* patterns to be written */
89 0xaaaaaaaa, /* 10101010... */
90 0x55555555, /* 01010101... == UUUU... */
91 0xaaaaaaaa, /* 10101010... */
92 0xaaaaaaaa, /* 10101010... */
95 static unsigned int alpha_pattern = 0x40404040; /* 10000000... == @@@@... */
97 /* Function prototypes */
98 #ifdef __STDC__
100 static int scan_repair(diskaddr_t bn, int mode);
101 static int analyze_blocks(int flags, diskaddr_t blkno, uint_t blkcnt,
102 unsigned data, int init, int driver_flags, int *xfercntp);
103 static int handle_error_conditions(void);
104 static int verify_blocks(int flags, diskaddr_t blkno, uint_t blkcnt,
105 unsigned data, int driver_flags, int *xfercntp);
106 #else /* __STDC__ */
108 static int scan_repair();
109 static int analyze_blocks();
110 static int handle_error_conditions();
111 static int verify_blocks();
113 #endif /* __STDC__ */
116 * This routine performs a surface analysis based upon the global
117 * parameters. It is called from several commands in the defect menu,
118 * and from the format command in the command menu (if post-format
119 * analysis is enable).
122 do_scan(flags, mode)
123 int flags, mode;
125 diskaddr_t start, end, curnt;
126 int pass, needinit, data;
127 uint_t size;
128 int status, founderr, i, j;
129 int error = 0;
130 int pattern = 0;
131 int xfercnt;
134 * Check to be sure we aren't correcting without a defect list
135 * if the controller can correct the defect.
137 if (scan_correct && !EMBEDDED_SCSI && (cur_ops->op_repair != NULL) &&
138 (cur_list.list == NULL)) {
139 err_print("Current Defect List must be initialized ");
140 err_print("to do automatic repair.\n");
141 return (-1);
144 * Define the bounds of the scan.
146 if (scan_entire) {
147 start = 0;
148 if (cur_label == L_TYPE_SOLARIS) {
149 if (cur_ctype->ctype_flags & CF_SCSI)
150 end = datasects() - 1;
151 else
152 end = physsects() - 1;
153 } else if (cur_label == L_TYPE_EFI) {
154 end = cur_parts->etoc->efi_last_lba;
156 } else {
157 start = scan_lower;
158 end = scan_upper;
161 * Make sure the user knows if we are scanning over a mounted
162 * partition.
164 if ((flags & (SCAN_PATTERN | SCAN_WRITE)) &&
165 (checkmount(start, end))) {
166 err_print("Cannot do analysis on a mounted partition.\n");
167 return (-1);
171 * Make sure the user knows if we are scanning over a
172 * partition being used for swapping.
174 if ((flags & (SCAN_PATTERN | SCAN_WRITE)) &&
175 (checkswap(start, end))) {
176 err_print("Cannot do analysis on a partition \
177 which is currently being used for swapping.\n");
178 return (-1);
182 * Check to see if any partitions used for svm, vxvm, ZFS zpool
183 * or live upgrade are on the disk.
185 if ((flags & (SCAN_PATTERN | SCAN_WRITE)) &&
186 (checkdevinuse(cur_disk->disk_name, (diskaddr_t)-1,
187 (diskaddr_t)-1, 0, 0))) {
188 err_print("Cannot do analysis on a partition "
189 "while it in use as described above.\n");
190 return (-1);
194 * If we are scanning destructively over certain sectors,
195 * we mark the defect list and/or label dirty so it will get rewritten.
197 if (flags & (SCAN_PATTERN | SCAN_WRITE)) {
198 if (cur_label == L_TYPE_SOLARIS) {
199 if (start < (diskaddr_t)totalsects() &&
200 end >= (diskaddr_t)datasects()) {
201 if (!EMBEDDED_SCSI) {
202 cur_list.flags |= LIST_DIRTY;
204 if (cur_disk->disk_flags & DSK_LABEL)
205 cur_flags |= LABEL_DIRTY;
208 if (start == 0) {
209 if (cur_disk->disk_flags & DSK_LABEL)
210 cur_flags |= LABEL_DIRTY;
214 * Initialize the summary info on sectors repaired.
216 scan_blocks_fixed = 0;
218 * Loop through the passes of the scan. If required, loop forever.
220 for (pass = 0; pass < scan_passes || scan_loop; pass++) {
222 * Determine the data pattern to use if pattern testing
223 * is to be done.
225 if (flags & SCAN_PATTERN) {
226 if (scan_random)
227 data = (int)mrand48();
228 else
229 data = scan_patterns[pass % NPPATTERNS];
231 if (flags & SCAN_PURGE) {
232 flags &= ~(SCAN_PURGE_READ_PASS
233 | SCAN_PURGE_ALPHA_PASS);
234 switch (pattern % (NPPATTERNS + 1)) {
235 case NPPATTERNS:
236 pattern = 0;
237 if (!error) {
238 fmt_print(
239 "\nThe last %d passes were successful, running alpha pattern pass", NPPATTERNS);
240 flags |= SCAN_PURGE_ALPHA_PASS;
241 data = alpha_pattern;
242 } else {
243 data = purge_patterns[pattern];
244 pattern++;
246 break;
247 case READPATTERN:
248 flags |= SCAN_PURGE_READ_PASS;
249 default:
250 data = purge_patterns[pattern];
251 pattern++;
252 break;
255 fmt_print("\n pass %d", pass);
256 fmt_print(" - pattern = 0x%x", data);
257 } else
258 fmt_print("\n pass %d", pass);
260 fmt_print("\n");
262 * Mark the pattern buffer as corrupt, since it
263 * hasn't been initialized.
265 needinit = 1;
267 * Print the first block number to the log file if
268 * logging is on so there is some record of what
269 * analysis was performed.
271 if (log_file) {
272 pr_dblock(log_print, start);
273 log_print("\n");
276 * Loop through this pass, each time analyzing an amount
277 * specified by the global parameters.
279 xfercnt = 0;
280 for (curnt = start; curnt <= end; curnt += size) {
281 if ((end - curnt) < scan_size)
282 size = end - curnt + 1;
283 else
284 size = scan_size;
286 * Print out where we are, so we don't look dead.
287 * Also store it in summary info for logging.
289 scan_cur_block = curnt;
290 nolog_print(" ");
291 pr_dblock(nolog_print, curnt);
292 nolog_print(" \015");
293 (void) fflush(stdout);
294 disk_error = 0;
296 * Do the actual analysis.
298 status = analyze_blocks(flags, curnt, size,
299 (unsigned)data, needinit, (F_ALLERRS | F_SILENT),
300 &xfercnt);
302 * If there were no errors, the pattern buffer is
303 * still initialized, and we just loop to next chunk.
305 needinit = 0;
306 if (!status)
307 continue;
309 * There was an error. Check if surface analysis
310 * can be continued.
312 if (handle_error_conditions()) {
313 scan_blocks_fixed = scan_cur_block = -1;
314 return (-1);
317 * There was an error. Mark the pattern buffer
318 * corrupt so it will get reinitialized.
320 needinit = 1;
322 * If it was not a media error, ignore it.
324 if (!media_error)
325 continue;
327 * Loop 5 times through each sector of the chunk,
328 * analyzing them individually.
330 nolog_print(" ");
331 pr_dblock(nolog_print, curnt);
332 nolog_print(" \015");
333 (void) fflush(stdout);
334 founderr = 0;
335 for (j = 0; j < size * 5; j++) {
336 i = j % size;
337 disk_error = 0;
338 status = analyze_blocks(flags, (curnt + i), 1,
339 (unsigned)data, needinit, F_ALLERRS, NULL);
340 needinit = 0;
341 if (!status)
342 continue;
344 * There was an error. Check if surface analysis
345 * can be continued.
347 if (handle_error_conditions()) {
348 scan_blocks_fixed = scan_cur_block = -1;
349 return (-1);
352 * An error occurred. Mark the buffer
353 * corrupt and see if it was media
354 * related.
356 needinit = 1;
357 if (!media_error)
358 continue;
360 * We found a bad sector. Print out a message
361 * and fix it if required.
363 founderr = 1;
364 if (scan_correct && (flags != SCAN_VALID)) {
365 if (scan_repair(curnt+i, mode)) {
366 error = -1;
368 } else
369 err_print("\n");
371 * Stop after the error if required.
373 if (scan_stop)
374 goto out;
377 * Mark the pattern buffer corrupt to be safe.
379 needinit = 1;
381 * We didn't find an individual sector that was bad.
382 * Print out a warning.
384 if (!founderr) {
385 err_print("Warning: unable to pinpoint ");
386 err_print("defective block.\n");
390 * Print the end of each pass to the log file.
392 enter_critical();
393 if (log_file) {
394 pr_dblock(log_print, scan_cur_block);
395 log_print("\n");
397 scan_cur_block = -1;
398 exit_critical();
399 fmt_print("\n");
402 * alternate the read and write for SCAN_VERIFY test
404 if (flags & SCAN_VERIFY) {
405 flags ^= SCAN_VERIFY_READ_PASS;
408 out:
410 * We got here either by giving up after an error or falling
411 * through after all passes were completed.
413 fmt_print("\n");
414 enter_critical();
416 * If the defect list is dirty, write it to disk,
417 * if scan_restore_defects (the default) is true.
419 if (!EMBEDDED_SCSI && (cur_list.flags & LIST_DIRTY) &&
420 (scan_restore_defects)) {
421 cur_list.flags = 0;
422 write_deflist(&cur_list);
425 * If the label is dirty, write it to disk.
426 * if scan_restore_label (the default) is true.
428 if ((cur_flags & LABEL_DIRTY) && (scan_restore_label)) {
429 cur_flags &= ~LABEL_DIRTY;
430 (void) write_label();
433 * If we dropped down to here after an error, we need to write
434 * the final block number to the log file for record keeping.
436 if (log_file && scan_cur_block >= 0) {
437 pr_dblock(log_print, scan_cur_block);
438 log_print("\n");
440 fmt_print("Total of %lld defective blocks repaired.\n",
441 scan_blocks_fixed);
443 * Reinitialize the logging variables so they don't get used
444 * when they are not really valid.
446 scan_blocks_fixed = scan_cur_block = -1;
447 exit_critical();
448 return (error);
453 * This routine is called to repair a bad block discovered
454 * during a scan operation. Return 0 for success, 1 for failure.
455 * (This has been extracted out of do_scan(), to simplify it.)
457 static int
458 scan_repair(bn, mode)
459 diskaddr_t bn;
460 int mode;
462 int status;
463 int result = 1;
464 char *buf;
465 int buf_is_good;
466 int i;
468 if (cur_ops->op_repair == NULL) {
469 err_print("Warning: Controller does ");
470 err_print("not support repairing.\n\n");
471 return (result);
474 buf = malloc(cur_blksz);
475 if (buf == NULL) {
476 err_print("Warning: no memory.\n\n");
477 return (result);
479 enter_critical();
482 * Determine if the error appears to be hard or soft. We
483 * already assume there's an error. If we can get any
484 * good data out of the sector, write that data back
485 * after the repair.
487 buf_is_good = 0;
488 for (i = 0; i < 5; i++) {
489 status = (*cur_ops->op_rdwr)(DIR_READ, cur_file, bn, 1,
490 buf, F_SILENT, NULL);
491 if (status == 0) {
492 buf_is_good = 1;
493 break;
497 fmt_print("Repairing %s error on %llu (",
498 buf_is_good ? "soft" : "hard", bn);
499 pr_dblock(fmt_print, bn);
500 fmt_print(")...");
502 status = (*cur_ops->op_repair)(bn, mode);
503 if (status) {
505 * If the repair failed, we note it and will return the
506 * failure. However, the analysis goes on.
508 fmt_print("failed.\n\n");
509 } else {
511 * The repair worked. Write the good data we could
512 * recover from the failed block, if possible.
513 * If not, zero the block. In doing so, try to
514 * determine if the new block appears ok.
516 if (!buf_is_good) {
517 bzero(buf, cur_blksz);
518 fmt_print("Warning: Block %llu zero-filled.\n", bn);
519 } else {
520 fmt_print("ok.\n");
522 status = (*cur_ops->op_rdwr)(DIR_WRITE, cur_file, bn,
523 1, buf, (F_SILENT | F_ALLERRS), NULL);
524 if (status == 0) {
525 status = (*cur_ops->op_rdwr)(DIR_READ, cur_file, bn,
526 1, buf, (F_SILENT | F_ALLERRS), NULL);
528 if (status) {
529 fmt_print("The new block also appears defective.\n");
531 fmt_print("\n");
533 * add the defect to the list and write the list out.
534 * Also, kill the working list so it will get resynced
535 * with the current list.
537 * For embedded scsi, we don't require a defect list.
538 * However, if we have one, add the defect if the
539 * list includes the grown list. If not, kill it
540 * to force a resync if we need the list later.
542 if (EMBEDDED_SCSI) {
543 if (cur_list.list != NULL) {
544 if (cur_list.flags & LIST_PGLIST) {
545 add_ldef(bn, &cur_list);
546 } else {
547 kill_deflist(&cur_list);
551 * The next "if" statement reflects the fix for
552 * bug id 1026096 where format keeps adding the
553 * same defect to the defect list.
555 } else if (cur_ctype->ctype_flags & CF_WLIST) {
556 kill_deflist(&cur_list);
557 (*cur_ops->op_ex_cur)(&cur_list);
558 fmt_print("Current list updated\n");
559 } else {
560 add_ldef(bn, &cur_list);
561 write_deflist(&cur_list);
563 kill_deflist(&work_list);
565 /* Log the repair. */
566 scan_blocks_fixed++;
568 /* return ok */
569 result = 0;
572 exit_critical();
573 free(buf);
574 return (result);
579 * This routine analyzes a set of sectors on the disk. It simply returns
580 * an error if a defect is found. It is called by do_scan().
582 static int
583 analyze_blocks(flags, blkno, blkcnt, data, init, driver_flags, xfercntp)
584 int flags, driver_flags, init;
585 uint_t blkcnt;
586 register unsigned data;
587 diskaddr_t blkno;
588 int *xfercntp;
590 int corrupt = 0;
591 int status;
592 register diskaddr_t i, nints;
593 register unsigned *ptr = (uint_t *)pattern_buf;
595 media_error = 0;
596 if (flags & SCAN_VERIFY) {
597 return (verify_blocks(flags, blkno, blkcnt, data,
598 driver_flags, xfercntp));
602 * Initialize the pattern buffer if necessary.
604 nints = (diskaddr_t)blkcnt * cur_blksz / sizeof (int);
605 if ((flags & SCAN_PATTERN) && init) {
606 for (i = 0; i < nints; i++)
607 *((int *)((int *)pattern_buf + i)) = data;
610 * Lock out interrupts so we can insure valid data will get
611 * restored. This is necessary because there are modes
612 * of scanning that corrupt the disk data then restore it at
613 * the end of the analysis.
615 enter_critical();
617 * If the disk data is valid, read it into the data buffer.
619 if (flags & SCAN_VALID) {
620 status = (*cur_ops->op_rdwr)(DIR_READ, cur_file, blkno,
621 blkcnt, (caddr_t)cur_buf, driver_flags, xfercntp);
622 if (status)
623 goto bad;
626 * If we are doing pattern testing, write and read the pattern
627 * from the pattern buffer.
629 if (flags & SCAN_PATTERN) {
631 * If the disk data was valid, mark it corrupt so we know
632 * to restore it later.
634 if (flags & SCAN_VALID)
635 corrupt++;
637 * Only write if we're not on the read pass of SCAN_PURGE.
639 if (!(flags & SCAN_PURGE_READ_PASS)) {
640 status = (*cur_ops->op_rdwr)(DIR_WRITE, cur_file, blkno,
641 blkcnt, (caddr_t)pattern_buf, driver_flags,
642 xfercntp);
643 if (status)
644 goto bad;
647 * Only read if we are on the read pass of SCAN_PURGE, if we
648 * are purging.
650 if ((!(flags & SCAN_PURGE)) || (flags & SCAN_PURGE_READ_PASS)) {
651 status = (*cur_ops->op_rdwr)(DIR_READ, cur_file, blkno,
652 blkcnt, (caddr_t)pattern_buf, driver_flags,
653 xfercntp);
654 if (status)
655 goto bad;
659 * If we are doing a data compare, make sure the pattern
660 * came back intact.
661 * Only compare if we are on the read pass of SCAN_PURGE, or
662 * we wrote random data instead of the expected data pattern.
664 if ((flags & SCAN_COMPARE) || (flags & SCAN_PURGE_READ_PASS)) {
665 for (i = nints, ptr = (uint_t *)pattern_buf; i; i--)
666 if (*ptr++ != data) {
667 err_print("Data miscompare error (expecting ");
668 err_print("0x%x, got 0x%x) at ", data,
669 *((int *)((int *)pattern_buf +
670 (nints - i))));
671 pr_dblock(err_print, blkno);
672 err_print(", offset = 0x%llx.\n",
673 (nints - i) * sizeof (int));
674 goto bad;
678 * If we are supposed to write data out, do so.
680 if (flags & SCAN_WRITE) {
681 status = (*cur_ops->op_rdwr)(DIR_WRITE, cur_file, blkno,
682 blkcnt, (caddr_t)cur_buf, driver_flags, xfercntp);
683 if (status)
684 goto bad;
686 exit_critical();
688 * No errors occurred, return ok.
690 return (0);
691 bad:
693 * There was an error. If the data was corrupted, we write it
694 * out from the data buffer to restore it.
696 if (corrupt) {
697 if ((*cur_ops->op_rdwr)(DIR_WRITE, cur_file, blkno,
698 blkcnt, (caddr_t)cur_buf, F_NORMAL, xfercntp))
699 err_print("Warning: unable to restore original data.\n");
701 exit_critical();
703 * Return the error.
705 return (-1);
710 * This routine analyzes a set of sectors on the disk. It simply returns
711 * an error if a defect is found. It is called by analyze_blocks().
712 * For simplicity, this is done as a separate function instead of
713 * making the analyze_block routine complex.
715 * This routine implements the 'verify' command. It writes the disk
716 * by writing unique data for each block; after the write pass, it
717 * reads the data and verifies for correctness. Note that the entire
718 * disk (or the range of disk) is fully written first and then read.
719 * This should eliminate any caching effect on the drives.
721 static int
722 verify_blocks(int flags,
723 diskaddr_t blkno,
724 uint_t blkcnt,
725 unsigned data,
726 int driver_flags,
727 int *xfercntp)
729 int status, i, nints;
730 unsigned *ptr = (uint_t *)pattern_buf;
732 nints = cur_blksz / sizeof (int);
735 * Initialize the pattern buffer if we are in write pass.
736 * Use the block number itself as data, each block has unique
737 * buffer data that way.
739 if (!(flags & SCAN_VERIFY_READ_PASS)) {
740 for (data = blkno; data < blkno + blkcnt; data++) {
741 for (i = 0; i < nints; i++) {
742 *ptr++ = data;
745 ptr = (uint_t *)pattern_buf;
749 * Only write if we're not on the read pass of SCAN_VERIFY.
751 if (!(flags & SCAN_VERIFY_READ_PASS)) {
752 status = (*cur_ops->op_rdwr)(DIR_WRITE, cur_file, blkno,
753 blkcnt, (caddr_t)pattern_buf, driver_flags, xfercntp);
754 if (status)
755 goto bad;
756 } else {
758 * Only read if we are on the read pass of SCAN_VERIFY
760 status = (*cur_ops->op_rdwr)(DIR_READ, cur_file, blkno,
761 blkcnt, (caddr_t)pattern_buf, driver_flags, xfercntp);
762 if (status)
763 goto bad;
765 * compare and make sure the pattern came back intact.
767 for (data = blkno; data < blkno + blkcnt; data++) {
768 for (i = 0; i < nints; i++) {
769 if (*ptr++ != data) {
770 ptr--;
771 err_print("Data miscompare error "
772 "(expecting 0x%x, got 0x%x) at ",
773 data, *ptr);
774 pr_dblock(err_print, blkno);
775 err_print(", offset = 0x%x.\n",
776 (ptr - (uint_t *)pattern_buf) *
777 sizeof (int));
778 goto bad;
784 * No errors occurred, return ok.
786 return (0);
787 bad:
788 return (-1);
792 static int
793 handle_error_conditions()
797 * Check if the errno is ENXIO.
799 if (errno == ENXIO) {
800 fmt_print("\n\nWarning:Cannot access drive, ");
801 fmt_print("aborting surface analysis.\n");
802 return (-1);
805 * check for disk errors
807 switch (disk_error) {
808 case DISK_STAT_RESERVED:
809 case DISK_STAT_UNAVAILABLE:
810 fmt_print("\n\nWarning:Drive may be reserved ");
811 fmt_print("or has been removed, ");
812 fmt_print("aborting surface analysis.\n");
813 return (-1);
814 case DISK_STAT_NOTREADY:
815 fmt_print("\n\nWarning: Drive not ready, ");
816 fmt_print("aborting surface analysis.\n");
817 return (-1);
818 case DISK_STAT_DATA_PROTECT:
819 fmt_print("\n\nWarning: Drive is write protected, ");
820 fmt_print("aborting surface analysis.\n");
821 return (-1);
822 default:
823 break;
825 return (0);