4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2014, Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
25 * Copyright 2016 Joyent, Inc.
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/systm.h>
38 #include <sys/vnode.h>
39 #include <sys/errno.h>
40 #include <sys/memlist.h>
41 #include <sys/dumphdr.h>
42 #include <sys/dumpadm.h>
43 #include <sys/ksyms.h>
44 #include <sys/compress.h>
45 #include <sys/stream.h>
46 #include <sys/strsun.h>
47 #include <sys/cmn_err.h>
48 #include <sys/bitmap.h>
49 #include <sys/modctl.h>
50 #include <sys/utsname.h>
51 #include <sys/systeminfo.h>
55 #include <sys/debug.h>
56 #include <sys/sunddi.h>
57 #include <sys/fs_subr.h>
58 #include <sys/fs/snode.h>
59 #include <sys/ontrap.h>
60 #include <sys/panic.h>
63 #include <sys/errorq.h>
64 #include <sys/fm/util.h>
65 #include <sys/fs/zfs.h>
72 #include <vm/seg_kmem.h>
73 #include <sys/clock_impl.h>
74 #include <sys/hold_page.h>
79 kmutex_t dump_lock
; /* lock for dump configuration */
80 dumphdr_t
*dumphdr
; /* dump header */
81 int dump_conflags
= DUMP_KERNEL
; /* dump configuration flags */
82 vnode_t
*dumpvp
; /* dump device vnode pointer */
83 uoff_t dumpvp_size
; /* size of dump device, in bytes */
84 char *dumppath
; /* pathname of dump device */
85 int dump_timeout
= 120; /* timeout for dumping pages */
86 int dump_timeleft
; /* portion of dump_timeout remaining */
87 int dump_ioerr
; /* dump i/o error */
88 char *dump_stack_scratch
; /* scratch area for saving stack summary */
91 * Tunables for dump. These can be set via /etc/system.
93 * dump_metrics_on if set, metrics are collected in the kernel, passed
94 * to savecore via the dump file, and recorded by savecore in
98 /* tunables for pre-reserved heap */
99 uint_t dump_kmem_permap
= 1024;
100 uint_t dump_kmem_pages
= 8;
103 * Compression metrics are accumulated nano-second subtotals. The
104 * results are normalized by the number of pages dumped. A report is
105 * generated when dumpsys() completes and is saved in the dump image
106 * after the trailing dump header.
108 * Metrics are always collected. Set the variable dump_metrics_on to
109 * cause metrics to be saved in the crash file, where savecore will
110 * save it in the file METRICS.txt.
113 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
117 typedef struct perpage
{
118 #define PERPAGE(x) hrtime_t x;
124 * If dump_metrics_on is set to 1, the timing information is passed to
125 * savecore via the crash file, where it is appended to the file
126 * dump-dir/METRICS.txt.
128 uint_t dump_metrics_on
= 0; /* set to 1 to enable recording metrics */
130 #define HRSTART(v, m) v##ts.m = gethrtime()
131 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m
133 static char dump_osimage_uuid
[36 + 1];
135 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
136 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
137 ((ch) >= 'A' && (ch) <= 'F'))
140 * configuration vars for dumpsys
142 typedef struct dumpcfg
{
143 char *page
; /* buffer for page copy */
144 char *lzbuf
; /* lzjb output */
146 char *cmap
; /* array of input (map) buffers */
147 ulong_t
*bitmap
; /* bitmap for marking pages to dump */
148 pgcnt_t bitmapsize
; /* size of bitmap */
149 pid_t
*pids
; /* list of process IDs at dump time */
154 perpage_t perpage
; /* per page metrics */
155 perpage_t perpagets
; /* per page metrics (timestamps) */
156 pgcnt_t npages
; /* subtotal of pages dumped */
157 pgcnt_t pages_mapped
; /* subtotal of pages mapped */
158 pgcnt_t pages_used
; /* subtotal of pages used per map */
159 size_t nwrite
; /* subtotal of bytes written */
160 hrtime_t elapsed
; /* elapsed time when completed */
161 hrtime_t iotime
; /* time spent writing nwrite bytes */
162 hrtime_t iowait
; /* time spent waiting for output */
163 hrtime_t iowaitts
; /* iowait timestamp */
168 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It
169 * is sized according to the optimum device transfer speed.
172 vnode_t
*cdev_vp
; /* VCHR open of the dump device */
173 len_t vp_limit
; /* maximum write offset */
174 offset_t vp_off
; /* current dump device offset */
175 char *cur
; /* dump write pointer */
176 char *start
; /* dump buffer address */
177 char *end
; /* dump buffer end */
178 size_t size
; /* size of dump buf in bytes */
179 size_t iosize
; /* best transfer size for device */
183 static dumpcfg_t dumpcfg
; /* config vars */
186 * The dump I/O buffer must be at least one page, at most xfer_size bytes,
187 * and should scale with physmem in between. The transfer size passed in
188 * will either represent a global default (maxphys) or the best size for the
189 * device. The size of the dump I/O buffer is limited by dumpbuf_limit (8MB
190 * by default) because the dump performance saturates beyond a certain size.
191 * The default is to select 1/4096 of the memory.
193 static int dumpbuf_fraction
= 12; /* memory size scale factor */
194 static size_t dumpbuf_limit
= 8 << 20; /* max I/O buf size */
197 dumpbuf_iosize(size_t xfer_size
)
199 size_t iosize
= ptob(physmem
>> dumpbuf_fraction
);
201 if (iosize
< PAGESIZE
)
203 else if (iosize
> xfer_size
)
205 if (iosize
> dumpbuf_limit
)
206 iosize
= dumpbuf_limit
;
207 return (iosize
& PAGEMASK
);
211 * resize the I/O buffer
216 char *old_buf
= dumpcfg
.buf
.start
;
217 size_t old_size
= dumpcfg
.buf
.size
;
221 ASSERT(MUTEX_HELD(&dump_lock
));
223 new_size
= dumpbuf_iosize(MAX(dumpcfg
.buf
.iosize
, maxphys
));
224 if (new_size
<= old_size
)
225 return; /* no need to reallocate buffer */
227 new_buf
= kmem_alloc(new_size
, KM_SLEEP
);
228 dumpcfg
.buf
.size
= new_size
;
229 dumpcfg
.buf
.start
= new_buf
;
230 dumpcfg
.buf
.end
= new_buf
+ new_size
;
231 kmem_free(old_buf
, old_size
);
235 * dump_update_clevel is called when dumpadm configures the dump device.
236 * Allocate the minimum configuration for now.
238 * When the dump file is configured we reserve a minimum amount of
239 * memory for use at crash time. But we reserve VA for all the memory
240 * we really want in order to do the fastest dump possible. The VA is
241 * backed by pages not being dumped, according to the bitmap. If
242 * there is insufficient spare memory, however, we fall back to the
245 * Live dump (savecore -L) always uses the minimum config.
250 dumpcfg_t
*old
= &dumpcfg
;
251 dumpcfg_t newcfg
= *old
;
252 dumpcfg_t
*new = &newcfg
;
254 ASSERT(MUTEX_HELD(&dump_lock
));
257 * Free the previously allocated bufs and VM.
260 kmem_free(old
->lzbuf
, PAGESIZE
);
262 kmem_free(old
->page
, PAGESIZE
);
265 /* VM space for mapping pages */
266 vmem_xfree(heap_arena
, old
->cmap
, PAGESIZE
);
269 * Allocate new data structures and buffers, and also figure the max
272 new->lzbuf
= kmem_alloc(PAGESIZE
, KM_SLEEP
);
273 new->page
= kmem_alloc(PAGESIZE
, KM_SLEEP
);
275 new->cmap
= vmem_xalloc(heap_arena
, PAGESIZE
, PAGESIZE
,
276 0, 0, NULL
, NULL
, VM_SLEEP
);
279 * Reserve memory for kmem allocation calls made during crash
280 * dump. The hat layer allocates memory for each mapping
281 * created, and the I/O path allocates buffers and data structs.
282 * Add a few pages for safety.
284 kmem_dump_init(dump_kmem_permap
+ (dump_kmem_pages
* PAGESIZE
));
286 /* set new config pointers */
291 * Define a struct memlist walker to optimize bitnum to pfn
292 * lookup. The walker maintains the state of the list traversal.
294 typedef struct dumpmlw
{
295 struct memlist
*mp
; /* current memlist */
296 pgcnt_t basenum
; /* bitnum base offset */
297 pgcnt_t mppages
; /* current memlist size */
298 pgcnt_t mpleft
; /* size to end of current memlist */
299 pfn_t mpaddr
; /* first pfn in memlist */
302 /* initialize the walker */
304 dump_init_memlist_walker(dumpmlw_t
*pw
)
306 pw
->mp
= phys_install
;
308 pw
->mppages
= pw
->mp
->ml_size
>> PAGESHIFT
;
309 pw
->mpleft
= pw
->mppages
;
310 pw
->mpaddr
= pw
->mp
->ml_address
>> PAGESHIFT
;
314 * Lookup pfn given bitnum. The memlist can be quite long on some
315 * systems (e.g.: one per board). To optimize sequential lookups, the
316 * caller initializes and presents a memlist walker.
319 dump_bitnum_to_pfn(pgcnt_t bitnum
, dumpmlw_t
*pw
)
321 bitnum
-= pw
->basenum
;
322 while (pw
->mp
!= NULL
) {
323 if (bitnum
< pw
->mppages
) {
324 pw
->mpleft
= pw
->mppages
- bitnum
;
325 return (pw
->mpaddr
+ bitnum
);
327 bitnum
-= pw
->mppages
;
328 pw
->basenum
+= pw
->mppages
;
329 pw
->mp
= pw
->mp
->ml_next
;
330 if (pw
->mp
!= NULL
) {
331 pw
->mppages
= pw
->mp
->ml_size
>> PAGESHIFT
;
332 pw
->mpleft
= pw
->mppages
;
333 pw
->mpaddr
= pw
->mp
->ml_address
>> PAGESHIFT
;
336 return (PFN_INVALID
);
340 dump_pfn_to_bitnum(pfn_t pfn
)
345 for (mp
= phys_install
; mp
!= NULL
; mp
= mp
->ml_next
) {
346 if (pfn
>= (mp
->ml_address
>> PAGESHIFT
) &&
347 pfn
< ((mp
->ml_address
+ mp
->ml_size
) >> PAGESHIFT
))
348 return (bitnum
+ pfn
- (mp
->ml_address
>> PAGESHIFT
));
349 bitnum
+= mp
->ml_size
>> PAGESHIFT
;
351 return ((pgcnt_t
)-1);
359 ASSERT(MUTEX_HELD(&dump_lock
));
361 if (dumphdr
== NULL
) {
362 dumphdr
= kmem_zalloc(sizeof (dumphdr_t
), KM_SLEEP
);
363 dumphdr
->dump_magic
= DUMP_MAGIC
;
364 dumphdr
->dump_version
= DUMP_VERSION
;
365 dumphdr
->dump_wordsize
= DUMP_WORDSIZE
;
366 dumphdr
->dump_pageshift
= PAGESHIFT
;
367 dumphdr
->dump_pagesize
= PAGESIZE
;
368 dumphdr
->dump_utsname
= utsname
;
369 (void) strcpy(dumphdr
->dump_platform
, platform
);
370 dumpcfg
.buf
.size
= dumpbuf_iosize(maxphys
);
371 dumpcfg
.buf
.start
= kmem_alloc(dumpcfg
.buf
.size
, KM_SLEEP
);
372 dumpcfg
.buf
.end
= dumpcfg
.buf
.start
+ dumpcfg
.buf
.size
;
373 dumpcfg
.pids
= kmem_alloc(v
.v_proc
* sizeof (pid_t
), KM_SLEEP
);
374 dump_stack_scratch
= kmem_alloc(STACK_BUF_SIZE
, KM_SLEEP
);
375 (void) strncpy(dumphdr
->dump_uuid
, dump_get_uuid(),
376 sizeof (dumphdr
->dump_uuid
));
379 npages
= num_phys_pages();
381 if (dumpcfg
.bitmapsize
!= npages
) {
382 void *map
= kmem_alloc(BT_SIZEOFMAP(npages
), KM_SLEEP
);
384 if (dumpcfg
.bitmap
!= NULL
)
385 kmem_free(dumpcfg
.bitmap
, BT_SIZEOFMAP(dumpcfg
.
387 dumpcfg
.bitmap
= map
;
388 dumpcfg
.bitmapsize
= npages
;
393 * Establish a new dump device.
396 dumpinit(vnode_t
*vp
, char *name
, int justchecking
)
403 ASSERT(MUTEX_HELD(&dump_lock
));
407 cvp
= common_specvp(vp
);
412 * Determine whether this is a plausible dump device. We want either:
413 * (1) a real device that's not mounted and has a cb_dump routine, or
414 * (2) a swapfile on some filesystem that has a vop_dump routine.
416 if ((error
= fop_open(&cvp
, FREAD
| FWRITE
, kcred
, NULL
)) != 0)
419 vattr
.va_mask
= AT_SIZE
| AT_TYPE
| AT_RDEV
;
420 if ((error
= fop_getattr(cvp
, &vattr
, 0, kcred
, NULL
)) == 0) {
421 if (vattr
.va_type
== VBLK
|| vattr
.va_type
== VCHR
) {
422 if (devopsp
[getmajor(vattr
.va_rdev
)]->
423 devo_cb_ops
->cb_dump
== nodev
)
425 else if (vfs_devismounted(vattr
.va_rdev
))
427 if (strcmp(ddi_driver_name(VTOS(cvp
)->s_dip
),
429 IS_SWAPVP(common_specvp(cvp
)))
432 if (cvp
->v_op
->vop_dump
== fs_nosys
||
433 cvp
->v_op
->vop_dump
== NULL
||
439 if (error
== 0 && vattr
.va_size
< 2 * DUMP_LOGSIZE
+ DUMP_ERPTSIZE
)
442 if (error
|| justchecking
) {
443 (void) fop_close(cvp
, FREAD
| FWRITE
, 1, 0,
451 dumpfini(); /* unconfigure the old dump device */
454 dumpvp_size
= vattr
.va_size
& -DUMP_OFFSET
;
455 dumppath
= kmem_alloc(strlen(name
) + 1, KM_SLEEP
);
456 (void) strcpy(dumppath
, name
);
457 dumpcfg
.buf
.iosize
= 0;
460 * If the dump device is a block device, attempt to open up the
461 * corresponding character device and determine its maximum transfer
462 * size. We use this information to potentially resize dump buffer
463 * to a larger and more optimal size for performing i/o to the dump
466 if (cvp
->v_type
== VBLK
&&
467 (cdev_vp
= makespecvp(VTOS(cvp
)->s_dev
, VCHR
)) != NULL
) {
468 if (fop_open(&cdev_vp
, FREAD
| FWRITE
, kcred
, NULL
) == 0) {
471 struct dk_minfo minf
;
473 if (fop_ioctl(cdev_vp
, DKIOCGMEDIAINFO
,
474 (intptr_t)&minf
, FKIOCTL
, kcred
, NULL
, NULL
)
475 == 0 && minf
.dki_lbsize
!= 0)
476 blk_size
= minf
.dki_lbsize
;
478 blk_size
= DEV_BSIZE
;
480 if (fop_ioctl(cdev_vp
, DKIOCINFO
, (intptr_t)&dki
,
481 FKIOCTL
, kcred
, NULL
, NULL
) == 0) {
482 dumpcfg
.buf
.iosize
= dki
.dki_maxtransfer
* blk_size
;
486 * If we are working with a zvol then dumpify it
487 * if it's not being used as swap.
489 if (strcmp(dki
.dki_dname
, ZVOL_DRIVER
) == 0) {
490 if (IS_SWAPVP(common_specvp(cvp
)))
492 else if ((error
= fop_ioctl(cdev_vp
,
493 DKIOCDUMPINIT
, (intptr_t)NULL
, FKIOCTL
,
494 kcred
, NULL
, NULL
)) != 0)
498 (void) fop_close(cdev_vp
, FREAD
| FWRITE
, 1, 0,
505 cmn_err(CE_CONT
, "?dump on %s size %llu MB\n", name
, dumpvp_size
>> 20);
507 dump_update_clevel();
516 boolean_t is_zfs
= B_FALSE
;
518 ASSERT(MUTEX_HELD(&dump_lock
));
520 kmem_free(dumppath
, strlen(dumppath
) + 1);
523 * Determine if we are using zvols for our dump device
525 vattr
.va_mask
= AT_RDEV
;
526 if (fop_getattr(dumpvp
, &vattr
, 0, kcred
, NULL
) == 0) {
527 is_zfs
= (getmajor(vattr
.va_rdev
) ==
528 ddi_name_to_major(ZFS_DRIVER
)) ? B_TRUE
: B_FALSE
;
532 * If we have a zvol dump device then we call into zfs so
533 * that it may have a chance to cleanup.
536 (cdev_vp
= makespecvp(VTOS(dumpvp
)->s_dev
, VCHR
)) != NULL
) {
537 if (fop_open(&cdev_vp
, FREAD
| FWRITE
, kcred
, NULL
) == 0) {
538 (void) fop_ioctl(cdev_vp
, DKIOCDUMPFINI
, (intptr_t)NULL
,
539 FKIOCTL
, kcred
, NULL
, NULL
);
540 (void) fop_close(cdev_vp
, FREAD
| FWRITE
, 1, 0,
546 (void) fop_close(dumpvp
, FREAD
| FWRITE
, 1, 0, kcred
, NULL
);
558 size_t size
= P2ROUNDUP(dumpcfg
.buf
.cur
- dumpcfg
.buf
.start
, PAGESIZE
);
562 if (dumpcfg
.buf
.vp_off
+ size
> dumpcfg
.buf
.vp_limit
) {
564 dumpcfg
.buf
.vp_off
= dumpcfg
.buf
.vp_limit
;
565 } else if (size
!= 0) {
566 iotime
= gethrtime();
567 dumpcfg
.iowait
+= iotime
- dumpcfg
.iowaitts
;
569 err
= fop_dump(dumpvp
, dumpcfg
.buf
.start
,
570 lbtodb(dumpcfg
.buf
.vp_off
), btod(size
), NULL
);
572 err
= vn_rdwr(UIO_WRITE
, dumpcfg
.buf
.cdev_vp
!= NULL
?
573 dumpcfg
.buf
.cdev_vp
: dumpvp
, dumpcfg
.buf
.start
, size
,
574 dumpcfg
.buf
.vp_off
, UIO_SYSSPACE
, 0, dumpcfg
.buf
.vp_limit
,
576 if (err
&& dump_ioerr
== 0)
578 dumpcfg
.iowaitts
= gethrtime();
579 dumpcfg
.iotime
+= dumpcfg
.iowaitts
- iotime
;
580 dumpcfg
.nwrite
+= size
;
581 dumpcfg
.buf
.vp_off
+= size
;
583 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
;
584 dump_timeleft
= dump_timeout
;
585 return (dumpcfg
.buf
.vp_off
);
588 /* maximize write speed by keeping seek offset aligned with size */
590 dumpvp_write(const void *va
, size_t size
)
595 len
= MIN(size
, dumpcfg
.buf
.end
- dumpcfg
.buf
.cur
);
597 off
= P2PHASE(dumpcfg
.buf
.vp_off
, dumpcfg
.buf
.size
);
598 if (off
== 0 || !ISP2(dumpcfg
.buf
.size
)) {
599 (void) dumpvp_flush();
601 sz
= dumpcfg
.buf
.size
- off
;
602 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
+ sz
;
603 (void) dumpvp_flush();
604 ovbcopy(dumpcfg
.buf
.start
+ sz
, dumpcfg
.buf
.start
, off
);
605 dumpcfg
.buf
.cur
+= off
;
608 bcopy(va
, dumpcfg
.buf
.cur
, len
);
609 va
= (char *)va
+ len
;
610 dumpcfg
.buf
.cur
+= len
;
618 dumpvp_ksyms_write(const void *src
, void *dst
, size_t size
)
620 dumpvp_write(src
, size
);
624 * Mark 'pfn' in the bitmap and dump its translation table entry.
627 dump_addpage(struct as
*as
, void *va
, pfn_t pfn
)
632 if ((bitnum
= dump_pfn_to_bitnum(pfn
)) != (pgcnt_t
)-1) {
633 if (!BT_TEST(dumpcfg
.bitmap
, bitnum
)) {
634 dumphdr
->dump_npages
++;
635 BT_SET(dumpcfg
.bitmap
, bitnum
);
637 dumphdr
->dump_nvtop
++;
640 mem_vtop
.m_pfn
= pfn
;
641 dumpvp_write(&mem_vtop
, sizeof (mem_vtop_t
));
643 dump_timeleft
= dump_timeout
;
647 * Mark 'pfn' in the bitmap
654 if ((bitnum
= dump_pfn_to_bitnum(pfn
)) != (pgcnt_t
)-1) {
655 if (!BT_TEST(dumpcfg
.bitmap
, bitnum
)) {
656 dumphdr
->dump_npages
++;
657 BT_SET(dumpcfg
.bitmap
, bitnum
);
660 dump_timeleft
= dump_timeout
;
664 * Dump the <as, va, pfn> information for a given address space.
665 * segop_dump() will call dump_addpage() for each page in the segment.
668 dump_as(struct as
*as
)
672 AS_LOCK_ENTER(as
, RW_READER
);
673 for (seg
= AS_SEGFIRST(as
); seg
; seg
= AS_SEGNEXT(as
, seg
)) {
676 if (seg
->s_ops
== NULL
)
683 cmn_err(CE_WARN
, "invalid segment %p in address space %p",
684 (void *)seg
, (void *)as
);
688 dump_process(pid_t pid
)
690 proc_t
*p
= sprlock(pid
);
694 if (p
->p_as
!= &kas
) {
695 mutex_exit(&p
->p_lock
);
697 mutex_enter(&p
->p_lock
);
706 * The following functions (dump_summary(), dump_ereports(), and
707 * dump_messages()), write data to an uncompressed area within the
708 * crashdump. The layout of these is
710 * +------------------------------------------------------------+
711 * | compressed pages | summary | ereports | messages |
712 * +------------------------------------------------------------+
714 * With the advent of saving a compressed crash dump by default, we
715 * need to save a little more data to describe the failure mode in
716 * an uncompressed buffer available before savecore uncompresses
717 * the dump. Initially this is a copy of the stack trace. Additional
718 * summary information should be added here.
727 if (dumpvp
== NULL
|| dumphdr
== NULL
)
730 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
;
732 dumpcfg
.buf
.vp_limit
= dumpvp_size
- (DUMP_OFFSET
+ DUMP_LOGSIZE
+
734 dumpvp_start
= dumpcfg
.buf
.vp_limit
- DUMP_SUMMARYSIZE
;
735 dumpcfg
.buf
.vp_off
= dumpvp_start
;
737 sd
.sd_magic
= SUMMARY_MAGIC
;
738 sd
.sd_ssum
= checksum32(dump_stack_scratch
, STACK_BUF_SIZE
);
739 dumpvp_write(&sd
, sizeof (sd
));
740 dumpvp_write(dump_stack_scratch
, STACK_BUF_SIZE
);
742 sd
.sd_magic
= 0; /* indicate end of summary */
743 dumpvp_write(&sd
, sizeof (sd
));
744 (void) dumpvp_flush();
753 if (dumpvp
== NULL
|| dumphdr
== NULL
)
756 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
;
757 dumpcfg
.buf
.vp_limit
= dumpvp_size
- (DUMP_OFFSET
+ DUMP_LOGSIZE
);
758 dumpvp_start
= dumpcfg
.buf
.vp_limit
- DUMP_ERPTSIZE
;
759 dumpcfg
.buf
.vp_off
= dumpvp_start
;
765 bzero(&ed
, sizeof (ed
)); /* indicate end of ereports */
766 dumpvp_write(&ed
, sizeof (ed
));
767 (void) dumpvp_flush();
770 (void) fop_putpage(dumpvp
, dumpvp_start
,
771 (size_t)(dumpcfg
.buf
.vp_off
- dumpvp_start
),
772 B_INVAL
| B_FORCE
, kcred
, NULL
);
780 mblk_t
*mctl
, *mdata
;
784 if (dumpvp
== NULL
|| dumphdr
== NULL
|| log_consq
== NULL
)
787 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
;
788 dumpcfg
.buf
.vp_limit
= dumpvp_size
- DUMP_OFFSET
;
789 dumpvp_start
= dumpcfg
.buf
.vp_limit
- DUMP_LOGSIZE
;
790 dumpcfg
.buf
.vp_off
= dumpvp_start
;
794 for (q
= log_consq
; q
->q_next
!= qlast
; q
= q
->q_next
)
796 for (mctl
= q
->q_first
; mctl
!= NULL
; mctl
= mctl
->b_next
) {
797 dump_timeleft
= dump_timeout
;
798 mdata
= mctl
->b_cont
;
799 ld
.ld_magic
= LOG_MAGIC
;
800 ld
.ld_msgsize
= MBLKL(mctl
->b_cont
);
801 ld
.ld_csum
= checksum32(mctl
->b_rptr
, MBLKL(mctl
));
802 ld
.ld_msum
= checksum32(mdata
->b_rptr
, MBLKL(mdata
));
803 dumpvp_write(&ld
, sizeof (ld
));
804 dumpvp_write(mctl
->b_rptr
, MBLKL(mctl
));
805 dumpvp_write(mdata
->b_rptr
, MBLKL(mdata
));
807 } while ((qlast
= q
) != log_consq
);
809 ld
.ld_magic
= 0; /* indicate end of messages */
810 dumpvp_write(&ld
, sizeof (ld
));
811 (void) dumpvp_flush();
813 (void) fop_putpage(dumpvp
, dumpvp_start
,
814 (size_t)(dumpcfg
.buf
.vp_off
- dumpvp_start
),
815 B_INVAL
| B_FORCE
, kcred
, NULL
);
820 * Copy pages, trapping ECC errors. Also, for robustness, trap data
821 * access in case something goes wrong in the hat layer and the
825 dump_pagecopy(void *src
, void *dst
)
827 long *wsrc
= (long *)src
;
828 long *wdst
= (long *)dst
;
829 const ulong_t ncopies
= PAGESIZE
/ sizeof (long);
831 volatile int ueoff
= -1;
834 if (on_trap(&otd
, OT_DATA_EC
| OT_DATA_ACCESS
)) {
836 ueoff
= w
* sizeof (long);
837 /* report "bad ECC" or "bad address" */
839 if (otd
.ot_trap
& OT_DATA_EC
)
840 wdst
[w
++] = 0x00badecc00badecc;
842 wdst
[w
++] = 0x00badadd00badadd;
844 if (otd
.ot_trap
& OT_DATA_EC
)
845 wdst
[w
++] = 0x00badecc;
847 wdst
[w
++] = 0x00badadd;
850 while (w
< ncopies
) {
858 dumpsys_metrics(char *buf
, size_t size
)
860 dumpcfg_t
*cfg
= &dumpcfg
;
863 char *e
= buf
+ size
;
866 sec
= cfg
->elapsed
/ (1000 * 1000 * 1000ULL);
872 iorate
= (cfg
->nwrite
* 100000ULL) / cfg
->iotime
;
874 compress_ratio
= 100LL * cfg
->npages
/ btopr(cfg
->nwrite
+ 1);
876 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
878 P("Master cpu_seqid,%d\n", CPU
->cpu_seqid
);
879 P("Master cpu_id,%d\n", CPU
->cpu_id
);
880 P("dump_flags,0x%x\n", dumphdr
->dump_flags
);
881 P("dump_ioerr,%d\n", dump_ioerr
);
883 P("Compression type,serial lzjb\n");
884 P("Compression ratio,%d.%02d\n", compress_ratio
/ 100, compress_ratio
%
887 P("Dump I/O rate MBS,%d.%02d\n", iorate
/ 100, iorate
% 100);
888 P("..total bytes,%lld\n", (u_longlong_t
)cfg
->nwrite
);
889 P("..total nsec,%lld\n", (u_longlong_t
)cfg
->iotime
);
890 P("dumpbuf.iosize,%ld\n", dumpcfg
.buf
.iosize
);
891 P("dumpbuf.size,%ld\n", dumpcfg
.buf
.size
);
893 P("Dump pages/sec,%llu\n", (u_longlong_t
)cfg
->npages
/ sec
);
894 P("Dump pages,%llu\n", (u_longlong_t
)cfg
->npages
);
895 P("Dump time,%d\n", sec
);
897 if (cfg
->pages_mapped
> 0)
898 P("per-cent map utilization,%d\n", (int)((100 * cfg
->pages_used
)
899 / cfg
->pages_mapped
));
901 P("\nPer-page metrics:\n");
902 if (cfg
->npages
> 0) {
904 P("%s nsec/page,%d\n", #x, (int)(cfg->perpage.x / cfg->npages));
908 P("I/O wait nsec/page,%llu\n", (u_longlong_t
)(cfg
->iowait
/
923 dumpcfg_t
*cfg
= &dumpcfg
;
924 uint_t percent_done
; /* dump progress reported */
926 hrtime_t start
; /* start time */
936 dumpdatahdr_t datahdr
;
938 if (dumpvp
== NULL
|| dumphdr
== NULL
) {
939 uprintf("skipping system dump - no dump device configured\n");
942 dumpcfg
.buf
.cur
= dumpcfg
.buf
.start
;
944 /* clear the sync variables */
946 cfg
->pages_mapped
= 0;
955 * Calculate the starting block for dump. If we're dumping on a
956 * swap device, start 1/5 of the way in; otherwise, start at the
957 * beginning. And never use the first page -- it may be a disk label.
959 if (dumpvp
->v_flag
& VISSWAP
)
960 dumphdr
->dump_start
= P2ROUNDUP(dumpvp_size
/ 5, DUMP_OFFSET
);
962 dumphdr
->dump_start
= DUMP_OFFSET
;
964 dumphdr
->dump_flags
= DF_VALID
| DF_COMPLETE
| DF_LIVE
| DF_COMPRESSED
;
965 dumphdr
->dump_crashtime
= gethrestime_sec();
966 dumphdr
->dump_npages
= 0;
967 dumphdr
->dump_nvtop
= 0;
968 bzero(dumpcfg
.bitmap
, BT_SIZEOFMAP(dumpcfg
.bitmapsize
));
969 dump_timeleft
= dump_timeout
;
972 dumphdr
->dump_flags
&= ~DF_LIVE
;
973 (void) fop_dumpctl(dumpvp
, DUMP_FREE
, NULL
, NULL
);
974 (void) fop_dumpctl(dumpvp
, DUMP_ALLOC
, NULL
, NULL
);
975 (void) vsnprintf(dumphdr
->dump_panicstring
, DUMP_PANICSIZE
,
976 panicstr
, panicargs
);
979 if (dump_conflags
& DUMP_ALL
)
981 else if (dump_conflags
& DUMP_CURPROC
)
982 content
= "kernel + curproc";
985 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath
,
986 dumphdr
->dump_start
, content
);
988 /* Make sure nodename is current */
989 bcopy(utsname
.nodename
, dumphdr
->dump_utsname
.nodename
, SYS_NMLN
);
992 * If this is a live dump, try to open a VCHR vnode for better
993 * performance. We must take care to flush the buffer cache
997 vnode_t
*cdev_vp
, *cmn_cdev_vp
;
999 ASSERT(dumpcfg
.buf
.cdev_vp
== NULL
);
1000 cdev_vp
= makespecvp(VTOS(dumpvp
)->s_dev
, VCHR
);
1001 if (cdev_vp
!= NULL
) {
1002 cmn_cdev_vp
= common_specvp(cdev_vp
);
1003 if (fop_open(&cmn_cdev_vp
, FREAD
| FWRITE
, kcred
, NULL
)
1005 if (vn_has_cached_data(dumpvp
))
1006 (void) pvn_vplist_dirty(dumpvp
, 0, NULL
,
1007 B_INVAL
| B_TRUNC
, kcred
);
1008 dumpcfg
.buf
.cdev_vp
= cmn_cdev_vp
;
1016 * Store a hires timestamp so we can look it up during debugging.
1018 lbolt_debug_entry();
1021 * Leave room for the message and ereport save areas and terminal dump
1024 dumpcfg
.buf
.vp_limit
= dumpvp_size
- DUMP_LOGSIZE
- DUMP_OFFSET
-
1028 * Write out the symbol table. It's no longer compressed,
1029 * so its 'size' and 'csize' are equal.
1031 dumpcfg
.buf
.vp_off
= dumphdr
->dump_ksyms
= dumphdr
->dump_start
+ PAGESIZE
;
1032 dumphdr
->dump_ksyms_size
= dumphdr
->dump_ksyms_csize
=
1033 ksyms_snapshot(dumpvp_ksyms_write
, NULL
, LONG_MAX
);
1036 * Write out the translation map.
1038 dumphdr
->dump_map
= dumpvp_flush();
1040 dumphdr
->dump_nvtop
+= dump_plat_addr();
1043 * call into hat, which may have unmapped pages that also need to
1048 if (dump_conflags
& DUMP_ALL
) {
1049 mutex_enter(&pidlock
);
1051 for (npids
= 0, p
= practive
; p
!= NULL
; p
= p
->p_next
)
1052 dumpcfg
.pids
[npids
++] = p
->p_pid
;
1054 mutex_exit(&pidlock
);
1056 for (pidx
= 0; pidx
< npids
; pidx
++)
1057 (void) dump_process(dumpcfg
.pids
[pidx
]);
1059 dump_init_memlist_walker(&mlw
);
1060 for (bitnum
= 0; bitnum
< dumpcfg
.bitmapsize
; bitnum
++) {
1061 dump_timeleft
= dump_timeout
;
1062 pfn
= dump_bitnum_to_pfn(bitnum
, &mlw
);
1064 * Some hypervisors do not have all pages available to
1065 * be accessed by the guest OS. Check for page
1068 if (plat_hold_page(pfn
, PLAT_HOLD_NO_LOCK
, NULL
) !=
1071 BT_SET(dumpcfg
.bitmap
, bitnum
);
1073 dumphdr
->dump_npages
= dumpcfg
.bitmapsize
;
1074 dumphdr
->dump_flags
|= DF_ALL
;
1076 } else if (dump_conflags
& DUMP_CURPROC
) {
1078 * Determine which pid is to be dumped. If we're panicking, we
1079 * dump the process associated with panic_thread (if any). If
1080 * this is a live dump, we dump the process associated with
1085 if (panic_thread
!= NULL
&&
1086 panic_thread
->t_procp
!= NULL
&&
1087 panic_thread
->t_procp
!= &p0
) {
1088 dumpcfg
.pids
[npids
++] =
1089 panic_thread
->t_procp
->p_pid
;
1092 dumpcfg
.pids
[npids
++] = curthread
->t_procp
->p_pid
;
1095 if (npids
&& dump_process(dumpcfg
.pids
[0]) == 0)
1096 dumphdr
->dump_flags
|= DF_CURPROC
;
1098 dumphdr
->dump_flags
|= DF_KERNEL
;
1101 dumphdr
->dump_flags
|= DF_KERNEL
;
1104 dumphdr
->dump_hashmask
= (1 << highbit(dumphdr
->dump_nvtop
- 1)) - 1;
1107 * Write out the pfn table.
1109 dumphdr
->dump_pfn
= dumpvp_flush();
1110 dump_init_memlist_walker(&mlw
);
1111 for (bitnum
= 0; bitnum
< dumpcfg
.bitmapsize
; bitnum
++) {
1112 dump_timeleft
= dump_timeout
;
1113 if (!BT_TEST(dumpcfg
.bitmap
, bitnum
))
1115 pfn
= dump_bitnum_to_pfn(bitnum
, &mlw
);
1116 ASSERT(pfn
!= PFN_INVALID
);
1117 dumpvp_write(&pfn
, sizeof (pfn_t
));
1122 * Write out all the pages.
1123 * Map pages, copy them handling UEs, compress, and write them out.
1125 dumphdr
->dump_data
= dumpvp_flush();
1127 ASSERT(dumpcfg
.page
);
1128 bzero(&dumpcfg
.perpage
, sizeof (dumpcfg
.perpage
));
1130 start
= gethrtime();
1131 cfg
->iowaitts
= start
;
1139 dump_init_memlist_walker(&mlw
);
1140 for (bitnum
= 0; bitnum
< dumpcfg
.bitmapsize
; bitnum
++) {
1145 dump_timeleft
= dump_timeout
;
1146 HRSTART(cfg
->perpage
, bitmap
);
1147 if (!BT_TEST(dumpcfg
.bitmap
, bitnum
)) {
1148 HRSTOP(cfg
->perpage
, bitmap
);
1151 HRSTOP(cfg
->perpage
, bitmap
);
1153 pfn
= dump_bitnum_to_pfn(bitnum
, &mlw
);
1154 ASSERT(pfn
!= PFN_INVALID
);
1156 HRSTART(cfg
->perpage
, map
);
1157 hat_devload(kas
.a_hat
, dumpcfg
.cmap
, PAGESIZE
, pfn
, PROT_READ
,
1158 HAT_LOAD_NOCONSIST
);
1159 HRSTOP(cfg
->perpage
, map
);
1161 dump_pagecopy(dumpcfg
.cmap
, dumpcfg
.page
);
1163 HRSTART(cfg
->perpage
, unmap
);
1164 hat_unload(kas
.a_hat
, dumpcfg
.cmap
, PAGESIZE
, HAT_UNLOAD
);
1165 HRSTOP(cfg
->perpage
, unmap
);
1167 HRSTART(dumpcfg
.perpage
, compress
);
1168 csize
= compress(dumpcfg
.page
, dumpcfg
.lzbuf
, PAGESIZE
);
1169 HRSTOP(dumpcfg
.perpage
, compress
);
1171 HRSTART(dumpcfg
.perpage
, write
);
1172 dumpvp_write(&csize
, sizeof (csize
));
1173 dumpvp_write(dumpcfg
.lzbuf
, csize
);
1174 HRSTOP(dumpcfg
.perpage
, write
);
1177 dumphdr
->dump_flags
&= ~DF_COMPLETE
;
1178 dumphdr
->dump_npages
= cfg
->npages
;
1182 sec
= (gethrtime() - start
) / NANOSEC
;
1183 percent
= ++cfg
->npages
* 100LL / dumphdr
->dump_npages
;
1186 * Render a simple progress display on the system console to
1187 * make clear to the operator that the system has not hung.
1188 * Emit an update when dump progress has advanced by one
1189 * percent, or when no update has been drawn in the last
1192 if (percent
> percent_done
|| sec
> sec_done
) {
1193 percent_done
= percent
;
1196 uprintf("^\r%2d:%02d %3d%% done", sec
/ 60, sec
% 60,
1199 delay(1); /* let the output be sent */
1203 cfg
->elapsed
= gethrtime() - start
;
1204 if (cfg
->elapsed
< 1)
1207 /* record actual pages dumped */
1208 dumphdr
->dump_npages
= cfg
->npages
;
1210 /* platform-specific data */
1211 dumphdr
->dump_npages
+= dump_plat_data(dumpcfg
.page
);
1213 /* note any errors by clearing DF_COMPLETE */
1214 if (dump_ioerr
|| cfg
->npages
< dumphdr
->dump_npages
)
1215 dumphdr
->dump_flags
&= ~DF_COMPLETE
;
1217 /* end of stream blocks */
1219 dumpvp_write(&datatag
, sizeof (datatag
));
1221 bzero(&datahdr
, sizeof (datahdr
));
1223 /* buffer for metrics */
1225 size
= MIN(PAGESIZE
, DUMP_OFFSET
- sizeof (dumphdr_t
) -
1226 sizeof (dumpdatahdr_t
));
1228 /* finish the kmem intercepts, collect kmem verbose info */
1230 datahdr
.dump_metrics
= kmem_dump_finish(buf
, size
);
1231 buf
+= datahdr
.dump_metrics
;
1232 size
-= datahdr
.dump_metrics
;
1235 /* record in the header whether this is a fault-management panic */
1237 dumphdr
->dump_fm_panic
= is_fm_panic();
1239 /* compression info in data header */
1240 datahdr
.dump_datahdr_magic
= DUMP_DATAHDR_MAGIC
;
1241 datahdr
.dump_datahdr_version
= DUMP_DATAHDR_VERSION
;
1242 datahdr
.dump_maxcsize
= PAGESIZE
;
1243 datahdr
.dump_maxrange
= 1;
1244 datahdr
.dump_nstreams
= 1;
1245 datahdr
.dump_clevel
= 0;
1247 if (dump_metrics_on
)
1248 datahdr
.dump_metrics
+= dumpsys_metrics(buf
, size
);
1250 datahdr
.dump_data_csize
= dumpvp_flush() - dumphdr
->dump_data
;
1253 * Write out the initial and terminal dump headers.
1255 dumpcfg
.buf
.vp_off
= dumphdr
->dump_start
;
1256 dumpvp_write(dumphdr
, sizeof (dumphdr_t
));
1257 (void) dumpvp_flush();
1259 dumpcfg
.buf
.vp_limit
= dumpvp_size
;
1260 dumpcfg
.buf
.vp_off
= dumpcfg
.buf
.vp_limit
- DUMP_OFFSET
;
1261 dumpvp_write(dumphdr
, sizeof (dumphdr_t
));
1262 dumpvp_write(&datahdr
, sizeof (dumpdatahdr_t
));
1263 dumpvp_write(dumpcfg
.page
, datahdr
.dump_metrics
);
1265 (void) dumpvp_flush();
1267 uprintf("\r%3d%% done: %llu pages dumped, ",
1268 percent_done
, (u_longlong_t
)cfg
->npages
);
1270 if (dump_ioerr
== 0) {
1271 uprintf("dump succeeded\n");
1273 uprintf("dump failed: error %d\n", dump_ioerr
);
1276 debug_enter("dump failed");
1281 * Write out all undelivered messages. This has to be the *last*
1282 * thing we do because the dump process itself emits messages.
1290 ddi_sleep(2); /* let people see the 'done' message */
1294 /* restore settings after live dump completes */
1296 /* release any VCHR open of the dump device */
1297 if (dumpcfg
.buf
.cdev_vp
!= NULL
) {
1298 (void) fop_close(dumpcfg
.buf
.cdev_vp
, FREAD
| FWRITE
, 1, 0,
1300 VN_RELE(dumpcfg
.buf
.cdev_vp
);
1301 dumpcfg
.buf
.cdev_vp
= NULL
;
1307 * This function is called whenever the memory size, as represented
1308 * by the phys_install list, changes.
1313 mutex_enter(&dump_lock
);
1316 dump_update_clevel();
1317 mutex_exit(&dump_lock
);
1321 * This function allows for dynamic resizing of a dump area. It assumes that
1322 * the underlying device has update its appropriate size(9P).
1330 mutex_enter(&dump_lock
);
1331 vattr
.va_mask
= AT_SIZE
;
1332 if ((error
= fop_getattr(dumpvp
, &vattr
, 0, kcred
, NULL
)) != 0) {
1333 mutex_exit(&dump_lock
);
1337 if (error
== 0 && vattr
.va_size
< 2 * DUMP_LOGSIZE
+ DUMP_ERPTSIZE
) {
1338 mutex_exit(&dump_lock
);
1342 dumpvp_size
= vattr
.va_size
& -DUMP_OFFSET
;
1343 mutex_exit(&dump_lock
);
1348 dump_set_uuid(const char *uuidstr
)
1353 if (uuidstr
== NULL
|| strnlen(uuidstr
, 36 + 1) != 36)
1356 /* uuid_parse is not common code so check manually */
1357 for (i
= 0, ptr
= uuidstr
; i
< 36; i
++, ptr
++) {
1368 if (!isxdigit(*ptr
))
1374 if (dump_osimage_uuid
[0] != '\0')
1377 (void) strncpy(dump_osimage_uuid
, uuidstr
, 36 + 1);
1379 cmn_err(CE_CONT
, "?This Solaris instance has UUID %s\n",
1388 return (dump_osimage_uuid
[0] != '\0' ? dump_osimage_uuid
: "");