3909 "zfs send -D" does not work
[illumos-gate.git] / usr / src / uts / i86pc / os / memscrub.c
blobc0681bcadb739f7f3a8ed25e2b2a70abb0e7f8e4
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * i86pc Memory Scrubbing
30 * On detection of a correctable memory ECC error, the i86pc hardware
31 * returns the corrected data to the requester and may re-write it
32 * to memory (DRAM or NVRAM). Machines which do not re-write this to
33 * memory should add an NMI handler to correct and rewrite.
35 * Scrubbing thus reduces the likelyhood that multiple transient errors
36 * will occur in the same memory word, making uncorrectable errors due
37 * to transients less likely.
39 * Thus is born the desire that every memory location be periodically
40 * accessed.
42 * This file implements a memory scrubbing thread. This scrubber
43 * guarantees that all of physical memory is accessed periodically
44 * (memscrub_period_sec -- 12 hours).
46 * It attempts to do this as unobtrusively as possible. The thread
47 * schedules itself to wake up at an interval such that if it reads
48 * memscrub_span_pages (4MB) on each wakeup, it will read all of physical
49 * memory in in memscrub_period_sec (12 hours).
51 * The scrubber uses the REP LODS so it reads 4MB in 0.15 secs (on P5-200).
52 * When it completes a span, if all the CPUs are idle, it reads another span.
53 * Typically it soaks up idle time this way to reach its deadline early
54 * -- and sleeps until the next period begins.
56 * Maximal Cost Estimate: 8GB @ xxMB/s = xxx seconds spent in 640 wakeups
57 * that run for 0.15 seconds at intervals of 67 seconds.
59 * In practice, the scrubber finds enough idle time to finish in a few
60 * minutes, and sleeps until its 12 hour deadline.
62 * The scrubber maintains a private copy of the phys_install memory list
63 * to keep track of what memory should be scrubbed.
65 * The following parameters can be set via /etc/system
67 * memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES (4MB)
68 * memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC (12 hours)
69 * memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI (0)
70 * memscrub_delay_start_sec = (10 seconds)
71 * disable_memscrub = (0)
73 * the scrubber will exit (or never be started) if it finds the variable
74 * "disable_memscrub" set.
76 * MEMSCRUB_DFL_SPAN_PAGES is based on the guess that 0.15 sec
77 * is a "good" amount of minimum time for the thread to run at a time.
79 * MEMSCRUB_DFL_PERIOD_SEC (12 hours) is nearly a total guess --
80 * twice the frequency the hardware folk estimated would be necessary.
82 * MEMSCRUB_DFL_THREAD_PRI (0) is based on the assumption that nearly
83 * any other use of the system should be higher priority than scrubbing.
86 #include <sys/types.h>
87 #include <sys/systm.h> /* timeout, types, t_lock */
88 #include <sys/cmn_err.h>
89 #include <sys/sysmacros.h> /* MIN */
90 #include <sys/memlist.h> /* memlist */
91 #include <sys/kmem.h> /* KMEM_NOSLEEP */
92 #include <sys/cpuvar.h> /* ncpus_online */
93 #include <sys/debug.h> /* ASSERTs */
94 #include <sys/vmem.h>
95 #include <sys/mman.h>
96 #include <vm/seg_kmem.h>
97 #include <vm/seg_kpm.h>
98 #include <vm/hat_i86.h>
99 #include <sys/callb.h> /* CPR callback */
101 static caddr_t memscrub_window;
102 static hat_mempte_t memscrub_pte;
105 * Global Data:
108 * scan all of physical memory at least once every MEMSCRUB_PERIOD_SEC
110 #define MEMSCRUB_DFL_PERIOD_SEC (12 * 60 * 60) /* 12 hours */
113 * start only if at least MEMSCRUB_MIN_PAGES in system
115 #define MEMSCRUB_MIN_PAGES ((32 * 1024 * 1024) / PAGESIZE)
118 * scan at least MEMSCRUB_DFL_SPAN_PAGES each iteration
120 #define MEMSCRUB_DFL_SPAN_PAGES ((4 * 1024 * 1024) / PAGESIZE)
123 * almost anything is higher priority than scrubbing
125 #define MEMSCRUB_DFL_THREAD_PRI 0
128 * we can patch these defaults in /etc/system if necessary
130 uint_t disable_memscrub = 0;
131 static uint_t disable_memscrub_quietly = 0;
132 pgcnt_t memscrub_min_pages = MEMSCRUB_MIN_PAGES;
133 pgcnt_t memscrub_span_pages = MEMSCRUB_DFL_SPAN_PAGES;
134 time_t memscrub_period_sec = MEMSCRUB_DFL_PERIOD_SEC;
135 uint_t memscrub_thread_pri = MEMSCRUB_DFL_THREAD_PRI;
136 time_t memscrub_delay_start_sec = 10;
139 * Static Routines
141 static void memscrubber(void);
142 static int system_is_idle(void);
143 static int memscrub_add_span(uint64_t, uint64_t);
146 * Static Data
148 static struct memlist *memscrub_memlist;
149 static uint_t memscrub_phys_pages;
151 static kcondvar_t memscrub_cv;
152 static kmutex_t memscrub_lock;
155 * memscrub_lock protects memscrub_memlist
157 uint_t memscrub_scans_done;
159 uint_t memscrub_done_early;
160 uint_t memscrub_early_sec;
162 uint_t memscrub_done_late;
163 time_t memscrub_late_sec;
166 * create memscrub_memlist from phys_install list
167 * initialize locks, set memscrub_phys_pages.
169 void
170 memscrub_init()
172 struct memlist *src;
174 if (physmem < memscrub_min_pages)
175 return;
177 if (!kpm_enable) {
178 memscrub_window = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
179 memscrub_pte = hat_mempte_setup(memscrub_window);
183 * copy phys_install to memscrub_memlist
185 for (src = phys_install; src; src = src->ml_next) {
186 if (memscrub_add_span(src->ml_address, src->ml_size)) {
187 cmn_err(CE_WARN,
188 "Software memory scrubber failed to initialize\n");
189 return;
193 mutex_init(&memscrub_lock, NULL, MUTEX_DRIVER, NULL);
194 cv_init(&memscrub_cv, NULL, CV_DRIVER, NULL);
197 * create memscrubber thread
199 (void) thread_create(NULL, 0, (void (*)())memscrubber, NULL, 0, &p0,
200 TS_RUN, memscrub_thread_pri);
204 * Function to cause the software memscrubber to exit quietly if the
205 * platform support has located a hardware scrubber and enabled it.
207 void
208 memscrub_disable(void)
210 disable_memscrub_quietly = 1;
213 #ifdef MEMSCRUB_DEBUG
214 static void
215 memscrub_printmemlist(char *title, struct memlist *listp)
217 struct memlist *list;
219 cmn_err(CE_CONT, "%s:\n", title);
221 for (list = listp; list; list = list->next) {
222 cmn_err(CE_CONT, "addr = 0x%llx, size = 0x%llx\n",
223 list->address, list->size);
226 #endif /* MEMSCRUB_DEBUG */
228 /* ARGSUSED */
229 static void
230 memscrub_wakeup(void *c)
233 * grab mutex to guarantee that our wakeup call
234 * arrives after we go to sleep -- so we can't sleep forever.
236 mutex_enter(&memscrub_lock);
237 cv_signal(&memscrub_cv);
238 mutex_exit(&memscrub_lock);
242 * this calculation doesn't account for the time that the actual scan
243 * consumes -- so we'd fall slightly behind schedule with this
244 * interval_sec. but the idle loop optimization below usually makes us
245 * come in way ahead of schedule.
247 static int
248 compute_interval_sec()
250 if (memscrub_phys_pages <= memscrub_span_pages)
251 return (memscrub_period_sec);
252 else
253 return (memscrub_period_sec/
254 (memscrub_phys_pages/memscrub_span_pages));
257 static void
258 memscrubber()
260 time_t deadline;
261 uint64_t mlp_last_addr;
262 uint64_t mlp_next_addr;
263 int reached_end = 1;
264 time_t interval_sec = 0;
265 struct memlist *mlp;
267 extern void scan_memory(caddr_t, size_t);
268 callb_cpr_t cprinfo;
271 * notify CPR of our existence
273 CALLB_CPR_INIT(&cprinfo, &memscrub_lock, callb_generic_cpr, "memscrub");
275 if (memscrub_memlist == NULL) {
276 cmn_err(CE_WARN, "memscrub_memlist not initialized.");
277 goto memscrub_exit;
280 mlp = memscrub_memlist;
281 mlp_next_addr = mlp->ml_address;
282 mlp_last_addr = mlp->ml_address + mlp->ml_size;
284 deadline = gethrestime_sec() + memscrub_delay_start_sec;
286 for (;;) {
287 if (disable_memscrub || disable_memscrub_quietly)
288 break;
290 mutex_enter(&memscrub_lock);
293 * did we just reach the end of memory?
295 if (reached_end) {
296 time_t now = gethrestime_sec();
298 if (now >= deadline) {
299 memscrub_done_late++;
300 memscrub_late_sec += (now - deadline);
302 * past deadline, start right away
304 interval_sec = 0;
306 deadline = now + memscrub_period_sec;
307 } else {
309 * we finished ahead of schedule.
310 * wait till previous dealine before re-start.
312 interval_sec = deadline - now;
313 memscrub_done_early++;
314 memscrub_early_sec += interval_sec;
315 deadline += memscrub_period_sec;
317 } else {
318 interval_sec = compute_interval_sec();
322 * it is safe from our standpoint for CPR to
323 * suspend the system
325 CALLB_CPR_SAFE_BEGIN(&cprinfo);
328 * hit the snooze bar
330 (void) timeout(memscrub_wakeup, NULL, interval_sec * hz);
333 * go to sleep
335 cv_wait(&memscrub_cv, &memscrub_lock);
337 /* we need to goto work */
338 CALLB_CPR_SAFE_END(&cprinfo, &memscrub_lock);
340 mutex_exit(&memscrub_lock);
342 do {
343 pgcnt_t pages = memscrub_span_pages;
344 uint64_t address = mlp_next_addr;
346 if (disable_memscrub || disable_memscrub_quietly)
347 break;
349 mutex_enter(&memscrub_lock);
352 * Make sure we don't try to scan beyond the end of
353 * the current memlist. If we would, then resize
354 * our scan target for this iteration, and prepare
355 * to read the next memlist entry on the next
356 * iteration.
358 reached_end = 0;
359 if (address + mmu_ptob(pages) >= mlp_last_addr) {
360 pages = mmu_btop(mlp_last_addr - address);
361 mlp = mlp->ml_next;
362 if (mlp == NULL) {
363 reached_end = 1;
364 mlp = memscrub_memlist;
366 mlp_next_addr = mlp->ml_address;
367 mlp_last_addr = mlp->ml_address + mlp->ml_size;
368 } else {
369 mlp_next_addr += mmu_ptob(pages);
372 mutex_exit(&memscrub_lock);
374 while (pages--) {
375 pfn_t pfn = btop(address);
378 * Without segkpm, the memscrubber cannot
379 * be allowed to migrate across CPUs, as
380 * the CPU-specific mapping of
381 * memscrub_window would be incorrect.
382 * With segkpm, switching CPUs is legal, but
383 * inefficient. We don't use
384 * kpreempt_disable as it might hold a
385 * higher priority thread (eg, RT) too long
386 * off CPU.
388 thread_affinity_set(curthread, CPU_CURRENT);
389 if (kpm_enable)
390 memscrub_window = hat_kpm_pfn2va(pfn);
391 else
392 hat_mempte_remap(pfn, memscrub_window,
393 memscrub_pte,
394 PROT_READ, HAT_LOAD_NOCONSIST);
396 scan_memory(memscrub_window, PAGESIZE);
398 thread_affinity_clear(curthread);
399 address += MMU_PAGESIZE;
402 memscrub_scans_done++;
403 } while (!reached_end && system_is_idle());
406 memscrub_exit:
408 if (!disable_memscrub_quietly)
409 cmn_err(CE_NOTE, "Software memory scrubber exiting.");
411 * We are about to bail, but don't have the memscrub_lock,
412 * and it is needed for CALLB_CPR_EXIT.
414 mutex_enter(&memscrub_lock);
415 CALLB_CPR_EXIT(&cprinfo);
417 cv_destroy(&memscrub_cv);
419 thread_exit();
424 * return 1 if we're MP and all the other CPUs are idle
426 static int
427 system_is_idle()
429 int cpu_id;
430 int found = 0;
432 if (1 == ncpus_online)
433 return (0);
435 for (cpu_id = 0; cpu_id < NCPU; ++cpu_id) {
436 if (!cpu[cpu_id])
437 continue;
439 found++;
441 if (cpu[cpu_id]->cpu_thread != cpu[cpu_id]->cpu_idle_thread) {
442 if (CPU->cpu_id == cpu_id &&
443 CPU->cpu_disp->disp_nrunnable == 0)
444 continue;
445 return (0);
448 if (found == ncpus)
449 break;
451 return (1);
455 * add a span to the memscrub list
457 static int
458 memscrub_add_span(uint64_t start, uint64_t bytes)
460 struct memlist *dst;
461 struct memlist *prev, *next;
462 uint64_t end = start + bytes - 1;
463 int retval = 0;
465 mutex_enter(&memscrub_lock);
467 #ifdef MEMSCRUB_DEBUG
468 memscrub_printmemlist("memscrub_memlist before", memscrub_memlist);
469 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
470 cmn_err(CE_CONT, "memscrub_add_span: address: 0x%llx"
471 " size: 0x%llx\n", start, bytes);
472 #endif /* MEMSCRUB_DEBUG */
475 * Scan through the list to find the proper place to install it.
477 prev = NULL;
478 next = memscrub_memlist;
479 while (next) {
480 uint64_t ns = next->ml_address;
481 uint64_t ne = next->ml_address + next->ml_size - 1;
484 * If this span overlaps with an existing span, then
485 * something has gone horribly wrong with the phys_install
486 * list. In fact, I'm surprised we made it this far.
488 if ((start >= ns && start <= ne) || (end >= ns && end <= ne) ||
489 (start < ns && end > ne))
490 panic("memscrub found overlapping memory ranges "
491 "(0x%p-0x%p) and (0x%p-0x%p)",
492 (void *)(uintptr_t)start, (void *)(uintptr_t)end,
493 (void *)(uintptr_t)ns, (void *)(uintptr_t)ne);
496 * New span can be appended to an existing one.
498 if (start == ne + 1) {
499 next->ml_size += bytes;
500 goto add_done;
504 * New span can be prepended to an existing one.
506 if (end + 1 == ns) {
507 next->ml_size += bytes;
508 next->ml_address = start;
509 goto add_done;
513 * If the next span has a higher start address than the new
514 * one, then we have found the right spot for our
515 * insertion.
517 if (ns > start)
518 break;
520 prev = next;
521 next = next->ml_next;
525 * allocate a new struct memlist
527 dst = kmem_alloc(sizeof (struct memlist), KM_NOSLEEP);
528 if (dst == NULL) {
529 retval = -1;
530 goto add_done;
532 dst->ml_address = start;
533 dst->ml_size = bytes;
534 dst->ml_prev = prev;
535 dst->ml_next = next;
537 if (prev)
538 prev->ml_next = dst;
539 else
540 memscrub_memlist = dst;
542 if (next)
543 next->ml_prev = dst;
545 add_done:
547 if (retval != -1)
548 memscrub_phys_pages += mmu_btop(bytes);
550 #ifdef MEMSCRUB_DEBUG
551 memscrub_printmemlist("memscrub_memlist after", memscrub_memlist);
552 cmn_err(CE_CONT, "memscrub_phys_pages: 0x%x\n", memscrub_phys_pages);
553 #endif /* MEMSCRUB_DEBUG */
555 mutex_exit(&memscrub_lock);
556 return (retval);