4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
21 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22 /* All Rights Reserved */
26 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/tuneable.h>
34 #include <sys/inline.h>
35 #include <sys/systm.h>
43 #include <sys/vnode.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/sysinfo.h>
49 #include <sys/callb.h>
50 #include <sys/reboot.h>
52 #include <sys/fs/ufs_inode.h>
53 #include <sys/fs/ufs_bio.h>
58 #include <vm/seg_kmem.h>
60 int doiflush
= 1; /* non-zero to turn inode flushing on */
61 int dopageflush
= 1; /* non-zero to turn page flushing on */
64 * To improve boot performance, don't run the inode flushing loop until
65 * the specified number of seconds after boot. To revert to the old
66 * behavior, set fsflush_iflush_delay to 0. We have not created any new
67 * filesystem danger that did not exist previously, since there is always a
68 * window in between when fsflush does the inode flush loop during which the
69 * system could crash, fail to sync the filesystem, and fsck will be needed
70 * to recover. We have, however, widened this window. Finally,
71 * we never delay inode flushing if we're booting into single user mode,
72 * where the administrator may be modifying files or using fsck. This
73 * modification avoids inode flushes during boot whose only purpose is to
74 * update atimes on files which have been accessed during boot.
76 int fsflush_iflush_delay
= 60;
78 kcondvar_t fsflush_cv
;
79 static kmutex_t fsflush_lock
; /* just for the cv_wait */
80 ksema_t fsflush_sema
; /* to serialize with reboot */
83 * some statistics for fsflush_do_pages
86 ulong_t fsf_scan
; /* number of pages scanned */
87 ulong_t fsf_examined
; /* number of page_t's actually examined, can */
88 /* be less than fsf_scan due to large pages */
89 ulong_t fsf_locked
; /* pages we actually page_lock()ed */
90 ulong_t fsf_modified
; /* number of modified pages found */
91 ulong_t fsf_coalesce
; /* number of page coalesces done */
92 ulong_t fsf_time
; /* nanoseconds of run time */
93 ulong_t fsf_releases
; /* number of page_release() done */
96 fsf_stat_t fsf_recent
; /* counts for most recent duty cycle */
97 fsf_stat_t fsf_total
; /* total of counts */
98 ulong_t fsf_cycles
; /* number of runs refelected in fsf_total */
101 * data used to determine when we can coalesce consecutive free pages
104 #define MAX_PAGESIZES 32
105 static ulong_t fsf_npgsz
;
106 static pgcnt_t fsf_pgcnt
[MAX_PAGESIZES
];
107 static pgcnt_t fsf_mask
[MAX_PAGESIZES
];
111 * Scan page_t's and issue I/O's for modified pages.
113 * Also coalesces consecutive small sized free pages into the next larger
114 * pagesize. This costs a tiny bit of time in fsflush, but will reduce time
115 * spent scanning on later passes and for anybody allocating large pages.
122 hrtime_t timer
= gethrtime();
123 ulong_t releases
= 0;
124 ulong_t nexamined
= 0;
126 ulong_t nmodified
= 0;
127 ulong_t ncoalesce
= 0;
134 page_t
*coal_page
= NULL
; /* 1st page in group to coalesce */
135 uint_t coal_szc
= 0; /* size code, coal_page->p_szc */
136 uint_t coal_cnt
= 0; /* count of pages seen */
138 static ulong_t nscan
= 0;
139 static pgcnt_t last_total_pages
= 0;
140 static page_t
*pp
= NULL
;
143 * Check to see if total_pages has changed.
145 if (total_pages
!= last_total_pages
) {
146 last_total_pages
= total_pages
;
147 nscan
= (last_total_pages
* (tune
.t_fsflushr
))/v
.v_autoup
;
154 while (pcount
< nscan
) {
157 * move to the next page, skipping over large pages
158 * and issuing prefetches.
160 if (pp
->p_szc
&& fspage
== 0) {
163 pfn
= page_pptonum(pp
);
164 cnt
= page_get_pagecnt(pp
->p_szc
);
165 cnt
-= pfn
& (cnt
- 1);
169 pp
= page_nextn(pp
, cnt
);
170 prefetch_page_r((void *)pp
);
175 * Do a bunch of dirty tests (ie. no locking) to determine
176 * if we can quickly skip this page. These tests are repeated
177 * after acquiring the page lock.
187 * skip free pages too, but try coalescing them into larger
192 * skip pages with a file system identity or that
193 * are already maximum size
197 if (pp
->p_vnode
!= NULL
|| szc
== fsf_npgsz
- 1) {
203 * If not in a coalescing candidate page or the size
204 * codes are different, start a new candidate.
206 if (coal_page
== NULL
|| coal_szc
!= szc
) {
209 * page must be properly aligned
211 if ((page_pptonum(pp
) & fsf_mask
[szc
]) != 0) {
222 * acceptable to add this to existing candidate page
225 if (coal_cnt
< fsf_pgcnt
[coal_szc
])
229 * We've got enough pages to coalesce, so do it.
230 * After promoting, we clear coal_page, so it will
231 * take another pass to promote this to an even
235 (void) page_promote_size(coal_page
, coal_szc
);
252 * Reject pages that can't be "exclusively" locked.
254 if (!page_trylock(pp
, SE_EXCL
))
260 * After locking the page, redo the above checks.
261 * Since we locked the page, leave out the PAGE_LOCKED() test.
268 (vp
->v_flag
& VISSWAP
) != 0) {
273 if (pp
->p_lckcnt
!= 0 || pp
->p_cowcnt
!= 0) {
279 ASSERT(vp
->v_type
!= VCHR
);
282 * Check the modified bit. Leaving the bit alone in hardware.
283 * It will be cleared if we do the putpage.
288 mod
= hat_pagesync(pp
,
289 HAT_SYNC_DONTZERO
| HAT_SYNC_STOPON_MOD
) & P_MOD
;
293 offset
= pp
->p_offset
;
296 * Hold the vnode before releasing the page lock
297 * to prevent it from being freed and re-used by
304 (void) VOP_PUTPAGE(vp
, offset
, PAGESIZE
, B_ASYNC
,
311 * Catch any pages which should be on the cache list,
314 if (hat_page_is_mapped(pp
) == 0) {
316 (void) page_release(pp
, 1);
324 * maintain statistics
325 * reset every million wakeups, just to avoid overflow
327 if (++fsf_cycles
== 1000000) {
329 fsf_total
.fsf_scan
= 0;
330 fsf_total
.fsf_examined
= 0;
331 fsf_total
.fsf_locked
= 0;
332 fsf_total
.fsf_modified
= 0;
333 fsf_total
.fsf_coalesce
= 0;
334 fsf_total
.fsf_time
= 0;
335 fsf_total
.fsf_releases
= 0;
337 fsf_total
.fsf_scan
+= fsf_recent
.fsf_scan
= nscan
;
338 fsf_total
.fsf_examined
+= fsf_recent
.fsf_examined
= nexamined
;
339 fsf_total
.fsf_locked
+= fsf_recent
.fsf_locked
= nlocked
;
340 fsf_total
.fsf_modified
+= fsf_recent
.fsf_modified
= nmodified
;
341 fsf_total
.fsf_coalesce
+= fsf_recent
.fsf_coalesce
= ncoalesce
;
342 fsf_total
.fsf_time
+= fsf_recent
.fsf_time
= gethrtime() - timer
;
343 fsf_total
.fsf_releases
+= fsf_recent
.fsf_releases
= releases
;
348 * As part of file system hardening, this daemon is awakened
349 * every second to flush cached data which includes the
350 * buffer cache, the inode cache and mapped pages.
355 struct buf
*bp
, *dwp
;
358 unsigned int ix
, icount
, count
= 0;
364 proc_fsflush
= ttoproc(curthread
);
365 proc_fsflush
->p_cstime
= 0;
366 proc_fsflush
->p_stime
= 0;
367 proc_fsflush
->p_cutime
= 0;
368 proc_fsflush
->p_utime
= 0;
369 bcopy("fsflush", curproc
->p_user
.u_psargs
, 8);
370 bcopy("fsflush", curproc
->p_user
.u_comm
, 7);
372 mutex_init(&fsflush_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
373 sema_init(&fsflush_sema
, 0, NULL
, SEMA_DEFAULT
, NULL
);
376 * Setup page coalescing.
378 fsf_npgsz
= page_num_pagesizes();
379 ASSERT(fsf_npgsz
< MAX_PAGESIZES
);
380 for (ix
= 0; ix
< fsf_npgsz
- 1; ++ix
) {
382 page_get_pagesize(ix
+ 1) / page_get_pagesize(ix
);
383 fsf_mask
[ix
] = page_get_pagecnt(ix
+ 1) - 1;
386 autoup
= v
.v_autoup
* hz
;
387 icount
= v
.v_autoup
/ tune
.t_fsflushr
;
388 CALLB_CPR_INIT(&cprinfo
, &fsflush_lock
, callb_generic_cpr
, "fsflush");
390 sema_v(&fsflush_sema
);
391 mutex_enter(&fsflush_lock
);
392 CALLB_CPR_SAFE_BEGIN(&cprinfo
);
393 cv_wait(&fsflush_cv
, &fsflush_lock
); /* wait for clock */
394 CALLB_CPR_SAFE_END(&cprinfo
, &fsflush_lock
);
395 mutex_exit(&fsflush_lock
);
396 sema_p(&fsflush_sema
);
399 * Write back all old B_DELWRI buffers on the freelist.
402 for (ix
= 0; ix
< v
.v_hbuf
; ix
++) {
405 dwp
= (struct buf
*)&dwbuf
[ix
];
407 bcount
+= (hp
->b_length
);
409 if (dwp
->av_forw
== dwp
) {
413 hmp
= &hbuf
[ix
].b_lock
;
418 * Go down only on the delayed write lists.
422 ASSERT(bp
->b_flags
& B_DELWRI
);
424 if ((bp
->b_flags
& B_DELWRI
) &&
425 (ddi_get_lbolt() - bp
->b_start
>= autoup
) &&
426 sema_tryp(&bp
->b_sem
)) {
427 bp
->b_flags
|= B_ASYNC
;
431 if (bp
->b_vp
== NULL
) {
434 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
,
448 * There is no need to wakeup any thread waiting on bio_mem_cv
449 * since brelse will wake them up as soon as IO is complete.
451 bfreelist
.b_bcount
= bcount
;
460 * If the system was not booted to single user mode, skip the
461 * inode flushing until after fsflush_iflush_delay secs have elapsed.
463 if ((boothowto
& RB_SINGLE
) == 0 &&
464 (ddi_get_lbolt64() / hz
) < fsflush_iflush_delay
)
468 * Flush cached attribute information (e.g. inodes).
470 if (++count
>= icount
) {
474 * Sync back cached data.
477 for (vswp
= &vfssw
[1]; vswp
< &vfssw
[nfstype
]; vswp
++) {
478 if (ALLOCATED_VFSSW(vswp
) && VFS_INSTALLED(vswp
)) {
481 (void) fsop_sync_by_kind(vswp
- vfssw
,
483 vfs_unrefvfssw(vswp
);