4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright 2012 Milan Jurik. All rights reserved.
28 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/signal.h>
33 #include <sys/sysmacros.h>
34 #include <sys/cmn_err.h>
38 #include <sys/project.h>
40 #include <sys/vnode.h>
42 #include <sys/fcntl.h>
43 #include <sys/flock.h>
45 #include <sys/stream.h>
46 #include <sys/strsubr.h>
48 #include <sys/class.h>
52 #include <sys/exechdr.h>
54 #include <sys/resource.h>
57 #include <vm/seg_kmem.h>
58 #include <sys/vmparam.h>
59 #include <sys/machparam.h>
60 #include <sys/utsname.h>
62 #include <sys/stack.h>
63 #include <sys/modctl.h>
64 #include <sys/fdbuffer.h>
65 #include <sys/cyclic_impl.h>
67 #include <sys/tuneable.h>
68 #include <sys/systeminfo.h>
71 #include <sys/clock.h>
72 #include <sys/clock_impl.h>
73 #include <sys/serializer.h>
76 * The following few lines describe generic things that must be compiled
77 * into the booted executable (unix) rather than genunix or any other
78 * module because they're required by crash dump readers, etc.
80 struct modctl modules
; /* head of linked list of modules */
81 char *default_path
; /* default module loading path */
82 struct swapinfo
*swapinfo
; /* protected by the swapinfo_lock */
83 proc_t
*practive
; /* active process list */
84 uint_t nproc
; /* current number of processes */
85 proc_t p0
; /* process 0 */
86 struct plock p0lock
; /* p0's p_lock */
87 klwp_t lwp0
; /* t0's lwp */
88 task_t
*task0p
; /* task 0 */
89 kproject_t
*proj0p
; /* location of project 0 */
92 * The following are "implementation architecture" dependent constants made
93 * available here in the form of initialized data for use by "implementation
94 * architecture" independent modules. See machparam.h.
96 const unsigned long _pagesize
= (unsigned long)PAGESIZE
;
97 const unsigned int _pageshift
= (unsigned int)PAGESHIFT
;
98 const unsigned long _pageoffset
= (unsigned long)PAGEOFFSET
;
100 * XXX - This value pagemask has to be a 64bit size because
101 * large file support uses this mask on offsets which are 64 bit size.
102 * using unsigned leaves the higher 32 bits value as zero thus
103 * corrupting offset calculations in the file system and VM.
105 const u_longlong_t _pagemask
= (u_longlong_t
)PAGEMASK
;
106 const unsigned long _mmu_pagesize
= (unsigned long)MMU_PAGESIZE
;
107 const unsigned int _mmu_pageshift
= (unsigned int)MMU_PAGESHIFT
;
108 const unsigned long _mmu_pageoffset
= (unsigned long)MMU_PAGEOFFSET
;
109 const unsigned long _mmu_pagemask
= (unsigned long)MMU_PAGEMASK
;
110 uintptr_t _kernelbase
= (uintptr_t)KERNELBASE
;
111 uintptr_t _userlimit
= (uintptr_t)USERLIMIT
;
112 uintptr_t _userlimit32
= (uintptr_t)USERLIMIT32
;
113 const uintptr_t _argsbase
= (uintptr_t)ARGSBASE
;
114 const unsigned int _diskrpm
= (unsigned int)DISKRPM
;
115 const unsigned long _pgthresh
= (unsigned long)PGTHRESH
;
116 const unsigned int _maxslp
= (unsigned int)MAXSLP
;
117 const unsigned long _maxhandspreadpages
= (unsigned long)MAXHANDSPREADPAGES
;
118 const int _ncpu
= (int)NCPU
;
119 const int _ncpu_log2
= (int)NCPU_LOG2
;
120 const int _ncpu_p2
= (int)NCPU_P2
;
121 const unsigned long _defaultstksz
= (unsigned long)DEFAULTSTKSZ
;
122 const unsigned int _nbpg
= (unsigned int)MMU_PAGESIZE
;
125 * System parameter formulae.
127 * This file is copied into each directory where we compile
128 * the kernel; it should be modified there to suit local taste
133 * Default hz is 100, but if we set hires_tick we get higher resolution
134 * clock behavior (currently defined to be 1000 hz). Higher values seem
135 * to work, but are not supported.
137 * If we do decide to play with higher values, remember that hz should
138 * satisfy the following constraints to avoid integer round-off problems:
140 * (1) hz should be in the range 100 <= hz <= MICROSEC. If hz exceeds
141 * MICROSEC, usec_per_tick will be zero and lots of stuff will break.
142 * Similarly, if hz < 100 then hz / 100 == 0 and stuff will break.
144 * (2) If hz <= 1000, it should be both a multiple of 100 and a
147 * (3) If hz > 1000, it should be both a multiple of 1000 and a
148 * divisor of MICROSEC.
150 * Thus the only reasonable values of hz (i.e. the values that won't
151 * cause roundoff error) are: 100, 200, 500, 1000, 2000, 4000, 5000,
152 * 8000, 10000, 20000, 25000, 40000, 50000, 100000, 125000, 200000,
153 * 250000, 500000, 1000000. As of this writing (1996) a clock rate
154 * of more than about 10 kHz seems utterly ridiculous, although
155 * this observation will no doubt seem quaintly amusing one day.
157 #define HIRES_HZ_DEFAULT 1000
160 int hires_hz
= HIRES_HZ_DEFAULT
;
163 int cpu_decay_factor
= 10; /* this is no longer tied to clock */
164 int max_hres_adj
; /* maximum adjustment of hrtime per tick */
165 int tick_per_msec
; /* clock ticks per millisecond (zero if hz < 1000) */
168 * Milliseconds, Microseconds, and Nanoseconds per clock tick
171 * msec_per_tick is zero if hz > 1000
178 * Time Resolution values. These are defined in condvar.h and initialized in
179 * param_init(). Consumers of cv_reltimedwait() and cv_reltimedwait_sig()
180 * need to specify how accurate the timeout argument should be through
181 * one of these values. The intention is to allow the underlying implementation
182 * to anticipate or defer the expiration of timeouts, preventing unnecessary
183 * wakeups by batch processing similarly expiring events.
185 time_res_t time_res
[TR_COUNT
];
188 * Setting "snooping" to a non-zero value will cause a deadman panic if
189 * snoop_interval microseconds elapse without lbolt increasing. The default
190 * snoop_interval is 50 seconds.
192 #define SNOOP_INTERVAL_MIN (MICROSEC)
193 #define SNOOP_INTERVAL_DEFAULT (50 * MICROSEC)
196 uint_t snoop_interval
= SNOOP_INTERVAL_DEFAULT
;
199 * Tables of initialization functions, called from main().
202 extern void system_taskq_init(void);
203 extern void binit(void);
204 extern void space_init(void);
205 extern void dnlc_init(void);
206 extern void vfsinit(void);
207 extern void finit(void);
208 extern void strinit(void);
209 extern void flk_init(void);
210 extern void ftrace_init(void);
211 extern void softcall_init(void);
212 extern void ttyinit(void);
213 extern void schedctl_init(void);
214 extern void deadman_init(void);
215 extern void clock_timer_init(void);
216 extern void clock_realtime_init(void);
217 extern void clock_highres_init(void);
218 extern void clock_tick_mp_init(void);
219 extern void cu_init(void);
220 extern void callout_mp_init(void);
221 extern void cpu_seq_tbl_init(void);
223 void (*init_tbl
[])(void) = {
251 extern void siron_mp_init();
255 * Any per cpu resources should be initialized via
256 * an entry in mp_init_tbl().
258 void (*mp_init_tbl
[])(void) = {
270 int maxusers
; /* kitchen-sink knob for dynamic configuration */
273 * pidmax -- highest pid value assigned by the system
274 * Settable in /etc/system
276 int pidmax
= DEFAULT_MAXPID
;
279 * jump_pid - if set, this value is where pid numbers should start
280 * after the first few system pids (0-3) are used. If 0, pids are
281 * chosen in the usual way. This variable can be used to quickly
282 * create large pids (by setting it to 100000, for example). pids
283 * less than this value will never be chosen.
285 pid_t jump_pid
= DEFAULT_JUMPPID
;
288 * autoup -- used in struct var for dynamic config of the age a delayed-write
289 * buffer must be in seconds before bdflush will write it out.
291 #define DEFAULT_AUTOUP 30
292 int autoup
= DEFAULT_AUTOUP
;
295 * bufhwm -- tuneable variable for struct var for v_bufhwm.
296 * high water mark for buffer cache mem usage in units of K bytes.
298 * bufhwm_pct -- ditto, but given in % of physmem.
307 int max_nprocs
; /* set in param_init() */
308 int maxuprc
; /* set in param_init() */
315 int ufs_ninode
; /* declared here due to backwards compatibility */
316 int ndquot
; /* declared here due to backwards compatibility */
319 * Exec switch table. This is used by the generic exec module
320 * to switch out to the desired executable type, based on the
321 * magic number. The currently supported types are ELF, a.out
322 * (both NMAGIC and ZMAGIC), interpreter (#!) files,
323 * and Java executables.
328 short elfmagic
= 0x7f45;
329 short intpmagic
= 0x2321;
330 short jmagic
= 0x504b;
333 short aout_nmagic
= NMAGIC
;
334 short aout_zmagic
= ZMAGIC
;
335 short aout_omagic
= OMAGIC
;
342 #define ELF32MAGIC_STRING "\x7f""ELF\x1"
343 #define ELF64MAGIC_STRING "\x7f""ELF\x2"
344 #define INTPMAGIC_STRING "#!"
345 #define JAVAMAGIC_STRING "PK\003\004"
346 #define AOUT_OMAGIC_STRING "\x1""\x07" /* 0407 */
347 #define AOUT_NMAGIC_STRING "\x1""\x08" /* 0410 */
348 #define AOUT_ZMAGIC_STRING "\x1""\x0b" /* 0413 */
349 #define NOMAGIC_STRING ""
351 #define SHBIN_CNTL(x) ((x)&037)
352 #define SHBINMAGIC_STRING {SHBIN_CNTL('k'), SHBIN_CNTL('s'), SHBIN_CNTL('h'), 0}
353 #define SHBINMAGIC_LEN 4
355 char elf32magicstr
[] = ELF32MAGIC_STRING
;
356 char elf64magicstr
[] = ELF64MAGIC_STRING
;
357 char intpmagicstr
[] = INTPMAGIC_STRING
;
358 char shbinmagicstr
[] = SHBINMAGIC_STRING
;
359 char javamagicstr
[] = JAVAMAGIC_STRING
;
361 char aout_nmagicstr
[] = AOUT_NMAGIC_STRING
;
362 char aout_zmagicstr
[] = AOUT_ZMAGIC_STRING
;
363 char aout_omagicstr
[] = AOUT_OMAGIC_STRING
;
365 char nomagicstr
[] = NOMAGIC_STRING
;
367 char *execswnames
[] = {
368 "elfexec", /* Elf32 */
370 "elfexec", /* Elf64 */
385 struct execsw execsw
[] = {
386 { elf32magicstr
, 0, 5, NULL
, NULL
, NULL
},
388 { elf64magicstr
, 0, 5, NULL
, NULL
, NULL
},
390 { intpmagicstr
, 0, 2, NULL
, NULL
, NULL
},
391 { shbinmagicstr
, 0, SHBINMAGIC_LEN
, NULL
, NULL
, NULL
},
392 { javamagicstr
, 0, 4, NULL
, NULL
, NULL
},
394 { aout_zmagicstr
, 2, 2, NULL
, NULL
, NULL
},
395 { aout_nmagicstr
, 2, 2, NULL
, NULL
, NULL
},
396 { aout_omagicstr
, 2, 2, NULL
, NULL
, NULL
},
398 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
399 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
400 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
},
401 { nomagicstr
, 0, 0, NULL
, NULL
, NULL
}
403 int nexectype
= sizeof (execsw
) / sizeof (execsw
[0]); /* # of exec types */
404 kmutex_t execsw_lock
; /* Used for allocation of execsw entries */
407 * symbols added to make changing proc.max-file-descriptor
408 * simple via /etc/system
410 #define RLIM_FD_CUR 0x10000
411 #define RLIM_FD_MAX 0x10000
413 uint_t rlim_fd_cur
= RLIM_FD_CUR
;
414 uint_t rlim_fd_max
= RLIM_FD_MAX
;
417 * (Default resource limits were formerly declared here, but are now provided by
418 * the more general resource controls framework.)
424 int nstrpush
= 9; /* maximum # of modules/drivers on a stream */
425 ssize_t strctlsz
= 1024; /* maximum size of user-generated M_PROTO */
426 ssize_t strmsgsz
= 0x10000; /* maximum size of user-generated M_DATA */
427 /* for `strmsgsz', zero means unlimited */
429 * Filesystem tunables
431 int rstchown
= 1; /* POSIX_CHOWN_RESTRICTED is enabled */
432 int ngroups_max
= NGROUPS_MAX_DEFAULT
;
435 * generic scheduling stuff
437 * Configurable parameters for RT and TS are in the respective
438 * scheduling class modules.
441 pri_t maxclsyspri
= MAXCLSYSPRI
;
442 pri_t minclsyspri
= MINCLSYSPRI
;
443 char sys_name
[] = "SYS";
445 extern pri_t
sys_init(id_t
, int, classfuncs_t
**);
446 extern classfuncs_t sys_classfuncs
;
448 sclass_t sclass
[] = {
449 { "SYS", sys_init
, &sys_classfuncs
, STATIC_SCHED
, 0 },
450 { "", NULL
, NULL
, NULL
, 0 },
451 { "", NULL
, NULL
, NULL
, 0 },
452 { "", NULL
, NULL
, NULL
, 0 },
453 { "", NULL
, NULL
, NULL
, 0 },
454 { "", NULL
, NULL
, NULL
, 0 },
455 { "", NULL
, NULL
, NULL
, 0 },
456 { "", NULL
, NULL
, NULL
, 0 },
457 { "", NULL
, NULL
, NULL
, 0 },
458 { "", NULL
, NULL
, NULL
, 0 }
461 int loaded_classes
= 1; /* for loaded classes */
462 kmutex_t class_lock
; /* lock for class[] */
464 int nclass
= sizeof (sclass
) / sizeof (sclass_t
);
465 char initcls
[] = "TS";
466 char *defaultclass
= initcls
;
469 * Tunable system parameters.
473 * The integers tune_* are done this way so that the tune
474 * data structure may be "tuned" if necessary from the /etc/system
475 * file. The tune data structure is initialized in param_init();
481 * If freemem < t_getpgslow, then start to steal pages from processes.
483 int tune_t_gpgslo
= 25;
486 * Rate at which fsflush is run, in seconds.
488 #define DEFAULT_TUNE_T_FSFLUSHR 1
489 int tune_t_fsflushr
= DEFAULT_TUNE_T_FSFLUSHR
;
492 * The minimum available resident (not swappable) memory to maintain
493 * in order to avoid deadlock. In pages.
495 int tune_t_minarmem
= 25;
498 * The minimum available swappable memory to maintain in order to avoid
499 * deadlock. In pages.
501 int tune_t_minasmem
= 25;
503 int tune_t_flckrec
= 512; /* max # of active frlocks */
506 * Number of currently available pages that cannot be 'locked'
507 * This is set in init_pages_pp_maximum, and must be initialized
508 * to zero here to detect an override in /etc/system
510 pgcnt_t pages_pp_maximum
= 0;
512 int boothowto
; /* boot flags passed to kernel */
513 struct var v
; /* System Configuration Information */
516 * System Configuration Information
520 * The physical system's host identifier, expressed as a decimal string.
521 * Code should only directly access this value when writing to it (setting the
522 * physical system's host identifier). Code that reads the physical system's
523 * host identifier should use zone_get_hostid(NULL) instead.
525 char hw_serial
[HW_HOSTID_LEN
] = "0";
530 * On sparc machines, read hw_serial from the firmware at boot time
531 * and simply assert Oracle is the hardware provider.
533 char architecture
[] = "sparcv9";
534 char architecture_32
[] = "sparc";
535 char hw_provider
[] = "Oracle Corporation";
537 #elif defined(__i386)
539 char architecture
[] = "i386";
540 char architecture_32
[] = "i386";
541 char hw_provider
[SYS_NMLN
] = "";
543 #elif defined(__amd64)
545 char architecture
[] = "amd64";
546 char architecture_32
[] = "i386";
547 char hw_provider
[SYS_NMLN
] = "";
550 #error "unknown processor architecture"
553 char srpc_domain
[SYS_NMLN
] = "";
554 char platform
[SYS_NMLN
] = ""; /* read from the devinfo root node */
556 /* Initialize isa_list */
557 char *isa_list
= architecture
;
559 static pgcnt_t original_physmem
= 0;
561 #define MIN_DEFAULT_MAXUSERS 8u
562 #define MAX_DEFAULT_MAXUSERS 2048u
563 #define MAX_MAXUSERS 4096u
568 original_physmem
= physmem
;
572 param_calc(int platform_max_nprocs
)
575 * Default to about one "user" per megabyte, taking into
576 * account both physical and virtual constraints.
577 * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT)
578 * converts pages to megs without integer overflow.
581 if (physmem
> original_physmem
) {
582 physmem
= original_physmem
;
583 cmn_err(CE_NOTE
, "physmem limited to %ld", physmem
);
587 pgcnt_t physmegs
= physmem
>> (20 - PAGESHIFT
);
588 pgcnt_t virtmegs
= vmem_size(heap_arena
, VMEM_FREE
) >> 20;
589 maxusers
= MIN(MAX(MIN(physmegs
, virtmegs
),
590 MIN_DEFAULT_MAXUSERS
), MAX_DEFAULT_MAXUSERS
);
592 if (maxusers
> MAX_MAXUSERS
) {
593 maxusers
= MAX_MAXUSERS
;
594 cmn_err(CE_NOTE
, "maxusers limited to %d", MAX_MAXUSERS
);
599 * The purpose of maxusers is to prevent memory overcommit.
600 * DEBUG kernels take more space, so reduce maxusers a bit.
602 maxusers
= (3 * maxusers
) / 4;
606 * We need to dynamically change any variables now so that
607 * the setting of maxusers and pidmax propagate to the other
608 * variables that are dependent on them.
610 if (reserved_procs
== 0)
612 if (pidmax
< reserved_procs
|| pidmax
> MAX_MAXPID
)
618 * This allows platform-dependent code to constrain the maximum
619 * number of processes allowed in case there are e.g. VM limitations
620 * with how many contexts are available.
623 max_nprocs
= (10 + 16 * maxusers
);
624 if (platform_max_nprocs
> 0 && max_nprocs
> platform_max_nprocs
)
625 max_nprocs
= platform_max_nprocs
;
626 if (max_nprocs
> maxpid
)
630 maxuprc
= (max_nprocs
- reserved_procs
);
637 * Set each individual element of struct var v to be the
638 * default value. This is done this way
639 * so that a user can set the assigned integer value in the
640 * /etc/system file *IF* tuning is needed.
642 v
.v_proc
= max_nprocs
; /* v_proc - max # of processes system wide */
643 v
.v_maxupttl
= max_nprocs
- reserved_procs
;
644 v
.v_maxsyspri
= (int)maxclsyspri
; /* max global pri for sysclass */
645 v
.v_maxup
= MIN(maxuprc
, v
.v_maxupttl
); /* max procs per user */
646 v
.v_autoup
= autoup
; /* v_autoup - delay for delayed writes */
649 * Set each individual element of struct tune to be the
650 * default value. Each struct element This is done this way
651 * so that a user can set the assigned integer value in the
652 * /etc/system file *IF* tuning is needed.
654 tune
.t_gpgslo
= tune_t_gpgslo
;
655 tune
.t_fsflushr
= tune_t_fsflushr
;
656 tune
.t_minarmem
= tune_t_minarmem
;
657 tune
.t_minasmem
= tune_t_minasmem
;
658 tune
.t_flckrec
= tune_t_flckrec
;
661 * Initialization for file descriptors to correct mistaken settings in
662 * /etc/system. Initialization of limits performed by resource control
665 if (rlim_fd_cur
> rlim_fd_max
)
666 rlim_fd_cur
= rlim_fd_max
;
669 * calculations needed if hz was set in /etc/system
674 tick_per_msec
= hz
/ MILLISEC
;
675 msec_per_tick
= MILLISEC
/ hz
;
676 usec_per_tick
= MICROSEC
/ hz
;
677 nsec_per_tick
= NANOSEC
/ hz
;
678 max_hres_adj
= nsec_per_tick
>> ADJ_SHIFT
;
681 * Consumers of relative timedwait functions must specify how accurately
682 * the given timeout must expire. This is currently TR_CLOCK_TICK for
683 * the vast majority of consumers, but nsec_per_tick becomes an
684 * artificial value in a tickless world. Each caller of such routines
685 * should re-evaluate their usage and specify the appropriate
688 time_res
[TR_NANOSEC
] = NANOSEC
/ NANOSEC
;
689 time_res
[TR_MICROSEC
] = NANOSEC
/ MICROSEC
;
690 time_res
[TR_MILLISEC
] = NANOSEC
/ MILLISEC
;
691 time_res
[TR_SEC
] = NANOSEC
/ SEC
;
692 time_res
[TR_CLOCK_TICK
] = nsec_per_tick
;
696 * Validate tuneable parameters following /etc/system processing,
697 * but prior to param_init().
703 if (physmem
!= original_physmem
) {
704 cmn_err(CE_NOTE
, "physmem cannot be modified to 0x%lx"
705 " via /etc/system. Please use eeprom(1M) instead.",
707 physmem
= original_physmem
;
710 if (ngroups_max
< NGROUPS_UMIN
)
711 ngroups_max
= NGROUPS_UMIN
;
712 if (ngroups_max
> NGROUPS_UMAX
)
713 ngroups_max
= NGROUPS_UMAX
;
715 /* If we have many groups then the ucred proto message also grows. */
716 if (ngroups_max
> NGROUPS_OLDMAX
&&
717 strctlsz
< (ngroups_max
- NGROUPS_OLDMAX
) * sizeof (gid_t
) + 1024) {
718 strctlsz
= (ngroups_max
- NGROUPS_OLDMAX
) * sizeof (gid_t
) +
723 autoup
= DEFAULT_AUTOUP
;
724 cmn_err(CE_WARN
, "autoup <= 0; defaulting to %d", autoup
);
727 if (tune_t_fsflushr
<= 0) {
728 tune_t_fsflushr
= DEFAULT_TUNE_T_FSFLUSHR
;
729 cmn_err(CE_WARN
, "tune_t_fsflushr <= 0; defaulting to %d",
733 if (jump_pid
< 0 || jump_pid
>= pidmax
) {
735 cmn_err(CE_WARN
, "jump_pid < 0 or >= pidmax; ignored");
738 if (snoop_interval
< SNOOP_INTERVAL_MIN
) {
739 snoop_interval
= SNOOP_INTERVAL_DEFAULT
;
740 cmn_err(CE_WARN
, "snoop_interval < minimum (%d); defaulting"
741 " to %d", SNOOP_INTERVAL_MIN
, SNOOP_INTERVAL_DEFAULT
);