treewide: open64() -> open()
[unleashed.git] / usr / src / cmd / zonestat / zonestatd / zonestatd.c
blob85795393e86aa115cc7100c090f4518a49e5fe88
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2014 Garrett D'Amore <garrett@damore.org>
24 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <alloca.h>
27 #include <assert.h>
28 #include <dirent.h>
29 #include <dlfcn.h>
30 #include <door.h>
31 #include <errno.h>
32 #include <exacct.h>
33 #include <ctype.h>
34 #include <fcntl.h>
35 #include <kstat.h>
36 #include <libcontract.h>
37 #include <libintl.h>
38 #include <libscf.h>
39 #include <zonestat.h>
40 #include <zonestat_impl.h>
41 #include <limits.h>
42 #include <pool.h>
43 #include <procfs.h>
44 #include <priv.h>
45 #include <rctl.h>
46 #include <thread.h>
47 #include <signal.h>
48 #include <stdarg.h>
49 #include <stddef.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <strings.h>
53 #include <synch.h>
54 #include <sys/acctctl.h>
55 #include <sys/contract/process.h>
56 #include <sys/ctfs.h>
57 #include <sys/fork.h>
58 #include <sys/param.h>
59 #include <sys/priocntl.h>
60 #include <sys/fxpriocntl.h>
61 #include <sys/processor.h>
62 #include <sys/pset.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/statvfs.h>
66 #include <sys/swap.h>
67 #include <sys/systeminfo.h>
68 #include <thread.h>
69 #include <sys/list.h>
70 #include <sys/time.h>
71 #include <sys/types.h>
72 #include <sys/vm_usage.h>
73 #include <sys/wait.h>
74 #include <sys/zone.h>
75 #include <time.h>
76 #include <ucred.h>
77 #include <unistd.h>
78 #include <vm/anon.h>
79 #include <zone.h>
80 #include <zonestat.h>
82 #define MAX_PSET_NAME 1024 /* Taken from PV_NAME_MAX_LEN */
83 #define ZSD_PSET_UNLIMITED UINT16_MAX
84 #define ZONESTAT_EXACCT_FILE "/var/adm/exacct/zonestat-process"
87 * zonestatd implements gathering cpu and memory utilization data for
88 * running zones. It has these components:
90 * zsd_server:
91 * Door server to respond to client connections. Each client
92 * will connect using libzonestat.so, which will open and
93 * call /var/tmp/.zonestat_door. Each connecting client is given
94 * a file descriptor to the stat server.
96 * The zsd_server also responds to zoneadmd, which reports when a
97 * new zone is booted. This is used to fattach the zsd_server door
98 * into the new zone.
100 * zsd_stat_server:
101 * Receives client requests for the current utilization data. Each
102 * client request will cause zonestatd to update the current utilization
103 * data by kicking the stat_thread.
105 * If the client is in a non-global zone, the utilization data will
106 * be filtered to only show the given zone. The usage by all other zones
107 * will be added to the system utilization.
109 * stat_thread:
110 * The stat thread implements querying the system to determine the
111 * current utilization data for each running zone. This includes
112 * inspecting the system's processor set configuration, as well as details
113 * of each zone, such as their configured limits, and which processor
114 * sets they are running in.
116 * The stat_thread will only update memory utilization data as often as
117 * the configured config/sample_interval on the zones-monitoring service.
121 * The private vmusage structure unfortunately uses size_t types, and assumes
122 * the caller's bitness matches the kernel's bitness. Since the getvmusage()
123 * system call is contracted, and zonestatd is 32 bit, the following structures
124 * are used to interact with a 32bit or 64 bit kernel.
126 typedef struct zsd_vmusage32 {
127 id_t vmu_zoneid;
128 uint_t vmu_type;
129 id_t vmu_id;
131 uint32_t vmu_rss_all;
132 uint32_t vmu_rss_private;
133 uint32_t vmu_rss_shared;
134 uint32_t vmu_swap_all;
135 uint32_t vmu_swap_private;
136 uint32_t vmu_swap_shared;
137 } zsd_vmusage32_t;
139 typedef struct zsd_vmusage64 {
140 id_t vmu_zoneid;
141 uint_t vmu_type;
142 id_t vmu_id;
144 * An amd64 kernel will align the following uint64_t members, but a
145 * 32bit i386 process will not without help.
147 int vmu_align_next_members_on_8_bytes;
148 uint64_t vmu_rss_all;
149 uint64_t vmu_rss_private;
150 uint64_t vmu_rss_shared;
151 uint64_t vmu_swap_all;
152 uint64_t vmu_swap_private;
153 uint64_t vmu_swap_shared;
154 } zsd_vmusage64_t;
156 struct zsd_zone;
158 /* Used to store a zone's usage of a pset */
159 typedef struct zsd_pset_usage {
160 struct zsd_zone *zsu_zone;
161 struct zsd_pset *zsu_pset;
163 list_node_t zsu_next;
165 zoneid_t zsu_zoneid;
166 boolean_t zsu_found; /* zone bound at end of interval */
167 boolean_t zsu_active; /* zone was bound during interval */
168 boolean_t zsu_new; /* zone newly bound in this interval */
169 boolean_t zsu_deleted; /* zone was unbound in this interval */
170 boolean_t zsu_empty; /* no procs in pset in this interval */
171 time_t zsu_start; /* time when zone was found in pset */
172 hrtime_t zsu_hrstart; /* time when zone was found in pset */
173 uint64_t zsu_cpu_shares;
174 uint_t zsu_scheds; /* schedulers found in this pass */
175 timestruc_t zsu_cpu_usage; /* cpu time used */
176 } zsd_pset_usage_t;
178 /* Used to store a pset's utilization */
179 typedef struct zsd_pset {
180 psetid_t zsp_id;
181 list_node_t zsp_next;
182 char zsp_name[ZS_PSETNAME_MAX];
184 uint_t zsp_cputype; /* default, dedicated or shared */
185 boolean_t zsp_found; /* pset found at end of interval */
186 boolean_t zsp_new; /* pset new in this interval */
187 boolean_t zsp_deleted; /* pset deleted in this interval */
188 boolean_t zsp_active; /* pset existed during interval */
189 boolean_t zsp_empty; /* no processes in pset */
190 time_t zsp_start;
191 hrtime_t zsp_hrstart;
193 uint64_t zsp_online; /* online cpus in interval */
194 uint64_t zsp_size; /* size in this interval */
195 uint64_t zsp_min; /* configured min in this interval */
196 uint64_t zsp_max; /* configured max in this interval */
197 int64_t zsp_importance; /* configured max in this interval */
199 uint_t zsp_scheds; /* scheds of processes found in pset */
200 uint64_t zsp_cpu_shares; /* total shares in this interval */
202 timestruc_t zsp_total_time;
203 timestruc_t zsp_usage_kern;
204 timestruc_t zsp_usage_zones;
206 /* Individual zone usages of pset */
207 list_t zsp_usage_list;
208 int zsp_nusage;
210 /* Summed kstat values from individual cpus in pset */
211 timestruc_t zsp_idle;
212 timestruc_t zsp_intr;
213 timestruc_t zsp_kern;
214 timestruc_t zsp_user;
216 } zsd_pset_t;
218 /* Used to track an individual cpu's utilization as reported by kstats */
219 typedef struct zsd_cpu {
220 processorid_t zsc_id;
221 list_node_t zsc_next;
222 psetid_t zsc_psetid;
223 psetid_t zsc_psetid_prev;
224 zsd_pset_t *zsc_pset;
226 boolean_t zsc_found; /* cpu online in this interval */
227 boolean_t zsc_onlined; /* cpu onlined during this interval */
228 boolean_t zsc_offlined; /* cpu offlined during this interval */
229 boolean_t zsc_active; /* cpu online during this interval */
230 boolean_t zsc_allocated; /* True if cpu has ever been found */
232 /* kstats this interval */
233 uint64_t zsc_nsec_idle;
234 uint64_t zsc_nsec_intr;
235 uint64_t zsc_nsec_kern;
236 uint64_t zsc_nsec_user;
238 /* kstats in most recent interval */
239 uint64_t zsc_nsec_idle_prev;
240 uint64_t zsc_nsec_intr_prev;
241 uint64_t zsc_nsec_kern_prev;
242 uint64_t zsc_nsec_user_prev;
244 /* Total kstat increases since zonestatd started reading kstats */
245 timestruc_t zsc_idle;
246 timestruc_t zsc_intr;
247 timestruc_t zsc_kern;
248 timestruc_t zsc_user;
250 } zsd_cpu_t;
252 /* Used to describe an individual zone and its utilization */
253 typedef struct zsd_zone {
254 zoneid_t zsz_id;
255 list_node_t zsz_next;
256 char zsz_name[ZS_ZONENAME_MAX];
257 uint_t zsz_cputype;
258 uint_t zsz_iptype;
259 time_t zsz_start;
260 hrtime_t zsz_hrstart;
262 char zsz_pool[ZS_POOLNAME_MAX];
263 char zsz_pset[ZS_PSETNAME_MAX];
264 int zsz_default_sched;
265 /* These are deduced by inspecting processes */
266 psetid_t zsz_psetid;
267 uint_t zsz_scheds;
269 boolean_t zsz_new; /* zone booted during this interval */
270 boolean_t zsz_deleted; /* halted during this interval */
271 boolean_t zsz_active; /* running in this interval */
272 boolean_t zsz_empty; /* no processes in this interval */
273 boolean_t zsz_gone; /* not installed in this interval */
274 boolean_t zsz_found; /* Running at end of this interval */
276 uint64_t zsz_cpu_shares;
277 uint64_t zsz_cpu_cap;
278 uint64_t zsz_ram_cap;
279 uint64_t zsz_locked_cap;
280 uint64_t zsz_vm_cap;
282 uint64_t zsz_cpus_online;
283 timestruc_t zsz_cpu_usage; /* cpu time of cpu cap */
284 timestruc_t zsz_cap_time; /* cpu time of cpu cap */
285 timestruc_t zsz_share_time; /* cpu time of share of cpu */
286 timestruc_t zsz_pset_time; /* time of all psets zone is bound to */
288 uint64_t zsz_usage_ram;
289 uint64_t zsz_usage_locked;
290 uint64_t zsz_usage_vm;
292 uint64_t zsz_processes_cap;
293 uint64_t zsz_lwps_cap;
294 uint64_t zsz_shm_cap;
295 uint64_t zsz_shmids_cap;
296 uint64_t zsz_semids_cap;
297 uint64_t zsz_msgids_cap;
298 uint64_t zsz_lofi_cap;
300 uint64_t zsz_processes;
301 uint64_t zsz_lwps;
302 uint64_t zsz_shm;
303 uint64_t zsz_shmids;
304 uint64_t zsz_semids;
305 uint64_t zsz_msgids;
306 uint64_t zsz_lofi;
308 } zsd_zone_t;
311 * Used to track the cpu usage of an individual processes.
313 * zonestatd sweeps /proc each interval and charges the cpu usage of processes.
314 * to their zone. As processes exit, their extended accounting records are
315 * read and the difference of their total and known usage is charged to their
316 * zone.
318 * If a process is never seen in /proc, the total usage on its extended
319 * accounting record will be charged to its zone.
321 typedef struct zsd_proc {
322 list_node_t zspr_next;
323 pid_t zspr_ppid;
324 psetid_t zspr_psetid;
325 zoneid_t zspr_zoneid;
326 int zspr_sched;
327 timestruc_t zspr_usage;
328 } zsd_proc_t;
330 /* Used to track the overall resource usage of the system */
331 typedef struct zsd_system {
333 uint64_t zss_ram_total;
334 uint64_t zss_ram_kern;
335 uint64_t zss_ram_zones;
337 uint64_t zss_locked_kern;
338 uint64_t zss_locked_zones;
340 uint64_t zss_vm_total;
341 uint64_t zss_vm_kern;
342 uint64_t zss_vm_zones;
344 uint64_t zss_swap_total;
345 uint64_t zss_swap_used;
347 timestruc_t zss_idle;
348 timestruc_t zss_intr;
349 timestruc_t zss_kern;
350 timestruc_t zss_user;
352 timestruc_t zss_cpu_total_time;
353 timestruc_t zss_cpu_usage_kern;
354 timestruc_t zss_cpu_usage_zones;
356 uint64_t zss_maxpid;
357 uint64_t zss_processes_max;
358 uint64_t zss_lwps_max;
359 uint64_t zss_shm_max;
360 uint64_t zss_shmids_max;
361 uint64_t zss_semids_max;
362 uint64_t zss_msgids_max;
363 uint64_t zss_lofi_max;
365 uint64_t zss_processes;
366 uint64_t zss_lwps;
367 uint64_t zss_shm;
368 uint64_t zss_shmids;
369 uint64_t zss_semids;
370 uint64_t zss_msgids;
371 uint64_t zss_lofi;
373 uint64_t zss_ncpus;
374 uint64_t zss_ncpus_online;
376 } zsd_system_t;
379 * A dumping ground for various information and structures used to compute
380 * utilization.
382 * This structure is used to track the system while clients are connected.
383 * When The first client connects, a zsd_ctl is allocated and configured by
384 * zsd_open(). When all clients disconnect, the zsd_ctl is closed.
386 typedef struct zsd_ctl {
387 kstat_ctl_t *zsctl_kstat_ctl;
389 /* To track extended accounting */
390 int zsctl_proc_fd; /* Log currently being used */
391 ea_file_t zsctl_proc_eaf;
392 struct stat zsctl_proc_stat;
393 int zsctl_proc_open;
394 int zsctl_proc_fd_next; /* Log file to use next */
395 ea_file_t zsctl_proc_eaf_next;
396 struct stat zsctl_proc_stat_next;
397 int zsctl_proc_open_next;
399 /* pool configuration handle */
400 pool_conf_t *zsctl_pool_conf;
401 int zsctl_pool_status;
402 int zsctl_pool_changed;
404 /* The above usage tacking structures */
405 zsd_system_t *zsctl_system;
406 list_t zsctl_zones;
407 list_t zsctl_psets;
408 list_t zsctl_cpus;
409 zsd_cpu_t *zsctl_cpu_array;
410 zsd_proc_t *zsctl_proc_array;
412 /* Various system info */
413 uint64_t zsctl_maxcpuid;
414 uint64_t zsctl_maxproc;
415 uint64_t zsctl_kern_bits;
416 uint64_t zsctl_pagesize;
418 /* Used to track time available under a cpu cap. */
419 uint64_t zsctl_hrtime;
420 uint64_t zsctl_hrtime_prev;
421 timestruc_t zsctl_hrtime_total;
423 struct timeval zsctl_timeofday;
425 /* Caches for arrays allocated for use by various system calls */
426 psetid_t *zsctl_pset_cache;
427 uint_t zsctl_pset_ncache;
428 processorid_t *zsctl_cpu_cache;
429 uint_t zsctl_cpu_ncache;
430 zoneid_t *zsctl_zone_cache;
431 uint_t zsctl_zone_ncache;
432 struct swaptable *zsctl_swap_cache;
433 uint64_t zsctl_swap_cache_size;
434 uint64_t zsctl_swap_cache_num;
435 zsd_vmusage64_t *zsctl_vmusage_cache;
436 uint64_t zsctl_vmusage_cache_num;
438 /* Info about procfs for scanning /proc */
439 struct dirent *zsctl_procfs_dent;
440 long zsctl_procfs_dent_size;
441 pool_value_t *zsctl_pool_vals[3];
443 /* Counts on tracked entities */
444 uint_t zsctl_nzones;
445 uint_t zsctl_npsets;
446 uint_t zsctl_npset_usages;
447 } zsd_ctl_t;
449 zsd_ctl_t *g_ctl;
450 boolean_t g_open; /* True if g_ctl is open */
451 int g_hasclient; /* True if any clients are connected */
454 * The usage cache is updated by the stat_thread, and copied to clients by
455 * the zsd_stat_server. Mutex and cond are to synchronize between the
456 * stat_thread and the stat_server.
458 zs_usage_cache_t *g_usage_cache;
459 mutex_t g_usage_cache_lock;
460 cond_t g_usage_cache_kick;
461 uint_t g_usage_cache_kickers;
462 cond_t g_usage_cache_wait;
463 char *g_usage_cache_buf;
464 uint_t g_usage_cache_bufsz;
465 uint64_t g_gen_next;
467 /* fds of door servers */
468 int g_server_door;
469 int g_stat_door;
472 * Starting and current time. Used to throttle memory calculation, and to
473 * mark new zones and psets with their boot and creation time.
475 time_t g_now;
476 time_t g_start;
477 hrtime_t g_hrnow;
478 hrtime_t g_hrstart;
479 uint64_t g_interval;
482 * main() thread.
484 thread_t g_main;
486 /* PRINTFLIKE1 */
487 static void
488 zsd_warn(const char *fmt, ...)
490 va_list alist;
492 va_start(alist, fmt);
494 (void) fprintf(stderr, gettext("zonestat: Warning: "));
495 (void) vfprintf(stderr, fmt, alist);
496 (void) fprintf(stderr, "\n");
497 va_end(alist);
500 /* PRINTFLIKE1 */
501 static void
502 zsd_error(const char *fmt, ...)
504 va_list alist;
506 va_start(alist, fmt);
508 (void) fprintf(stderr, gettext("zonestat: Error: "));
509 (void) vfprintf(stderr, fmt, alist);
510 (void) fprintf(stderr, "\n");
511 va_end(alist);
512 exit(1);
515 /* Turns on extended accounting if not configured externally */
517 zsd_enable_cpu_stats()
519 char *path = ZONESTAT_EXACCT_FILE;
520 char oldfile[MAXPATHLEN];
521 int ret, state = AC_ON;
522 ac_res_t res[6];
525 * Start a new accounting file if accounting not configured
526 * externally.
529 res[0].ar_id = AC_PROC_PID;
530 res[0].ar_state = AC_ON;
531 res[1].ar_id = AC_PROC_ANCPID;
532 res[1].ar_state = AC_ON;
533 res[2].ar_id = AC_PROC_CPU;
534 res[2].ar_state = AC_ON;
535 res[3].ar_id = AC_PROC_TIME;
536 res[3].ar_state = AC_ON;
537 res[4].ar_id = AC_PROC_ZONENAME;
538 res[4].ar_state = AC_ON;
539 res[5].ar_id = AC_NONE;
540 res[5].ar_state = AC_ON;
541 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
542 zsd_warn(gettext("Unable to set accounting resources"));
543 return (-1);
545 /* Only set accounting file if none is configured */
546 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
547 if (ret < 0) {
549 (void) unlink(path);
550 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1)
551 == -1) {
552 zsd_warn(gettext("Unable to set accounting file"));
553 return (-1);
556 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
557 zsd_warn(gettext("Unable to enable accounting"));
558 return (-1);
560 return (0);
563 /* Turns off extended accounting if not configured externally */
565 zsd_disable_cpu_stats()
567 char *path = ZONESTAT_EXACCT_FILE;
568 int ret, state = AC_OFF;
569 ac_res_t res[6];
570 char oldfile[MAXPATHLEN];
572 /* If accounting file is externally configured, leave it alone */
573 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
574 if (ret == 0 && strcmp(oldfile, path) != 0)
575 return (0);
577 res[0].ar_id = AC_PROC_PID;
578 res[0].ar_state = AC_OFF;
579 res[1].ar_id = AC_PROC_ANCPID;
580 res[1].ar_state = AC_OFF;
581 res[2].ar_id = AC_PROC_CPU;
582 res[2].ar_state = AC_OFF;
583 res[3].ar_id = AC_PROC_TIME;
584 res[3].ar_state = AC_OFF;
585 res[4].ar_id = AC_PROC_ZONENAME;
586 res[4].ar_state = AC_OFF;
587 res[5].ar_id = AC_NONE;
588 res[5].ar_state = AC_OFF;
589 if (acctctl(AC_PROC | AC_RES_SET, res, sizeof (res)) != 0) {
590 zsd_warn(gettext("Unable to clear accounting resources"));
591 return (-1);
593 if (acctctl(AC_PROC | AC_FILE_SET, NULL, 0) == -1) {
594 zsd_warn(gettext("Unable to clear accounting file"));
595 return (-1);
597 if (acctctl(AC_PROC | AC_STATE_SET, &state, sizeof (state)) == -1) {
598 zsd_warn(gettext("Unable to diable accounting"));
599 return (-1);
602 (void) unlink(path);
603 return (0);
607 * If not configured externally, deletes the current extended accounting file
608 * and starts a new one.
610 * Since the stat_thread holds an open handle to the accounting file, it will
611 * read all remaining entries from the old file before switching to
612 * read the new one.
615 zsd_roll_exacct(void)
617 int ret;
618 char *path = ZONESTAT_EXACCT_FILE;
619 char oldfile[MAXPATHLEN];
621 /* If accounting file is externally configured, leave it alone */
622 ret = acctctl(AC_PROC | AC_FILE_GET, oldfile, sizeof (oldfile));
623 if (ret == 0 && strcmp(oldfile, path) != 0)
624 return (0);
626 if (unlink(path) != 0)
627 /* Roll it next time */
628 return (0);
630 if (acctctl(AC_PROC | AC_FILE_SET, path, strlen(path) + 1) == -1) {
631 zsd_warn(gettext("Unable to set accounting file"));
632 return (-1);
634 return (0);
637 /* Contract stuff for zone_enter() */
639 init_template(void)
641 int fd;
642 int err = 0;
644 fd = open(CTFS_ROOT "/process/template", O_RDWR);
645 if (fd == -1)
646 return (-1);
649 * For now, zoneadmd doesn't do anything with the contract.
650 * Deliver no events, don't inherit, and allow it to be orphaned.
652 err |= ct_tmpl_set_critical(fd, 0);
653 err |= ct_tmpl_set_informative(fd, 0);
654 err |= ct_pr_tmpl_set_fatal(fd, CT_PR_EV_HWERR);
655 err |= ct_pr_tmpl_set_param(fd, CT_PR_PGRPONLY | CT_PR_REGENT);
656 if (err || ct_tmpl_activate(fd)) {
657 (void) close(fd);
658 return (-1);
661 return (fd);
665 * Contract stuff for zone_enter()
668 contract_latest(ctid_t *id)
670 int cfd, r;
671 ct_stathdl_t st;
672 ctid_t result;
674 if ((cfd = open(CTFS_ROOT "/process/latest", O_RDONLY)) == -1)
675 return (errno);
677 if ((r = ct_status_read(cfd, CTD_COMMON, &st)) != 0) {
678 (void) close(cfd);
679 return (r);
682 result = ct_status_get_id(st);
683 ct_status_free(st);
684 (void) close(cfd);
686 *id = result;
687 return (0);
690 static int
691 close_on_exec(int fd)
693 int flags = fcntl(fd, F_GETFD, 0);
694 if ((flags != -1) && (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) != -1))
695 return (0);
696 return (-1);
700 contract_open(ctid_t ctid, const char *type, const char *file, int oflag)
702 char path[PATH_MAX];
703 int n, fd;
705 if (type == NULL)
706 type = "all";
708 n = snprintf(path, PATH_MAX, CTFS_ROOT "/%s/%ld/%s", type, ctid, file);
709 if (n >= sizeof (path)) {
710 errno = ENAMETOOLONG;
711 return (-1);
714 fd = open(path, oflag);
715 if (fd != -1) {
716 if (close_on_exec(fd) == -1) {
717 int err = errno;
718 (void) close(fd);
719 errno = err;
720 return (-1);
723 return (fd);
727 contract_abandon_id(ctid_t ctid)
729 int fd, err;
731 fd = contract_open(ctid, "all", "ctl", O_WRONLY);
732 if (fd == -1)
733 return (errno);
735 err = ct_ctl_abandon(fd);
736 (void) close(fd);
738 return (err);
741 * Attach the zsd_server to a zone. Called for each zone when zonestatd
742 * starts, and for each newly booted zone when zoneadmd contacts the zsd_server
744 * Zone_enter is used to avoid reaching into zone to fattach door.
746 static void
747 zsd_fattach_zone(zoneid_t zid, int door, boolean_t detach_only)
749 char *path = ZS_DOOR_PATH;
750 int fd, pid, stat, tmpl_fd;
751 ctid_t ct;
753 if ((tmpl_fd = init_template()) == -1) {
754 zsd_warn("Unable to init template");
755 return;
758 pid = forkx(0);
759 if (pid < 0) {
760 (void) ct_tmpl_clear(tmpl_fd);
761 zsd_warn(gettext(
762 "Unable to fork to add zonestat to zoneid %d\n"), zid);
763 return;
766 if (pid == 0) {
767 (void) ct_tmpl_clear(tmpl_fd);
768 (void) close(tmpl_fd);
769 if (zid != 0 && zone_enter(zid) != 0) {
770 if (errno == EINVAL) {
771 _exit(0);
773 _exit(1);
775 (void) fdetach(path);
776 (void) unlink(path);
777 if (detach_only)
778 _exit(0);
779 fd = open(path, O_CREAT|O_RDWR, 0644);
780 if (fd < 0)
781 _exit(2);
782 if (fattach(door, path) != 0)
783 _exit(3);
784 _exit(0);
786 if (contract_latest(&ct) == -1)
787 ct = -1;
788 (void) ct_tmpl_clear(tmpl_fd);
789 (void) close(tmpl_fd);
790 (void) contract_abandon_id(ct);
791 while (waitpid(pid, &stat, 0) != pid)
793 if (WIFEXITED(stat) && WEXITSTATUS(stat) == 0)
794 return;
796 zsd_warn(gettext("Unable to attach door to zoneid: %d"), zid);
798 if (WEXITSTATUS(stat) == 1)
799 zsd_warn(gettext("Cannot entering zone"));
800 else if (WEXITSTATUS(stat) == 2)
801 zsd_warn(gettext("Unable to create door file: %s"), path);
802 else if (WEXITSTATUS(stat) == 3)
803 zsd_warn(gettext("Unable to fattach file: %s"), path);
805 zsd_warn(gettext("Internal error entering zone: %d"), zid);
809 * Zone lookup and allocation functions to manage list of currently running
810 * zones.
812 static zsd_zone_t *
813 zsd_lookup_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
815 zsd_zone_t *zone;
817 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
818 zone = list_next(&ctl->zsctl_zones, zone)) {
819 if (strcmp(zone->zsz_name, zonename) == 0) {
820 if (zoneid != -1)
821 zone->zsz_id = zoneid;
822 return (zone);
825 return (NULL);
828 static zsd_zone_t *
829 zsd_lookup_zone_byid(zsd_ctl_t *ctl, zoneid_t zoneid)
831 zsd_zone_t *zone;
833 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
834 zone = list_next(&ctl->zsctl_zones, zone)) {
835 if (zone->zsz_id == zoneid)
836 return (zone);
838 return (NULL);
841 static zsd_zone_t *
842 zsd_allocate_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
844 zsd_zone_t *zone;
846 if ((zone = (zsd_zone_t *)calloc(1, sizeof (zsd_zone_t))) == NULL)
847 return (NULL);
849 (void) strlcpy(zone->zsz_name, zonename, sizeof (zone->zsz_name));
850 zone->zsz_id = zoneid;
851 zone->zsz_found = B_FALSE;
854 * Allocate as deleted so if not found in first pass, zone is deleted
855 * from list. This can happen if zone is returned by zone_list, but
856 * exits before first attempt to fetch zone details.
858 zone->zsz_start = g_now;
859 zone->zsz_hrstart = g_hrnow;
860 zone->zsz_deleted = B_TRUE;
862 zone->zsz_cpu_shares = ZS_LIMIT_NONE;
863 zone->zsz_cpu_cap = ZS_LIMIT_NONE;
864 zone->zsz_ram_cap = ZS_LIMIT_NONE;
865 zone->zsz_locked_cap = ZS_LIMIT_NONE;
866 zone->zsz_vm_cap = ZS_LIMIT_NONE;
868 zone->zsz_processes_cap = ZS_LIMIT_NONE;
869 zone->zsz_lwps_cap = ZS_LIMIT_NONE;
870 zone->zsz_shm_cap = ZS_LIMIT_NONE;
871 zone->zsz_shmids_cap = ZS_LIMIT_NONE;
872 zone->zsz_semids_cap = ZS_LIMIT_NONE;
873 zone->zsz_msgids_cap = ZS_LIMIT_NONE;
874 zone->zsz_lofi_cap = ZS_LIMIT_NONE;
876 ctl->zsctl_nzones++;
878 return (zone);
881 static zsd_zone_t *
882 zsd_lookup_insert_zone(zsd_ctl_t *ctl, char *zonename, zoneid_t zoneid)
884 zsd_zone_t *zone, *tmp;
886 if ((zone = zsd_lookup_zone(ctl, zonename, zoneid)) != NULL)
887 return (zone);
889 if ((zone = zsd_allocate_zone(ctl, zonename, zoneid)) == NULL)
890 return (NULL);
892 /* Insert sorted by zonename */
893 tmp = list_head(&ctl->zsctl_zones);
894 while (tmp != NULL && strcmp(zonename, tmp->zsz_name) > 0)
895 tmp = list_next(&ctl->zsctl_zones, tmp);
897 list_insert_before(&ctl->zsctl_zones, tmp, zone);
898 return (zone);
902 * Mark all zones as not existing. As zones are found, they will
903 * be marked as existing. If a zone is not found, then it must have
904 * halted.
906 static void
907 zsd_mark_zones_start(zsd_ctl_t *ctl)
910 zsd_zone_t *zone;
912 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
913 zone = list_next(&ctl->zsctl_zones, zone)) {
914 zone->zsz_found = B_FALSE;
919 * Mark each zone as not using pset. If processes are found using the
920 * pset, the zone will remain bound to the pset. If none of a zones
921 * processes are bound to the pset, the zone's usage of the pset will
922 * be deleted.
925 static void
926 zsd_mark_pset_usage_start(zsd_pset_t *pset)
928 zsd_pset_usage_t *usage;
930 for (usage = list_head(&pset->zsp_usage_list);
931 usage != NULL;
932 usage = list_next(&pset->zsp_usage_list, usage)) {
933 usage->zsu_found = B_FALSE;
934 usage->zsu_empty = B_TRUE;
939 * Mark each pset as not existing. If a pset is found, it will be marked
940 * as existing. If a pset is not found, it wil be deleted.
942 static void
943 zsd_mark_psets_start(zsd_ctl_t *ctl)
945 zsd_pset_t *pset;
947 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
948 pset = list_next(&ctl->zsctl_psets, pset)) {
949 pset->zsp_found = B_FALSE;
950 zsd_mark_pset_usage_start(pset);
955 * A pset was found. Update its information
957 static void
958 zsd_mark_pset_found(zsd_pset_t *pset, uint_t type, uint64_t online,
959 uint64_t size, uint64_t min, uint64_t max, int64_t importance)
961 pset->zsp_empty = B_TRUE;
962 pset->zsp_deleted = B_FALSE;
964 assert(pset->zsp_found == B_FALSE);
966 /* update pset flags */
967 if (pset->zsp_active == B_FALSE)
968 /* pset not seen on previous interval. It is new. */
969 pset->zsp_new = B_TRUE;
970 else
971 pset->zsp_new = B_FALSE;
973 pset->zsp_found = B_TRUE;
974 pset->zsp_cputype = type;
975 pset->zsp_online = online;
976 pset->zsp_size = size;
977 pset->zsp_min = min;
978 pset->zsp_max = max;
979 pset->zsp_importance = importance;
980 pset->zsp_cpu_shares = 0;
981 pset->zsp_scheds = 0;
982 pset->zsp_active = B_TRUE;
986 * A zone's process was found using a pset. Charge the process to the pset and
987 * the per-zone data for the pset.
989 static void
990 zsd_mark_pset_usage_found(zsd_pset_usage_t *usage, uint_t sched)
992 zsd_zone_t *zone = usage->zsu_zone;
993 zsd_pset_t *pset = usage->zsu_pset;
995 /* Nothing to do if already found */
996 if (usage->zsu_found == B_TRUE)
997 goto add_stats;
999 usage->zsu_found = B_TRUE;
1000 usage->zsu_empty = B_FALSE;
1002 usage->zsu_deleted = B_FALSE;
1003 /* update usage flags */
1004 if (usage->zsu_active == B_FALSE)
1005 usage->zsu_new = B_TRUE;
1006 else
1007 usage->zsu_new = B_FALSE;
1009 usage->zsu_scheds = 0;
1010 usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1011 usage->zsu_active = B_TRUE;
1012 pset->zsp_empty = B_FALSE;
1013 zone->zsz_empty = B_FALSE;
1015 add_stats:
1016 /* Detect zone's pset id, and if it is bound to multiple psets */
1017 if (zone->zsz_psetid == ZS_PSET_ERROR)
1018 zone->zsz_psetid = pset->zsp_id;
1019 else if (zone->zsz_psetid != pset->zsp_id)
1020 zone->zsz_psetid = ZS_PSET_MULTI;
1022 usage->zsu_scheds |= sched;
1023 pset->zsp_scheds |= sched;
1024 zone->zsz_scheds |= sched;
1026 /* Record if FSS is co-habitating with conflicting scheduler */
1027 if ((pset->zsp_scheds & ZS_SCHED_FSS) &&
1028 usage->zsu_scheds & (
1029 ZS_SCHED_TS | ZS_SCHED_IA | ZS_SCHED_FX)) {
1030 usage->zsu_scheds |= ZS_SCHED_CONFLICT;
1032 pset->zsp_scheds |= ZS_SCHED_CONFLICT;
1037 /* Add cpu time for a process to a pset, zone, and system totals */
1038 static void
1039 zsd_add_usage(zsd_ctl_t *ctl, zsd_pset_usage_t *usage, timestruc_t *delta)
1041 zsd_system_t *system = ctl->zsctl_system;
1042 zsd_zone_t *zone = usage->zsu_zone;
1043 zsd_pset_t *pset = usage->zsu_pset;
1045 TIMESTRUC_ADD_TIMESTRUC(usage->zsu_cpu_usage, *delta);
1046 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_zones, *delta);
1047 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_cpu_usage, *delta);
1048 TIMESTRUC_ADD_TIMESTRUC(system->zss_cpu_usage_zones, *delta);
1051 /* Determine which processor sets have been deleted */
1052 static void
1053 zsd_mark_psets_end(zsd_ctl_t *ctl)
1055 zsd_pset_t *pset, *tmp;
1058 * Mark pset as not exists, and deleted if it existed
1059 * previous interval.
1061 pset = list_head(&ctl->zsctl_psets);
1062 while (pset != NULL) {
1063 if (pset->zsp_found == B_FALSE) {
1064 pset->zsp_empty = B_TRUE;
1065 if (pset->zsp_deleted == B_TRUE) {
1066 tmp = pset;
1067 pset = list_next(&ctl->zsctl_psets, pset);
1068 list_remove(&ctl->zsctl_psets, tmp);
1069 free(tmp);
1070 ctl->zsctl_npsets--;
1071 continue;
1072 } else {
1073 /* Pset vanished during this interval */
1074 pset->zsp_new = B_FALSE;
1075 pset->zsp_deleted = B_TRUE;
1076 pset->zsp_active = B_TRUE;
1079 pset = list_next(&ctl->zsctl_psets, pset);
1083 /* Determine which zones are no longer bound to processor sets */
1084 static void
1085 zsd_mark_pset_usages_end(zsd_ctl_t *ctl)
1087 zsd_pset_t *pset;
1088 zsd_zone_t *zone;
1089 zsd_pset_usage_t *usage, *tmp;
1092 * Mark pset as not exists, and deleted if it existed previous
1093 * interval.
1095 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1096 pset = list_next(&ctl->zsctl_psets, pset)) {
1097 usage = list_head(&pset->zsp_usage_list);
1098 while (usage != NULL) {
1100 * Mark pset as not exists, and deleted if it existed
1101 * previous interval.
1103 if (usage->zsu_found == B_FALSE ||
1104 usage->zsu_zone->zsz_deleted == B_TRUE ||
1105 usage->zsu_pset->zsp_deleted == B_TRUE) {
1106 tmp = usage;
1107 usage = list_next(&pset->zsp_usage_list,
1108 usage);
1109 list_remove(&pset->zsp_usage_list, tmp);
1110 free(tmp);
1111 pset->zsp_nusage--;
1112 ctl->zsctl_npset_usages--;
1113 continue;
1114 } else {
1115 usage->zsu_new = B_FALSE;
1116 usage->zsu_deleted = B_TRUE;
1117 usage->zsu_active = B_TRUE;
1119 /* Add cpu shares for usages that are in FSS */
1120 zone = usage->zsu_zone;
1121 if (usage->zsu_scheds & ZS_SCHED_FSS &&
1122 zone->zsz_cpu_shares != ZS_SHARES_UNLIMITED &&
1123 zone->zsz_cpu_shares != 0) {
1124 zone = usage->zsu_zone;
1125 usage->zsu_cpu_shares = zone->zsz_cpu_shares;
1126 pset->zsp_cpu_shares += zone->zsz_cpu_shares;
1128 usage = list_next(&pset->zsp_usage_list,
1129 usage);
1134 /* A zone has been found. Update its information */
1135 static void
1136 zsd_mark_zone_found(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t cpu_shares,
1137 uint64_t cpu_cap, uint64_t ram_cap, uint64_t locked_cap,
1138 uint64_t vm_cap, uint64_t processes_cap, uint64_t processes,
1139 uint64_t lwps_cap, uint64_t lwps, uint64_t shm_cap, uint64_t shm,
1140 uint64_t shmids_cap, uint64_t shmids, uint64_t semids_cap,
1141 uint64_t semids, uint64_t msgids_cap, uint64_t msgids, uint64_t lofi_cap,
1142 uint64_t lofi, char *poolname, char *psetname, uint_t sched, uint_t cputype,
1143 uint_t iptype)
1145 zsd_system_t *sys = ctl->zsctl_system;
1147 assert(zone->zsz_found == B_FALSE);
1150 * Mark zone as exists, and new if it did not exist in previous
1151 * interval.
1153 zone->zsz_found = B_TRUE;
1154 zone->zsz_empty = B_TRUE;
1155 zone->zsz_deleted = B_FALSE;
1158 * Zone is new. Assume zone's properties are the same over entire
1159 * interval.
1161 if (zone->zsz_active == B_FALSE)
1162 zone->zsz_new = B_TRUE;
1163 else
1164 zone->zsz_new = B_FALSE;
1166 (void) strlcpy(zone->zsz_pool, poolname, sizeof (zone->zsz_pool));
1167 (void) strlcpy(zone->zsz_pset, psetname, sizeof (zone->zsz_pset));
1168 zone->zsz_default_sched = sched;
1170 /* Schedulers updated later as processes are found */
1171 zone->zsz_scheds = 0;
1173 /* Cpus updated later as psets bound are identified */
1174 zone->zsz_cpus_online = 0;
1176 zone->zsz_cputype = cputype;
1177 zone->zsz_iptype = iptype;
1178 zone->zsz_psetid = ZS_PSET_ERROR;
1179 zone->zsz_cpu_cap = cpu_cap;
1180 zone->zsz_cpu_shares = cpu_shares;
1181 zone->zsz_ram_cap = ram_cap;
1182 zone->zsz_locked_cap = locked_cap;
1183 zone->zsz_vm_cap = vm_cap;
1184 zone->zsz_processes_cap = processes_cap;
1185 zone->zsz_processes = processes;
1186 zone->zsz_lwps_cap = lwps_cap;
1187 zone->zsz_lwps = lwps;
1188 zone->zsz_shm_cap = shm_cap;
1189 zone->zsz_shm = shm;
1190 zone->zsz_shmids_cap = shmids_cap;
1191 zone->zsz_shmids = shmids;
1192 zone->zsz_semids_cap = semids_cap;
1193 zone->zsz_semids = semids;
1194 zone->zsz_msgids_cap = msgids_cap;
1195 zone->zsz_msgids = msgids;
1196 zone->zsz_lofi_cap = lofi_cap;
1197 zone->zsz_lofi = lofi;
1199 sys->zss_processes += processes;
1200 sys->zss_lwps += lwps;
1201 sys->zss_shm += shm;
1202 sys->zss_shmids += shmids;
1203 sys->zss_semids += semids;
1204 sys->zss_msgids += msgids;
1205 sys->zss_lofi += lofi;
1206 zone->zsz_active = B_TRUE;
1210 /* Determine which zones have halted */
1211 static void
1212 zsd_mark_zones_end(zsd_ctl_t *ctl)
1214 zsd_zone_t *zone, *tmp;
1217 * Mark zone as not existing, or delete if it did not exist in
1218 * previous interval.
1220 zone = list_head(&ctl->zsctl_zones);
1221 while (zone != NULL) {
1222 if (zone->zsz_found == B_FALSE) {
1223 zone->zsz_empty = B_TRUE;
1224 if (zone->zsz_deleted == B_TRUE) {
1226 * Zone deleted in prior interval,
1227 * so it no longer exists.
1229 tmp = zone;
1230 zone = list_next(&ctl->zsctl_zones, zone);
1231 list_remove(&ctl->zsctl_zones, tmp);
1232 free(tmp);
1233 ctl->zsctl_nzones--;
1234 continue;
1235 } else {
1236 zone->zsz_new = B_FALSE;
1237 zone->zsz_deleted = B_TRUE;
1238 zone->zsz_active = B_TRUE;
1241 zone = list_next(&ctl->zsctl_zones, zone);
1246 * Mark cpus as not existing. If a cpu is found, it will be updated. If
1247 * a cpu is not found, then it must have gone offline, so it will be
1248 * deleted.
1250 * The kstat tracking data is rolled so that the usage since the previous
1251 * interval can be determined.
1253 static void
1254 zsd_mark_cpus_start(zsd_ctl_t *ctl, boolean_t roll)
1256 zsd_cpu_t *cpu;
1259 * Mark all cpus as not existing. As cpus are found, they will
1260 * be marked as existing.
1262 for (cpu = list_head(&ctl->zsctl_cpus); cpu != NULL;
1263 cpu = list_next(&ctl->zsctl_cpus, cpu)) {
1264 cpu->zsc_found = B_FALSE;
1265 if (cpu->zsc_active == B_TRUE && roll) {
1266 cpu->zsc_psetid_prev = cpu->zsc_psetid;
1267 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1268 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1269 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1270 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1276 * An array the size of the maximum number of cpus is kept. Within this array
1277 * a list of the online cpus is maintained.
1279 zsd_cpu_t *
1280 zsd_lookup_insert_cpu(zsd_ctl_t *ctl, processorid_t cpuid)
1282 zsd_cpu_t *cpu;
1284 assert(cpuid < ctl->zsctl_maxcpuid);
1285 cpu = &(ctl->zsctl_cpu_array[cpuid]);
1286 assert(cpuid == cpu->zsc_id);
1288 if (cpu->zsc_allocated == B_FALSE) {
1289 cpu->zsc_allocated = B_TRUE;
1290 list_insert_tail(&ctl->zsctl_cpus, cpu);
1292 return (cpu);
1295 /* A cpu has been found. Update its information */
1296 static void
1297 zsd_mark_cpu_found(zsd_cpu_t *cpu, zsd_pset_t *pset, psetid_t psetid)
1300 * legacy processor sets, the cpu may move while zonestatd is
1301 * inspecting, causing it to be found twice. In this case, just
1302 * leave cpu in the first processor set in which it was found.
1304 if (cpu->zsc_found == B_TRUE)
1305 return;
1307 /* Mark cpu as online */
1308 cpu->zsc_found = B_TRUE;
1309 cpu->zsc_offlined = B_FALSE;
1310 cpu->zsc_pset = pset;
1312 * cpu is newly online.
1314 if (cpu->zsc_active == B_FALSE) {
1316 * Cpu is newly online.
1318 cpu->zsc_onlined = B_TRUE;
1319 cpu->zsc_psetid = psetid;
1320 cpu->zsc_psetid_prev = psetid;
1321 } else {
1323 * cpu online during previous interval. Save properties at
1324 * start of interval
1326 cpu->zsc_onlined = B_FALSE;
1327 cpu->zsc_psetid = psetid;
1330 cpu->zsc_active = B_TRUE;
1333 /* Remove all offlined cpus from the list of tracked cpus */
1334 static void
1335 zsd_mark_cpus_end(zsd_ctl_t *ctl)
1337 zsd_cpu_t *cpu, *tmp;
1338 int id;
1340 /* Mark cpu as online or offline */
1341 cpu = list_head(&ctl->zsctl_cpus);
1342 while (cpu != NULL) {
1343 if (cpu->zsc_found == B_FALSE) {
1344 if (cpu->zsc_offlined == B_TRUE) {
1346 * cpu offlined in prior interval. It is gone.
1348 tmp = cpu;
1349 cpu = list_next(&ctl->zsctl_cpus, cpu);
1350 list_remove(&ctl->zsctl_cpus, tmp);
1351 /* Clear structure for future use */
1352 id = tmp->zsc_id;
1353 bzero(tmp, sizeof (zsd_cpu_t));
1354 tmp->zsc_id = id;
1355 tmp->zsc_allocated = B_FALSE;
1356 tmp->zsc_psetid = ZS_PSET_ERROR;
1357 tmp->zsc_psetid_prev = ZS_PSET_ERROR;
1359 } else {
1361 * cpu online at start of interval. Treat
1362 * as still online, since it was online for
1363 * some portion of the interval.
1365 cpu->zsc_offlined = B_TRUE;
1366 cpu->zsc_onlined = B_FALSE;
1367 cpu->zsc_active = B_TRUE;
1368 cpu->zsc_psetid = cpu->zsc_psetid_prev;
1369 cpu->zsc_pset = NULL;
1372 cpu = list_next(&ctl->zsctl_cpus, cpu);
1376 /* Some utility functions for managing the list of processor sets */
1377 static zsd_pset_t *
1378 zsd_lookup_pset_byid(zsd_ctl_t *ctl, psetid_t psetid)
1380 zsd_pset_t *pset;
1382 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1383 pset = list_next(&ctl->zsctl_psets, pset)) {
1384 if (pset->zsp_id == psetid)
1385 return (pset);
1387 return (NULL);
1390 static zsd_pset_t *
1391 zsd_lookup_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1393 zsd_pset_t *pset;
1395 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
1396 pset = list_next(&ctl->zsctl_psets, pset)) {
1397 if (strcmp(pset->zsp_name, psetname) == 0) {
1398 if (psetid != -1)
1399 pset->zsp_id = psetid;
1400 return (pset);
1403 return (NULL);
1406 static zsd_pset_t *
1407 zsd_allocate_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1409 zsd_pset_t *pset;
1411 if ((pset = (zsd_pset_t *)calloc(1, sizeof (zsd_pset_t))) == NULL)
1412 return (NULL);
1414 (void) strlcpy(pset->zsp_name, psetname, sizeof (pset->zsp_name));
1415 pset->zsp_id = psetid;
1416 pset->zsp_found = B_FALSE;
1418 * Allocate as deleted so if not found in first pass, pset is deleted
1419 * from list. This can happen if pset is returned by pset_list, but
1420 * is destroyed before first attempt to fetch pset details.
1422 list_create(&pset->zsp_usage_list, sizeof (zsd_pset_usage_t),
1423 offsetof(zsd_pset_usage_t, zsu_next));
1425 pset->zsp_hrstart = g_hrnow;
1426 pset->zsp_deleted = B_TRUE;
1427 pset->zsp_empty = B_TRUE;
1428 ctl->zsctl_npsets++;
1430 return (pset);
1433 static zsd_pset_t *
1434 zsd_lookup_insert_pset(zsd_ctl_t *ctl, char *psetname, psetid_t psetid)
1436 zsd_pset_t *pset, *tmp;
1438 if ((pset = zsd_lookup_pset(ctl, psetname, psetid)) != NULL)
1439 return (pset);
1441 if ((pset = zsd_allocate_pset(ctl, psetname, psetid)) == NULL)
1442 return (NULL);
1444 /* Insert sorted by psetname */
1445 tmp = list_head(&ctl->zsctl_psets);
1446 while (tmp != NULL && strcmp(psetname, tmp->zsp_name) > 0)
1447 tmp = list_next(&ctl->zsctl_psets, tmp);
1449 list_insert_before(&ctl->zsctl_psets, tmp, pset);
1450 return (pset);
1453 /* Some utility functions for managing the list of zones using each pset */
1454 static zsd_pset_usage_t *
1455 zsd_lookup_usage(zsd_pset_t *pset, zsd_zone_t *zone)
1457 zsd_pset_usage_t *usage;
1459 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
1460 usage = list_next(&pset->zsp_usage_list, usage))
1461 if (usage->zsu_zone == zone)
1462 return (usage);
1464 return (NULL);
1467 static zsd_pset_usage_t *
1468 zsd_allocate_pset_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1470 zsd_pset_usage_t *usage;
1472 if ((usage = (zsd_pset_usage_t *)calloc(1, sizeof (zsd_pset_usage_t)))
1473 == NULL)
1474 return (NULL);
1476 list_link_init(&usage->zsu_next);
1477 usage->zsu_zone = zone;
1478 usage->zsu_zoneid = zone->zsz_id;
1479 usage->zsu_pset = pset;
1480 usage->zsu_found = B_FALSE;
1481 usage->zsu_active = B_FALSE;
1482 usage->zsu_new = B_FALSE;
1484 * Allocate as not deleted. If a process is found in a pset for
1485 * a zone, the usage will not be deleted until at least the next
1486 * interval.
1488 usage->zsu_start = g_now;
1489 usage->zsu_hrstart = g_hrnow;
1490 usage->zsu_deleted = B_FALSE;
1491 usage->zsu_empty = B_TRUE;
1492 usage->zsu_scheds = 0;
1493 usage->zsu_cpu_shares = ZS_LIMIT_NONE;
1495 ctl->zsctl_npset_usages++;
1496 pset->zsp_nusage++;
1498 return (usage);
1501 static zsd_pset_usage_t *
1502 zsd_lookup_insert_usage(zsd_ctl_t *ctl, zsd_pset_t *pset, zsd_zone_t *zone)
1504 zsd_pset_usage_t *usage, *tmp;
1506 if ((usage = zsd_lookup_usage(pset, zone))
1507 != NULL)
1508 return (usage);
1510 if ((usage = zsd_allocate_pset_usage(ctl, pset, zone)) == NULL)
1511 return (NULL);
1513 tmp = list_head(&pset->zsp_usage_list);
1514 while (tmp != NULL && strcmp(zone->zsz_name, tmp->zsu_zone->zsz_name)
1515 > 0)
1516 tmp = list_next(&pset->zsp_usage_list, tmp);
1518 list_insert_before(&pset->zsp_usage_list, tmp, usage);
1519 return (usage);
1522 static void
1523 zsd_refresh_system(zsd_ctl_t *ctl)
1525 zsd_system_t *system = ctl->zsctl_system;
1527 /* Re-count these values each interval */
1528 system->zss_processes = 0;
1529 system->zss_lwps = 0;
1530 system->zss_shm = 0;
1531 system->zss_shmids = 0;
1532 system->zss_semids = 0;
1533 system->zss_msgids = 0;
1534 system->zss_lofi = 0;
1538 /* Reads each cpu's kstats, and adds the usage to the cpu's pset */
1539 static void
1540 zsd_update_cpu_stats(zsd_ctl_t *ctl, zsd_cpu_t *cpu)
1542 zsd_system_t *sys;
1543 processorid_t cpuid;
1544 zsd_pset_t *pset_prev;
1545 zsd_pset_t *pset;
1546 kstat_t *kstat;
1547 kstat_named_t *knp;
1548 kid_t kid;
1549 uint64_t idle, intr, kern, user;
1551 sys = ctl->zsctl_system;
1552 pset = cpu->zsc_pset;
1553 knp = NULL;
1554 kid = -1;
1555 cpuid = cpu->zsc_id;
1557 /* Get the cpu time totals for this cpu */
1558 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "cpu", cpuid, "sys");
1559 if (kstat == NULL)
1560 return;
1562 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
1563 if (kid == -1)
1564 return;
1566 knp = kstat_data_lookup(kstat, "cpu_nsec_idle");
1567 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1568 return;
1570 idle = knp->value.ui64;
1572 knp = kstat_data_lookup(kstat, "cpu_nsec_kernel");
1573 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1574 return;
1576 kern = knp->value.ui64;
1578 knp = kstat_data_lookup(kstat, "cpu_nsec_user");
1579 if (knp == NULL || knp->data_type != KSTAT_DATA_UINT64)
1580 return;
1582 user = knp->value.ui64;
1585 * Tracking intr time per cpu just exists for future enhancements.
1586 * The value is presently always zero.
1588 intr = 0;
1589 cpu->zsc_nsec_idle = idle;
1590 cpu->zsc_nsec_intr = intr;
1591 cpu->zsc_nsec_kern = kern;
1592 cpu->zsc_nsec_user = user;
1594 if (cpu->zsc_onlined == B_TRUE) {
1596 * cpu is newly online. There is no reference value,
1597 * so just record its current stats for comparison
1598 * on next stat read.
1600 cpu->zsc_nsec_idle_prev = cpu->zsc_nsec_idle;
1601 cpu->zsc_nsec_intr_prev = cpu->zsc_nsec_intr;
1602 cpu->zsc_nsec_kern_prev = cpu->zsc_nsec_kern;
1603 cpu->zsc_nsec_user_prev = cpu->zsc_nsec_user;
1604 return;
1608 * Calculate relative time since previous refresh.
1609 * Paranoia. Don't let time go backwards.
1611 idle = intr = kern = user = 0;
1612 if (cpu->zsc_nsec_idle > cpu->zsc_nsec_idle_prev)
1613 idle = cpu->zsc_nsec_idle - cpu->zsc_nsec_idle_prev;
1615 if (cpu->zsc_nsec_intr > cpu->zsc_nsec_intr_prev)
1616 intr = cpu->zsc_nsec_intr - cpu->zsc_nsec_intr_prev;
1618 if (cpu->zsc_nsec_kern > cpu->zsc_nsec_kern_prev)
1619 kern = cpu->zsc_nsec_kern - cpu->zsc_nsec_kern_prev;
1621 if (cpu->zsc_nsec_user > cpu->zsc_nsec_user_prev)
1622 user = cpu->zsc_nsec_user - cpu->zsc_nsec_user_prev;
1624 /* Update totals for cpu usage */
1625 TIMESTRUC_ADD_NANOSEC(cpu->zsc_idle, idle);
1626 TIMESTRUC_ADD_NANOSEC(cpu->zsc_intr, intr);
1627 TIMESTRUC_ADD_NANOSEC(cpu->zsc_kern, kern);
1628 TIMESTRUC_ADD_NANOSEC(cpu->zsc_user, user);
1631 * Add cpu's stats to its pset if it is known to be in
1632 * the pset since previous read.
1634 if (cpu->zsc_psetid == cpu->zsc_psetid_prev ||
1635 cpu->zsc_psetid_prev == ZS_PSET_ERROR ||
1636 (pset_prev = zsd_lookup_pset_byid(ctl,
1637 cpu->zsc_psetid_prev)) == NULL) {
1638 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle, idle);
1639 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr, intr);
1640 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern, kern);
1641 TIMESTRUC_ADD_NANOSEC(pset->zsp_user, user);
1642 } else {
1644 * Last pset was different than current pset.
1645 * Best guess is to split usage between the two.
1647 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_idle, idle / 2);
1648 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_intr, intr / 2);
1649 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_kern, kern / 2);
1650 TIMESTRUC_ADD_NANOSEC(pset_prev->zsp_user, user / 2);
1652 TIMESTRUC_ADD_NANOSEC(pset->zsp_idle,
1653 (idle / 2) + (idle % 2));
1654 TIMESTRUC_ADD_NANOSEC(pset->zsp_intr,
1655 (intr / 2) + (intr % 2));
1656 TIMESTRUC_ADD_NANOSEC(pset->zsp_kern,
1657 (kern / 2) + (kern % 2));
1658 TIMESTRUC_ADD_NANOSEC(pset->zsp_user,
1659 (user / 2) + (user % 2));
1661 TIMESTRUC_ADD_NANOSEC(sys->zss_idle, idle);
1662 TIMESTRUC_ADD_NANOSEC(sys->zss_intr, intr);
1663 TIMESTRUC_ADD_NANOSEC(sys->zss_kern, kern);
1664 TIMESTRUC_ADD_NANOSEC(sys->zss_user, user);
1667 /* Determine the details of a processor set by pset_id */
1668 static int
1669 zsd_get_pool_pset(zsd_ctl_t *ctl, psetid_t psetid, char *psetname,
1670 size_t namelen, uint_t *cputype, uint64_t *online, uint64_t *size,
1671 uint64_t *min, uint64_t *max, int64_t *importance)
1673 uint_t old, num;
1675 pool_conf_t *conf = ctl->zsctl_pool_conf;
1676 pool_value_t **vals = ctl->zsctl_pool_vals;
1677 pool_resource_t **res_list = NULL;
1678 pool_resource_t *pset;
1679 pool_component_t **cpus = NULL;
1680 processorid_t *cache;
1681 const char *string;
1682 uint64_t uint64;
1683 int64_t int64;
1684 int i, ret, type;
1686 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1689 * Inspect legacy psets
1691 for (;;) {
1692 old = num = ctl->zsctl_cpu_ncache;
1693 ret = pset_info(psetid, &type, &num,
1694 ctl->zsctl_cpu_cache);
1695 if (ret < 0) {
1696 /* pset is gone. Tell caller to retry */
1697 errno = EINTR;
1698 return (-1);
1700 if (num <= old) {
1701 /* Success */
1702 break;
1704 if ((cache = reallocarray(ctl->zsctl_cpu_cache, num,
1705 sizeof (processorid_t))) != NULL) {
1706 ctl->zsctl_cpu_ncache = num;
1707 ctl->zsctl_cpu_cache = cache;
1708 } else {
1710 * Could not allocate to get new cpu list.
1712 zsd_warn(gettext(
1713 "Could not allocate for cpu list"));
1714 errno = ENOMEM;
1715 return (-1);
1719 * Old school pset. Just make min and max equal
1720 * to its size
1722 if (psetid == ZS_PSET_DEFAULT) {
1723 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1724 (void) strlcpy(psetname, "pset_default", namelen);
1725 } else {
1726 *cputype = ZS_CPUTYPE_PSRSET_PSET;
1727 (void) snprintf(psetname, namelen,
1728 "SUNWlegacy_pset_%d", psetid);
1732 * Just treat legacy pset as a simple pool pset
1734 *online = num;
1735 *size = num;
1736 *min = num;
1737 *max = num;
1738 *importance = 1;
1740 return (0);
1743 /* Look up the pool pset using the pset id */
1744 res_list = NULL;
1745 pool_value_set_int64(vals[1], psetid);
1746 if (pool_value_set_name(vals[1], "pset.sys_id")
1747 != PO_SUCCESS)
1748 goto err;
1750 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1751 goto err;
1752 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1753 goto err;
1754 if ((res_list = pool_query_resources(conf, &num, vals)) == NULL)
1755 goto err;
1756 if (num != 1)
1757 goto err;
1758 pset = res_list[0];
1759 free(res_list);
1760 res_list = NULL;
1761 if (pool_get_property(conf, pool_resource_to_elem(conf, pset),
1762 "pset.name", vals[0]) != POC_STRING ||
1763 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1764 goto err;
1766 (void) strlcpy(psetname, string, namelen);
1767 if (strncmp(psetname, "SUNWtmp", strlen("SUNWtmp")) == 0)
1768 *cputype = ZS_CPUTYPE_DEDICATED;
1769 else if (psetid == ZS_PSET_DEFAULT)
1770 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
1771 else
1772 *cputype = ZS_CPUTYPE_POOL_PSET;
1774 /* Get size, min, max, and importance */
1775 if (pool_get_property(conf, pool_resource_to_elem(conf,
1776 pset), "pset.size", vals[0]) == POC_UINT &&
1777 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1778 *size = uint64;
1779 else
1780 *size = 0;
1782 /* Get size, min, max, and importance */
1783 if (pool_get_property(conf, pool_resource_to_elem(conf,
1784 pset), "pset.min", vals[0]) == POC_UINT &&
1785 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1786 *min = uint64;
1787 else
1788 *min = 0;
1789 if (*min >= ZSD_PSET_UNLIMITED)
1790 *min = ZS_LIMIT_NONE;
1792 if (pool_get_property(conf, pool_resource_to_elem(conf,
1793 pset), "pset.max", vals[0]) == POC_UINT &&
1794 pool_value_get_uint64(vals[0], &uint64) == PO_SUCCESS)
1795 *max = uint64;
1796 else
1797 *max = ZS_LIMIT_NONE;
1799 if (*max >= ZSD_PSET_UNLIMITED)
1800 *max = ZS_LIMIT_NONE;
1802 if (pool_get_property(conf, pool_resource_to_elem(conf,
1803 pset), "pset.importance", vals[0]) == POC_INT &&
1804 pool_value_get_int64(vals[0], &int64) == PO_SUCCESS)
1805 *importance = int64;
1806 else
1807 *importance = (uint64_t)1;
1809 *online = 0;
1810 if (*size == 0)
1811 return (0);
1813 /* get cpus */
1814 cpus = pool_query_resource_components(conf, pset, &num, NULL);
1815 if (cpus == NULL)
1816 goto err;
1818 /* Make sure there is space for cpu id list */
1819 if (num > ctl->zsctl_cpu_ncache) {
1820 if ((cache = reallocarray(ctl->zsctl_cpu_cache, num,
1821 sizeof (processorid_t))) != NULL) {
1822 ctl->zsctl_cpu_ncache = num;
1823 ctl->zsctl_cpu_cache = cache;
1824 } else {
1826 * Could not allocate to get new cpu list.
1828 zsd_warn(gettext(
1829 "Could not allocate for cpu list"));
1830 goto err;
1834 /* count the online cpus */
1835 for (i = 0; i < num; i++) {
1836 if (pool_get_property(conf, pool_component_to_elem(
1837 conf, cpus[i]), "cpu.status", vals[0]) != POC_STRING ||
1838 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
1839 goto err;
1841 if (strcmp(string, "on-line") != 0 &&
1842 strcmp(string, "no-intr") != 0)
1843 continue;
1845 if (pool_get_property(conf, pool_component_to_elem(
1846 conf, cpus[i]), "cpu.sys_id", vals[0]) != POC_INT ||
1847 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
1848 goto err;
1850 (*online)++;
1851 ctl->zsctl_cpu_cache[i] = (psetid_t)int64;
1853 free(cpus);
1854 return (0);
1855 err:
1856 free(res_list);
1857 free(cpus);
1860 * The pools operations should succeed since the conf is a consistent
1861 * snapshot. Tell caller there is no need to retry.
1863 errno = EINVAL;
1864 return (-1);
1868 * Update the current list of processor sets.
1869 * This also updates the list of online cpus, and each cpu's pset membership.
1871 static void
1872 zsd_refresh_psets(zsd_ctl_t *ctl)
1874 int i, j, ret, state;
1875 uint_t old, num;
1876 uint_t cputype;
1877 int64_t sys_id, importance;
1878 uint64_t online, size, min, max;
1879 zsd_system_t *system;
1880 zsd_pset_t *pset;
1881 zsd_cpu_t *cpu;
1882 psetid_t *cache;
1883 char psetname[ZS_PSETNAME_MAX];
1884 processorid_t cpuid;
1885 pool_value_t *pv_save = NULL;
1886 pool_resource_t **res_list = NULL;
1887 pool_resource_t *res;
1888 pool_value_t **vals;
1889 pool_conf_t *conf;
1890 boolean_t roll_cpus = B_TRUE;
1892 /* Zero cpu counters to recount them */
1893 system = ctl->zsctl_system;
1894 system->zss_ncpus = 0;
1895 system->zss_ncpus_online = 0;
1896 retry:
1897 ret = pool_get_status(&state);
1898 if (ret == 0 && state == POOL_ENABLED) {
1900 conf = ctl->zsctl_pool_conf;
1901 vals = ctl->zsctl_pool_vals;
1902 pv_save = vals[1];
1903 vals[1] = NULL;
1905 if (ctl->zsctl_pool_status == POOL_DISABLED) {
1906 if (pool_conf_open(ctl->zsctl_pool_conf,
1907 pool_dynamic_location(), PO_RDONLY) == 0) {
1908 ctl->zsctl_pool_status = POOL_ENABLED;
1909 ctl->zsctl_pool_changed = POU_PSET;
1911 } else {
1912 ctl->zsctl_pool_changed = 0;
1913 ret = pool_conf_update(ctl->zsctl_pool_conf,
1914 &(ctl->zsctl_pool_changed));
1915 if (ret < 0) {
1916 /* Pools must have become disabled */
1917 (void) pool_conf_close(ctl->zsctl_pool_conf);
1918 ctl->zsctl_pool_status = POOL_DISABLED;
1919 if (pool_error() == POE_SYSTEM && errno ==
1920 ENOTACTIVE)
1921 goto retry;
1923 zsd_warn(gettext(
1924 "Unable to update pool configuration"));
1925 /* Not able to get pool info. Don't update. */
1926 goto err;
1929 /* Get the list of psets using libpool */
1930 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
1931 goto err;
1933 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
1934 goto err;
1935 if ((res_list = pool_query_resources(conf, &num, vals))
1936 == NULL)
1937 goto err;
1939 if (num > ctl->zsctl_pset_ncache) {
1940 if ((cache = reallocarray(ctl->zsctl_pset_cache, num,
1941 sizeof (psetid_t))) == NULL) {
1942 goto err;
1944 ctl->zsctl_pset_ncache = num;
1945 ctl->zsctl_pset_cache = cache;
1947 /* Save the pset id of each pset */
1948 for (i = 0; i < num; i++) {
1949 res = res_list[i];
1950 if (pool_get_property(conf, pool_resource_to_elem(conf,
1951 res), "pset.sys_id", vals[0]) != POC_INT ||
1952 pool_value_get_int64(vals[0], &sys_id)
1953 != PO_SUCCESS)
1954 goto err;
1955 ctl->zsctl_pset_cache[i] = (int)sys_id;
1957 vals[1] = pv_save;
1958 pv_save = NULL;
1959 } else {
1960 if (ctl->zsctl_pool_status == POOL_ENABLED) {
1961 (void) pool_conf_close(ctl->zsctl_pool_conf);
1962 ctl->zsctl_pool_status = POOL_DISABLED;
1964 /* Get the pset list using legacy psets */
1965 for (;;) {
1966 old = num = ctl->zsctl_pset_ncache;
1967 (void) pset_list(ctl->zsctl_pset_cache, &num);
1968 if ((num + 1) <= old) {
1969 break;
1971 if ((cache = reallocarray(ctl->zsctl_pset_cache,
1972 num + 1, sizeof (psetid_t))) != NULL) {
1973 ctl->zsctl_pset_ncache = num + 1;
1974 ctl->zsctl_pset_cache = cache;
1975 } else {
1977 * Could not allocate to get new pset list.
1978 * Give up
1980 return;
1983 /* Add the default pset to list */
1984 ctl->zsctl_pset_cache[num] = ctl->zsctl_pset_cache[0];
1985 ctl->zsctl_pset_cache[0] = ZS_PSET_DEFAULT;
1986 num++;
1988 psets_changed:
1989 zsd_mark_cpus_start(ctl, roll_cpus);
1990 zsd_mark_psets_start(ctl);
1991 roll_cpus = B_FALSE;
1993 /* Refresh cpu membership of all psets */
1994 for (i = 0; i < num; i++) {
1996 /* Get pool pset information */
1997 sys_id = ctl->zsctl_pset_cache[i];
1998 if (zsd_get_pool_pset(ctl, sys_id, psetname, sizeof (psetname),
1999 &cputype, &online, &size, &min, &max, &importance)
2000 != 0) {
2001 if (errno == EINTR)
2002 goto psets_changed;
2003 zsd_warn(gettext("Failed to get info for pset %d"),
2004 sys_id);
2005 continue;
2008 system->zss_ncpus += size;
2009 system->zss_ncpus_online += online;
2011 pset = zsd_lookup_insert_pset(ctl, psetname,
2012 ctl->zsctl_pset_cache[i]);
2014 /* update pset info */
2015 zsd_mark_pset_found(pset, cputype, online, size, min,
2016 max, importance);
2018 /* update each cpu in pset */
2019 for (j = 0; j < pset->zsp_online; j++) {
2020 cpuid = ctl->zsctl_cpu_cache[j];
2021 cpu = zsd_lookup_insert_cpu(ctl, cpuid);
2022 zsd_mark_cpu_found(cpu, pset, sys_id);
2025 err:
2026 free(res_list);
2027 if (pv_save != NULL)
2028 vals[1] = pv_save;
2034 * Fetch the current pool and pset name for the given zone.
2036 static void
2037 zsd_get_zone_pool_pset(zsd_ctl_t *ctl, zsd_zone_t *zone,
2038 char *pool, int poollen, char *pset, int psetlen, uint_t *cputype)
2040 poolid_t poolid;
2041 pool_t **pools = NULL;
2042 pool_resource_t **res_list = NULL;
2043 char poolname[ZS_POOLNAME_MAX];
2044 char psetname[ZS_PSETNAME_MAX];
2045 pool_conf_t *conf = ctl->zsctl_pool_conf;
2046 pool_value_t *pv_save = NULL;
2047 pool_value_t **vals = ctl->zsctl_pool_vals;
2048 const char *string;
2049 int ret;
2050 int64_t int64;
2051 uint_t num;
2053 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_POOLID,
2054 &poolid, sizeof (poolid));
2055 if (ret < 0)
2056 goto lookup_done;
2058 pv_save = vals[1];
2059 vals[1] = NULL;
2060 pools = NULL;
2061 res_list = NULL;
2063 /* Default values if lookup fails */
2064 (void) strlcpy(poolname, "pool_default", sizeof (poolname));
2065 (void) strlcpy(psetname, "pset_default", sizeof (poolname));
2066 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2068 /* no dedicated cpu if pools are disabled */
2069 if (ctl->zsctl_pool_status == POOL_DISABLED)
2070 goto lookup_done;
2072 /* Get the pool name using the id */
2073 pool_value_set_int64(vals[0], poolid);
2074 if (pool_value_set_name(vals[0], "pool.sys_id") != PO_SUCCESS)
2075 goto lookup_done;
2077 if ((pools = pool_query_pools(conf, &num, vals)) == NULL)
2078 goto lookup_done;
2080 if (num != 1)
2081 goto lookup_done;
2083 if (pool_get_property(conf, pool_to_elem(conf, pools[0]),
2084 "pool.name", vals[0]) != POC_STRING ||
2085 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2086 goto lookup_done;
2087 (void) strlcpy(poolname, (char *)string, sizeof (poolname));
2089 /* Get the name of the pset for the pool */
2090 if (pool_value_set_name(vals[0], "type") != PO_SUCCESS)
2091 goto lookup_done;
2093 if (pool_value_set_string(vals[0], "pset") != PO_SUCCESS)
2094 goto lookup_done;
2096 if ((res_list = pool_query_pool_resources(conf, pools[0], &num, vals))
2097 == NULL)
2098 goto lookup_done;
2100 if (num != 1)
2101 goto lookup_done;
2103 if (pool_get_property(conf, pool_resource_to_elem(conf,
2104 res_list[0]), "pset.sys_id", vals[0]) != POC_INT ||
2105 pool_value_get_int64(vals[0], &int64) != PO_SUCCESS)
2106 goto lookup_done;
2108 if (int64 == ZS_PSET_DEFAULT)
2109 *cputype = ZS_CPUTYPE_DEFAULT_PSET;
2111 if (pool_get_property(conf, pool_resource_to_elem(conf,
2112 res_list[0]), "pset.name", vals[0]) != POC_STRING ||
2113 pool_value_get_string(vals[0], &string) != PO_SUCCESS)
2114 goto lookup_done;
2116 (void) strlcpy(psetname, (char *)string, sizeof (psetname));
2118 if (strncmp(psetname, "SUNWtmp_", strlen("SUNWtmp_")) == 0)
2119 *cputype = ZS_CPUTYPE_DEDICATED;
2120 if (strncmp(psetname, "SUNW_legacy_", strlen("SUNW_legacy_")) == 0)
2121 *cputype = ZS_CPUTYPE_PSRSET_PSET;
2122 else
2123 *cputype = ZS_CPUTYPE_POOL_PSET;
2125 lookup_done:
2127 if (pv_save != NULL)
2128 vals[1] = pv_save;
2130 free(res_list);
2131 free(pools);
2133 (void) strlcpy(pool, poolname, poollen);
2134 (void) strlcpy(pset, psetname, psetlen);
2137 /* Convert scheduler names to ZS_* scheduler flags */
2138 static uint_t
2139 zsd_schedname2int(char *clname, int pri)
2141 uint_t sched = 0;
2143 if (strcmp(clname, "TS") == 0) {
2144 sched = ZS_SCHED_TS;
2145 } else if (strcmp(clname, "IA") == 0) {
2146 sched = ZS_SCHED_IA;
2147 } else if (strcmp(clname, "FX") == 0) {
2148 if (pri > 59) {
2149 sched = ZS_SCHED_FX_60;
2150 } else {
2151 sched = ZS_SCHED_FX;
2153 } else if (strcmp(clname, "RT") == 0) {
2154 sched = ZS_SCHED_RT;
2156 } else if (strcmp(clname, "FSS") == 0) {
2157 sched = ZS_SCHED_FSS;
2159 return (sched);
2162 static uint64_t
2163 zsd_get_zone_rctl_limit(char *name)
2165 rctlblk_t *rblk;
2167 rblk = (rctlblk_t *)alloca(rctlblk_size());
2168 if (getrctl(name, NULL, rblk, RCTL_FIRST)
2169 != 0) {
2170 return (ZS_LIMIT_NONE);
2172 return (rctlblk_get_value(rblk));
2175 static uint64_t
2176 zsd_get_zone_rctl_usage(char *name)
2178 rctlblk_t *rblk;
2180 rblk = (rctlblk_t *)alloca(rctlblk_size());
2181 if (getrctl(name, NULL, rblk, RCTL_USAGE)
2182 != 0) {
2183 return (0);
2185 return (rctlblk_get_value(rblk));
2188 #define ZSD_NUM_RCTL_VALS 19
2191 * Fetch the limit information for a zone. This uses zone_enter() as the
2192 * getrctl(2) system call only returns rctl information for the zone of
2193 * the caller.
2195 static int
2196 zsd_get_zone_caps(zsd_ctl_t *ctl, zsd_zone_t *zone, uint64_t *cpu_shares,
2197 uint64_t *cpu_cap, uint64_t *ram_cap, uint64_t *locked_cap,
2198 uint64_t *vm_cap, uint64_t *processes_cap, uint64_t *processes,
2199 uint64_t *lwps_cap, uint64_t *lwps, uint64_t *shm_cap, uint64_t *shm,
2200 uint64_t *shmids_cap, uint64_t *shmids, uint64_t *semids_cap,
2201 uint64_t *semids, uint64_t *msgids_cap, uint64_t *msgids,
2202 uint64_t *lofi_cap, uint64_t *lofi, uint_t *sched)
2204 int p[2], pid, tmpl_fd, ret;
2205 ctid_t ct;
2206 char class[PC_CLNMSZ];
2207 uint64_t vals[ZSD_NUM_RCTL_VALS];
2208 zsd_system_t *sys = ctl->zsctl_system;
2209 int i = 0;
2210 int res = 0;
2212 /* Treat all caps as no cap on error */
2213 *cpu_shares = ZS_LIMIT_NONE;
2214 *cpu_cap = ZS_LIMIT_NONE;
2215 *ram_cap = ZS_LIMIT_NONE;
2216 *locked_cap = ZS_LIMIT_NONE;
2217 *vm_cap = ZS_LIMIT_NONE;
2219 *processes_cap = ZS_LIMIT_NONE;
2220 *lwps_cap = ZS_LIMIT_NONE;
2221 *shm_cap = ZS_LIMIT_NONE;
2222 *shmids_cap = ZS_LIMIT_NONE;
2223 *semids_cap = ZS_LIMIT_NONE;
2224 *msgids_cap = ZS_LIMIT_NONE;
2225 *lofi_cap = ZS_LIMIT_NONE;
2227 *processes = 0;
2228 *lwps = 0;
2229 *shm = 0;
2230 *shmids = 0;
2231 *semids = 0;
2232 *msgids = 0;
2233 *lofi = 0;
2235 /* Get the ram cap first since it is a zone attr */
2236 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_PHYS_MCAP,
2237 ram_cap, sizeof (*ram_cap));
2238 if (ret < 0 || *ram_cap == 0)
2239 *ram_cap = ZS_LIMIT_NONE;
2241 /* Get the zone's default scheduling class */
2242 ret = zone_getattr(zone->zsz_id, ZONE_ATTR_SCHED_CLASS,
2243 class, sizeof (class));
2244 if (ret < 0)
2245 return (-1);
2247 *sched = zsd_schedname2int(class, 0);
2249 /* rctl caps must be fetched from within the zone */
2250 if (pipe(p) != 0)
2251 return (-1);
2253 if ((tmpl_fd = init_template()) == -1) {
2254 (void) close(p[0]);
2255 (void) close(p[1]);
2256 return (-1);
2258 pid = forkx(0);
2259 if (pid < 0) {
2260 (void) ct_tmpl_clear(tmpl_fd);
2261 (void) close(p[0]);
2262 (void) close(p[1]);
2263 return (-1);
2265 if (pid == 0) {
2267 (void) ct_tmpl_clear(tmpl_fd);
2268 (void) close(tmpl_fd);
2269 (void) close(p[0]);
2270 if (zone->zsz_id != getzoneid()) {
2271 if (zone_enter(zone->zsz_id) < 0) {
2272 (void) close(p[1]);
2273 _exit(0);
2277 /* Get caps for zone, and write them to zonestatd parent. */
2278 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-shares");
2279 vals[i++] = zsd_get_zone_rctl_limit("zone.cpu-cap");
2280 vals[i++] = zsd_get_zone_rctl_limit("zone.max-locked-memory");
2281 vals[i++] = zsd_get_zone_rctl_limit("zone.max-swap");
2282 vals[i++] = zsd_get_zone_rctl_limit("zone.max-processes");
2283 vals[i++] = zsd_get_zone_rctl_usage("zone.max-processes");
2284 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lwps");
2285 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lwps");
2286 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-memory");
2287 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-memory");
2288 vals[i++] = zsd_get_zone_rctl_limit("zone.max-shm-ids");
2289 vals[i++] = zsd_get_zone_rctl_usage("zone.max-shm-ids");
2290 vals[i++] = zsd_get_zone_rctl_limit("zone.max-sem-ids");
2291 vals[i++] = zsd_get_zone_rctl_usage("zone.max-sem-ids");
2292 vals[i++] = zsd_get_zone_rctl_limit("zone.max-msg-ids");
2293 vals[i++] = zsd_get_zone_rctl_usage("zone.max-msg-ids");
2294 vals[i++] = zsd_get_zone_rctl_limit("zone.max-lofi");
2295 vals[i++] = zsd_get_zone_rctl_usage("zone.max-lofi");
2297 if (write(p[1], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2298 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2299 (void) close(p[1]);
2300 _exit(1);
2303 (void) close(p[1]);
2304 _exit(0);
2306 if (contract_latest(&ct) == -1)
2307 ct = -1;
2309 (void) ct_tmpl_clear(tmpl_fd);
2310 (void) close(tmpl_fd);
2311 (void) close(p[1]);
2312 while (waitpid(pid, NULL, 0) != pid)
2315 /* Read cap from child in zone */
2316 if (read(p[0], vals, ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) !=
2317 ZSD_NUM_RCTL_VALS * sizeof (uint64_t)) {
2318 res = -1;
2319 goto cleanup;
2321 i = 0;
2322 *cpu_shares = vals[i++];
2323 *cpu_cap = vals[i++];
2324 *locked_cap = vals[i++];
2325 *vm_cap = vals[i++];
2326 *processes_cap = vals[i++];
2327 *processes = vals[i++];
2328 *lwps_cap = vals[i++];
2329 *lwps = vals[i++];
2330 *shm_cap = vals[i++];
2331 *shm = vals[i++];
2332 *shmids_cap = vals[i++];
2333 *shmids = vals[i++];
2334 *semids_cap = vals[i++];
2335 *semids = vals[i++];
2336 *msgids_cap = vals[i++];
2337 *msgids = vals[i++];
2338 *lofi_cap = vals[i++];
2339 *lofi = vals[i++];
2341 /* Interpret maximum values as no cap */
2342 if (*cpu_cap == UINT32_MAX || *cpu_cap == 0)
2343 *cpu_cap = ZS_LIMIT_NONE;
2344 if (*processes_cap == sys->zss_processes_max)
2345 *processes_cap = ZS_LIMIT_NONE;
2346 if (*lwps_cap == sys->zss_lwps_max)
2347 *lwps_cap = ZS_LIMIT_NONE;
2348 if (*shm_cap == sys->zss_shm_max)
2349 *shm_cap = ZS_LIMIT_NONE;
2350 if (*shmids_cap == sys->zss_shmids_max)
2351 *shmids_cap = ZS_LIMIT_NONE;
2352 if (*semids_cap == sys->zss_semids_max)
2353 *semids_cap = ZS_LIMIT_NONE;
2354 if (*msgids_cap == sys->zss_msgids_max)
2355 *msgids_cap = ZS_LIMIT_NONE;
2356 if (*lofi_cap == sys->zss_lofi_max)
2357 *lofi_cap = ZS_LIMIT_NONE;
2360 cleanup:
2361 (void) close(p[0]);
2362 (void) ct_tmpl_clear(tmpl_fd);
2363 (void) close(tmpl_fd);
2364 (void) contract_abandon_id(ct);
2366 return (res);
2369 /* Update the current list of running zones */
2370 static void
2371 zsd_refresh_zones(zsd_ctl_t *ctl)
2373 zsd_zone_t *zone;
2374 uint_t old, num;
2375 ushort_t flags;
2376 int i, ret;
2377 zoneid_t *cache;
2378 uint64_t cpu_shares;
2379 uint64_t cpu_cap;
2380 uint64_t ram_cap;
2381 uint64_t locked_cap;
2382 uint64_t vm_cap;
2383 uint64_t processes_cap;
2384 uint64_t processes;
2385 uint64_t lwps_cap;
2386 uint64_t lwps;
2387 uint64_t shm_cap;
2388 uint64_t shm;
2389 uint64_t shmids_cap;
2390 uint64_t shmids;
2391 uint64_t semids_cap;
2392 uint64_t semids;
2393 uint64_t msgids_cap;
2394 uint64_t msgids;
2395 uint64_t lofi_cap;
2396 uint64_t lofi;
2398 char zonename[ZS_ZONENAME_MAX];
2399 char poolname[ZS_POOLNAME_MAX];
2400 char psetname[ZS_PSETNAME_MAX];
2401 uint_t sched;
2402 uint_t cputype;
2403 uint_t iptype;
2405 /* Get the current list of running zones */
2406 for (;;) {
2407 old = num = ctl->zsctl_zone_ncache;
2408 (void) zone_list(ctl->zsctl_zone_cache, &num);
2409 if (num <= old)
2410 break;
2411 if ((cache = reallocarray(ctl->zsctl_zone_cache, num,
2412 sizeof (zoneid_t))) != NULL) {
2413 ctl->zsctl_zone_ncache = num;
2414 ctl->zsctl_zone_cache = cache;
2415 } else {
2416 /* Could not allocate to get new zone list. Give up */
2417 return;
2421 zsd_mark_zones_start(ctl);
2423 for (i = 0; i < num; i++) {
2425 ret = getzonenamebyid(ctl->zsctl_zone_cache[i],
2426 zonename, sizeof (zonename));
2427 if (ret < 0)
2428 continue;
2430 zone = zsd_lookup_insert_zone(ctl, zonename,
2431 ctl->zsctl_zone_cache[i]);
2433 ret = zone_getattr(ctl->zsctl_zone_cache[i], ZONE_ATTR_FLAGS,
2434 &flags, sizeof (flags));
2435 if (ret < 0)
2436 continue;
2438 if (flags & ZF_NET_EXCL)
2439 iptype = ZS_IPTYPE_EXCLUSIVE;
2440 else
2441 iptype = ZS_IPTYPE_SHARED;
2443 zsd_get_zone_pool_pset(ctl, zone, poolname, sizeof (poolname),
2444 psetname, sizeof (psetname), &cputype);
2446 if (zsd_get_zone_caps(ctl, zone, &cpu_shares, &cpu_cap,
2447 &ram_cap, &locked_cap, &vm_cap, &processes_cap, &processes,
2448 &lwps_cap, &lwps, &shm_cap, &shm, &shmids_cap, &shmids,
2449 &semids_cap, &semids, &msgids_cap, &msgids, &lofi_cap,
2450 &lofi, &sched) != 0)
2451 continue;
2453 zsd_mark_zone_found(ctl, zone, cpu_shares, cpu_cap, ram_cap,
2454 locked_cap, vm_cap, processes_cap, processes, lwps_cap,
2455 lwps, shm_cap, shm, shmids_cap, shmids, semids_cap,
2456 semids, msgids_cap, msgids, lofi_cap, lofi, poolname,
2457 psetname, sched, cputype, iptype);
2461 /* Fetch the details of a process from its psinfo_t */
2462 static void
2463 zsd_get_proc_info(zsd_ctl_t *ctl, psinfo_t *psinfo, psetid_t *psetid,
2464 psetid_t *prev_psetid, zoneid_t *zoneid, zoneid_t *prev_zoneid,
2465 timestruc_t *delta, uint_t *sched)
2467 timestruc_t d;
2468 zsd_proc_t *proc;
2470 /* Get cached data for proc */
2471 proc = &(ctl->zsctl_proc_array[psinfo->pr_pid]);
2472 *psetid = psinfo->pr_lwp.pr_bindpset;
2474 if (proc->zspr_psetid == ZS_PSET_ERROR)
2475 *prev_psetid = *psetid;
2476 else
2477 *prev_psetid = proc->zspr_psetid;
2479 *zoneid = psinfo->pr_zoneid;
2480 if (proc->zspr_zoneid == -1)
2481 *prev_zoneid = *zoneid;
2482 else
2483 *prev_zoneid = proc->zspr_zoneid;
2485 TIMESTRUC_DELTA(d, psinfo->pr_time, proc->zspr_usage);
2486 *delta = d;
2488 *sched = zsd_schedname2int(psinfo->pr_lwp.pr_clname,
2489 psinfo->pr_lwp.pr_pri);
2491 /* Update cached data for proc */
2492 proc->zspr_psetid = psinfo->pr_lwp.pr_bindpset;
2493 proc->zspr_zoneid = psinfo->pr_zoneid;
2494 proc->zspr_sched = *sched;
2495 proc->zspr_usage.tv_sec = psinfo->pr_time.tv_sec;
2496 proc->zspr_usage.tv_nsec = psinfo->pr_time.tv_nsec;
2497 proc->zspr_ppid = psinfo->pr_ppid;
2501 * Reset the known cpu usage of a process. This is done after a process
2502 * exits so that if the pid is recycled, data from its previous life is
2503 * not reused
2505 static void
2506 zsd_flush_proc_info(zsd_proc_t *proc)
2508 proc->zspr_usage.tv_sec = 0;
2509 proc->zspr_usage.tv_nsec = 0;
2513 * Open the current extended accounting file. On initialization, open the
2514 * file as the current file to be used. Otherwise, open the file as the
2515 * next file to use of the current file reaches EOF.
2517 static int
2518 zsd_open_exacct(zsd_ctl_t *ctl, boolean_t init)
2520 int ret, oret, state, trys = 0, flags;
2521 int *fd, *opn;
2522 ea_file_t *eaf;
2523 struct stat *stat;
2524 char path[MAXPATHLEN];
2527 * The accounting file is first opened at the tail. Following
2528 * opens to new accounting files are opened at the head.
2530 if (init == B_TRUE) {
2531 flags = EO_NO_VALID_HDR | EO_TAIL;
2532 fd = &ctl->zsctl_proc_fd;
2533 eaf = &ctl->zsctl_proc_eaf;
2534 stat = &ctl->zsctl_proc_stat;
2535 opn = &ctl->zsctl_proc_open;
2536 } else {
2537 flags = EO_NO_VALID_HDR | EO_HEAD;
2538 fd = &ctl->zsctl_proc_fd_next;
2539 eaf = &ctl->zsctl_proc_eaf_next;
2540 stat = &ctl->zsctl_proc_stat_next;
2541 opn = &ctl->zsctl_proc_open_next;
2544 *fd = -1;
2545 *opn = 0;
2546 retry:
2547 /* open accounting files for cpu consumption */
2548 ret = acctctl(AC_STATE_GET | AC_PROC, &state, sizeof (state));
2549 if (ret != 0) {
2550 zsd_warn(gettext("Unable to get process accounting state"));
2551 goto err;
2553 if (state != AC_ON) {
2554 if (trys > 0) {
2555 zsd_warn(gettext(
2556 "Unable to enable process accounting"));
2557 goto err;
2559 (void) zsd_enable_cpu_stats();
2560 trys++;
2561 goto retry;
2564 ret = acctctl(AC_FILE_GET | AC_PROC, path, sizeof (path));
2565 if (ret != 0) {
2566 zsd_warn(gettext("Unable to get process accounting file"));
2567 goto err;
2570 if ((*fd = open(path, O_RDONLY, 0)) >= 0 &&
2571 (oret = ea_fdopen(eaf, *fd, NULL, flags, O_RDONLY)) == 0)
2572 ret = fstat(*fd, stat);
2574 if (*fd < 0 || oret < 0 || ret < 0) {
2575 struct timespec ts;
2578 * It is possible the accounting file is momentarily unavailable
2579 * because it is being rolled. Try for up to half a second.
2581 * If failure to open accounting file persists, give up.
2583 if (oret == 0)
2584 (void) ea_close(eaf);
2585 else if (*fd >= 0)
2586 (void) close(*fd);
2587 if (trys > 500) {
2588 zsd_warn(gettext(
2589 "Unable to open process accounting file"));
2590 goto err;
2592 /* wait one millisecond */
2593 ts.tv_sec = 0;
2594 ts.tv_nsec = NANOSEC / 1000;
2595 (void) nanosleep(&ts, NULL);
2596 goto retry;
2598 *opn = 1;
2599 return (0);
2600 err:
2601 if (*fd >= 0)
2602 (void) close(*fd);
2603 *opn = 0;
2604 *fd = -1;
2605 return (-1);
2609 * Walk /proc and charge each process to its zone and processor set.
2610 * Then read exacct data for exited processes, and charge them as well.
2612 static void
2613 zsd_refresh_procs(zsd_ctl_t *ctl, boolean_t init)
2615 DIR *dir;
2616 struct dirent *dent, *dresult;
2617 psinfo_t psinfo;
2618 int fd, ret;
2619 zsd_proc_t *proc, *pproc, *tmp, *next;
2620 list_t pplist, plist;
2621 zsd_zone_t *zone, *prev_zone;
2622 zsd_pset_t *pset, *prev_pset;
2623 psetid_t psetid, prev_psetid;
2624 zoneid_t zoneid, prev_zoneid;
2625 zsd_pset_usage_t *usage, *prev_usage;
2626 char path[MAXPATHLEN];
2628 ea_object_t object;
2629 ea_object_t pobject;
2630 boolean_t hrtime_expired = B_FALSE;
2631 struct timeval interval_end;
2633 timestruc_t delta, d1, d2;
2634 uint_t sched = 0;
2637 * Get the current accounting file. The current accounting file
2638 * may be different than the file in use, as the accounting file
2639 * may have been rolled, or manually changed by an admin.
2641 ret = zsd_open_exacct(ctl, init);
2642 if (ret != 0) {
2643 zsd_warn(gettext("Unable to track process accounting"));
2644 return;
2648 * Mark the current time as the interval end time. Don't track
2649 * processes that exit after this time.
2651 (void) gettimeofday(&interval_end, NULL);
2653 dir = opendir("/proc");
2654 if (dir == NULL) {
2655 zsd_warn(gettext("Unable to open /proc"));
2656 return;
2659 dent = ctl->zsctl_procfs_dent;
2661 (void) memset(dent, 0, ctl->zsctl_procfs_dent_size);
2663 /* Walk all processes and compute each zone's usage on each pset. */
2664 while (readdir_r(dir, dent, &dresult) == 0) {
2666 if (strcmp(dent->d_name, ".") == 0 ||
2667 strcmp(dent->d_name, "..") == 0)
2668 continue;
2670 (void) snprintf(path, sizeof (path), "/proc/%s/psinfo",
2671 dent->d_name);
2673 fd = open(path, O_RDONLY);
2674 if (fd < 0)
2675 continue;
2677 if (read(fd, &psinfo, sizeof (psinfo)) != sizeof (psinfo)) {
2678 (void) close(fd);
2679 continue;
2681 (void) close(fd);
2683 zsd_get_proc_info(ctl, &psinfo, &psetid, &prev_psetid,
2684 &zoneid, &prev_zoneid, &delta, &sched);
2686 d1.tv_sec = delta.tv_sec / 2;
2687 d1.tv_nsec = delta.tv_nsec / 2;
2688 d2.tv_sec = (delta.tv_sec / 2) + (delta.tv_sec % 2);
2689 d2.tv_nsec = (delta.tv_nsec / 2) + (delta.tv_nsec % 2);
2691 /* Get the zone and pset this process is running in */
2692 zone = zsd_lookup_zone_byid(ctl, zoneid);
2693 if (zone == NULL)
2694 continue;
2695 pset = zsd_lookup_pset_byid(ctl, psetid);
2696 if (pset == NULL)
2697 continue;
2698 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2699 if (usage == NULL)
2700 continue;
2703 * Get the usage of the previous zone and pset if they were
2704 * different.
2706 if (zoneid != prev_zoneid)
2707 prev_zone = zsd_lookup_zone_byid(ctl, prev_zoneid);
2708 else
2709 prev_zone = NULL;
2711 if (psetid != prev_psetid)
2712 prev_pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2713 else
2714 prev_pset = NULL;
2716 prev_usage = NULL;
2717 if (prev_zone != NULL || prev_pset != NULL) {
2718 if (prev_zone == NULL)
2719 prev_zone = zone;
2720 if (prev_pset == NULL)
2721 prev_pset = pset;
2723 prev_usage = zsd_lookup_insert_usage(ctl, prev_pset,
2724 prev_zone);
2727 /* Update the usage with the processes info */
2728 if (prev_usage == NULL) {
2729 zsd_mark_pset_usage_found(usage, sched);
2730 } else {
2731 zsd_mark_pset_usage_found(usage, sched);
2732 zsd_mark_pset_usage_found(prev_usage, sched);
2736 * First time around is just to get a starting point. All
2737 * usages will be zero.
2739 if (init == B_TRUE)
2740 continue;
2742 if (prev_usage == NULL) {
2743 zsd_add_usage(ctl, usage, &delta);
2744 } else {
2745 zsd_add_usage(ctl, usage, &d1);
2746 zsd_add_usage(ctl, prev_usage, &d2);
2749 (void) closedir(dir);
2752 * No need to collect exited proc data on initialization. Just
2753 * caching the usage of the known processes to get a zero starting
2754 * point.
2756 if (init == B_TRUE)
2757 return;
2760 * Add accounting records to account for processes which have
2761 * exited.
2763 list_create(&plist, sizeof (zsd_proc_t),
2764 offsetof(zsd_proc_t, zspr_next));
2765 list_create(&pplist, sizeof (zsd_proc_t),
2766 offsetof(zsd_proc_t, zspr_next));
2768 for (;;) {
2769 pid_t pid;
2770 pid_t ppid;
2771 timestruc_t user, sys, proc_usage;
2772 timestruc_t finish;
2773 int numfound = 0;
2775 bzero(&object, sizeof (object));
2776 proc = NULL;
2777 zone = NULL;
2778 pset = NULL;
2779 usage = NULL;
2780 ret = ea_get_object(&ctl->zsctl_proc_eaf, &object);
2781 if (ret == EO_ERROR) {
2782 if (ea_error() == EXR_EOF) {
2784 struct stat *stat;
2785 struct stat *stat_next;
2788 * See if the next accounting file is the
2789 * same as the current accounting file.
2791 stat = &(ctl->zsctl_proc_stat);
2792 stat_next = &(ctl->zsctl_proc_stat_next);
2793 if (stat->st_ino == stat_next->st_ino &&
2794 stat->st_dev == stat_next->st_dev) {
2796 * End of current accounting file is
2797 * reached, so finished. Clear EOF
2798 * bit for next time around.
2800 ea_clear(&ctl->zsctl_proc_eaf);
2801 break;
2802 } else {
2804 * Accounting file has changed. Move
2805 * to current accounting file.
2807 (void) ea_close(&ctl->zsctl_proc_eaf);
2809 ctl->zsctl_proc_fd =
2810 ctl->zsctl_proc_fd_next;
2811 ctl->zsctl_proc_eaf =
2812 ctl->zsctl_proc_eaf_next;
2813 ctl->zsctl_proc_stat =
2814 ctl->zsctl_proc_stat_next;
2816 ctl->zsctl_proc_fd_next = -1;
2817 ctl->zsctl_proc_open_next = 0;
2818 continue;
2820 } else {
2822 * Other accounting error. Give up on
2823 * accounting.
2825 goto ea_err;
2828 /* Skip if not a process group */
2829 if ((object.eo_catalog & EXT_TYPE_MASK) != EXT_GROUP ||
2830 (object.eo_catalog & EXD_DATA_MASK) != EXD_GROUP_PROC) {
2831 (void) ea_free_item(&object, EUP_ALLOC);
2832 continue;
2835 /* The process group entry should be complete */
2836 while (numfound < 9) {
2837 bzero(&pobject, sizeof (pobject));
2838 ret = ea_get_object(&ctl->zsctl_proc_eaf,
2839 &pobject);
2840 if (ret < 0) {
2841 (void) ea_free_item(&object, EUP_ALLOC);
2842 zsd_warn(
2843 "unable to get process accounting data");
2844 goto ea_err;
2846 /* Next entries should be process data */
2847 if ((pobject.eo_catalog & EXT_TYPE_MASK) ==
2848 EXT_GROUP) {
2849 (void) ea_free_item(&object, EUP_ALLOC);
2850 (void) ea_free_item(&pobject, EUP_ALLOC);
2851 zsd_warn(
2852 "process data of wrong type");
2853 goto ea_err;
2855 switch (pobject.eo_catalog & EXD_DATA_MASK) {
2856 case EXD_PROC_PID:
2857 pid = pobject.eo_item.ei_uint32;
2858 proc = &(ctl->zsctl_proc_array[pid]);
2860 * This process should not be currently in
2861 * the list of processes to process.
2863 assert(!list_link_active(&proc->zspr_next));
2864 numfound++;
2865 break;
2866 case EXD_PROC_ANCPID:
2867 ppid = pobject.eo_item.ei_uint32;
2868 pproc = &(ctl->zsctl_proc_array[ppid]);
2869 numfound++;
2870 break;
2871 case EXD_PROC_ZONENAME:
2872 zone = zsd_lookup_zone(ctl,
2873 pobject.eo_item.ei_string, -1);
2874 numfound++;
2875 break;
2876 case EXD_PROC_CPU_USER_SEC:
2877 user.tv_sec =
2878 pobject.eo_item.ei_uint64;
2879 numfound++;
2880 break;
2881 case EXD_PROC_CPU_USER_NSEC:
2882 user.tv_nsec =
2883 pobject.eo_item.ei_uint64;
2884 numfound++;
2885 break;
2886 case EXD_PROC_CPU_SYS_SEC:
2887 sys.tv_sec =
2888 pobject.eo_item.ei_uint64;
2889 numfound++;
2890 break;
2891 case EXD_PROC_CPU_SYS_NSEC:
2892 sys.tv_nsec =
2893 pobject.eo_item.ei_uint64;
2894 numfound++;
2895 break;
2896 case EXD_PROC_FINISH_SEC:
2897 finish.tv_sec =
2898 pobject.eo_item.ei_uint64;
2899 numfound++;
2900 break;
2901 case EXD_PROC_FINISH_NSEC:
2902 finish.tv_nsec =
2903 pobject.eo_item.ei_uint64;
2904 numfound++;
2905 break;
2907 (void) ea_free_item(&pobject, EUP_ALLOC);
2909 (void) ea_free_item(&object, EUP_ALLOC);
2910 if (numfound != 9) {
2911 zsd_warn(gettext(
2912 "Malformed process accounting entry found"));
2913 goto proc_done;
2916 if (finish.tv_sec > interval_end.tv_sec ||
2917 (finish.tv_sec == interval_end.tv_sec &&
2918 finish.tv_nsec > (interval_end.tv_usec * 1000)))
2919 hrtime_expired = B_TRUE;
2922 * Try to identify the zone and pset to which this
2923 * exited process belongs.
2925 if (zone == NULL)
2926 goto proc_done;
2928 /* Save proc info */
2929 proc->zspr_ppid = ppid;
2930 proc->zspr_zoneid = zone->zsz_id;
2932 prev_psetid = ZS_PSET_ERROR;
2933 sched = 0;
2936 * The following tries to deduce the processes pset.
2938 * First choose pset and sched using cached value from the
2939 * most recent time the process has been seen.
2941 * pset and sched can change across zone_enter, so make sure
2942 * most recent sighting of this process was in the same
2943 * zone before using most recent known value.
2945 * If there is no known value, use value of processes
2946 * parent. If parent is unknown, walk parents until a known
2947 * parent is found.
2949 * If no parent in the zone is found, use the zone's default
2950 * pset and scheduling class.
2952 if (proc->zspr_psetid != ZS_PSET_ERROR) {
2953 prev_psetid = proc->zspr_psetid;
2954 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2955 sched = proc->zspr_sched;
2956 } else if (pproc->zspr_zoneid == zone->zsz_id &&
2957 pproc->zspr_psetid != ZS_PSET_ERROR) {
2958 prev_psetid = pproc->zspr_psetid;
2959 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
2960 sched = pproc->zspr_sched;
2963 if (pset == NULL) {
2965 * Process or processes parent has never been seen.
2966 * Save to deduce a known parent later.
2968 proc_usage = sys;
2969 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2970 TIMESTRUC_DELTA(delta, proc_usage,
2971 proc->zspr_usage);
2972 proc->zspr_usage = delta;
2973 list_insert_tail(&plist, proc);
2974 continue;
2977 /* Add the zone's usage to the pset */
2978 usage = zsd_lookup_insert_usage(ctl, pset, zone);
2979 if (usage == NULL)
2980 goto proc_done;
2982 zsd_mark_pset_usage_found(usage, sched);
2984 /* compute the usage to add for the exited proc */
2985 proc_usage = sys;
2986 TIMESTRUC_ADD_TIMESTRUC(proc_usage, user);
2987 TIMESTRUC_DELTA(delta, proc_usage,
2988 proc->zspr_usage);
2990 zsd_add_usage(ctl, usage, &delta);
2991 proc_done:
2992 zsd_flush_proc_info(proc);
2994 if (hrtime_expired == B_TRUE)
2995 break;
2998 * close next accounting file.
3000 if (ctl->zsctl_proc_open_next) {
3001 (void) ea_close(
3002 &ctl->zsctl_proc_eaf_next);
3003 ctl->zsctl_proc_open_next = 0;
3004 ctl->zsctl_proc_fd_next = -1;
3007 /* For the remaining processes, use pset and sched of a known parent */
3008 proc = list_head(&plist);
3009 while (proc != NULL) {
3010 next = proc;
3011 for (;;) {
3012 if (next->zspr_ppid == 0 || next->zspr_ppid == -1) {
3014 * Kernel process, or parent is unknown, skip
3015 * process, remove from process list.
3017 tmp = proc;
3018 proc = list_next(&plist, proc);
3019 list_link_init(&tmp->zspr_next);
3020 break;
3022 pproc = &(ctl->zsctl_proc_array[next->zspr_ppid]);
3023 if (pproc->zspr_zoneid != proc->zspr_zoneid) {
3025 * Parent in different zone. Save process and
3026 * use zone's default pset and sched below
3028 tmp = proc;
3029 proc = list_next(&plist, proc);
3030 list_remove(&plist, tmp);
3031 list_insert_tail(&pplist, tmp);
3032 break;
3034 /* Parent has unknown pset, Search parent's parent */
3035 if (pproc->zspr_psetid == ZS_PSET_ERROR) {
3036 next = pproc;
3037 continue;
3039 /* Found parent with known pset. Use its info */
3040 proc->zspr_psetid = pproc->zspr_psetid;
3041 proc->zspr_sched = pproc->zspr_sched;
3042 next->zspr_psetid = pproc->zspr_psetid;
3043 next->zspr_sched = pproc->zspr_sched;
3044 zone = zsd_lookup_zone_byid(ctl,
3045 proc->zspr_zoneid);
3046 if (zone == NULL) {
3047 tmp = proc;
3048 proc = list_next(&plist, proc);
3049 list_remove(&plist, tmp);
3050 list_link_init(&tmp->zspr_next);
3051 break;
3053 pset = zsd_lookup_pset_byid(ctl,
3054 proc->zspr_psetid);
3055 if (pset == NULL) {
3056 tmp = proc;
3057 proc = list_next(&plist, proc);
3058 list_remove(&plist, tmp);
3059 list_link_init(&tmp->zspr_next);
3060 break;
3062 /* Add the zone's usage to the pset */
3063 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3064 if (usage == NULL) {
3065 tmp = proc;
3066 proc = list_next(&plist, proc);
3067 list_remove(&plist, tmp);
3068 list_link_init(&tmp->zspr_next);
3069 break;
3071 zsd_mark_pset_usage_found(usage, proc->zspr_sched);
3072 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3073 zsd_flush_proc_info(proc);
3074 tmp = proc;
3075 proc = list_next(&plist, proc);
3076 list_remove(&plist, tmp);
3077 list_link_init(&tmp->zspr_next);
3078 break;
3082 * Process has never been seen. Using zone info to
3083 * determine pset and scheduling class.
3085 proc = list_head(&pplist);
3086 while (proc != NULL) {
3088 zone = zsd_lookup_zone_byid(ctl, proc->zspr_zoneid);
3089 if (zone == NULL)
3090 goto next;
3091 if (zone->zsz_psetid != ZS_PSET_ERROR &&
3092 zone->zsz_psetid != ZS_PSET_MULTI) {
3093 prev_psetid = zone->zsz_psetid;
3094 pset = zsd_lookup_pset_byid(ctl, prev_psetid);
3095 } else {
3096 pset = zsd_lookup_pset(ctl, zone->zsz_pset, -1);
3097 if (pset != NULL)
3098 prev_psetid = pset->zsp_id;
3100 if (pset == NULL)
3101 goto next;
3103 sched = zone->zsz_scheds;
3105 * Ignore FX high scheduling class if it is not the
3106 * only scheduling class in the zone.
3108 if (sched != ZS_SCHED_FX_60)
3109 sched &= (~ZS_SCHED_FX_60);
3111 * If more than one scheduling class has been found
3112 * in the zone, use zone's default scheduling class for
3113 * this process.
3115 if ((sched & (sched - 1)) != 0)
3116 sched = zone->zsz_default_sched;
3118 /* Add the zone's usage to the pset */
3119 usage = zsd_lookup_insert_usage(ctl, pset, zone);
3120 if (usage == NULL)
3121 goto next;
3123 zsd_mark_pset_usage_found(usage, sched);
3124 zsd_add_usage(ctl, usage, &proc->zspr_usage);
3125 next:
3126 tmp = proc;
3127 proc = list_next(&pplist, proc);
3128 zsd_flush_proc_info(tmp);
3129 list_link_init(&tmp->zspr_next);
3131 return;
3132 ea_err:
3134 * Close the next accounting file if we have not transitioned to it
3135 * yet.
3137 if (ctl->zsctl_proc_open_next) {
3138 (void) ea_close(&ctl->zsctl_proc_eaf_next);
3139 ctl->zsctl_proc_open_next = 0;
3140 ctl->zsctl_proc_fd_next = -1;
3145 * getvmusage(2) uses size_t's in the passwd data structure, which differ
3146 * in size for 32bit and 64 bit kernels. Since this is a contracted interface,
3147 * and zonestatd does not necessarily match the kernel's bitness, marshal
3148 * results appropriately.
3150 static int
3151 zsd_getvmusage(zsd_ctl_t *ctl, uint_t flags, time_t age, zsd_vmusage64_t *buf,
3152 uint64_t *nres)
3154 zsd_vmusage32_t *vmu32;
3155 zsd_vmusage64_t *vmu64;
3156 uint32_t nres32;
3157 int i;
3158 int ret;
3160 if (ctl->zsctl_kern_bits == 32) {
3161 nres32 = *nres;
3162 ret = syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3163 flags, age, (uintptr_t)buf, (uintptr_t)&nres32);
3164 *nres = nres32;
3165 if (ret == 0 && buf != NULL) {
3167 * An array of vmusage32_t's has been returned.
3168 * Convert it to an array of vmusage64_t's.
3170 vmu32 = (zsd_vmusage32_t *)buf;
3171 vmu64 = (zsd_vmusage64_t *)buf;
3172 for (i = nres32 - 1; i >= 0; i--) {
3174 vmu64[i].vmu_zoneid = vmu32[i].vmu_zoneid;
3175 vmu64[i].vmu_type = vmu32[i].vmu_type;
3176 vmu64[i].vmu_type = vmu32[i].vmu_type;
3177 vmu64[i].vmu_rss_all = vmu32[i].vmu_rss_all;
3178 vmu64[i].vmu_rss_private =
3179 vmu32[i].vmu_rss_private;
3180 vmu64[i].vmu_rss_shared =
3181 vmu32[i].vmu_rss_shared;
3182 vmu64[i].vmu_swap_all = vmu32[i].vmu_swap_all;
3183 vmu64[i].vmu_swap_private =
3184 vmu32[i].vmu_swap_private;
3185 vmu64[i].vmu_swap_shared =
3186 vmu32[i].vmu_swap_shared;
3189 return (ret);
3190 } else {
3192 * kernel is 64 bit, so use 64 bit structures as zonestat
3193 * expects.
3195 return (syscall(SYS_rusagesys, _RUSAGESYS_GETVMUSAGE,
3196 flags, age, (uintptr_t)buf, (uintptr_t)nres));
3202 * Update the current physical, virtual, and locked memory usage of the
3203 * running zones.
3205 static void
3206 zsd_refresh_memory(zsd_ctl_t *ctl, boolean_t init)
3209 uint64_t phys_total;
3210 uint64_t phys_used;
3211 uint64_t phys_zones;
3212 uint64_t phys_zones_overcount;
3213 uint64_t phys_zones_extra;
3214 uint64_t phys_zones_credit;
3216 uint64_t vm_free;
3217 uint64_t vm_used;
3219 uint64_t disk_swap_total;
3220 uint64_t disk_swap_used; /* disk swap with contents */
3222 uint64_t physmem;
3223 uint64_t pp_kernel;
3224 uint64_t arc_size = 0;
3225 struct anoninfo ani;
3227 int num_swap_devices;
3228 struct swaptable *swt;
3229 struct swapent *swent;
3230 size_t swt_size;
3231 char *path;
3233 zsd_vmusage64_t *vmusage;
3234 uint64_t num_vmusage;
3236 int i, ret;
3238 zsd_system_t *sys;
3239 zsd_zone_t *zone;
3240 int vmu_nzones;
3242 kstat_t *kstat;
3243 char kstat_name[KSTAT_STRLEN];
3244 kstat_named_t *knp;
3245 kid_t kid;
3247 if (init)
3248 return;
3250 sys = ctl->zsctl_system;
3252 /* interrogate swap devices to find the amount of disk swap */
3253 disk_swap_again:
3254 num_swap_devices = swapctl(SC_GETNSWP, NULL);
3256 if (num_swap_devices == 0) {
3257 sys->zss_swap_total = disk_swap_total = 0;
3258 sys->zss_swap_used = disk_swap_used = 0;
3259 /* No disk swap */
3260 goto disk_swap_done;
3262 /* see if swap table needs to be larger */
3263 if (num_swap_devices > ctl->zsctl_swap_cache_num) {
3264 swt_size = sizeof (int) +
3265 (num_swap_devices * sizeof (struct swapent)) +
3266 (num_swap_devices * MAXPATHLEN);
3267 free(ctl->zsctl_swap_cache);
3269 swt = (struct swaptable *)malloc(swt_size);
3270 if (swt == NULL) {
3272 * Could not allocate to get list of swap devices.
3273 * Just use data from the most recent read, which will
3274 * be zero if this is the first read.
3276 zsd_warn(gettext("Unable to allocate to determine "
3277 "virtual memory"));
3278 disk_swap_total = sys->zss_swap_total;
3279 disk_swap_used = sys->zss_swap_used;
3280 goto disk_swap_done;
3282 swent = swt->swt_ent;
3283 path = (char *)swt + (sizeof (int) +
3284 num_swap_devices * sizeof (swapent_t));
3285 for (i = 0; i < num_swap_devices; i++, swent++) {
3286 swent->ste_path = path;
3287 path += MAXPATHLEN;
3289 swt->swt_n = num_swap_devices;
3290 ctl->zsctl_swap_cache = swt;
3291 ctl->zsctl_swap_cache_size = swt_size;
3292 ctl->zsctl_swap_cache_num = num_swap_devices;
3294 num_swap_devices = swapctl(SC_LIST, ctl->zsctl_swap_cache);
3295 if (num_swap_devices < 0) {
3296 /* More swap devices have arrived */
3297 if (errno == ENOMEM)
3298 goto disk_swap_again;
3300 zsd_warn(gettext("Unable to determine disk swap devices"));
3301 /* Unexpected error. Use existing data */
3302 disk_swap_total = sys->zss_swap_total;
3303 disk_swap_used = sys->zss_swap_used;
3304 goto disk_swap_done;
3307 /* add up the disk swap */
3308 disk_swap_total = 0;
3309 disk_swap_used = 0;
3310 swent = ctl->zsctl_swap_cache->swt_ent;
3311 for (i = 0; i < num_swap_devices; i++, swent++) {
3312 disk_swap_total += swent->ste_pages;
3313 disk_swap_used += (swent->ste_pages - swent->ste_free);
3315 disk_swap_total *= ctl->zsctl_pagesize;
3316 disk_swap_used *= ctl->zsctl_pagesize;
3318 sys->zss_swap_total = disk_swap_total;
3319 sys->zss_swap_used = disk_swap_used;
3321 disk_swap_done:
3323 /* get system pages kstat */
3324 kid = -1;
3325 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "unix", 0, "system_pages");
3326 if (kstat == NULL)
3327 zsd_warn(gettext("Unable to lookup system pages kstat"));
3328 else
3329 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3331 if (kid == -1) {
3332 zsd_warn(gettext("Unable to read system pages kstat"));
3333 return;
3334 } else {
3335 knp = kstat_data_lookup(kstat, "physmem");
3336 if (knp == NULL) {
3337 zsd_warn(gettext("Unable to read physmem"));
3338 } else {
3339 if (knp->data_type == KSTAT_DATA_UINT64)
3340 physmem = knp->value.ui64;
3341 else if (knp->data_type == KSTAT_DATA_UINT32)
3342 physmem = knp->value.ui32;
3343 else
3344 return;
3346 knp = kstat_data_lookup(kstat, "pp_kernel");
3347 if (knp == NULL) {
3348 zsd_warn(gettext("Unable to read pp_kernel"));
3349 } else {
3350 if (knp->data_type == KSTAT_DATA_UINT64)
3351 pp_kernel = knp->value.ui64;
3352 else if (knp->data_type == KSTAT_DATA_UINT32)
3353 pp_kernel = knp->value.ui32;
3354 else
3355 return;
3358 physmem *= ctl->zsctl_pagesize;
3359 pp_kernel *= ctl->zsctl_pagesize;
3361 /* get the zfs arc size if available */
3362 arc_size = 0;
3363 kid = -1;
3364 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "zfs", 0, "arcstats");
3365 if (kstat != NULL)
3366 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3367 if (kid != -1) {
3368 knp = kstat_data_lookup(kstat, "size");
3369 if (knp != NULL)
3370 if (knp->data_type == KSTAT_DATA_UINT64)
3371 arc_size = knp->value.ui64;
3374 /* Try to get swap information */
3375 if (swapctl(SC_AINFO, &ani) < 0) {
3376 zsd_warn(gettext("Unable to get swap info"));
3377 return;
3380 vmusage_again:
3381 /* getvmusage to get physical memory usage */
3382 vmusage = ctl->zsctl_vmusage_cache;
3383 num_vmusage = ctl->zsctl_vmusage_cache_num;
3385 ret = zsd_getvmusage(ctl, VMUSAGE_SYSTEM | VMUSAGE_ALL_ZONES, 0,
3386 vmusage, &num_vmusage);
3388 if (ret != 0) {
3389 /* Unexpected error. Use existing data */
3390 if (errno != EOVERFLOW) {
3391 zsd_warn(gettext(
3392 "Unable to read physical memory usage"));
3393 phys_zones = sys->zss_ram_zones;
3394 goto vmusage_done;
3397 /* vmusage results cache too small */
3398 if (num_vmusage > ctl->zsctl_vmusage_cache_num) {
3400 size_t size = sizeof (zsd_vmusage64_t) * num_vmusage;
3402 free(ctl->zsctl_vmusage_cache);
3403 vmusage = (zsd_vmusage64_t *)malloc(size);
3404 if (vmusage == NULL) {
3405 zsd_warn(gettext("Unable to alloc to determine "
3406 "physical memory usage"));
3407 phys_zones = sys->zss_ram_zones;
3408 goto vmusage_done;
3410 ctl->zsctl_vmusage_cache = vmusage;
3411 ctl->zsctl_vmusage_cache_num = num_vmusage;
3412 goto vmusage_again;
3415 phys_zones_overcount = 0;
3416 vmu_nzones = 0;
3417 for (i = 0; i < num_vmusage; i++) {
3418 switch (vmusage[i].vmu_type) {
3419 case VMUSAGE_SYSTEM:
3420 /* total pages backing user process mappings */
3421 phys_zones = sys->zss_ram_zones =
3422 vmusage[i].vmu_rss_all;
3423 break;
3424 case VMUSAGE_ZONE:
3425 vmu_nzones++;
3426 phys_zones_overcount += vmusage[i].vmu_rss_all;
3427 zone = zsd_lookup_zone_byid(ctl, vmusage[i].vmu_id);
3428 if (zone != NULL)
3429 zone->zsz_usage_ram = vmusage[i].vmu_rss_all;
3430 break;
3431 default:
3432 break;
3436 * Figure how much memory was double counted due to text sharing
3437 * between zones. Credit this back so that the sum of the zones
3438 * equals the total zone ram usage;
3440 phys_zones_extra = phys_zones_overcount - phys_zones;
3441 phys_zones_credit = phys_zones_extra / vmu_nzones;
3443 vmusage_done:
3445 /* walk the zones to get swap and locked kstats. Fetch ram cap. */
3446 sys->zss_locked_zones = 0;
3447 sys->zss_vm_zones = 0;
3448 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3449 zone = list_next(&ctl->zsctl_zones, zone)) {
3451 /* If zone halted during interval, show memory usage as none */
3452 if (zone->zsz_active == B_FALSE ||
3453 zone->zsz_deleted == B_TRUE) {
3454 zone->zsz_usage_ram = 0;
3455 zone->zsz_usage_vm = 0;
3456 zone->zsz_usage_locked = 0;
3457 continue;
3460 if (phys_zones_credit > 0) {
3461 if (zone->zsz_usage_ram > phys_zones_credit) {
3462 zone->zsz_usage_ram -= phys_zones_credit;
3466 * Get zone's swap usage. Since zone could have halted,
3467 * treats as zero if cannot read
3469 zone->zsz_usage_vm = 0;
3470 (void) snprintf(kstat_name, sizeof (kstat_name),
3471 "swapresv_zone_%d", zone->zsz_id);
3472 kid = -1;
3473 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3474 zone->zsz_id, kstat_name);
3475 if (kstat != NULL)
3476 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3477 if (kid != -1) {
3478 knp = kstat_data_lookup(kstat, "usage");
3479 if (knp != NULL &&
3480 knp->data_type == KSTAT_DATA_UINT64) {
3481 zone->zsz_usage_vm = knp->value.ui64;
3482 sys->zss_vm_zones += knp->value.ui64;
3486 * Get zone's locked usage. Since zone could have halted,
3487 * treats as zero if cannot read
3489 zone->zsz_usage_locked = 0;
3490 (void) snprintf(kstat_name, sizeof (kstat_name),
3491 "lockedmem_zone_%d", zone->zsz_id);
3492 kid = -1;
3493 kstat = kstat_lookup(ctl->zsctl_kstat_ctl, "caps",
3494 zone->zsz_id, kstat_name);
3495 if (kstat != NULL)
3496 kid = kstat_read(ctl->zsctl_kstat_ctl, kstat, NULL);
3497 if (kid != -1) {
3498 knp = kstat_data_lookup(kstat, "usage");
3499 if (knp != NULL &&
3500 knp->data_type == KSTAT_DATA_UINT64) {
3501 zone->zsz_usage_locked = knp->value.ui64;
3503 * Since locked memory accounting for zones
3504 * can double count ddi locked memory, cap each
3505 * zone's locked usage at its ram usage.
3507 if (zone->zsz_usage_locked >
3508 zone->zsz_usage_ram)
3509 zone->zsz_usage_locked =
3510 zone->zsz_usage_ram;
3511 sys->zss_locked_zones +=
3512 zone->zsz_usage_locked;
3517 phys_total =
3518 sysconf(_SC_PHYS_PAGES) * ctl->zsctl_pagesize;
3520 phys_used = (sysconf(_SC_PHYS_PAGES) - sysconf(_SC_AVPHYS_PAGES))
3521 * ctl->zsctl_pagesize;
3523 /* Compute remaining statistics */
3524 sys->zss_ram_total = phys_total;
3525 sys->zss_ram_zones = phys_zones;
3526 sys->zss_ram_kern = phys_used - phys_zones - arc_size;
3529 * The total for kernel locked memory should include
3530 * segkp locked pages, but oh well. The arc size is subtracted,
3531 * as that physical memory is reclaimable.
3533 sys->zss_locked_kern = pp_kernel - arc_size;
3534 /* Add memory used by kernel startup and obp to kernel locked */
3535 if ((phys_total - physmem) > 0)
3536 sys->zss_locked_kern += phys_total - physmem;
3539 * Add in the portion of (RAM+DISK) that is not available as swap,
3540 * and consider it swap used by the kernel.
3542 sys->zss_vm_total = phys_total + disk_swap_total;
3543 vm_free = (ani.ani_max - ani.ani_resv) * ctl->zsctl_pagesize;
3544 vm_used = sys->zss_vm_total - vm_free;
3545 sys->zss_vm_kern = vm_used - sys->zss_vm_zones - arc_size;
3549 * Charge each cpu's usage to its processor sets. Also add the cpu's total
3550 * time to each zone using the processor set. This tracks the maximum
3551 * amount of cpu time that a zone could have used.
3553 static void
3554 zsd_refresh_cpu_stats(zsd_ctl_t *ctl, boolean_t init)
3556 zsd_system_t *sys;
3557 zsd_zone_t *zone;
3558 zsd_pset_usage_t *usage;
3559 zsd_cpu_t *cpu;
3560 zsd_cpu_t *cpu_next;
3561 zsd_pset_t *pset;
3562 timestruc_t ts;
3563 uint64_t hrtime;
3564 timestruc_t delta;
3566 /* Update the per-cpu kstat data */
3567 cpu_next = list_head(&ctl->zsctl_cpus);
3568 while (cpu_next != NULL) {
3569 cpu = cpu_next;
3570 cpu_next = list_next(&ctl->zsctl_cpus, cpu);
3571 zsd_update_cpu_stats(ctl, cpu);
3573 /* Update the elapsed real time */
3574 hrtime = gethrtime();
3575 if (init) {
3576 /* first time around, store hrtime for future comparision */
3577 ctl->zsctl_hrtime = hrtime;
3578 ctl->zsctl_hrtime_prev = hrtime;
3580 } else {
3581 /* Compute increase in hrtime since the most recent read */
3582 ctl->zsctl_hrtime_prev = ctl->zsctl_hrtime;
3583 ctl->zsctl_hrtime = hrtime;
3584 if ((hrtime = hrtime - ctl->zsctl_hrtime_prev) > 0)
3585 TIMESTRUC_ADD_NANOSEC(ctl->zsctl_hrtime_total, hrtime);
3588 /* On initialization, all psets have zero time */
3589 if (init)
3590 return;
3592 for (pset = list_head(&ctl->zsctl_psets); pset != NULL;
3593 pset = list_next(&ctl->zsctl_psets, pset)) {
3595 if (pset->zsp_active == B_FALSE) {
3596 zsd_warn(gettext("Internal error,inactive pset found"));
3597 continue;
3600 /* sum total used time for pset */
3601 ts.tv_sec = 0;
3602 ts.tv_nsec = 0;
3603 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_intr);
3604 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_kern);
3605 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_user);
3606 /* kernel time in pset is total time minus zone time */
3607 TIMESTRUC_DELTA(pset->zsp_usage_kern, ts,
3608 pset->zsp_usage_zones);
3609 if (pset->zsp_usage_kern.tv_sec < 0 ||
3610 pset->zsp_usage_kern.tv_nsec < 0) {
3611 pset->zsp_usage_kern.tv_sec = 0;
3612 pset->zsp_usage_kern.tv_nsec = 0;
3614 /* Total pset elapsed time is used time plus idle time */
3615 TIMESTRUC_ADD_TIMESTRUC(ts, pset->zsp_idle);
3617 TIMESTRUC_DELTA(delta, ts, pset->zsp_total_time);
3619 for (usage = list_head(&pset->zsp_usage_list); usage != NULL;
3620 usage = list_next(&pset->zsp_usage_list, usage)) {
3622 zone = usage->zsu_zone;
3623 if (usage->zsu_cpu_shares != ZS_LIMIT_NONE &&
3624 usage->zsu_cpu_shares != ZS_SHARES_UNLIMITED &&
3625 usage->zsu_cpu_shares != 0) {
3627 * Figure out how many nanoseconds of share time
3628 * to give to the zone
3630 hrtime = delta.tv_sec;
3631 hrtime *= NANOSEC;
3632 hrtime += delta.tv_nsec;
3633 hrtime *= usage->zsu_cpu_shares;
3634 hrtime /= pset->zsp_cpu_shares;
3635 TIMESTRUC_ADD_NANOSEC(zone->zsz_share_time,
3636 hrtime);
3638 /* Add pset time to each zone using pset */
3639 TIMESTRUC_ADD_TIMESTRUC(zone->zsz_pset_time, delta);
3641 zone->zsz_cpus_online += pset->zsp_online;
3643 pset->zsp_total_time = ts;
3646 for (zone = list_head(&ctl->zsctl_zones); zone != NULL;
3647 zone = list_next(&ctl->zsctl_zones, zone)) {
3649 /* update cpu cap tracking if the zone has a cpu cap */
3650 if (zone->zsz_cpu_cap != ZS_LIMIT_NONE) {
3651 uint64_t elapsed;
3653 elapsed = ctl->zsctl_hrtime - ctl->zsctl_hrtime_prev;
3654 elapsed *= zone->zsz_cpu_cap;
3655 elapsed = elapsed / 100;
3656 TIMESTRUC_ADD_NANOSEC(zone->zsz_cap_time, elapsed);
3659 sys = ctl->zsctl_system;
3660 ts.tv_sec = 0;
3661 ts.tv_nsec = 0;
3662 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_intr);
3663 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_kern);
3664 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_user);
3666 /* kernel time in pset is total time minus zone time */
3667 TIMESTRUC_DELTA(sys->zss_cpu_usage_kern, ts,
3668 sys->zss_cpu_usage_zones);
3669 if (sys->zss_cpu_usage_kern.tv_sec < 0 ||
3670 sys->zss_cpu_usage_kern.tv_nsec < 0) {
3671 sys->zss_cpu_usage_kern.tv_sec = 0;
3672 sys->zss_cpu_usage_kern.tv_nsec = 0;
3674 /* Total pset elapsed time is used time plus idle time */
3675 TIMESTRUC_ADD_TIMESTRUC(ts, sys->zss_idle);
3676 sys->zss_cpu_total_time = ts;
3680 * Saves current usage data to a cache that is read by libzonestat when
3681 * calling zs_usage_read().
3683 * All pointers in the cached data structure are set to NULL. When
3684 * libzonestat reads the cached data, it will set the pointers relative to
3685 * its address space.
3687 static void
3688 zsd_usage_cache_update(zsd_ctl_t *ctl)
3690 zs_usage_cache_t *cache;
3691 zs_usage_cache_t *old;
3692 zs_usage_t *usage;
3694 zs_system_t *sys;
3695 zsd_system_t *dsys;
3696 zs_zone_t *zone = NULL;
3697 zsd_zone_t *dzone;
3698 zs_pset_t *pset = NULL;
3699 zsd_pset_t *dpset;
3700 zs_pset_zone_t *pusage;
3701 zsd_pset_usage_t *dpusage;
3703 char *next;
3704 uint_t size, i, j;
3706 size =
3707 sizeof (zs_usage_cache_t) +
3708 sizeof (zs_usage_t) +
3709 sizeof (zs_system_t) +
3710 sizeof (zs_zone_t) * ctl->zsctl_nzones +
3711 sizeof (zs_pset_t) * ctl->zsctl_npsets +
3712 sizeof (zs_pset_zone_t) * ctl->zsctl_npset_usages;
3714 cache = (zs_usage_cache_t *)malloc(size);
3715 if (cache == NULL) {
3716 zsd_warn(gettext("Unable to allocate usage cache\n"));
3717 return;
3720 next = (char *)cache;
3721 cache->zsuc_size = size - sizeof (zs_usage_cache_t);
3722 next += sizeof (zs_usage_cache_t);
3724 /* LINTED */
3725 usage = cache->zsuc_usage = (zs_usage_t *)next;
3726 next += sizeof (zs_usage_t);
3727 usage->zsu_start = g_start;
3728 usage->zsu_hrstart = g_hrstart;
3729 usage->zsu_time = g_now;
3730 usage->zsu_hrtime = g_hrnow;
3731 usage->zsu_nzones = ctl->zsctl_nzones;
3732 usage->zsu_npsets = ctl->zsctl_npsets;
3733 usage->zsu_system = NULL;
3735 /* LINTED */
3736 sys = (zs_system_t *)next;
3737 next += sizeof (zs_system_t);
3738 dsys = ctl->zsctl_system;
3739 sys->zss_ram_total = dsys->zss_ram_total;
3740 sys->zss_ram_kern = dsys->zss_ram_kern;
3741 sys->zss_ram_zones = dsys->zss_ram_zones;
3742 sys->zss_locked_kern = dsys->zss_locked_kern;
3743 sys->zss_locked_zones = dsys->zss_locked_zones;
3744 sys->zss_vm_total = dsys->zss_vm_total;
3745 sys->zss_vm_kern = dsys->zss_vm_kern;
3746 sys->zss_vm_zones = dsys->zss_vm_zones;
3747 sys->zss_swap_total = dsys->zss_swap_total;
3748 sys->zss_swap_used = dsys->zss_swap_used;
3749 sys->zss_ncpus = dsys->zss_ncpus;
3750 sys->zss_ncpus_online = dsys->zss_ncpus_online;
3752 sys->zss_processes_max = dsys->zss_maxpid;
3753 sys->zss_lwps_max = dsys->zss_lwps_max;
3754 sys->zss_shm_max = dsys->zss_shm_max;
3755 sys->zss_shmids_max = dsys->zss_shmids_max;
3756 sys->zss_semids_max = dsys->zss_semids_max;
3757 sys->zss_msgids_max = dsys->zss_msgids_max;
3758 sys->zss_lofi_max = dsys->zss_lofi_max;
3760 sys->zss_processes = dsys->zss_processes;
3761 sys->zss_lwps = dsys->zss_lwps;
3762 sys->zss_shm = dsys->zss_shm;
3763 sys->zss_shmids = dsys->zss_shmids;
3764 sys->zss_semids = dsys->zss_semids;
3765 sys->zss_msgids = dsys->zss_msgids;
3766 sys->zss_lofi = dsys->zss_lofi;
3768 sys->zss_cpu_total_time = dsys->zss_cpu_total_time;
3769 sys->zss_cpu_usage_zones = dsys->zss_cpu_usage_zones;
3770 sys->zss_cpu_usage_kern = dsys->zss_cpu_usage_kern;
3772 for (i = 0, dzone = list_head(&ctl->zsctl_zones);
3773 i < ctl->zsctl_nzones;
3774 i++, dzone = list_next(&ctl->zsctl_zones, dzone)) {
3775 /* LINTED */
3776 zone = (zs_zone_t *)next;
3777 next += sizeof (zs_zone_t);
3778 list_link_init(&zone->zsz_next);
3779 zone->zsz_system = NULL;
3781 (void) strlcpy(zone->zsz_name, dzone->zsz_name,
3782 sizeof (zone->zsz_name));
3783 (void) strlcpy(zone->zsz_pool, dzone->zsz_pool,
3784 sizeof (zone->zsz_pool));
3785 (void) strlcpy(zone->zsz_pset, dzone->zsz_pset,
3786 sizeof (zone->zsz_pset));
3787 zone->zsz_id = dzone->zsz_id;
3788 zone->zsz_cputype = dzone->zsz_cputype;
3789 zone->zsz_iptype = dzone->zsz_iptype;
3790 zone->zsz_start = dzone->zsz_start;
3791 zone->zsz_hrstart = dzone->zsz_hrstart;
3792 zone->zsz_scheds = dzone->zsz_scheds;
3793 zone->zsz_cpu_shares = dzone->zsz_cpu_shares;
3794 zone->zsz_cpu_cap = dzone->zsz_cpu_cap;
3795 zone->zsz_ram_cap = dzone->zsz_ram_cap;
3796 zone->zsz_vm_cap = dzone->zsz_vm_cap;
3797 zone->zsz_locked_cap = dzone->zsz_locked_cap;
3798 zone->zsz_cpu_usage = dzone->zsz_cpu_usage;
3799 zone->zsz_cpus_online = dzone->zsz_cpus_online;
3800 zone->zsz_pset_time = dzone->zsz_pset_time;
3801 zone->zsz_cap_time = dzone->zsz_cap_time;
3802 zone->zsz_share_time = dzone->zsz_share_time;
3803 zone->zsz_usage_ram = dzone->zsz_usage_ram;
3804 zone->zsz_usage_locked = dzone->zsz_usage_locked;
3805 zone->zsz_usage_vm = dzone->zsz_usage_vm;
3807 zone->zsz_processes_cap = dzone->zsz_processes_cap;
3808 zone->zsz_lwps_cap = dzone->zsz_lwps_cap;
3809 zone->zsz_shm_cap = dzone->zsz_shm_cap;
3810 zone->zsz_shmids_cap = dzone->zsz_shmids_cap;
3811 zone->zsz_semids_cap = dzone->zsz_semids_cap;
3812 zone->zsz_msgids_cap = dzone->zsz_msgids_cap;
3813 zone->zsz_lofi_cap = dzone->zsz_lofi_cap;
3815 zone->zsz_processes = dzone->zsz_processes;
3816 zone->zsz_lwps = dzone->zsz_lwps;
3817 zone->zsz_shm = dzone->zsz_shm;
3818 zone->zsz_shmids = dzone->zsz_shmids;
3819 zone->zsz_semids = dzone->zsz_semids;
3820 zone->zsz_msgids = dzone->zsz_msgids;
3821 zone->zsz_lofi = dzone->zsz_lofi;
3824 for (i = 0, dpset = list_head(&ctl->zsctl_psets);
3825 i < ctl->zsctl_npsets;
3826 i++, dpset = list_next(&ctl->zsctl_psets, dpset)) {
3827 /* LINTED */
3828 pset = (zs_pset_t *)next;
3829 next += sizeof (zs_pset_t);
3830 list_link_init(&pset->zsp_next);
3831 (void) strlcpy(pset->zsp_name, dpset->zsp_name,
3832 sizeof (pset->zsp_name));
3833 pset->zsp_id = dpset->zsp_id;
3834 pset->zsp_cputype = dpset->zsp_cputype;
3835 pset->zsp_start = dpset->zsp_start;
3836 pset->zsp_hrstart = dpset->zsp_hrstart;
3837 pset->zsp_online = dpset->zsp_online;
3838 pset->zsp_size = dpset->zsp_size;
3839 pset->zsp_min = dpset->zsp_min;
3840 pset->zsp_max = dpset->zsp_max;
3841 pset->zsp_importance = dpset->zsp_importance;
3842 pset->zsp_scheds = dpset->zsp_scheds;
3843 pset->zsp_cpu_shares = dpset->zsp_cpu_shares;
3844 pset->zsp_total_time = dpset->zsp_total_time;
3845 pset->zsp_usage_kern = dpset->zsp_usage_kern;
3846 pset->zsp_usage_zones = dpset->zsp_usage_zones;
3847 pset->zsp_nusage = dpset->zsp_nusage;
3848 /* Add pset usages for pset */
3849 for (j = 0, dpusage = list_head(&dpset->zsp_usage_list);
3850 j < dpset->zsp_nusage;
3851 j++, dpusage = list_next(&dpset->zsp_usage_list, dpusage)) {
3852 /* LINTED */
3853 pusage = (zs_pset_zone_t *)next;
3854 next += sizeof (zs_pset_zone_t);
3855 /* pointers are computed by client */
3856 pusage->zspz_pset = NULL;
3857 pusage->zspz_zone = NULL;
3858 list_link_init(&pusage->zspz_next);
3859 pusage->zspz_zoneid = dpusage->zsu_zone->zsz_id;
3860 pusage->zspz_start = dpusage->zsu_start;
3861 pusage->zspz_hrstart = dpusage->zsu_hrstart;
3862 pusage->zspz_hrstart = dpusage->zsu_hrstart;
3863 pusage->zspz_cpu_shares = dpusage->zsu_cpu_shares;
3864 pusage->zspz_scheds = dpusage->zsu_scheds;
3865 pusage->zspz_cpu_usage = dpusage->zsu_cpu_usage;
3869 /* Update the current cache pointer */
3870 (void) mutex_lock(&g_usage_cache_lock);
3871 old = g_usage_cache;
3872 cache->zsuc_ref = 1;
3873 cache->zsuc_gen = g_gen_next;
3874 usage->zsu_gen = g_gen_next;
3875 usage->zsu_size = size;
3876 g_usage_cache = cache;
3877 if (old != NULL) {
3878 old->zsuc_ref--;
3879 if (old->zsuc_ref == 0)
3880 free(old);
3882 g_gen_next++;
3883 /* Wake up any clients that are waiting for this calculation */
3884 if (g_usage_cache_kickers > 0) {
3885 (void) cond_broadcast(&g_usage_cache_wait);
3887 (void) mutex_unlock(&g_usage_cache_lock);
3890 static zs_usage_cache_t *
3891 zsd_usage_cache_hold_locked()
3893 zs_usage_cache_t *ret;
3895 ret = g_usage_cache;
3896 ret->zsuc_ref++;
3897 return (ret);
3900 void
3901 zsd_usage_cache_rele(zs_usage_cache_t *cache)
3903 (void) mutex_lock(&g_usage_cache_lock);
3904 cache->zsuc_ref--;
3905 if (cache->zsuc_ref == 0)
3906 free(cache);
3907 (void) mutex_unlock(&g_usage_cache_lock);
3910 /* Close the handles held by zsd_open() */
3911 void
3912 zsd_close(zsd_ctl_t *ctl)
3914 zsd_zone_t *zone;
3915 zsd_pset_t *pset;
3916 zsd_pset_usage_t *usage;
3917 zsd_cpu_t *cpu;
3918 int id;
3920 if (ctl->zsctl_kstat_ctl) {
3921 (void) kstat_close(ctl->zsctl_kstat_ctl);
3922 ctl->zsctl_kstat_ctl = NULL;
3924 if (ctl->zsctl_proc_open) {
3925 (void) ea_close(&ctl->zsctl_proc_eaf);
3926 ctl->zsctl_proc_open = 0;
3927 ctl->zsctl_proc_fd = -1;
3929 if (ctl->zsctl_pool_conf) {
3930 if (ctl->zsctl_pool_status == POOL_ENABLED)
3931 (void) pool_conf_close(ctl->zsctl_pool_conf);
3932 ctl->zsctl_pool_status = POOL_DISABLED;
3935 while ((zone = list_head(&ctl->zsctl_zones)) != NULL) {
3936 list_remove(&ctl->zsctl_zones, zone);
3937 free(zone);
3938 ctl->zsctl_nzones--;
3941 while ((pset = list_head(&ctl->zsctl_psets)) != NULL) {
3942 while ((usage = list_head(&pset->zsp_usage_list))
3943 != NULL) {
3944 list_remove(&pset->zsp_usage_list, usage);
3945 ctl->zsctl_npset_usages--;
3946 free(usage);
3948 list_remove(&ctl->zsctl_psets, pset);
3949 free(pset);
3950 ctl->zsctl_npsets--;
3953 /* Release all cpus being tracked */
3954 while (cpu = list_head(&ctl->zsctl_cpus)) {
3955 list_remove(&ctl->zsctl_cpus, cpu);
3956 id = cpu->zsc_id;
3957 bzero(cpu, sizeof (zsd_cpu_t));
3958 cpu->zsc_id = id;
3959 cpu->zsc_allocated = B_FALSE;
3960 cpu->zsc_psetid = ZS_PSET_ERROR;
3961 cpu->zsc_psetid_prev = ZS_PSET_ERROR;
3964 assert(ctl->zsctl_npset_usages == 0);
3965 assert(ctl->zsctl_npsets == 0);
3966 assert(ctl->zsctl_nzones == 0);
3967 (void) zsd_disable_cpu_stats();
3972 * Update the utilization data for all zones and processor sets.
3974 static int
3975 zsd_read(zsd_ctl_t *ctl, boolean_t init, boolean_t do_memory)
3977 (void) kstat_chain_update(ctl->zsctl_kstat_ctl);
3978 (void) gettimeofday(&(ctl->zsctl_timeofday), NULL);
3980 zsd_refresh_system(ctl);
3983 * Memory calculation is expensive. Only update it on sample
3984 * intervals.
3986 if (do_memory == B_TRUE)
3987 zsd_refresh_memory(ctl, init);
3988 zsd_refresh_zones(ctl);
3989 zsd_refresh_psets(ctl);
3990 zsd_refresh_procs(ctl, init);
3991 zsd_refresh_cpu_stats(ctl, init);
3994 * Delete objects that no longer exist.
3995 * Pset usages must be deleted first as they point to zone and
3996 * pset objects.
3998 zsd_mark_pset_usages_end(ctl);
3999 zsd_mark_psets_end(ctl);
4000 zsd_mark_cpus_end(ctl);
4001 zsd_mark_zones_end(ctl);
4004 * Save results for clients.
4006 zsd_usage_cache_update(ctl);
4009 * Roll process accounting file.
4011 (void) zsd_roll_exacct();
4012 return (0);
4016 * Get the system rctl, which is the upper most limit
4018 static uint64_t
4019 zsd_get_system_rctl(char *name)
4021 rctlblk_t *rblk, *rblk_last;
4023 rblk = (rctlblk_t *)alloca(rctlblk_size());
4024 rblk_last = (rctlblk_t *)alloca(rctlblk_size());
4026 if (getrctl(name, NULL, rblk_last, RCTL_FIRST) != 0)
4027 return (ZS_LIMIT_NONE);
4029 while (getrctl(name, rblk_last, rblk, RCTL_NEXT) == 0)
4030 (void) bcopy(rblk, rblk_last, rctlblk_size());
4032 return (rctlblk_get_value(rblk_last));
4036 * Open any necessary subsystems for collecting utilization data,
4037 * allocate and initialize data structures, and get initial utilization.
4039 * Errors:
4040 * ENOMEM out of memory
4041 * EINVAL other error
4043 static zsd_ctl_t *
4044 zsd_open(zsd_ctl_t *ctl)
4046 zsd_system_t *system;
4048 char path[MAXPATHLEN];
4049 long pathmax;
4050 struct statvfs svfs;
4051 int ret;
4052 int i;
4053 size_t size;
4054 int err;
4056 if (ctl == NULL && (ctl = (zsd_ctl_t *)calloc(1,
4057 sizeof (zsd_ctl_t))) == NULL) {
4058 zsd_warn(gettext("Out of Memory"));
4059 errno = ENOMEM;
4060 goto err;
4062 ctl->zsctl_proc_fd = -1;
4064 /* open kstats */
4065 if (ctl->zsctl_kstat_ctl == NULL &&
4066 (ctl->zsctl_kstat_ctl = kstat_open()) == NULL) {
4067 err = errno;
4068 zsd_warn(gettext("Unable to open kstats"));
4069 errno = err;
4070 if (errno != ENOMEM)
4071 errno = EAGAIN;
4072 goto err;
4076 * These are set when the accounting file is opened by
4077 * zsd_update_procs()
4079 ctl->zsctl_proc_fd = -1;
4080 ctl->zsctl_proc_fd_next = -1;
4081 ctl->zsctl_proc_open = 0;
4082 ctl->zsctl_proc_open_next = 0;
4084 check_exacct:
4085 (void) zsd_enable_cpu_stats();
4087 /* Create structures to track usage */
4088 if (ctl->zsctl_system == NULL && (ctl->zsctl_system = (zsd_system_t *)
4089 calloc(1, sizeof (zsd_system_t))) == NULL) {
4090 ret = -1;
4091 zsd_warn(gettext("Out of Memory"));
4092 errno = ENOMEM;
4093 goto err;
4095 system = ctl->zsctl_system;
4096 /* get the kernel bitness to know structure layout for getvmusage */
4097 ret = sysinfo(SI_ARCHITECTURE_64, path, sizeof (path));
4098 if (ret < 0)
4099 ctl->zsctl_kern_bits = 32;
4100 else
4101 ctl->zsctl_kern_bits = 64;
4102 ctl->zsctl_pagesize = sysconf(_SC_PAGESIZE);
4104 size = sysconf(_SC_CPUID_MAX);
4105 ctl->zsctl_maxcpuid = size;
4106 if (ctl->zsctl_cpu_array == NULL && (ctl->zsctl_cpu_array =
4107 (zsd_cpu_t *)calloc(size + 1, sizeof (zsd_cpu_t))) == NULL) {
4108 zsd_warn(gettext("Out of Memory"));
4109 errno = ENOMEM;
4110 goto err;
4112 for (i = 0; i <= ctl->zsctl_maxcpuid; i++) {
4113 ctl->zsctl_cpu_array[i].zsc_id = i;
4114 ctl->zsctl_cpu_array[i].zsc_allocated = B_FALSE;
4115 ctl->zsctl_cpu_array[i].zsc_psetid = ZS_PSET_ERROR;
4116 ctl->zsctl_cpu_array[i].zsc_psetid_prev = ZS_PSET_ERROR;
4118 if (statvfs("/proc", &svfs) != 0 ||
4119 strcmp("/proc", svfs.f_fstr) != 0) {
4120 zsd_warn(gettext("/proc not a procfs filesystem"));
4121 errno = EINVAL;
4122 goto err;
4125 size = sysconf(_SC_MAXPID) + 1;
4126 ctl->zsctl_maxproc = size;
4127 if (ctl->zsctl_proc_array == NULL &&
4128 (ctl->zsctl_proc_array = (zsd_proc_t *)calloc(size,
4129 sizeof (zsd_proc_t))) == NULL) {
4130 zsd_warn(gettext("Out of Memory"));
4131 errno = ENOMEM;
4132 goto err;
4134 for (i = 0; i <= ctl->zsctl_maxproc; i++) {
4135 list_link_init(&(ctl->zsctl_proc_array[i].zspr_next));
4136 ctl->zsctl_proc_array[i].zspr_psetid = ZS_PSET_ERROR;
4137 ctl->zsctl_proc_array[i].zspr_zoneid = -1;
4138 ctl->zsctl_proc_array[i].zspr_usage.tv_sec = 0;
4139 ctl->zsctl_proc_array[i].zspr_usage.tv_nsec = 0;
4140 ctl->zsctl_proc_array[i].zspr_ppid = -1;
4143 list_create(&ctl->zsctl_zones, sizeof (zsd_zone_t),
4144 offsetof(zsd_zone_t, zsz_next));
4146 list_create(&ctl->zsctl_psets, sizeof (zsd_pset_t),
4147 offsetof(zsd_pset_t, zsp_next));
4149 list_create(&ctl->zsctl_cpus, sizeof (zsd_cpu_t),
4150 offsetof(zsd_cpu_t, zsc_next));
4152 pathmax = pathconf("/proc", _PC_NAME_MAX);
4153 if (pathmax < 0) {
4154 zsd_warn(gettext("Unable to determine max path of /proc"));
4155 errno = EINVAL;
4156 goto err;
4158 size = sizeof (struct dirent) + pathmax + 1;
4160 ctl->zsctl_procfs_dent_size = size;
4161 if (ctl->zsctl_procfs_dent == NULL &&
4162 (ctl->zsctl_procfs_dent = (struct dirent *)calloc(1, size))
4163 == NULL) {
4164 zsd_warn(gettext("Out of Memory"));
4165 errno = ENOMEM;
4166 goto err;
4169 if (ctl->zsctl_pool_conf == NULL &&
4170 (ctl->zsctl_pool_conf = pool_conf_alloc()) == NULL) {
4171 zsd_warn(gettext("Out of Memory"));
4172 errno = ENOMEM;
4173 goto err;
4175 ctl->zsctl_pool_status = POOL_DISABLED;
4176 ctl->zsctl_pool_changed = 0;
4178 if (ctl->zsctl_pool_vals[0] == NULL &&
4179 (ctl->zsctl_pool_vals[0] = pool_value_alloc()) == NULL) {
4180 zsd_warn(gettext("Out of Memory"));
4181 errno = ENOMEM;
4182 goto err;
4184 if (ctl->zsctl_pool_vals[1] == NULL &&
4185 (ctl->zsctl_pool_vals[1] = pool_value_alloc()) == NULL) {
4186 zsd_warn(gettext("Out of Memory"));
4187 errno = ENOMEM;
4188 goto err;
4190 ctl->zsctl_pool_vals[2] = NULL;
4193 * get system limits
4195 system->zss_maxpid = size = sysconf(_SC_MAXPID);
4196 system->zss_processes_max = zsd_get_system_rctl("zone.max-processes");
4197 system->zss_lwps_max = zsd_get_system_rctl("zone.max-lwps");
4198 system->zss_shm_max = zsd_get_system_rctl("zone.max-shm-memory");
4199 system->zss_shmids_max = zsd_get_system_rctl("zone.max-shm-ids");
4200 system->zss_semids_max = zsd_get_system_rctl("zone.max-sem-ids");
4201 system->zss_msgids_max = zsd_get_system_rctl("zone.max-msg-ids");
4202 system->zss_lofi_max = zsd_get_system_rctl("zone.max-lofi");
4204 g_gen_next = 1;
4206 if (zsd_read(ctl, B_TRUE, B_FALSE) != 0)
4207 zsd_warn(gettext("Reading zone statistics failed"));
4209 return (ctl);
4210 err:
4211 if (ctl)
4212 zsd_close(ctl);
4214 return (NULL);
4217 /* Copy utilization data to buffer, filtering data if non-global zone. */
4218 static void
4219 zsd_usage_filter(zoneid_t zid, zs_usage_cache_t *cache, zs_usage_t *usage,
4220 boolean_t is_gz)
4222 zs_usage_t *cusage;
4223 zs_system_t *sys, *csys;
4224 zs_zone_t *zone, *czone;
4225 zs_pset_t *pset, *cpset;
4226 zs_pset_zone_t *pz, *cpz, *foundpz;
4227 size_t size = 0, csize = 0;
4228 char *start, *cstart;
4229 int i, j;
4230 timestruc_t delta;
4232 /* Privileged users in the global zone get everything */
4233 if (is_gz) {
4234 cusage = cache->zsuc_usage;
4235 (void) bcopy(cusage, usage, cusage->zsu_size);
4236 return;
4239 /* Zones just get their own usage */
4240 cusage = cache->zsuc_usage;
4242 start = (char *)usage;
4243 cstart = (char *)cusage;
4244 size += sizeof (zs_usage_t);
4245 csize += sizeof (zs_usage_t);
4247 usage->zsu_start = cusage->zsu_start;
4248 usage->zsu_hrstart = cusage->zsu_hrstart;
4249 usage->zsu_time = cusage->zsu_time;
4250 usage->zsu_hrtime = cusage->zsu_hrtime;
4251 usage->zsu_gen = cusage->zsu_gen;
4252 usage->zsu_nzones = 1;
4253 usage->zsu_npsets = 0;
4255 /* LINTED */
4256 sys = (zs_system_t *)(start + size);
4257 /* LINTED */
4258 csys = (zs_system_t *)(cstart + csize);
4259 size += sizeof (zs_system_t);
4260 csize += sizeof (zs_system_t);
4262 /* Save system limits but not usage */
4263 *sys = *csys;
4264 sys->zss_ncpus = 0;
4265 sys->zss_ncpus_online = 0;
4267 /* LINTED */
4268 zone = (zs_zone_t *)(start + size);
4269 /* LINTED */
4270 czone = (zs_zone_t *)(cstart + csize);
4271 /* Find the matching zone */
4272 for (i = 0; i < cusage->zsu_nzones; i++) {
4273 if (czone->zsz_id == zid) {
4274 *zone = *czone;
4275 size += sizeof (zs_zone_t);
4277 csize += sizeof (zs_zone_t);
4278 /* LINTED */
4279 czone = (zs_zone_t *)(cstart + csize);
4281 sys->zss_ram_kern += (sys->zss_ram_zones - zone->zsz_usage_ram);
4282 sys->zss_ram_zones = zone->zsz_usage_ram;
4284 sys->zss_vm_kern += (sys->zss_vm_zones - zone->zsz_usage_vm);
4285 sys->zss_vm_zones = zone->zsz_usage_vm;
4287 sys->zss_locked_kern += (sys->zss_locked_zones -
4288 zone->zsz_usage_locked);
4289 sys->zss_locked_zones = zone->zsz_usage_locked;
4291 TIMESTRUC_DELTA(delta, sys->zss_cpu_usage_zones, zone->zsz_cpu_usage);
4292 TIMESTRUC_ADD_TIMESTRUC(sys->zss_cpu_usage_kern, delta);
4293 sys->zss_cpu_usage_zones = zone->zsz_cpu_usage;
4295 /* LINTED */
4296 pset = (zs_pset_t *)(start + size);
4297 /* LINTED */
4298 cpset = (zs_pset_t *)(cstart + csize);
4299 for (i = 0; i < cusage->zsu_npsets; i++) {
4300 csize += sizeof (zs_pset_t);
4301 /* LINTED */
4302 cpz = (zs_pset_zone_t *)(csize + cstart);
4303 foundpz = NULL;
4304 for (j = 0; j < cpset->zsp_nusage; j++) {
4305 if (cpz->zspz_zoneid == zid)
4306 foundpz = cpz;
4308 csize += sizeof (zs_pset_zone_t);
4309 /* LINTED */
4310 cpz = (zs_pset_zone_t *)(csize + cstart);
4312 if (foundpz != NULL) {
4313 size += sizeof (zs_pset_t);
4314 /* LINTED */
4315 pz = (zs_pset_zone_t *)(start + size);
4316 size += sizeof (zs_pset_zone_t);
4318 *pset = *cpset;
4319 *pz = *foundpz;
4321 TIMESTRUC_DELTA(delta, pset->zsp_usage_zones,
4322 pz->zspz_cpu_usage);
4323 TIMESTRUC_ADD_TIMESTRUC(pset->zsp_usage_kern, delta);
4324 pset->zsp_usage_zones = pz->zspz_cpu_usage;
4325 pset->zsp_nusage = 1;
4326 usage->zsu_npsets++;
4327 sys->zss_ncpus += pset->zsp_size;
4328 sys->zss_ncpus_online += pset->zsp_online;
4330 /* LINTED */
4331 cpset = (zs_pset_t *)(cstart + csize);
4333 usage->zsu_size = size;
4337 * Respond to new connections from libzonestat.so. Also respond to zoneadmd,
4338 * which reports new zones.
4340 /* ARGSUSED */
4341 static void
4342 zsd_server(void *cookie, char *argp, size_t arg_size,
4343 door_desc_t *dp, uint_t n_desc)
4345 int *args, cmd;
4346 door_desc_t door;
4347 ucred_t *ucred;
4348 const priv_set_t *eset;
4350 if (argp == DOOR_UNREF_DATA) {
4351 (void) door_return(NULL, 0, NULL, 0);
4352 thr_exit(NULL);
4355 if (arg_size != sizeof (cmd) * 2) {
4356 (void) door_return(NULL, 0, NULL, 0);
4357 thr_exit(NULL);
4360 /* LINTED */
4361 args = (int *)argp;
4362 cmd = args[0];
4364 /* If connection, return door to stat server */
4365 if (cmd == ZSD_CMD_CONNECT) {
4367 /* Verify client compilation version */
4368 if (args[1] != ZS_VERSION) {
4369 args[1] = ZSD_STATUS_VERSION_MISMATCH;
4370 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4371 thr_exit(NULL);
4373 ucred = alloca(ucred_size());
4374 /* Verify client permission */
4375 if (door_ucred(&ucred) != 0) {
4376 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4377 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4378 thr_exit(NULL);
4381 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4382 if (eset == NULL) {
4383 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4384 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4385 thr_exit(NULL);
4387 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4388 args[1] = ZSD_STATUS_PERMISSION;
4389 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4390 thr_exit(NULL);
4393 /* Return stat server door */
4394 args[1] = ZSD_STATUS_OK;
4395 door.d_attributes = DOOR_DESCRIPTOR;
4396 door.d_data.d_desc.d_descriptor = g_stat_door;
4397 (void) door_return(argp, sizeof (cmd) * 2, &door, 1);
4398 thr_exit(NULL);
4401 /* Respond to zoneadmd informing zonestatd of a new zone */
4402 if (cmd == ZSD_CMD_NEW_ZONE) {
4403 zsd_fattach_zone(args[1], g_server_door, B_FALSE);
4404 (void) door_return(NULL, 0, NULL, 0);
4405 thr_exit(NULL);
4408 args[1] = ZSD_STATUS_INTERNAL_ERROR;
4409 (void) door_return(argp, sizeof (cmd) * 2, NULL, 0);
4410 thr_exit(NULL);
4414 * Respond to libzonestat.so clients with the current utlilzation data.
4416 /* ARGSUSED */
4417 static void
4418 zsd_stat_server(void *cookie, char *argp, size_t arg_size,
4419 door_desc_t *dp, uint_t n_desc)
4421 uint64_t *args, cmd;
4422 zs_usage_cache_t *cache;
4423 int ret;
4424 char *rvalp;
4425 size_t rvals;
4426 zs_usage_t *usage;
4427 ucred_t *ucred;
4428 zoneid_t zoneid;
4429 const priv_set_t *eset;
4430 boolean_t is_gz = B_FALSE;
4432 /* Tell stat thread there are no more clients */
4433 if (argp == DOOR_UNREF_DATA) {
4434 (void) mutex_lock(&g_usage_cache_lock);
4435 g_hasclient = B_FALSE;
4436 (void) cond_signal(&g_usage_cache_kick);
4437 (void) mutex_unlock(&g_usage_cache_lock);
4438 (void) door_return(NULL, 0, NULL, 0);
4439 thr_exit(NULL);
4441 if (arg_size != sizeof (cmd) * 2) {
4442 (void) door_return(NULL, 0, NULL, 0);
4443 thr_exit(NULL);
4445 /* LINTED */
4446 args = (uint64_t *)argp;
4447 cmd = args[0];
4448 if (cmd != ZSD_CMD_READ) {
4449 (void) door_return(NULL, 0, NULL, 0);
4450 thr_exit(NULL);
4452 ucred = alloca(ucred_size());
4453 if (door_ucred(&ucred) != 0) {
4454 (void) door_return(NULL, 0, NULL, 0);
4455 thr_exit(NULL);
4457 zoneid = ucred_getzoneid(ucred);
4459 if (zoneid == GLOBAL_ZONEID)
4460 is_gz = B_TRUE;
4462 eset = ucred_getprivset(ucred, PRIV_EFFECTIVE);
4463 if (eset == NULL) {
4464 (void) door_return(NULL, 0, NULL, 0);
4465 thr_exit(NULL);
4467 if (!priv_ismember(eset, PRIV_PROC_INFO)) {
4468 (void) door_return(NULL, 0, NULL, 0);
4469 thr_exit(NULL);
4471 (void) mutex_lock(&g_usage_cache_lock);
4472 g_hasclient = B_TRUE;
4475 * Force a new cpu calculation for client. This will force a
4476 * new memory calculation if the memory data is older than the
4477 * sample period.
4479 g_usage_cache_kickers++;
4480 (void) cond_signal(&g_usage_cache_kick);
4481 ret = cond_wait(&g_usage_cache_wait, &g_usage_cache_lock);
4482 g_usage_cache_kickers--;
4483 if (ret != 0 && errno == EINTR) {
4484 (void) mutex_unlock(&g_usage_cache_lock);
4485 zsd_warn(gettext(
4486 "Interrupted before writing usage size to client\n"));
4487 (void) door_return(NULL, 0, NULL, 0);
4488 thr_exit(NULL);
4490 cache = zsd_usage_cache_hold_locked();
4491 if (cache == NULL) {
4492 zsd_warn(gettext("Usage cache empty.\n"));
4493 (void) door_return(NULL, 0, NULL, 0);
4494 thr_exit(NULL);
4496 (void) mutex_unlock(&g_usage_cache_lock);
4498 /* Copy current usage data to stack to send to client */
4499 usage = (zs_usage_t *)alloca(cache->zsuc_size);
4501 /* Filter out results if caller is non-global zone */
4502 zsd_usage_filter(zoneid, cache, usage, is_gz);
4504 rvalp = (void *)usage;
4505 rvals = usage->zsu_size;
4506 zsd_usage_cache_rele(cache);
4508 (void) door_return(rvalp, rvals, NULL, 0);
4509 thr_exit(NULL);
4512 static volatile boolean_t g_quit;
4514 /* ARGSUSED */
4515 static void
4516 zonestat_quithandler(int sig)
4518 g_quit = B_TRUE;
4522 * The stat thread generates new utilization data when clients request
4523 * it. It also manages opening and closing the subsystems used to gather
4524 * data depending on if clients exist.
4526 /* ARGSUSED */
4527 void *
4528 stat_thread(void *arg)
4530 time_t start;
4531 time_t now;
4532 time_t next_memory;
4533 boolean_t do_memory;
4534 boolean_t do_read;
4535 boolean_t do_close;
4537 start = time(NULL);
4538 if (start < 0) {
4539 if (g_quit == B_TRUE)
4540 goto quit;
4541 zsd_warn(gettext("Unable to fetch current time"));
4542 g_quit = B_TRUE;
4543 goto quit;
4546 next_memory = start;
4547 while (g_quit == B_FALSE) {
4548 for (;;) {
4550 * These are used to decide if the most recent memory
4551 * calculation was within a sample interval,
4552 * and weather or not the usage collection needs to
4553 * be opened or closed.
4555 do_memory = B_FALSE;
4556 do_read = B_FALSE;
4557 do_close = B_FALSE;
4560 * If all clients have gone, close usage collecting
4562 (void) mutex_lock(&g_usage_cache_lock);
4563 if (!g_hasclient && g_open == B_TRUE) {
4564 do_close = B_TRUE;
4565 (void) mutex_unlock(&g_usage_cache_lock);
4566 break;
4568 if (g_quit == B_TRUE) {
4569 (void) mutex_unlock(
4570 &g_usage_cache_lock);
4571 break;
4574 * Wait for a usage data request
4576 if (g_usage_cache_kickers == 0) {
4577 (void) cond_wait(&g_usage_cache_kick,
4578 &g_usage_cache_lock);
4580 now = time(NULL);
4581 if (now < 0) {
4582 if (g_quit == B_TRUE) {
4583 (void) mutex_unlock(
4584 &g_usage_cache_lock);
4585 goto quit;
4587 g_quit = B_TRUE;
4588 (void) mutex_unlock(&g_usage_cache_lock);
4589 zsd_warn(gettext(
4590 "Unable to fetch current time"));
4591 goto quit;
4593 if (g_hasclient) {
4594 do_read = B_TRUE;
4595 if (now >= next_memory) {
4596 do_memory = B_TRUE;
4597 next_memory = now + g_interval;
4599 } else {
4600 do_close = B_TRUE;
4602 (void) mutex_unlock(&g_usage_cache_lock);
4603 if (do_read || do_close)
4604 break;
4606 g_now = now;
4607 g_hrnow = gethrtime();
4608 if (g_hasclient && g_open == B_FALSE) {
4609 g_start = g_now;
4610 g_hrstart = g_hrnow;
4611 g_ctl = zsd_open(g_ctl);
4612 if (g_ctl == NULL)
4613 zsd_warn(gettext(
4614 "Unable to open zone statistics"));
4615 else
4616 g_open = B_TRUE;
4618 if (do_read && g_ctl) {
4619 if (zsd_read(g_ctl, B_FALSE, do_memory) != 0) {
4620 zsd_warn(gettext(
4621 "Unable to read zone statistics"));
4622 g_quit = B_TRUE;
4623 return (NULL);
4626 (void) mutex_lock(&g_usage_cache_lock);
4627 if (!g_hasclient && g_open == B_TRUE && g_ctl) {
4628 (void) mutex_unlock(&g_usage_cache_lock);
4629 zsd_close(g_ctl);
4630 g_open = B_FALSE;
4631 } else {
4632 (void) mutex_unlock(&g_usage_cache_lock);
4635 quit:
4636 if (g_open)
4637 zsd_close(g_ctl);
4639 (void) thr_kill(g_main, SIGINT);
4640 thr_exit(NULL);
4641 return (NULL);
4644 void
4645 zsd_set_fx()
4647 pcinfo_t pcinfo;
4648 pcparms_t pcparms;
4650 (void) strlcpy(pcinfo.pc_clname, "FX", sizeof (pcinfo.pc_clname));
4651 if (priocntl(0, 0, PC_GETCID, (caddr_t)&pcinfo) == -1) {
4652 zsd_warn(gettext("cannot get FX class parameters"));
4653 return;
4655 pcparms.pc_cid = pcinfo.pc_cid;
4656 ((fxparms_t *)pcparms.pc_clparms)->fx_upri = 60;
4657 ((fxparms_t *)pcparms.pc_clparms)->fx_uprilim = 60;
4658 ((fxparms_t *)pcparms.pc_clparms)->fx_tqsecs = 0;
4659 ((fxparms_t *)pcparms.pc_clparms)->fx_tqnsecs = FX_NOCHANGE;
4660 if (priocntl(P_PID, getpid(), PC_SETPARMS, (caddr_t)&pcparms) == -1)
4661 zsd_warn(gettext("cannot enter the FX class"));
4664 static int pipe_fd;
4666 static void
4667 daemonize_ready(char status)
4670 * wake the parent with a clue
4672 (void) write(pipe_fd, &status, 1);
4673 (void) close(pipe_fd);
4676 static int
4677 daemonize_start(void)
4679 char data;
4680 int status;
4682 int filedes[2];
4683 pid_t pid;
4685 (void) close(0);
4686 (void) dup2(2, 1);
4688 if (pipe(filedes) < 0)
4689 return (-1);
4691 (void) fflush(NULL);
4693 if ((pid = fork1()) < 0)
4694 return (-1);
4696 if (pid != 0) {
4698 * parent
4700 struct sigaction act;
4702 act.sa_sigaction = SIG_DFL;
4703 (void) sigemptyset(&act.sa_mask);
4704 act.sa_flags = 0;
4706 (void) sigaction(SIGPIPE, &act, NULL); /* ignore SIGPIPE */
4708 (void) close(filedes[1]);
4709 if (read(filedes[0], &data, 1) == 1) {
4710 /* forward ready code via exit status */
4711 exit(data);
4713 status = -1;
4714 (void) wait4(pid, &status, 0, NULL);
4715 /* daemon process exited before becoming ready */
4716 if (WIFEXITED(status)) {
4717 /* assume daemon process printed useful message */
4718 exit(WEXITSTATUS(status));
4719 } else {
4720 zsd_warn(gettext("daemon process killed or died"));
4721 exit(1);
4726 * child
4728 pipe_fd = filedes[1];
4729 (void) close(filedes[0]);
4732 * generic Unix setup
4734 (void) setsid();
4735 (void) umask(0000);
4737 return (0);
4740 static void
4741 fattach_all_zones(boolean_t detach_only)
4743 zoneid_t *zids;
4744 uint_t nzids, nzids_last;
4745 int i;
4747 again:
4748 (void) zone_list(NULL, &nzids);
4749 nzids_last = nzids;
4750 zids = (zoneid_t *)malloc(sizeof (zoneid_t) * nzids_last);
4751 if (zids == NULL)
4752 zsd_error(gettext("Out of memory"));
4754 (void) zone_list(zids, &nzids);
4755 if (nzids > nzids_last) {
4756 free(zids);
4757 goto again;
4759 for (i = 0; i < nzids; i++)
4760 zsd_fattach_zone(zids[i], g_server_door, detach_only);
4762 free(zids);
4766 main(int argc, char *argv[])
4769 int arg;
4770 thread_t tid;
4771 scf_simple_prop_t *prop;
4772 uint64_t *intervalp;
4773 boolean_t opt_cleanup = B_FALSE;
4775 g_main = thr_self();
4776 g_quit = B_FALSE;
4777 (void) signal(SIGINT, zonestat_quithandler);
4778 (void) signal(SIGTERM, zonestat_quithandler);
4779 (void) signal(SIGHUP, zonestat_quithandler);
4780 /* (void) sigignore(SIGCHLD); */
4781 (void) sigignore(SIGPIPE);
4783 if (getzoneid() != GLOBAL_ZONEID)
4784 zsd_error(gettext("Must be run from global zone only"));
4786 while ((arg = getopt(argc, argv, "c"))
4787 != EOF) {
4788 switch (arg) {
4789 case 'c':
4790 opt_cleanup = B_TRUE;
4791 break;
4792 default:
4793 zsd_error(gettext("Invalid option"));
4797 if (opt_cleanup) {
4798 if (zsd_disable_cpu_stats() != 0)
4799 exit(1);
4800 else
4801 exit(0);
4804 /* Get the configured sample interval */
4805 prop = scf_simple_prop_get(NULL, "svc:/system/zones-monitoring:default",
4806 "config", "sample_interval");
4807 if (prop == NULL)
4808 zsd_error(gettext("Unable to fetch SMF property "
4809 "\"config/sample_interval\""));
4811 if (scf_simple_prop_type(prop) != SCF_TYPE_COUNT)
4812 zsd_error(gettext("Malformed SMF property "
4813 "\"config/sample_interval\". Must be of type \"count\""));
4815 intervalp = scf_simple_prop_next_count(prop);
4816 g_interval = *intervalp;
4817 if (g_interval == 0)
4818 zsd_error(gettext("Malformed SMF property "
4819 "\"config/sample_interval\". Must be greater than zero"));
4821 scf_simple_prop_free(prop);
4823 if (daemonize_start() < 0)
4824 zsd_error(gettext("Unable to start daemon\n"));
4826 /* Run at high priority */
4827 zsd_set_fx();
4829 (void) mutex_init(&g_usage_cache_lock, USYNC_THREAD, NULL);
4830 (void) cond_init(&g_usage_cache_kick, USYNC_THREAD, NULL);
4831 (void) cond_init(&g_usage_cache_wait, USYNC_THREAD, NULL);
4833 g_server_door = door_create(zsd_server, NULL,
4834 DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4835 if (g_server_door < 0)
4836 zsd_error(gettext("Unable to create server door\n"));
4839 g_stat_door = door_create(zsd_stat_server, NULL, DOOR_UNREF_MULTI |
4840 DOOR_REFUSE_DESC | DOOR_NO_CANCEL);
4841 if (g_stat_door < 0)
4842 zsd_error(gettext("Unable to create statistics door\n"));
4844 fattach_all_zones(B_FALSE);
4846 if (thr_create(NULL, 0, stat_thread, NULL, 0, &tid) != 0)
4847 zsd_error(gettext("Unable to create statistics thread\n"));
4849 daemonize_ready(0);
4851 /* Wait for signal to quit */
4852 while (g_quit == B_FALSE)
4853 (void) pause();
4855 /* detach doors */
4856 fattach_all_zones(B_TRUE);
4858 (void) door_revoke(g_server_door);
4859 (void) door_revoke(g_stat_door);
4861 /* kick stat thread and wait for it to close the statistics */
4862 (void) mutex_lock(&g_usage_cache_lock);
4863 g_quit = B_TRUE;
4864 (void) cond_signal(&g_usage_cache_kick);
4865 (void) mutex_unlock(&g_usage_cache_lock);
4866 end:
4867 (void) thr_join(tid, NULL, NULL);
4868 return (0);