4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
32 #include <sys/cmn_err.h>
33 #include <sys/policy.h>
34 #include <sys/group.h>
37 #include <sys/cpu_pm.h>
38 #include <sys/cap_util.h>
41 * Processor Groups: Hardware sharing relationship layer
43 * This file implements an extension to Processor Groups to capture
44 * hardware sharing relationships existing between logical CPUs. Examples of
45 * hardware sharing relationships include shared caches on some CMT
46 * procesoor architectures, or shared local memory controllers on NUMA
47 * based system architectures.
49 * The pghw_t structure represents the extended PG. The first member
50 * of the structure is the generic pg_t with the pghw specific members
51 * following. The generic pg_t *must* remain the first member of the
52 * structure as the code uses casting of structure references to access
53 * the generic pg_t structure elements.
55 * In addition to the generic CPU grouping, physical PGs have a hardware
56 * sharing relationship enumerated "type", and an instance id. The enumerated
57 * type is defined by the pghw_type_t enumeration, while the instance id
58 * uniquely identifies the sharing instance from among others of the same
59 * hardware sharing type.
61 * The physical PGs are organized into an overall hierarchy, and are tracked
62 * in a number of different per CPU, and per pghw_type_t type groups.
69 * || ============================
70 * ||\\-----------------------// \\ \\
71 * || | hwset (PGC_HW_CHIP) | ------------- -------------
72 * || | (group_t) | | pghw_t | | pghw_t |
73 * || ----------------------- | chip 0 | | chip 1 |
74 * || ------------- -------------
75 * || \\ \\ \\ \\ \\ \\ \\ \\
76 * || cpu cpu cpu cpu cpu cpu cpu cpu
78 * || ============================
79 * ||\\-----------------------// \\ \\
80 * || | hwset (PGC_HW_IPIPE)| ------------- -------------
81 * || | (group_t) | | pghw_t | | pghw_t |
82 * || ----------------------- | ipipe 0 | | ipipe 1 |
83 * || ------------- -------------
89 * The top level pg_hw is a group of "hwset" groups. Each hwset holds of group
90 * of physical PGs of the same hardware sharing type. Within each hwset, the
91 * PG's instance id uniquely identifies the grouping relationshsip among other
92 * groupings of the same sharing type. The instance id for a grouping is
93 * platform defined, and in some cases may be used by platform code as a handle
94 * to search for a particular relationship instance.
96 * Each physical PG (by virtue of the embedded pg_t) contains a group of CPUs
97 * that participate in the sharing relationship. Each CPU also has associated
98 * with it a grouping tracking the PGs in which the CPU belongs. This can be
99 * used to iterate over the various relationships in which the CPU participates
100 * (the CPU's chip, cache, lgroup, etc.).
102 * The hwsets are created dynamically as new hardware sharing relationship types
103 * are instantiated. They are never destroyed, as once a given relationship
104 * type appears in the system, it is quite likely that at least one instance of
105 * that relationship will always persist as long as the system is running.
108 static group_t
*pg_hw
; /* top level pg hw group */
115 kstat_named_t pg_class
;
116 kstat_named_t pg_ncpus
;
117 kstat_named_t pg_instance_id
;
119 kstat_named_t pg_policy
;
121 { "id", KSTAT_DATA_INT32
},
122 { "pg_class", KSTAT_DATA_STRING
},
123 { "ncpus", KSTAT_DATA_UINT32
},
124 { "instance_id", KSTAT_DATA_UINT32
},
125 { "hardware", KSTAT_DATA_STRING
},
126 { "policy", KSTAT_DATA_STRING
},
129 kmutex_t pghw_kstat_lock
;
132 * Capacity and Utilization PG kstats
134 * These kstats are updated one at a time, so we can have a single scratch space
139 * pg_id PG ID for PG described by this kstat
141 * pg_parent Parent PG ID. The value -1 means "no parent".
143 * pg_ncpus Number of CPUs within this PG
145 * pg_cpus String describing CPUs within this PG
147 * pg_relationship Name of sharing relationship for this PG
149 * pg_generation Generation value that increases whenever any CPU leaves
150 * or joins PG. Two kstat snapshots for the same
151 * CPU may only be compared if they have the same
154 * pg_hw_util Running value of PG utilization for the sharing
157 * pg_hw_util_time_running
158 * Total time spent collecting CU data. The time may be
159 * less than wall time if CU counters were stopped for
162 * pg_hw_util_time_stopped Total time the CU counters were stopped.
164 * pg_hw_util_rate Utilization rate, expressed in operations per second.
166 * pg_hw_util_rate_max Maximum observed value of utilization rate.
168 struct pghw_cu_kstat
{
170 kstat_named_t pg_parent_id
;
171 kstat_named_t pg_ncpus
;
172 kstat_named_t pg_generation
;
173 kstat_named_t pg_hw_util
;
174 kstat_named_t pg_hw_util_time_running
;
175 kstat_named_t pg_hw_util_time_stopped
;
176 kstat_named_t pg_hw_util_rate
;
177 kstat_named_t pg_hw_util_rate_max
;
178 kstat_named_t pg_cpus
;
179 kstat_named_t pg_relationship
;
181 { "pg_id", KSTAT_DATA_INT32
},
182 { "parent_pg_id", KSTAT_DATA_INT32
},
183 { "ncpus", KSTAT_DATA_UINT32
},
184 { "generation", KSTAT_DATA_UINT32
},
185 { "hw_util", KSTAT_DATA_UINT64
},
186 { "hw_util_time_running", KSTAT_DATA_UINT64
},
187 { "hw_util_time_stopped", KSTAT_DATA_UINT64
},
188 { "hw_util_rate", KSTAT_DATA_UINT64
},
189 { "hw_util_rate_max", KSTAT_DATA_UINT64
},
190 { "cpus", KSTAT_DATA_STRING
},
191 { "relationship", KSTAT_DATA_STRING
},
195 * Calculate the string size to represent NCPUS. Allow 5 digits for each CPU ID
196 * plus one space per CPU plus NUL byte in the end. This is only an estimate,
197 * since we try to compress CPU ranges as x-y. In the worst case the string
198 * representation of CPUs may be truncated.
200 #define CPUSTR_LEN(ncpus) ((ncpus) * 6)
203 * Maximum length of the string that represents list of CPUs
205 static int pg_cpulist_maxlen
= 0;
207 static void pghw_kstat_create(pghw_t
*);
208 static int pghw_kstat_update(kstat_t
*, int);
209 static int pghw_cu_kstat_update(kstat_t
*, int);
210 static int cpu2id(void *);
215 static group_t
*pghw_set_create(pghw_type_t
);
216 static void pghw_set_add(group_t
*, pghw_t
*);
217 static void pghw_set_remove(group_t
*, pghw_t
*);
219 static void pghw_cpulist_alloc(pghw_t
*);
220 static int cpu2id(void *);
221 static pgid_t
pghw_parent_id(pghw_t
*);
224 * Initialize the physical portion of a hardware PG
227 pghw_init(pghw_t
*pg
, cpu_t
*cp
, pghw_type_t hw
)
231 if ((hwset
= pghw_set_lookup(hw
)) == NULL
) {
233 * Haven't seen this hardware type yet
235 hwset
= pghw_set_create(hw
);
238 pghw_set_add(hwset
, pg
);
240 pg
->pghw_generation
= 0;
242 pg_plat_hw_instance_id(cp
, hw
);
243 pghw_kstat_create(pg
);
246 * Hardware sharing relationship specific initialization
248 switch (pg
->pghw_hw
) {
249 case PGHW_POW_ACTIVE
:
251 (pghw_handle_t
)cpupm_domain_init(cp
, CPUPM_DTYPE_ACTIVE
);
255 (pghw_handle_t
)cpupm_domain_init(cp
, CPUPM_DTYPE_IDLE
);
258 pg
->pghw_handle
= (pghw_handle_t
)NULL
;
263 * Teardown the physical portion of a physical PG
266 pghw_fini(pghw_t
*pg
)
272 hwset
= pghw_set_lookup(pg
->pghw_hw
);
273 ASSERT(hwset
!= NULL
);
275 pghw_set_remove(hwset
, pg
);
276 pg
->pghw_instance
= (id_t
)PGHW_INSTANCE_ANON
;
277 pg
->pghw_hw
= (pghw_type_t
)-1;
279 if (pg
->pghw_kstat
!= NULL
)
280 kstat_delete(pg
->pghw_kstat
);
285 * PG is removed from CMT hierarchy
288 pghw_cmt_fini(pghw_t
*pg
)
291 * Destroy string representation of CPUs
293 if (pg
->pghw_cpulist
!= NULL
) {
294 kmem_free(pg
->pghw_cpulist
,
295 pg
->pghw_cpulist_len
);
296 pg
->pghw_cpulist
= NULL
;
302 if (pg
->pghw_cu_kstat
!= NULL
) {
303 kstat_delete(pg
->pghw_cu_kstat
);
304 pg
->pghw_cu_kstat
= NULL
;
309 * Find an existing physical PG in which to place
310 * the given CPU for the specified hardware sharing
314 pghw_place_cpu(cpu_t
*cp
, pghw_type_t hw
)
318 if ((hwset
= pghw_set_lookup(hw
)) == NULL
) {
322 return ((pghw_t
*)pg_cpu_find_pg(cp
, hwset
));
326 * Find the pg representing the hw sharing relationship in which
330 pghw_find_pg(cpu_t
*cp
, pghw_type_t hw
)
336 while ((pg
= group_iterate(&cp
->cpu_pg
->pgs
, &i
)) != NULL
) {
337 if (pg
->pghw_hw
== hw
)
344 * Find the PG of the given hardware sharing relationship
345 * type with the given instance id
348 pghw_find_by_instance(id_t id
, pghw_type_t hw
)
354 set
= pghw_set_lookup(hw
);
359 while ((pg
= group_iterate(set
, &i
)) != NULL
) {
360 if (pg
->pghw_instance
== id
)
367 * CPUs physical ID cache creation / destruction
368 * The cache's elements are initialized to the CPU's id
371 pghw_physid_create(cpu_t
*cp
)
375 cp
->cpu_physid
= kmem_alloc(sizeof (cpu_physid_t
), KM_SLEEP
);
377 for (i
= 0; i
< (sizeof (cpu_physid_t
) / sizeof (id_t
)); i
++) {
378 ((id_t
*)cp
->cpu_physid
)[i
] = cp
->cpu_id
;
383 pghw_physid_destroy(cpu_t
*cp
)
385 if (cp
->cpu_physid
) {
386 kmem_free(cp
->cpu_physid
, sizeof (cpu_physid_t
));
387 cp
->cpu_physid
= NULL
;
392 * Create a new, empty hwset.
393 * This routine may block, and must not be called from any
394 * paused CPU context.
397 pghw_set_create(pghw_type_t hw
)
403 * Create the top level PG hw group if it doesn't already exist
404 * This is a "set" of hardware sets, that is ordered (and indexed)
405 * by the pghw_type_t enum.
408 pg_hw
= kmem_alloc(sizeof (group_t
), KM_SLEEP
);
410 group_expand(pg_hw
, (uint_t
)PGHW_NUM_COMPONENTS
);
414 * Create the new hwset
415 * Add it to the top level pg_hw group.
417 g
= kmem_alloc(sizeof (group_t
), KM_SLEEP
);
420 ret
= group_add_at(pg_hw
, g
, (uint_t
)hw
);
427 * Find the hwset associated with the given hardware sharing type
430 pghw_set_lookup(pghw_type_t hw
)
437 hwset
= GROUP_ACCESS(pg_hw
, (uint_t
)hw
);
442 * Add a PG to a hwset
445 pghw_set_add(group_t
*hwset
, pghw_t
*pg
)
447 (void) group_add(hwset
, pg
, GRP_RESIZE
);
451 * Remove a PG from a hwset
454 pghw_set_remove(group_t
*hwset
, pghw_t
*pg
)
458 result
= group_remove(hwset
, pg
, GRP_RESIZE
);
463 * Return a string name given a pg_hw sharing type
466 pghw_type_string(pghw_type_t hw
)
470 return ("Integer Pipeline");
474 return ("Floating Point Unit");
476 return ("Data Pipe to memory");
481 case PGHW_POW_ACTIVE
:
482 return ("CPU PM Active Power Domain");
484 return ("CPU PM Idle Power Domain");
491 * Create / Update routines for PG hw kstats
493 * It is the intention of these kstats to provide some level
494 * of informational / debugging observability into the types
495 * and nature of the system's detected hardware sharing relationships
498 pghw_kstat_create(pghw_t
*pg
)
500 char *sharing
= pghw_type_string(pg
->pghw_hw
);
501 char name
[KSTAT_STRLEN
+ 1];
504 * Canonify PG name to conform to kstat name rules
506 (void) strncpy(name
, pghw_type_string(pg
->pghw_hw
), KSTAT_STRLEN
+ 1);
507 strident_canon(name
, KSTAT_STRLEN
+ 1);
510 * Create a hardware performance kstat
512 if ((pg
->pghw_kstat
= kstat_create("pg", ((pg_t
*)pg
)->pg_id
,
515 sizeof (pghw_kstat
) / sizeof (kstat_named_t
),
516 KSTAT_FLAG_VIRTUAL
)) != NULL
) {
517 /* Class string, hw string, and policy string */
518 pg
->pghw_kstat
->ks_data_size
+= PG_CLASS_NAME_MAX
;
519 pg
->pghw_kstat
->ks_data_size
+= PGHW_KSTAT_STR_LEN_MAX
;
520 pg
->pghw_kstat
->ks_data_size
+= PGHW_KSTAT_STR_LEN_MAX
;
521 pg
->pghw_kstat
->ks_lock
= &pghw_kstat_lock
;
522 pg
->pghw_kstat
->ks_data
= &pghw_kstat
;
523 pg
->pghw_kstat
->ks_update
= pghw_kstat_update
;
524 pg
->pghw_kstat
->ks_private
= pg
;
525 kstat_install(pg
->pghw_kstat
);
528 if (pg_cpulist_maxlen
== 0)
529 pg_cpulist_maxlen
= CPUSTR_LEN(max_ncpus
);
532 * Create a physical pg kstat
534 if ((pg
->pghw_cu_kstat
= kstat_create("pg_hw_perf", ((pg_t
*)pg
)->pg_id
,
535 name
, "processor_group",
537 sizeof (pghw_cu_kstat
) / sizeof (kstat_named_t
),
538 KSTAT_FLAG_VIRTUAL
)) != NULL
) {
539 pg
->pghw_cu_kstat
->ks_lock
= &pghw_kstat_lock
;
540 pg
->pghw_cu_kstat
->ks_data
= &pghw_cu_kstat
;
541 pg
->pghw_cu_kstat
->ks_update
= pghw_cu_kstat_update
;
542 pg
->pghw_cu_kstat
->ks_private
= pg
;
543 pg
->pghw_cu_kstat
->ks_data_size
+= strlen(sharing
) + 1;
544 /* Allow space for CPU strings */
545 pg
->pghw_cu_kstat
->ks_data_size
+= PGHW_KSTAT_STR_LEN_MAX
;
546 pg
->pghw_cu_kstat
->ks_data_size
+= pg_cpulist_maxlen
;
547 kstat_install(pg
->pghw_cu_kstat
);
552 pghw_kstat_update(kstat_t
*ksp
, int rw
)
554 struct pghw_kstat
*pgsp
= &pghw_kstat
;
555 pghw_t
*pg
= ksp
->ks_private
;
557 if (rw
== KSTAT_WRITE
)
560 pgsp
->pg_id
.value
.ui32
= ((pg_t
*)pg
)->pg_id
;
561 pgsp
->pg_ncpus
.value
.ui32
= GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
);
562 pgsp
->pg_instance_id
.value
.ui32
= pg
->pghw_instance
;
563 kstat_named_setstr(&pgsp
->pg_class
, ((pg_t
*)pg
)->pg_class
->pgc_name
);
564 kstat_named_setstr(&pgsp
->pg_hw
, pghw_type_string(pg
->pghw_hw
));
565 kstat_named_setstr(&pgsp
->pg_policy
, pg_policy_name((pg_t
*)pg
));
570 pghw_cu_kstat_update(kstat_t
*ksp
, int rw
)
572 struct pghw_cu_kstat
*pgsp
= &pghw_cu_kstat
;
573 pghw_t
*pg
= ksp
->ks_private
;
574 pghw_util_t
*hw_util
= &pg
->pghw_stats
;
575 boolean_t has_cpc_privilege
;
577 if (rw
== KSTAT_WRITE
)
581 * Check whether the caller has priv_cpc_cpu privilege. If it doesn't,
582 * it will not get hardware utilization data.
585 has_cpc_privilege
= (secpolicy_cpc_cpu(crgetcred()) == 0);
587 pgsp
->pg_id
.value
.i32
= ((pg_t
*)pg
)->pg_id
;
588 pgsp
->pg_parent_id
.value
.i32
= (int)pghw_parent_id(pg
);
590 pgsp
->pg_ncpus
.value
.ui32
= GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
);
593 * Allocate memory for the string representing the list of CPUs in PG.
594 * This memory should persist past the call to pghw_cu_kstat_update()
595 * since the kstat snapshot routine will reference this memory.
597 pghw_cpulist_alloc(pg
);
599 if (pg
->pghw_kstat_gen
!= pg
->pghw_generation
) {
601 * PG kstat generation number is out of sync with PG's
602 * generation mumber. It means that some CPUs could have joined
603 * or left PG and it is not possible to compare the numbers
604 * obtained before and after the generation change.
606 * Reset the maximum utilization rate and start computing it
609 hw_util
->pghw_util
= 0;
610 hw_util
->pghw_rate_max
= 0;
611 pg
->pghw_kstat_gen
= pg
->pghw_generation
;
615 * We can't block on CPU lock because when PG is destroyed (under
616 * cpu_lock) it tries to delete this kstat and it will wait for us to
617 * complete which will never happen since we are waiting for cpu_lock to
618 * drop. Deadlocks are fun!
620 if (mutex_tryenter(&cpu_lock
)) {
621 if (pg
->pghw_cpulist
!= NULL
&&
622 *(pg
->pghw_cpulist
) == '\0') {
623 (void) group2intlist(&(((pg_t
*)pg
)->pg_cpus
),
624 pg
->pghw_cpulist
, pg
->pghw_cpulist_len
, cpu2id
);
627 if (has_cpc_privilege
)
630 mutex_exit(&cpu_lock
);
633 pgsp
->pg_generation
.value
.ui32
= pg
->pghw_kstat_gen
;
634 if (pg
->pghw_cpulist
!= NULL
)
635 kstat_named_setstr(&pgsp
->pg_cpus
, pg
->pghw_cpulist
);
637 kstat_named_setstr(&pgsp
->pg_cpus
, "");
639 kstat_named_setstr(&pgsp
->pg_relationship
,
640 pghw_type_string(pg
->pghw_hw
));
642 if (has_cpc_privilege
) {
643 pgsp
->pg_hw_util
.value
.ui64
= hw_util
->pghw_util
;
644 pgsp
->pg_hw_util_time_running
.value
.ui64
=
645 hw_util
->pghw_time_running
;
646 pgsp
->pg_hw_util_time_stopped
.value
.ui64
=
647 hw_util
->pghw_time_stopped
;
648 pgsp
->pg_hw_util_rate
.value
.ui64
= hw_util
->pghw_rate
;
649 pgsp
->pg_hw_util_rate_max
.value
.ui64
= hw_util
->pghw_rate_max
;
651 pgsp
->pg_hw_util
.value
.ui64
= 0;
652 pgsp
->pg_hw_util_time_running
.value
.ui64
= 0;
653 pgsp
->pg_hw_util_time_stopped
.value
.ui64
= 0;
654 pgsp
->pg_hw_util_rate
.value
.ui64
= 0;
655 pgsp
->pg_hw_util_rate_max
.value
.ui64
= 0;
662 * Update the string representation of CPUs in PG (pg->pghw_cpulist).
663 * The string representation is used for kstats.
665 * The string is allocated if it has not already been or if it is already
666 * allocated and PG has more CPUs now. If PG has smaller or equal number of
667 * CPUs, but the actual CPUs may have changed, the string is reset to the empty
668 * string causes the string representation to be recreated. The pghw_generation
669 * field is used to detect whether CPUs within the pg may have changed.
672 pghw_cpulist_alloc(pghw_t
*pg
)
674 uint_t ncpus
= GROUP_SIZE(&((pg_t
*)pg
)->pg_cpus
);
675 size_t len
= CPUSTR_LEN(ncpus
);
678 * If the pghw_cpulist string is already allocated we need to make sure
679 * that it has sufficient length. Also if the set of CPUs may have
680 * changed, we need to re-generate the string.
682 if (pg
->pghw_cpulist
!= NULL
&&
683 pg
->pghw_kstat_gen
!= pg
->pghw_generation
) {
684 if (len
<= pg
->pghw_cpulist_len
) {
686 * There is sufficient space in the pghw_cpulist for
687 * the new set of CPUs. Just clear the string to trigger
688 * re-generation of list of CPUs
690 *(pg
->pghw_cpulist
) = '\0';
693 * There is, potentially, insufficient space in
694 * pghw_cpulist, so reallocate the string.
696 ASSERT(strlen(pg
->pghw_cpulist
) < pg
->pghw_cpulist_len
);
697 kmem_free(pg
->pghw_cpulist
, pg
->pghw_cpulist_len
);
698 pg
->pghw_cpulist
= NULL
;
699 pg
->pghw_cpulist_len
= 0;
703 if (pg
->pghw_cpulist
== NULL
) {
705 * Allocate space to hold cpulist.
707 * Length can not be bigger that the maximum space we have
708 * allowed for the kstat buffer
710 if (len
> pg_cpulist_maxlen
)
711 len
= pg_cpulist_maxlen
;
713 pg
->pghw_cpulist
= kmem_zalloc(len
, KM_NOSLEEP
);
714 if (pg
->pghw_cpulist
!= NULL
)
715 pg
->pghw_cpulist_len
= len
;
723 cpu_t
*cp
= (cpu_t
*)v
;
731 * Return parent ID or -1 if there is no parent.
732 * All hardware PGs are currently also CMT PGs, but for safety we check the
733 * class matches cmt before we upcast the pghw pointer to pg_cmt_t.
736 pghw_parent_id(pghw_t
*pghw
)
738 pg_t
*pg
= (pg_t
*)pghw
;
739 pgid_t parent_id
= -1;
741 if (pg
!= NULL
&& strcmp(pg
->pg_class
->pgc_name
, "cmt") == 0) {
742 pg_cmt_t
*cmt
= (pg_cmt_t
*)pg
;
743 pg_t
*parent
= (pg_t
*)cmt
->cmt_parent
;
745 parent_id
= parent
->pg_id
;