kernel/os/cpu_pm.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cpu_pm.h>
  27 #include <sys/cmn_err.h>
  28 #include <sys/time.h>
  29 #include <sys/sdt.h>
  30
  31 /*
  32  * Solaris Event Based CPU Power Manager
  33  *
  34  * This file implements platform independent event based CPU power management.
  35  * When CPUs are configured into the system, the CMT scheduling subsystem will
  36  * query the platform to determine if the CPU belongs to any power management
  37  * domains. That is, sets of CPUs that share power management states.
  38  *
  39  * Active Power Management domains represent a group of CPUs across which the
  40  * Operating System can request speed changes (which may in turn result
  41  * in voltage changes). This allows the operating system to trade off
  42  * performance for power savings.
  43  *
  44  * Idle Power Management domains can enter power savings states when they are
  45  * unutilized. These states allow the Operating System to trade off power
  46  * for performance (in the form of latency to transition from the idle state
  47  * to an active one).
  48  *
  49  * For each active and idle power domain the CMT subsystem instantiates, a
  50  * cpupm_domain_t structure is created. As the dispatcher schedules threads
  51  * to run on the system's CPUs, it will also track the utilization of the
  52  * enumerated power domains. Significant changes in utilization will result
  53  * in the dispatcher sending the power manager events that relate to the
  54  * utilization of the power domain. The power manager recieves the events,
  55  * and in the context of the policy objectives in force, may decide to request
  56  * the domain's power/performance state be changed.
  57  *
  58  * Under the "elastic" CPUPM policy, when the utilization rises, the CPU power
  59  * manager will request the CPUs in the domain run at their fastest (and most
  60  * power consuming) state. When the domain becomes idle (utilization at zero),
  61  * the power manager will request that the CPUs run at a speed that saves the
  62  * most power.
  63  *
  64  * The advantage of this scheme, is that the CPU power manager working with the
  65  * dispatcher can be extremely responsive to changes in utilization. Optimizing
  66  * for performance in the presence of utilization, and power savings in the
  67  * presence of idleness. Such close collaboration with the dispatcher has other
  68  * benefits that will play out in the form of more sophisticated power /
  69  * performance policy in the near future.
  70  *
  71  * Avoiding state thrashing in the presence of transient periods of utilization
  72  * and idleness while still being responsive to non-transient periods is key.
  73  * The power manager implements a "governor" that is used to throttle
  74  * state transitions when a significant amount of transient idle or transient
  75  * work is detected.
  76  *
  77  * Kernel background activity (e.g. taskq threads) are by far the most common
  78  * form of transient utilization. Ungoverned in the face of this utililzation,
  79  * hundreds of state transitions per second would result on an idle system.
  80  *
  81  * Transient idleness is common when a thread briefly yields the CPU to
  82  * wait for an event elsewhere in the system. Where the idle period is short
  83  * enough, the overhead associated with making the state transition doesn't
  84  * justify the power savings.
  85  *
  86  * The following is the state machine for the governor implemented by
  87  * cpupm_utilization_event():
  88  *
  89  *         ----->---tw---->-----
  90  *        /                     \
  91  *      (I)-<-ti-<-     -<-ntw-<(W)
  92  *       |         \   /         |
  93  *       \          \ /          /
  94  *        >-nti/rm->(D)--->-tw->-
  95  * Key:
  96  *
  97  * States
  98  * - (D): Default (ungoverned)
  99  * - (W): Transient work governed
 100  * - (I): Transient idle governed
 101  * State Transitions
 102  * - tw: transient work
 103  * - ti: transient idleness
 104  * - ntw: non-transient work
 105  * - nti: non-transient idleness
 106  * - rm: thread remain event
 107  */
 108
 109 static cpupm_domain_t *cpupm_domains = NULL;
 110
 111 /*
 112  * Uninitialized state of CPU power management is disabled
 113  */
 114 cpupm_policy_t cpupm_policy = CPUPM_POLICY_DISABLED;
 115
 116 /*
 117  * Periods of utilization lasting less than this time interval are characterized
 118  * as transient. State changes associated with transient work are considered
 119  * to be mispredicted. That is, it's not worth raising and lower power states
 120  * where the utilization lasts for less than this interval.
 121  */
 122 hrtime_t cpupm_tw_predict_interval;
 123
 124 /*
 125  * Periods of idleness lasting less than this time interval are characterized
 126  * as transient. State changes associated with transient idle are considered
 127  * to be mispredicted. That is, it's not worth lowering and raising power
 128  * states where the idleness lasts for less than this interval.
 129  */
 130 hrtime_t cpupm_ti_predict_interval;
 131
 132 /*
 133  * Number of mispredictions after which future transitions will be governed.
 134  */
 135 int cpupm_mispredict_thresh = 4;
 136
 137 /*
 138  * Likewise, the number of mispredicted governed transitions after which the
 139  * governor will be removed.
 140  */
 141 int cpupm_mispredict_gov_thresh = 4;
 142
 143 /*
 144  * The transient work and transient idle prediction intervals are specified
 145  * here. Tuning them higher will result in the transient work, and transient
 146  * idle governors being used more aggresively, which limits the frequency of
 147  * state transitions at the expense of performance and power savings,
 148  * respectively. The intervals are specified in nanoseconds.
 149  */
 150 /*
 151  * 400 usec
 152  */
 153 #define CPUPM_DEFAULT_TI_INTERVAL       400000
 154 /*
 155  * 400 usec
 156  */
 157 #define CPUPM_DEFAULT_TW_INTERVAL       400000
 158
 159 hrtime_t cpupm_ti_gov_interval = CPUPM_DEFAULT_TI_INTERVAL;
 160 hrtime_t cpupm_tw_gov_interval = CPUPM_DEFAULT_TW_INTERVAL;
 161
 162
 163 static void     cpupm_governor_initialize(void);
 164 static void     cpupm_state_change_global(cpupm_dtype_t, cpupm_state_name_t);
 165
 166 cpupm_policy_t
 167 cpupm_get_policy(void)
 168 {
 169         return (cpupm_policy);
 170 }
 171
 172 int
 173 cpupm_set_policy(cpupm_policy_t new_policy)
 174 {
 175         static int      gov_init = 0;
 176         int             result = 0;
 177
 178         mutex_enter(&cpu_lock);
 179         if (new_policy == cpupm_policy) {
 180                 mutex_exit(&cpu_lock);
 181                 return (result);
 182         }
 183
 184         /*
 185          * Pausing CPUs causes a high priority thread to be scheduled
 186          * on all other CPUs (besides the current one). This locks out
 187          * other CPUs from making CPUPM state transitions.
 188          */
 189         switch (new_policy) {
 190         case CPUPM_POLICY_DISABLED:
 191                 pause_cpus(NULL, NULL);
 192                 cpupm_policy = CPUPM_POLICY_DISABLED;
 193                 start_cpus();
 194
 195                 result = cmt_pad_disable(PGHW_POW_ACTIVE);
 196
 197                 /*
 198                  * Once PAD has been enabled, it should always be possible
 199                  * to disable it.
 200                  */
 201                 ASSERT(result == 0);
 202
 203                 /*
 204                  * Bring all the active power domains to the maximum
 205                  * performance state.
 206                  */
 207                 cpupm_state_change_global(CPUPM_DTYPE_ACTIVE,
 208                     CPUPM_STATE_MAX_PERF);
 209
 210                 break;
 211         case CPUPM_POLICY_ELASTIC:
 212
 213                 result = cmt_pad_enable(PGHW_POW_ACTIVE);
 214                 if (result < 0) {
 215                         /*
 216                          * Failed to enable PAD across the active power
 217                          * domains, which may well be because none were
 218                          * enumerated.
 219                          */
 220                         break;
 221                 }
 222
 223                 /*
 224                  * Initialize the governor parameters the first time through.
 225                  */
 226                 if (gov_init == 0) {
 227                         cpupm_governor_initialize();
 228                         gov_init = 1;
 229                 }
 230
 231                 pause_cpus(NULL, NULL);
 232                 cpupm_policy = CPUPM_POLICY_ELASTIC;
 233                 start_cpus();
 234
 235                 break;
 236         default:
 237                 cmn_err(CE_WARN, "Attempt to set unknown CPUPM policy %d\n",
 238                     new_policy);
 239                 ASSERT(0);
 240                 break;
 241         }
 242         mutex_exit(&cpu_lock);
 243
 244         return (result);
 245 }
 246
 247 /*
 248  * Look for an existing power domain
 249  */
 250 static cpupm_domain_t *
 251 cpupm_domain_find(id_t id, cpupm_dtype_t type)
 252 {
 253         ASSERT(MUTEX_HELD(&cpu_lock));
 254
 255         cpupm_domain_t *dom;
 256
 257         dom = cpupm_domains;
 258         while (dom != NULL) {
 259                 if (id == dom->cpd_id && type == dom->cpd_type)
 260                         return (dom);
 261                 dom = dom->cpd_next;
 262         }
 263         return (NULL);
 264 }
 265
 266 /*
 267  * Create a new domain
 268  */
 269 static cpupm_domain_t *
 270 cpupm_domain_create(id_t id, cpupm_dtype_t type)
 271 {
 272         cpupm_domain_t *dom;
 273
 274         ASSERT(MUTEX_HELD(&cpu_lock));
 275
 276         dom = kmem_zalloc(sizeof (cpupm_domain_t), KM_SLEEP);
 277         dom->cpd_id = id;
 278         dom->cpd_type = type;
 279
 280         /* Link into the known domain list */
 281         dom->cpd_next = cpupm_domains;
 282         cpupm_domains = dom;
 283
 284         return (dom);
 285 }
 286
 287 static void
 288 cpupm_domain_state_enum(struct cpu *cp, cpupm_domain_t *dom)
 289 {
 290         /*
 291          * In the envent we're enumerating because the domain's state
 292          * configuration has changed, toss any existing states.
 293          */
 294         if (dom->cpd_nstates > 0) {
 295                 kmem_free(dom->cpd_states,
 296                     sizeof (cpupm_state_t) * dom->cpd_nstates);
 297                 dom->cpd_nstates = 0;
 298         }
 299
 300         /*
 301          * Query to determine the number of states, allocate storage
 302          * large enough to hold the state information, and pass it back
 303          * to the platform driver to complete the enumeration.
 304          */
 305         dom->cpd_nstates = cpupm_plat_state_enumerate(cp, dom->cpd_type, NULL);
 306
 307         if (dom->cpd_nstates == 0)
 308                 return;
 309
 310         dom->cpd_states =
 311             kmem_zalloc(dom->cpd_nstates * sizeof (cpupm_state_t), KM_SLEEP);
 312         (void) cpupm_plat_state_enumerate(cp, dom->cpd_type, dom->cpd_states);
 313 }
 314
 315 /*
 316  * Initialize the specified type of power domain on behalf of the CPU
 317  */
 318 cpupm_domain_t *
 319 cpupm_domain_init(struct cpu *cp, cpupm_dtype_t type)
 320 {
 321         cpupm_domain_t  *dom;
 322         id_t            did;
 323
 324         ASSERT(MUTEX_HELD(&cpu_lock));
 325
 326         /*
 327          * Instantiate the domain if it doesn't already exist
 328          * and enumerate its power states.
 329          */
 330         did = cpupm_domain_id(cp, type);
 331         dom = cpupm_domain_find(did, type);
 332         if (dom == NULL) {
 333                 dom = cpupm_domain_create(did, type);
 334                 cpupm_domain_state_enum(cp, dom);
 335         }
 336
 337         /*
 338          * Named state initialization
 339          */
 340         if (type == CPUPM_DTYPE_ACTIVE) {
 341                 /*
 342                  * For active power domains, the highest performance
 343                  * state is defined as first state returned from
 344                  * the domain enumeration.
 345                  */
 346                 dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
 347                     &dom->cpd_states[0];
 348                 dom->cpd_named_states[CPUPM_STATE_LOW_POWER] =
 349                     &dom->cpd_states[dom->cpd_nstates - 1];
 350
 351                 /*
 352                  * Begin by assuming CPU is running at the max perf state.
 353                  */
 354                 dom->cpd_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 355         }
 356
 357         return (dom);
 358 }
 359
 360 /*
 361  * Return the id associated with the given type of domain
 362  * to which cp belongs
 363  */
 364 id_t
 365 cpupm_domain_id(struct cpu *cp, cpupm_dtype_t type)
 366 {
 367         return (cpupm_plat_domain_id(cp, type));
 368 }
 369
 370 /*
 371  * Initiate a state change for the specified domain on behalf of cp
 372  */
 373 int
 374 cpupm_change_state(struct cpu *cp, cpupm_domain_t *dom, cpupm_state_t *state)
 375 {
 376         if (cpupm_plat_change_state(cp, state) < 0)
 377                 return (-1);
 378
 379         DTRACE_PROBE2(cpupm__change__state,
 380             cpupm_domain_t *, dom,
 381             cpupm_state_t *, state);
 382
 383         dom->cpd_state = state;
 384         return (0);
 385 }
 386
 387 /*
 388  * Interface into the CPU power manager to indicate a significant change
 389  * in utilization of the specified active power domain
 390  */
 391 void
 392 cpupm_utilization_event(struct cpu *cp, hrtime_t now, cpupm_domain_t *dom,
 393                             cpupm_util_event_t event)
 394 {
 395         cpupm_state_t   *new_state = NULL;
 396         hrtime_t        last;
 397
 398         if (cpupm_policy == CPUPM_POLICY_DISABLED) {
 399                 return;
 400         }
 401
 402         /*
 403          * What follows is a simple elastic power state management policy.
 404          *
 405          * If the utilization has become non-zero, and the domain was
 406          * previously at it's lowest power state, then transition it
 407          * to the highest state in the spirit of "race to idle".
 408          *
 409          * If the utilization has dropped to zero, then transition the
 410          * domain to its lowest power state.
 411          *
 412          * Statistics are maintained to implement a governor to reduce state
 413          * transitions resulting from either transient work, or periods of
 414          * transient idleness on the domain.
 415          */
 416         switch (event) {
 417         case CPUPM_DOM_REMAIN_BUSY:
 418
 419                 /*
 420                  * We've received an event that the domain is running a thread
 421                  * that's made it to the end of it's time slice. If we are at
 422                  * low power, then raise it. If the transient work governor
 423                  * is engaged, then remove it.
 424                  */
 425                 if (dom->cpd_state ==
 426                     dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 427                         new_state =
 428                             dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 429                         if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 430                                 dom->cpd_governor = CPUPM_GOV_DISENGAGED;
 431                                 dom->cpd_tw = 0;
 432                         }
 433                 }
 434                 break;
 435
 436         case CPUPM_DOM_BUSY_FROM_IDLE:
 437                 last = dom->cpd_last_lower;
 438                 dom->cpd_last_raise = now;
 439
 440                 DTRACE_PROBE3(cpupm__raise__req,
 441                     cpupm_domain_t *, dom,
 442                     hrtime_t, last,
 443                     hrtime_t, now);
 444
 445                 if (dom->cpd_state ==
 446                     dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 447
 448                         /*
 449                          * There's non-zero utilization, and the domain is
 450                          * running in the lower power state. Before we
 451                          * consider raising power, check if the preceeding
 452                          * idle period was transient in duration.
 453                          *
 454                          * If the domain is already transient work governed,
 455                          * then we don't bother maintaining transient idle
 456                          * statistics, as the presence of enough transient work
 457                          * can also make the domain frequently transiently idle.
 458                          * In this case, we still want to remain transient work
 459                          * governed.
 460                          */
 461                         if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 462                                 if ((now - last) < cpupm_ti_predict_interval) {
 463                                         /*
 464                                          * We're raising the domain power and
 465                                          * we *just* lowered it. Consider
 466                                          * this a mispredicted power state
 467                                          * transition due to a transient
 468                                          * idle period.
 469                                          */
 470                                         if (++dom->cpd_ti >=
 471                                             cpupm_mispredict_thresh) {
 472                                                 /*
 473                                                  * There's enough transient
 474                                                  * idle transitions to
 475                                                  * justify governing future
 476                                                  * lowering requests.
 477                                                  */
 478                                                 dom->cpd_governor =
 479                                                     CPUPM_GOV_TRANS_IDLE;
 480                                                 dom->cpd_ti = 0;
 481                                                 DTRACE_PROBE1(
 482                                                     cpupm__ti__governed,
 483                                                     cpupm_domain_t *, dom);
 484                                         }
 485                                 } else {
 486                                         /*
 487                                          * We correctly predicted the last
 488                                          * lowering.
 489                                          */
 490                                         dom->cpd_ti = 0;
 491                                 }
 492                         }
 493                         if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 494                                 /*
 495                                  * Raise requests are governed due to
 496                                  * transient work.
 497                                  */
 498                                 DTRACE_PROBE1(cpupm__raise__governed,
 499                                     cpupm_domain_t *, dom);
 500
 501                                 return;
 502                         }
 503                         /*
 504                          * Prepare to transition to the higher power state
 505                          */
 506                         new_state = dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 507
 508                 } else if (dom->cpd_state ==
 509                     dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 510
 511                         /*
 512                          * Utilization is non-zero, and we're already running
 513                          * in the higher power state. Take this opportunity to
 514                          * perform some book keeping if the last lowering
 515                          * request was governed.
 516                          */
 517                         if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
 518
 519                                 if ((now - last) >= cpupm_ti_predict_interval) {
 520                                         /*
 521                                          * The domain is transient idle
 522                                          * governed, and we mispredicted
 523                                          * governing the last lowering request.
 524                                          */
 525                                         if (++dom->cpd_ti >=
 526                                             cpupm_mispredict_gov_thresh) {
 527                                                 /*
 528                                                  * There's enough non-transient
 529                                                  * idle periods to justify
 530                                                  * removing the governor.
 531                                                  */
 532                                                 dom->cpd_governor =
 533                                                     CPUPM_GOV_DISENGAGED;
 534                                                 dom->cpd_ti = 0;
 535                                                 DTRACE_PROBE1(
 536                                                     cpupm__ti__ungoverned,
 537                                                     cpupm_domain_t *, dom);
 538                                         }
 539                                 } else {
 540                                         /*
 541                                          * Correctly predicted governing the
 542                                          * last lowering request.
 543                                          */
 544                                         dom->cpd_ti = 0;
 545                                 }
 546                         }
 547                 }
 548                 break;
 549
 550         case CPUPM_DOM_IDLE_FROM_BUSY:
 551                 last = dom->cpd_last_raise;
 552                 dom->cpd_last_lower = now;
 553
 554                 DTRACE_PROBE3(cpupm__lower__req,
 555                     cpupm_domain_t *, dom,
 556                     hrtime_t, last,
 557                     hrtime_t, now);
 558
 559                 if (dom->cpd_state ==
 560                     dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 561
 562                         /*
 563                          * The domain is idle, and is running in the highest
 564                          * performance state. Before we consider lowering power,
 565                          * perform some book keeping for the transient work
 566                          * governor.
 567                          */
 568                         if (dom->cpd_governor == CPUPM_GOV_DISENGAGED) {
 569                                 if ((now - last) < cpupm_tw_predict_interval) {
 570                                         /*
 571                                          * We're lowering the domain power and
 572                                          * we *just* raised it. Consider the
 573                                          * last raise mispredicted due to
 574                                          * transient work.
 575                                          */
 576                                         if (++dom->cpd_tw >=
 577                                             cpupm_mispredict_thresh) {
 578                                                 /*
 579                                                  * There's enough transient work
 580                                                  * transitions to justify
 581                                                  * governing future raise
 582                                                  * requests.
 583                                                  */
 584                                                 dom->cpd_governor =
 585                                                     CPUPM_GOV_TRANS_WORK;
 586                                                 dom->cpd_tw = 0;
 587                                                 DTRACE_PROBE1(
 588                                                     cpupm__tw__governed,
 589                                                     cpupm_domain_t *, dom);
 590                                         }
 591                                 } else {
 592                                         /*
 593                                          * We correctly predicted during the
 594                                          * last raise.
 595                                          */
 596                                         dom->cpd_tw = 0;
 597                                 }
 598                         }
 599                         if (dom->cpd_governor == CPUPM_GOV_TRANS_IDLE) {
 600                                 /*
 601                                  * Lowering requests are governed due to
 602                                  * transient idleness.
 603                                  */
 604                                 DTRACE_PROBE1(cpupm__lowering__governed,
 605                                     cpupm_domain_t *, dom);
 606
 607                                 return;
 608                         }
 609
 610                         /*
 611                          * Prepare to transition to a lower power state.
 612                          */
 613                         new_state =
 614                             dom->cpd_named_states[CPUPM_STATE_LOW_POWER];
 615
 616                 } else if (dom->cpd_state ==
 617                     dom->cpd_named_states[CPUPM_STATE_LOW_POWER]) {
 618
 619                         /*
 620                          * The domain is idle, and we're already running in
 621                          * the lower power state. Take this opportunity to
 622                          * perform some book keeping if the last raising
 623                          * request was governed.
 624                          */
 625                         if (dom->cpd_governor == CPUPM_GOV_TRANS_WORK) {
 626                                 if ((now - last) >= cpupm_tw_predict_interval) {
 627                                         /*
 628                                          * The domain is transient work
 629                                          * governed, and we mispredicted
 630                                          * governing the last raising request.
 631                                          */
 632                                         if (++dom->cpd_tw >=
 633                                             cpupm_mispredict_gov_thresh) {
 634                                                 /*
 635                                                  * There's enough non-transient
 636                                                  * work to justify removing
 637                                                  * the governor.
 638                                                  */
 639                                                 dom->cpd_governor =
 640                                                     CPUPM_GOV_DISENGAGED;
 641                                                 dom->cpd_tw = 0;
 642                                                 DTRACE_PROBE1(
 643                                                     cpupm__tw__ungoverned,
 644                                                     cpupm_domain_t *, dom);
 645                                         }
 646                                 } else {
 647                                         /*
 648                                          * We correctly predicted governing
 649                                          * the last raise.
 650                                          */
 651                                         dom->cpd_tw = 0;
 652                                 }
 653                         }
 654                 }
 655                 break;
 656         }
 657         /*
 658          * Change the power state
 659          * Not much currently done if this doesn't succeed
 660          */
 661         if (new_state)
 662                 (void) cpupm_change_state(cp, dom, new_state);
 663 }
 664
 665
 666 /*
 667  * Interface called by platforms to dynamically change the
 668  * MAX performance cpupm state
 669  */
 670 void
 671 cpupm_redefine_max_activepwr_state(struct cpu *cp, int max_perf_level)
 672 {
 673         cpupm_domain_t  *dom;
 674         id_t            did;
 675         cpupm_dtype_t   type = CPUPM_DTYPE_ACTIVE;
 676         boolean_t       change_state = B_FALSE;
 677         cpupm_state_t   *new_state = NULL;
 678
 679         did = cpupm_domain_id(cp, type);
 680         if (MUTEX_HELD(&cpu_lock)) {
 681                 dom = cpupm_domain_find(did, type);
 682         } else {
 683                 mutex_enter(&cpu_lock);
 684                 dom = cpupm_domain_find(did, type);
 685                 mutex_exit(&cpu_lock);
 686         }
 687
 688         /*
 689          * Can use a lock to avoid changing the power state of the cpu when
 690          * CPUPM_STATE_MAX_PERF is getting changed.
 691          * Since the occurance of events to change MAX_PERF is not frequent,
 692          * it may not be a good idea to overburden with locks. In the worst
 693          * case, for one cycle the power may not get changed to the required
 694          * level
 695          */
 696         if (dom != NULL) {
 697                 if (dom->cpd_state ==
 698                     dom->cpd_named_states[CPUPM_STATE_MAX_PERF]) {
 699                         change_state = B_TRUE;
 700                 }
 701
 702                 /*
 703                  * If an out of range level is passed, use the lowest supported
 704                  * speed.
 705                  */
 706                 if (max_perf_level >= dom->cpd_nstates &&
 707                     dom->cpd_nstates > 1) {
 708                         max_perf_level = dom->cpd_nstates - 1;
 709                 }
 710
 711                 dom->cpd_named_states[CPUPM_STATE_MAX_PERF] =
 712                     &dom->cpd_states[max_perf_level];
 713
 714                 /*
 715                  * If the current state is MAX_PERF, change the current state
 716                  * to the new MAX_PERF
 717                  */
 718                 if (change_state) {
 719                         new_state =
 720                             dom->cpd_named_states[CPUPM_STATE_MAX_PERF];
 721                         if (new_state) {
 722                                 (void) cpupm_change_state(cp, dom, new_state);
 723                         }
 724                 }
 725         }
 726 }
 727
 728 /*
 729  * Initialize the parameters for the transience governor state machine
 730  */
 731 static void
 732 cpupm_governor_initialize(void)
 733 {
 734         /*
 735          * The default prediction intervals are specified in nanoseconds.
 736          * Convert these to the equivalent in unscaled hrtime, which is the
 737          * format of the timestamps passed to cpupm_utilization_event()
 738          */
 739         cpupm_ti_predict_interval = unscalehrtime(cpupm_ti_gov_interval);
 740         cpupm_tw_predict_interval = unscalehrtime(cpupm_tw_gov_interval);
 741 }
 742
 743 /*
 744  * Initiate a state change in all CPUPM domain instances of the specified type
 745  */
 746 static void
 747 cpupm_state_change_global(cpupm_dtype_t type, cpupm_state_name_t state)
 748 {
 749         cpu_t           *cp;
 750         pg_cmt_t        *pwr_pg;
 751         cpupm_domain_t  *dom;
 752         group_t         *hwset;
 753         group_iter_t    giter;
 754         pg_cpu_itr_t    cpu_iter;
 755         pghw_type_t     hw;
 756
 757         ASSERT(MUTEX_HELD(&cpu_lock));
 758
 759         switch (type) {
 760         case CPUPM_DTYPE_ACTIVE:
 761                 hw = PGHW_POW_ACTIVE;
 762                 break;
 763         default:
 764                 /*
 765                  * Power domain types other than "active" unsupported.
 766                  */
 767                 ASSERT(type == CPUPM_DTYPE_ACTIVE);
 768                 return;
 769         }
 770
 771         if ((hwset = pghw_set_lookup(hw)) == NULL)
 772                 return;
 773
 774         /*
 775          * Iterate over the power domains
 776          */
 777         group_iter_init(&giter);
 778         while ((pwr_pg = group_iterate(hwset, &giter)) != NULL) {
 779
 780                 dom = (cpupm_domain_t *)pwr_pg->cmt_pg.pghw_handle;
 781
 782                 /*
 783                  * Iterate over the CPUs in each domain
 784                  */
 785                 PG_CPU_ITR_INIT(pwr_pg, cpu_iter);
 786                 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
 787                         (void) cpupm_change_state(cp, dom,
 788                             dom->cpd_named_states[state]);
 789                 }
 790         }
 791 }