kernel/disp/cmt_policy.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/systm.h>
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/thread.h>
  30 #include <sys/cpuvar.h>
  31 #include <sys/cpupart.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/disp.h>
  34 #include <sys/group.h>
  35 #include <sys/bitset.h>
  36 #include <sys/lgrp.h>
  37 #include <sys/cmt.h>
  38
  39 /*
  40  * CMT dispatcher policies
  41  *
  42  * This file implements CMT dispatching policies using Processor Groups.
  43  *
  44  * The scheduler/dispatcher leverages knowledge of the performance
  45  * relevant CMT sharing relationships existing between CPUs to implement
  46  * load balancing, and coalescence thread placement policies.
  47  *
  48  * Load balancing policy seeks to improve performance by minimizing
  49  * contention over shared processor resources / facilities. Coalescence
  50  * policies improve resource utilization and ultimately power efficiency.
  51  *
  52  * On NUMA systems, the dispatcher will generally perform load balancing and
  53  * coalescence within (and not across) lgroups. This is because there isn't
  54  * much sense in trying to correct an imbalance by sending a thread outside
  55  * of its home, if it would attempt to return home a short while later.
  56  * The dispatcher will implement CMT policy across lgroups however, if
  57  * it can do so with a thread homed to the root lgroup, since root homed
  58  * threads have no lgroup affinity.
  59  */
  60
  61 /*
  62  * Return non-zero if, given the policy, we should migrate from running
  63  * somewhere "here" to somewhere "there".
  64  */
  65 static int
  66 cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
  67     int self)
  68 {
  69         uint32_t here_util, there_util;
  70
  71         here_util = here->cmt_utilization;
  72         there_util = there->cmt_utilization;
  73
  74         /*
  75          * This assumes that curthread's utilization is "1"
  76          */
  77         if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
  78                 here_util--;    /* Ignore curthread's effect */
  79
  80         /*
  81          * Load balancing and coalescence are conflicting policies
  82          */
  83         ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
  84             (CMT_BALANCE|CMT_COALESCE));
  85
  86         if (policy & CMT_BALANCE) {
  87                 /*
  88                  * Balance utilization
  89                  *
  90                  * If the target is comparatively underutilized
  91                  * (either in an absolute sense, or scaled by capacity),
  92                  * then choose to balance.
  93                  */
  94                 if ((here_util > there_util) ||
  95                     (here_util == there_util &&
  96                     (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
  97                         return (1);
  98                 }
  99         } else if (policy & CMT_COALESCE) {
 100                 /*
 101                  * Attempt to drive group utilization up to capacity
 102                  */
 103                 if (there_util > here_util &&
 104                     there_util < CMT_CAPACITY(there))
 105                         return (1);
 106         }
 107         return (0);
 108 }
 109
 110 /*
 111  * Perform multi-level CMT load balancing of running threads.
 112  *
 113  * tp is the thread being enqueued.
 114  * cp is a hint CPU, against which CMT load balancing will be performed.
 115  *
 116  * Returns cp, or a CPU better than cp with respect to balancing
 117  * running thread load.
 118  */
 119 cpu_t *
 120 cmt_balance(kthread_t *tp, cpu_t *cp)
 121 {
 122         int             hint, i, cpu, nsiblings;
 123         int             self = 0;
 124         group_t         *cmt_pgs, *siblings;
 125         pg_cmt_t        *pg, *pg_tmp, *tpg = NULL;
 126         int             level = 0;
 127         cpu_t           *newcp;
 128         extern cmt_lgrp_t *cmt_root;
 129
 130         ASSERT(THREAD_LOCK_HELD(tp));
 131
 132         cmt_pgs = &cp->cpu_pg->cmt_pgs;
 133
 134         if (GROUP_SIZE(cmt_pgs) == 0)
 135                 return (cp);    /* nothing to do */
 136
 137         if (tp == curthread)
 138                 self = 1;
 139
 140         /*
 141          * Balance across siblings in the CPUs CMT lineage
 142          * If the thread is homed to the root lgroup, perform
 143          * top level balancing against other top level PGs
 144          * in the system. Otherwise, start with the default
 145          * top level siblings group, which is within the leaf lgroup
 146          */
 147         pg = GROUP_ACCESS(cmt_pgs, level);
 148         if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
 149                 siblings = &cmt_root->cl_pgs;
 150         else
 151                 siblings = pg->cmt_siblings;
 152
 153         /*
 154          * Traverse down the lineage until we find a level that needs
 155          * balancing, or we get to the end.
 156          */
 157         for (;;) {
 158                 nsiblings = GROUP_SIZE(siblings);       /* self inclusive */
 159                 if (nsiblings == 1)
 160                         goto next_level;
 161
 162                 hint = CPU_PSEUDO_RANDOM() % nsiblings;
 163
 164                 /*
 165                  * Find a balancing candidate from among our siblings
 166                  * "hint" is a hint for where to start looking
 167                  */
 168                 i = hint;
 169                 do {
 170                         ASSERT(i < nsiblings);
 171                         pg_tmp = GROUP_ACCESS(siblings, i);
 172
 173                         /*
 174                          * The candidate must not be us, and must
 175                          * have some CPU resources in the thread's
 176                          * partition
 177                          */
 178                         if (pg_tmp != pg &&
 179                             bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
 180                             ((pg_t *)pg_tmp)->pg_id)) {
 181                                 tpg = pg_tmp;
 182                                 break;
 183                         }
 184
 185                         if (++i >= nsiblings)
 186                                 i = 0;
 187                 } while (i != hint);
 188
 189                 if (!tpg)
 190                         goto next_level; /* no candidates at this level */
 191
 192                 /*
 193                  * Decide if we should migrate from the current PG to a
 194                  * target PG given a policy
 195                  */
 196                 if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
 197                         break;
 198                 tpg = NULL;
 199
 200 next_level:
 201                 if (++level == GROUP_SIZE(cmt_pgs))
 202                         break;
 203
 204                 pg = GROUP_ACCESS(cmt_pgs, level);
 205                 siblings = pg->cmt_siblings;
 206         }
 207
 208         if (tpg) {
 209                 uint_t  tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
 210
 211                 /*
 212                  * Select an idle CPU from the target
 213                  */
 214                 hint = CPU_PSEUDO_RANDOM() % tgt_size;
 215                 cpu = hint;
 216                 do {
 217                         newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
 218                         if (newcp->cpu_part == tp->t_cpupart &&
 219                             newcp->cpu_dispatch_pri == -1) {
 220                                 cp = newcp;
 221                                 break;
 222                         }
 223                         if (++cpu == tgt_size)
 224                                 cpu = 0;
 225                 } while (cpu != hint);
 226         }
 227
 228         return (cp);
 229 }