sys/kern/usched_dfly.c

   1 /*
   2  * Copyright (c) 2012-2017 The DragonFly Project.  All rights reserved.
   3  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to The DragonFly Project
   6  * by Matthew Dillon <dillon@backplane.com>,
   7  * by Mihai Carabas <mihai.carabas@gmail.com>
   8  * and many others.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  *
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in
  18  *    the documentation and/or other materials provided with the
  19  *    distribution.
  20  * 3. Neither the name of The DragonFly Project nor the names of its
  21  *    contributors may be used to endorse or promote products derived
  22  *    from this software without specific, prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  */
  37 #include <sys/param.h>
  38 #include <sys/systm.h>
  39 #include <sys/kernel.h>
  40 #include <sys/lock.h>
  41 #include <sys/queue.h>
  42 #include <sys/proc.h>
  43 #include <sys/rtprio.h>
  44 #include <sys/uio.h>
  45 #include <sys/sysctl.h>
  46 #include <sys/resourcevar.h>
  47 #include <sys/spinlock.h>
  48 #include <sys/cpu_topology.h>
  49 #include <sys/thread2.h>
  50 #include <sys/spinlock2.h>
  51
  52 #include <sys/ktr.h>
  53
  54 #include <machine/cpu.h>
  55 #include <machine/smp.h>
  56
  57 /*
  58  * Priorities.  Note that with 32 run queues per scheduler each queue
  59  * represents four priority levels.
  60  */
  61
  62 int dfly_rebalanced;
  63
  64 #define MAXPRI                  128
  65 #define PRIMASK                 (MAXPRI - 1)
  66 #define PRIBASE_REALTIME        0
  67 #define PRIBASE_NORMAL          MAXPRI
  68 #define PRIBASE_IDLE            (MAXPRI * 2)
  69 #define PRIBASE_THREAD          (MAXPRI * 3)
  70 #define PRIBASE_NULL            (MAXPRI * 4)
  71
  72 #define NQS     32                      /* 32 run queues. */
  73 #define PPQ     (MAXPRI / NQS)          /* priorities per queue */
  74 #define PPQMASK (PPQ - 1)
  75
  76 /*
  77  * NICE_QS      - maximum queues nice can shift the process
  78  * EST_QS       - maximum queues estcpu can shift the process
  79  *
  80  * ESTCPUPPQ    - number of estcpu units per priority queue
  81  * ESTCPUMAX    - number of estcpu units
  82  *
  83  * Remember that NICE runs over the whole -20 to +20 range.
  84  */
  85 #define NICE_QS         24      /* -20 to +20 shift in whole queues */
  86 #define EST_QS          12      /* 0-MAX shift in whole queues */
  87 #define ESTCPUPPQ       512
  88 #define ESTCPUMAX       (ESTCPUPPQ * EST_QS)
  89 #define PRIO_RANGE      (PRIO_MAX - PRIO_MIN + 1)
  90
  91 #define ESTCPULIM(v)    min((v), ESTCPUMAX)
  92
  93 TAILQ_HEAD(rq, lwp);
  94
  95 #define lwp_priority    lwp_usdata.dfly.priority
  96 #define lwp_forked      lwp_usdata.dfly.forked
  97 #define lwp_rqindex     lwp_usdata.dfly.rqindex
  98 #define lwp_estcpu      lwp_usdata.dfly.estcpu
  99 #define lwp_estfast     lwp_usdata.dfly.estfast
 100 #define lwp_uload       lwp_usdata.dfly.uload
 101 #define lwp_rqtype      lwp_usdata.dfly.rqtype
 102 #define lwp_qcpu        lwp_usdata.dfly.qcpu
 103 #define lwp_rrcount     lwp_usdata.dfly.rrcount
 104
 105 /*
 106  * DFly scheduler pcpu structure.  Note that the pcpu uload field must
 107  * be 64-bits to avoid overflowing in the situation where more than 32768
 108  * processes are on a single cpu's queue.  Since high-end systems can
 109  * easily run 900,000+ processes, we have to deal with it.
 110  */
 111 struct usched_dfly_pcpu {
 112         struct spinlock spin;
 113         struct thread   *helper_thread;
 114         u_short         scancpu;
 115         short           upri;
 116         long            uload;          /* 64-bits to avoid overflow (1) */
 117         int             ucount;
 118         int             unused01;
 119         struct lwp      *uschedcp;
 120         struct rq       queues[NQS];
 121         struct rq       rtqueues[NQS];
 122         struct rq       idqueues[NQS];
 123         u_int32_t       queuebits;
 124         u_int32_t       rtqueuebits;
 125         u_int32_t       idqueuebits;
 126         int             runqcount;
 127         int             cpuid;
 128         cpumask_t       cpumask;
 129         cpu_node_t      *cpunode;
 130 };
 131
 132 typedef struct usched_dfly_pcpu *dfly_pcpu_t;
 133
 134 static void dfly_acquire_curproc(struct lwp *lp);
 135 static void dfly_release_curproc(struct lwp *lp);
 136 static void dfly_select_curproc(globaldata_t gd);
 137 static void dfly_setrunqueue(struct lwp *lp);
 138 static void dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp);
 139 static void dfly_schedulerclock(struct lwp *lp, sysclock_t period,
 140                                 sysclock_t cpstamp);
 141 static void dfly_recalculate_estcpu(struct lwp *lp);
 142 static void dfly_resetpriority(struct lwp *lp);
 143 static void dfly_forking(struct lwp *plp, struct lwp *lp);
 144 static void dfly_exiting(struct lwp *lp, struct proc *);
 145 static void dfly_uload_update(struct lwp *lp);
 146 static void dfly_yield(struct lwp *lp);
 147 static void dfly_changeqcpu_locked(struct lwp *lp,
 148                                 dfly_pcpu_t dd, dfly_pcpu_t rdd);
 149 static dfly_pcpu_t dfly_choose_best_queue(struct lwp *lp);
 150 static dfly_pcpu_t dfly_choose_worst_queue(dfly_pcpu_t dd);
 151 static dfly_pcpu_t dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp);
 152 static void dfly_need_user_resched_remote(void *dummy);
 153 static struct lwp *dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
 154                                           struct lwp *chklp, int worst);
 155 static void dfly_remrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
 156 static void dfly_setrunqueue_locked(dfly_pcpu_t dd, struct lwp *lp);
 157 static void dfly_changedcpu(struct lwp *lp);
 158
 159 struct usched usched_dfly = {
 160         { NULL },
 161         "dfly", "Original DragonFly Scheduler",
 162         NULL,                   /* default registration */
 163         NULL,                   /* default deregistration */
 164         dfly_acquire_curproc,
 165         dfly_release_curproc,
 166         dfly_setrunqueue,
 167         dfly_schedulerclock,
 168         dfly_recalculate_estcpu,
 169         dfly_resetpriority,
 170         dfly_forking,
 171         dfly_exiting,
 172         dfly_uload_update,
 173         NULL,                   /* setcpumask not supported */
 174         dfly_yield,
 175         dfly_changedcpu
 176 };
 177
 178 /*
 179  * We have NQS (32) run queues per scheduling class.  For the normal
 180  * class, there are 128 priorities scaled onto these 32 queues.  New
 181  * processes are added to the last entry in each queue, and processes
 182  * are selected for running by taking them from the head and maintaining
 183  * a simple FIFO arrangement.  Realtime and Idle priority processes have
 184  * and explicit 0-31 priority which maps directly onto their class queue
 185  * index.  When a queue has something in it, the corresponding bit is
 186  * set in the queuebits variable, allowing a single read to determine
 187  * the state of all 32 queues and then a ffs() to find the first busy
 188  * queue.
 189  */
 190                                         /* currently running a user process */
 191 static cpumask_t dfly_curprocmask = CPUMASK_INITIALIZER_ALLONES;
 192 static cpumask_t dfly_rdyprocmask;      /* ready to accept a user process */
 193 static struct usched_dfly_pcpu dfly_pcpu[MAXCPU];
 194 static struct sysctl_ctx_list usched_dfly_sysctl_ctx;
 195 static struct sysctl_oid *usched_dfly_sysctl_tree;
 196
 197 /* Debug info exposed through debug.* sysctl */
 198
 199 static int usched_dfly_debug = -1;
 200 SYSCTL_INT(_debug, OID_AUTO, dfly_scdebug, CTLFLAG_RW,
 201            &usched_dfly_debug, 0,
 202            "Print debug information for this pid");
 203
 204 static int usched_dfly_pid_debug = -1;
 205 SYSCTL_INT(_debug, OID_AUTO, dfly_pid_debug, CTLFLAG_RW,
 206            &usched_dfly_pid_debug, 0,
 207            "Print KTR debug information for this pid");
 208
 209 static int usched_dfly_chooser = 0;
 210 SYSCTL_INT(_debug, OID_AUTO, dfly_chooser, CTLFLAG_RW,
 211            &usched_dfly_chooser, 0,
 212            "Print KTR debug information for this pid");
 213
 214 /*
 215  * WARNING!
 216  *
 217  * The fork bias can have a large effect on the system in the face of a
 218  * make -j N or other high-forking applications.
 219  *
 220  * Larger values are much less invasive vs other things that
 221  * might be running in the system, but can cause exec chains
 222  * such as those typically generated by make to have higher
 223  * latencies in the face of modest load.
 224  *
 225  * Lower values are more invasive but have reduced latencies
 226  * for such exec chains.
 227  *
 228  *      make -j 10 buildkernel example, build times:
 229  *
 230  *           +0 3:04
 231  *           +1 3:14    -5.2%   <-- default
 232  *           +2 3:22    -8.9%
 233  *
 234  * This issue occurs due to the way the scheduler affinity heuristics work.
 235  * There is no way to really 'fix' the affinity heuristics because when it
 236  * comes right down to it trying to instantly schedule a process on an
 237  * available cpu (even if it will become unavailable a microsecond later)
 238  * tends to cause processes to shift around between cpus and sockets too much
 239  * and breaks the affinity.
 240  *
 241  * NOTE: Heavily concurrent builds typically have enough things on the pan
 242  *       that they remain time-efficient even with a higher bias.
 243  */
 244 static int usched_dfly_forkbias = 1;
 245 SYSCTL_INT(_debug, OID_AUTO, dfly_forkbias, CTLFLAG_RW,
 246            &usched_dfly_forkbias, 0,
 247            "Fork bias for estcpu in whole queues");
 248
 249 /*
 250  * Tunning usched_dfly - configurable through kern.usched_dfly.
 251  *
 252  * weight1 - Tries to keep threads on their current cpu.  If you
 253  *           make this value too large the scheduler will not be
 254  *           able to load-balance large loads.
 255  *
 256  * weight2 - If non-zero, detects thread pairs undergoing synchronous
 257  *           communications and tries to move them closer together.
 258  *           Behavior is adjusted by bit 4 of features (0x10).
 259  *
 260  *           WARNING!  Weight2 is a ridiculously sensitive parameter,
 261  *           a small value is recommended.
 262  *
 263  * weight3 - Weighting based on the number of recently runnable threads
 264  *           on the userland scheduling queue (ignoring their loads).
 265  *           A nominal value here prevents high-priority (low-load)
 266  *           threads from accumulating on one cpu core when other
 267  *           cores are available.
 268  *
 269  *           This value should be left fairly small relative to weight1
 270  *           and weight4.
 271  *
 272  * weight4 - Weighting based on other cpu queues being available
 273  *           or running processes with higher lwp_priority's.
 274  *
 275  *           This allows a thread to migrate to another nearby cpu if it
 276  *           is unable to run on the current cpu based on the other cpu
 277  *           being idle or running a lower priority (higher lwp_priority)
 278  *           thread.  This value should be large enough to override weight1
 279  *
 280  * features - These flags can be set or cleared to enable or disable various
 281  *            features.
 282  *
 283  *            0x01      Enable idle-cpu pulling                 (default)
 284  *            0x02      Enable proactive pushing                (default)
 285  *            0x04      Enable rebalancing rover                (default)
 286  *            0x08      Enable more proactive pushing           (default)
 287  *            0x10      (flip weight2 limit on same cpu)        (default)
 288  *            0x20      choose best cpu for forked process
 289  *            0x40      choose current cpu for forked process
 290  *            0x80      choose random cpu for forked process    (default)
 291  */
 292 static int usched_dfly_smt = 0;
 293 static int usched_dfly_cache_coherent = 0;
 294 static int usched_dfly_weight1 = 200;   /* keep thread on current cpu */
 295 static int usched_dfly_weight2 = 180;   /* synchronous peer's current cpu */
 296 static int usched_dfly_weight3 = 40;    /* number of threads on queue */
 297 static int usched_dfly_weight4 = 160;   /* availability of idle cores */
 298 static int usched_dfly_features = 0x8F; /* allow pulls */
 299 static int usched_dfly_fast_resched = 0;/* delta priority / resched */
 300 static int usched_dfly_swmask = ~PPQMASK; /* allow pulls */
 301 static int usched_dfly_rrinterval = (ESTCPUFREQ + 9) / 10;
 302 static int usched_dfly_decay = 8;
 303
 304 /* KTR debug printings */
 305
 306 KTR_INFO_MASTER(usched);
 307
 308 #if !defined(KTR_USCHED_DFLY)
 309 #define KTR_USCHED_DFLY KTR_ALL
 310 #endif
 311
 312 KTR_INFO(KTR_USCHED_DFLY, usched, chooseproc, 0,
 313     "USCHED_DFLY(chooseproc: pid %d, old_cpuid %d, curr_cpuid %d)",
 314     pid_t pid, int old_cpuid, int curr);
 315
 316 /*
 317  * This function is called when the kernel intends to return to userland.
 318  * It is responsible for making the thread the current designated userland
 319  * thread for this cpu, blocking if necessary.
 320  *
 321  * The kernel will not depress our LWKT priority until after we return,
 322  * in case we have to shove over to another cpu.
 323  *
 324  * We must determine our thread's disposition before we switch away.  This
 325  * is very sensitive code.
 326  *
 327  * WARNING! THIS FUNCTION IS ALLOWED TO CAUSE THE CURRENT THREAD TO MIGRATE
 328  * TO ANOTHER CPU!  Because most of the kernel assumes that no migration will
 329  * occur, this function is called only under very controlled circumstances.
 330  */
 331 static void
 332 dfly_acquire_curproc(struct lwp *lp)
 333 {
 334         globaldata_t gd;
 335         dfly_pcpu_t dd;
 336         dfly_pcpu_t rdd;
 337         thread_t td;
 338         int force_resched;
 339
 340         /*
 341          * Make sure we aren't sitting on a tsleep queue.
 342          */
 343         td = lp->lwp_thread;
 344         crit_enter_quick(td);
 345         if (td->td_flags & TDF_TSLEEPQ)
 346                 tsleep_remove(td);
 347         dfly_recalculate_estcpu(lp);
 348
 349         gd = mycpu;
 350         dd = &dfly_pcpu[gd->gd_cpuid];
 351
 352         /*
 353          * Process any pending interrupts/ipi's, then handle reschedule
 354          * requests.  dfly_release_curproc() will try to assign a new
 355          * uschedcp that isn't us and otherwise NULL it out.
 356          */
 357         force_resched = 0;
 358         if ((td->td_mpflags & TDF_MP_BATCH_DEMARC) &&
 359             lp->lwp_rrcount >= usched_dfly_rrinterval / 2) {
 360                 force_resched = 1;
 361         }
 362
 363         if (user_resched_wanted()) {
 364                 if (dd->uschedcp == lp)
 365                         force_resched = 1;
 366                 clear_user_resched();
 367                 dfly_release_curproc(lp);
 368         }
 369
 370         /*
 371          * Loop until we are the current user thread.
 372          *
 373          * NOTE: dd spinlock not held at top of loop.
 374          */
 375         if (dd->uschedcp == lp)
 376                 lwkt_yield_quick();
 377
 378         while (dd->uschedcp != lp) {
 379                 lwkt_yield_quick();
 380
 381                 spin_lock(&dd->spin);
 382
 383                 /* This lwp is an outcast; force reschedule. */
 384                 if (__predict_false(
 385                     CPUMASK_TESTBIT(lp->lwp_cpumask, gd->gd_cpuid) == 0) &&
 386                     (rdd = dfly_choose_best_queue(lp)) != dd) {
 387                         dfly_changeqcpu_locked(lp, dd, rdd);
 388                         spin_unlock(&dd->spin);
 389                         lwkt_deschedule(lp->lwp_thread);
 390                         dfly_setrunqueue_dd(rdd, lp);
 391                         lwkt_switch();
 392                         gd = mycpu;
 393                         dd = &dfly_pcpu[gd->gd_cpuid];
 394                         continue;
 395                 }
 396
 397                 if (force_resched &&
 398                    (usched_dfly_features & 0x08) &&
 399                    (rdd = dfly_choose_best_queue(lp)) != dd) {
 400                         /*
 401                          * We are not or are no longer the current lwp and a
 402                          * forced reschedule was requested.  Figure out the
 403                          * best cpu to run on (our current cpu will be given
 404                          * significant weight).
 405                          *
 406                          * (if a reschedule was not requested we want to
 407                          *  move this step after the uschedcp tests).
 408                          */
 409                         dfly_changeqcpu_locked(lp, dd, rdd);
 410                         spin_unlock(&dd->spin);
 411                         lwkt_deschedule(lp->lwp_thread);
 412                         dfly_setrunqueue_dd(rdd, lp);
 413                         lwkt_switch();
 414                         gd = mycpu;
 415                         dd = &dfly_pcpu[gd->gd_cpuid];
 416                         continue;
 417                 }
 418
 419                 /*
 420                  * Either no reschedule was requested or the best queue was
 421                  * dd, and no current process has been selected.  We can
 422                  * trivially become the current lwp on the current cpu.
 423                  */
 424                 if (dd->uschedcp == NULL) {
 425                         atomic_clear_int(&lp->lwp_thread->td_mpflags,
 426                                          TDF_MP_DIDYIELD);
 427                         ATOMIC_CPUMASK_ORBIT(dfly_curprocmask, gd->gd_cpuid);
 428                         dd->uschedcp = lp;
 429                         dd->upri = lp->lwp_priority;
 430                         KKASSERT(lp->lwp_qcpu == dd->cpuid);
 431                         spin_unlock(&dd->spin);
 432                         break;
 433                 }
 434
 435                 /*
 436                  * Put us back on the same run queue unconditionally.
 437                  *
 438                  * Set rrinterval to force placement at end of queue.
 439                  * Select the worst queue to ensure we round-robin,
 440                  * but do not change estcpu.
 441                  */
 442                 if (lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) {
 443                         u_int32_t tsqbits;
 444
 445                         switch(lp->lwp_rqtype) {
 446                         case RTP_PRIO_NORMAL:
 447                                 tsqbits = dd->queuebits;
 448                                 spin_unlock(&dd->spin);
 449
 450                                 lp->lwp_rrcount = usched_dfly_rrinterval;
 451                                 if (tsqbits)
 452                                         lp->lwp_rqindex = bsrl(tsqbits);
 453                                 break;
 454                         default:
 455                                 spin_unlock(&dd->spin);
 456                                 break;
 457                         }
 458                         lwkt_deschedule(lp->lwp_thread);
 459                         dfly_setrunqueue_dd(dd, lp);
 460                         atomic_clear_int(&lp->lwp_thread->td_mpflags,
 461                                          TDF_MP_DIDYIELD);
 462                         lwkt_switch();
 463                         gd = mycpu;
 464                         dd = &dfly_pcpu[gd->gd_cpuid];
 465                         continue;
 466                 }
 467
 468                 /*
 469                  * Can we steal the current designated user thread?
 470                  *
 471                  * If we do the other thread will stall when it tries to
 472                  * return to userland, possibly rescheduling elsewhere.
 473                  *
 474                  * It is important to do a masked test to avoid the edge
 475                  * case where two near-equal-priority threads are constantly
 476                  * interrupting each other.
 477                  *
 478                  * In the exact match case another thread has already gained
 479                  * uschedcp and lowered its priority, if we steal it the
 480                  * other thread will stay stuck on the LWKT runq and not
 481                  * push to another cpu.  So don't steal on equal-priority even
 482                  * though it might appear to be more beneficial due to not
 483                  * having to switch back to the other thread's context.
 484                  *
 485                  * usched_dfly_fast_resched requires that two threads be
 486                  * significantly far apart in priority in order to interrupt.
 487                  *
 488                  * If better but not sufficiently far apart, the current
 489                  * uschedcp will be interrupted at the next scheduler clock.
 490                  */
 491                 if (dd->uschedcp &&
 492                    (dd->upri & ~PPQMASK) >
 493                    (lp->lwp_priority & ~PPQMASK) + usched_dfly_fast_resched) {
 494                         dd->uschedcp = lp;
 495                         dd->upri = lp->lwp_priority;
 496                         KKASSERT(lp->lwp_qcpu == dd->cpuid);
 497                         spin_unlock(&dd->spin);
 498                         break;
 499                 }
 500                 /*
 501                  * We are not the current lwp, figure out the best cpu
 502                  * to run on (our current cpu will be given significant
 503                  * weight).  Loop on cpu change.
 504                  */
 505                 if ((usched_dfly_features & 0x02) &&
 506                     force_resched == 0 &&
 507                     (rdd = dfly_choose_best_queue(lp)) != dd) {
 508                         dfly_changeqcpu_locked(lp, dd, rdd);
 509                         spin_unlock(&dd->spin);
 510                         lwkt_deschedule(lp->lwp_thread);
 511                         dfly_setrunqueue_dd(rdd, lp);
 512                         lwkt_switch();
 513                         gd = mycpu;
 514                         dd = &dfly_pcpu[gd->gd_cpuid];
 515                         continue;
 516                 }
 517
 518                 /*
 519                  * We cannot become the current lwp, place the lp on the
 520                  * run-queue of this or another cpu and deschedule ourselves.
 521                  *
 522                  * When we are reactivated we will have another chance.
 523                  *
 524                  * Reload after a switch or setrunqueue/switch possibly
 525                  * moved us to another cpu.
 526                  */
 527                 spin_unlock(&dd->spin);
 528                 lwkt_deschedule(lp->lwp_thread);
 529                 dfly_setrunqueue_dd(dd, lp);
 530                 lwkt_switch();
 531                 gd = mycpu;
 532                 dd = &dfly_pcpu[gd->gd_cpuid];
 533         }
 534
 535         /*
 536          * Make sure upri is synchronized, then yield to LWKT threads as
 537          * needed before returning.  This could result in another reschedule.
 538          * XXX
 539          */
 540         crit_exit_quick(td);
 541
 542         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 543 }
 544
 545 /*
 546  * DFLY_RELEASE_CURPROC
 547  *
 548  * This routine detaches the current thread from the userland scheduler,
 549  * usually because the thread needs to run or block in the kernel (at
 550  * kernel priority) for a while.
 551  *
 552  * This routine is also responsible for selecting a new thread to
 553  * make the current thread.
 554  *
 555  * NOTE: This implementation differs from the dummy example in that
 556  * dfly_select_curproc() is able to select the current process, whereas
 557  * dummy_select_curproc() is not able to select the current process.
 558  * This means we have to NULL out uschedcp.
 559  *
 560  * Additionally, note that we may already be on a run queue if releasing
 561  * via the lwkt_switch() in dfly_setrunqueue().
 562  */
 563 static void
 564 dfly_release_curproc(struct lwp *lp)
 565 {
 566         globaldata_t gd = mycpu;
 567         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
 568
 569         /*
 570          * Make sure td_wakefromcpu is defaulted.  This will be overwritten
 571          * by wakeup().
 572          */
 573         if (dd->uschedcp == lp) {
 574                 KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
 575                 spin_lock(&dd->spin);
 576                 if (dd->uschedcp == lp) {
 577                         dd->uschedcp = NULL;    /* don't let lp be selected */
 578                         dd->upri = PRIBASE_NULL;
 579                         ATOMIC_CPUMASK_NANDBIT(dfly_curprocmask, gd->gd_cpuid);
 580                         spin_unlock(&dd->spin);
 581                         dfly_select_curproc(gd);
 582                 } else {
 583                         spin_unlock(&dd->spin);
 584                 }
 585         }
 586 }
 587
 588 /*
 589  * DFLY_SELECT_CURPROC
 590  *
 591  * Select a new current process for this cpu and clear any pending user
 592  * reschedule request.  The cpu currently has no current process.
 593  *
 594  * This routine is also responsible for equal-priority round-robining,
 595  * typically triggered from dfly_schedulerclock().  In our dummy example
 596  * all the 'user' threads are LWKT scheduled all at once and we just
 597  * call lwkt_switch().
 598  *
 599  * The calling process is not on the queue and cannot be selected.
 600  */
 601 static
 602 void
 603 dfly_select_curproc(globaldata_t gd)
 604 {
 605         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
 606         struct lwp *nlp;
 607         int cpuid = gd->gd_cpuid;
 608
 609         crit_enter_gd(gd);
 610
 611         spin_lock(&dd->spin);
 612         nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0);
 613
 614         if (nlp) {
 615                 ATOMIC_CPUMASK_ORBIT(dfly_curprocmask, cpuid);
 616                 dd->upri = nlp->lwp_priority;
 617                 dd->uschedcp = nlp;
 618 #if 0
 619                 dd->rrcount = 0;                /* reset round robin */
 620 #endif
 621                 spin_unlock(&dd->spin);
 622                 lwkt_acquire(nlp->lwp_thread);
 623                 lwkt_schedule(nlp->lwp_thread);
 624         } else {
 625                 spin_unlock(&dd->spin);
 626         }
 627         crit_exit_gd(gd);
 628 }
 629
 630 /*
 631  * Place the specified lwp on the user scheduler's run queue.  This routine
 632  * must be called with the thread descheduled.  The lwp must be runnable.
 633  * It must not be possible for anyone else to explicitly schedule this thread.
 634  *
 635  * The thread may be the current thread as a special case.
 636  */
 637 static void
 638 dfly_setrunqueue(struct lwp *lp)
 639 {
 640         dfly_pcpu_t dd;
 641         dfly_pcpu_t rdd;
 642
 643         /*
 644          * First validate the process LWKT state.
 645          */
 646         KASSERT(lp->lwp_stat == LSRUN, ("setrunqueue: lwp not LSRUN"));
 647         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0,
 648             ("lwp %d/%d already on runq! flag %08x/%08x", lp->lwp_proc->p_pid,
 649              lp->lwp_tid, lp->lwp_proc->p_flags, lp->lwp_flags));
 650         KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
 651
 652         /*
 653          * NOTE: dd/rdd do not necessarily represent the current cpu.
 654          *       Instead they may represent the cpu the thread was last
 655          *       scheduled on or inherited by its parent.
 656          */
 657         dd = &dfly_pcpu[lp->lwp_qcpu];
 658         rdd = dd;
 659
 660         /*
 661          * This process is not supposed to be scheduled anywhere or assigned
 662          * as the current process anywhere.  Assert the condition.
 663          */
 664         KKASSERT(rdd->uschedcp != lp);
 665
 666         /*
 667          * Ok, we have to setrunqueue some target cpu and request a reschedule
 668          * if necessary.
 669          *
 670          * We have to choose the best target cpu.  It might not be the current
 671          * target even if the current cpu has no running user thread (for
 672          * example, because the current cpu might be a hyperthread and its
 673          * sibling has a thread assigned).
 674          *
 675          * If we just forked it is most optimal to run the child on the same
 676          * cpu just in case the parent decides to wait for it (thus getting
 677          * off that cpu).  As long as there is nothing else runnable on the
 678          * cpu, that is.  If we did this unconditionally a parent forking
 679          * multiple children before waiting (e.g. make -j N) leaves other
 680          * cpus idle that could be working.
 681          */
 682         if (lp->lwp_forked) {
 683                 lp->lwp_forked = 0;
 684                 if (usched_dfly_features & 0x20)
 685                         rdd = dfly_choose_best_queue(lp);
 686                 else if (usched_dfly_features & 0x40)
 687                         rdd = &dfly_pcpu[lp->lwp_qcpu];
 688                 else if (usched_dfly_features & 0x80)
 689                         rdd = dfly_choose_queue_simple(rdd, lp);
 690                 else if (dfly_pcpu[lp->lwp_qcpu].runqcount)
 691                         rdd = dfly_choose_best_queue(lp);
 692                 else
 693                         rdd = &dfly_pcpu[lp->lwp_qcpu];
 694         } else {
 695                 rdd = dfly_choose_best_queue(lp);
 696                 /* rdd = &dfly_pcpu[lp->lwp_qcpu]; */
 697         }
 698         if (lp->lwp_qcpu != rdd->cpuid) {
 699                 spin_lock(&dd->spin);
 700                 dfly_changeqcpu_locked(lp, dd, rdd);
 701                 spin_unlock(&dd->spin);
 702         }
 703         dfly_setrunqueue_dd(rdd, lp);
 704 }
 705
 706 /*
 707  * Change qcpu to rdd->cpuid.  The dd the lp is CURRENTLY on must be
 708  * spin-locked on-call.  rdd does not have to be.
 709  */
 710 static void
 711 dfly_changeqcpu_locked(struct lwp *lp, dfly_pcpu_t dd, dfly_pcpu_t rdd)
 712 {
 713         if (lp->lwp_qcpu != rdd->cpuid) {
 714                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
 715                         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
 716                         atomic_add_long(&dd->uload, -lp->lwp_uload);
 717                         atomic_add_int(&dd->ucount, -1);
 718                 }
 719                 lp->lwp_qcpu = rdd->cpuid;
 720         }
 721 }
 722
 723 /*
 724  * Place lp on rdd's runqueue.  Nothing is locked on call.  This function
 725  * also performs all necessary ancillary notification actions.
 726  */
 727 static void
 728 dfly_setrunqueue_dd(dfly_pcpu_t rdd, struct lwp *lp)
 729 {
 730         globaldata_t rgd;
 731
 732         /*
 733          * We might be moving the lp to another cpu's run queue, and once
 734          * on the runqueue (even if it is our cpu's), another cpu can rip
 735          * it away from us.
 736          *
 737          * TDF_MIGRATING might already be set if this is part of a
 738          * remrunqueue+setrunqueue sequence.
 739          */
 740         if ((lp->lwp_thread->td_flags & TDF_MIGRATING) == 0)
 741                 lwkt_giveaway(lp->lwp_thread);
 742
 743         rgd = globaldata_find(rdd->cpuid);
 744
 745         /*
 746          * We lose control of the lp the moment we release the spinlock
 747          * after having placed it on the queue.  i.e. another cpu could pick
 748          * it up, or it could exit, or its priority could be further
 749          * adjusted, or something like that.
 750          *
 751          * WARNING! rdd can point to a foreign cpu!
 752          */
 753         spin_lock(&rdd->spin);
 754         dfly_setrunqueue_locked(rdd, lp);
 755
 756         /*
 757          * Potentially interrupt the currently-running thread
 758          */
 759         if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK)) {
 760                 /*
 761                  * Currently running thread is better or same, do not
 762                  * interrupt.
 763                  */
 764                 spin_unlock(&rdd->spin);
 765         } else if ((rdd->upri & ~PPQMASK) <= (lp->lwp_priority & ~PPQMASK) +
 766                    usched_dfly_fast_resched) {
 767                 /*
 768                  * Currently running thread is not better, but not so bad
 769                  * that we need to interrupt it.  Let it run for one more
 770                  * scheduler tick.
 771                  */
 772                 if (rdd->uschedcp &&
 773                     rdd->uschedcp->lwp_rrcount < usched_dfly_rrinterval) {
 774                         rdd->uschedcp->lwp_rrcount = usched_dfly_rrinterval - 1;
 775                 }
 776                 spin_unlock(&rdd->spin);
 777         } else if (rgd == mycpu) {
 778                 /*
 779                  * We should interrupt the currently running thread, which
 780                  * is on the current cpu.  However, if DIDYIELD is set we
 781                  * round-robin unconditionally and do not interrupt it.
 782                  */
 783                 spin_unlock(&rdd->spin);
 784                 if (rdd->uschedcp == NULL)
 785                         wakeup_mycpu(rdd->helper_thread); /* XXX */
 786                 if ((lp->lwp_thread->td_mpflags & TDF_MP_DIDYIELD) == 0)
 787                         need_user_resched();
 788         } else {
 789                 /*
 790                  * We should interrupt the currently running thread, which
 791                  * is on a different cpu.
 792                  */
 793                 spin_unlock(&rdd->spin);
 794                 lwkt_send_ipiq(rgd, dfly_need_user_resched_remote, NULL);
 795         }
 796 }
 797
 798 /*
 799  * This routine is called from a systimer IPI.  It MUST be MP-safe and
 800  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ on
 801  * each cpu.
 802  */
 803 static
 804 void
 805 dfly_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
 806 {
 807         globaldata_t gd = mycpu;
 808         dfly_pcpu_t dd = &dfly_pcpu[gd->gd_cpuid];
 809
 810         /*
 811          * Spinlocks also hold a critical section so there should not be
 812          * any active.
 813          */
 814         KKASSERT(gd->gd_spinlocks == 0 || dumping);
 815
 816         /*
 817          * If lp is NULL we might be contended and lwkt_switch() may have
 818          * cycled into the idle thread.  Apply the tick to the current
 819          * process on this cpu if it is contended.
 820          */
 821         if (gd->gd_curthread == &gd->gd_idlethread) {
 822                 lp = dd->uschedcp;
 823                 if (lp && (lp->lwp_thread == NULL ||
 824                            lp->lwp_thread->td_contended == 0)) {
 825                         lp = NULL;
 826                 }
 827         }
 828
 829         /*
 830          * Dock thread for tick
 831          */
 832         if (lp) {
 833                 /*
 834                  * Do we need to round-robin?  We round-robin 10 times a
 835                  * second.  This should only occur for cpu-bound batch
 836                  * processes.
 837                  */
 838                 if (++lp->lwp_rrcount >= usched_dfly_rrinterval) {
 839                         lp->lwp_thread->td_wakefromcpu = -1;
 840                         need_user_resched();
 841                 }
 842
 843                 /*
 844                  * Adjust estcpu upward using a real time equivalent
 845                  * calculation, and recalculate lp's priority.  Estcpu
 846                  * is increased such that it will cap-out over a period
 847                  * of one second.
 848                  */
 849                 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu +
 850                                            ESTCPUMAX / ESTCPUFREQ + 1);
 851                 dfly_resetpriority(lp);
 852         }
 853
 854         /*
 855          * Rebalance two cpus every 8 ticks, pulling the worst thread
 856          * from the worst cpu's queue into a rotating cpu number.
 857          *
 858          * This mechanic is needed because the push algorithms can
 859          * steady-state in an non-optimal configuration.  We need to mix it
 860          * up a little, even if it means breaking up a paired thread, so
 861          * the push algorithms can rebalance the degenerate conditions.
 862          * This portion of the algorithm exists to ensure stability at the
 863          * selected weightings.
 864          *
 865          * Because we might be breaking up optimal conditions we do not want
 866          * to execute this too quickly, hence we only rebalance approximately
 867          * ~7-8 times per second.  The push's, on the otherhand, are capable
 868          * moving threads to other cpus at a much higher rate.
 869          *
 870          * We choose the most heavily loaded thread from the worst queue
 871          * in order to ensure that multiple heavy-weight threads on the same
 872          * queue get broken up, and also because these threads are the most
 873          * likely to be able to remain in place.  Hopefully then any pairings,
 874          * if applicable, migrate to where these threads are.
 875          */
 876         if ((usched_dfly_features & 0x04) &&
 877             ((u_int)sched_ticks & 7) == 0 &&
 878             (u_int)sched_ticks / 8 % ncpus == gd->gd_cpuid) {
 879                 /*
 880                  * Our cpu is up.
 881                  */
 882                 struct lwp *nlp;
 883                 dfly_pcpu_t rdd;
 884
 885                 rdd = dfly_choose_worst_queue(dd);
 886                 if (rdd) {
 887                         spin_lock(&dd->spin);
 888                         if (spin_trylock(&rdd->spin)) {
 889                                 nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
 890                                 spin_unlock(&rdd->spin);
 891                                 if (nlp == NULL)
 892                                         spin_unlock(&dd->spin);
 893                         } else {
 894                                 spin_unlock(&dd->spin);
 895                                 nlp = NULL;
 896                         }
 897                 } else {
 898                         nlp = NULL;
 899                 }
 900                 /* dd->spin held if nlp != NULL */
 901
 902                 /*
 903                  * Either schedule it or add it to our queue.
 904                  */
 905                 if (nlp &&
 906                     (nlp->lwp_priority & ~PPQMASK) < (dd->upri & ~PPQMASK)) {
 907                         ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, dd->cpumask);
 908                         dd->upri = nlp->lwp_priority;
 909                         dd->uschedcp = nlp;
 910 #if 0
 911                         dd->rrcount = 0;        /* reset round robin */
 912 #endif
 913                         spin_unlock(&dd->spin);
 914                         lwkt_acquire(nlp->lwp_thread);
 915                         lwkt_schedule(nlp->lwp_thread);
 916                 } else if (nlp) {
 917                         dfly_setrunqueue_locked(dd, nlp);
 918                         spin_unlock(&dd->spin);
 919                 }
 920         }
 921 }
 922
 923 /*
 924  * Called from acquire and from kern_synch's one-second timer (one of the
 925  * callout helper threads) with a critical section held.
 926  *
 927  * Adjust p_estcpu based on our single-cpu load, p_nice, and compensate for
 928  * overall system load.
 929  *
 930  * Note that no recalculation occurs for a process which sleeps and wakes
 931  * up in the same tick.  That is, a system doing thousands of context
 932  * switches per second will still only do serious estcpu calculations
 933  * ESTCPUFREQ times per second.
 934  */
 935 static
 936 void
 937 dfly_recalculate_estcpu(struct lwp *lp)
 938 {
 939         globaldata_t gd = mycpu;
 940         sysclock_t cpbase;
 941         sysclock_t ttlticks;
 942         int estcpu;
 943         int decay_factor;
 944         int ucount;
 945
 946         /*
 947          * We have to subtract periodic to get the last schedclock
 948          * timeout time, otherwise we would get the upcoming timeout.
 949          * Keep in mind that a process can migrate between cpus and
 950          * while the scheduler clock should be very close, boundary
 951          * conditions could lead to a small negative delta.
 952          */
 953         cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
 954
 955         if (lp->lwp_slptime > 1) {
 956                 /*
 957                  * Too much time has passed, do a coarse correction.
 958                  */
 959                 lp->lwp_estcpu = lp->lwp_estcpu >> 1;
 960                 dfly_resetpriority(lp);
 961                 lp->lwp_cpbase = cpbase;
 962                 lp->lwp_cpticks = 0;
 963                 lp->lwp_estfast = 0;
 964         } else if (lp->lwp_cpbase != cpbase) {
 965                 /*
 966                  * Adjust estcpu if we are in a different tick.  Don't waste
 967                  * time if we are in the same tick.
 968                  *
 969                  * First calculate the number of ticks in the measurement
 970                  * interval.  The ttlticks calculation can wind up 0 due to
 971                  * a bug in the handling of lwp_slptime  (as yet not found),
 972                  * so make sure we do not get a divide by 0 panic.
 973                  */
 974                 ttlticks = (cpbase - lp->lwp_cpbase) /
 975                            gd->gd_schedclock.periodic;
 976                 if ((ssysclock_t)ttlticks < 0) {
 977                         ttlticks = 0;
 978                         lp->lwp_cpbase = cpbase;
 979                 }
 980                 if (ttlticks < 4)
 981                         return;
 982                 updatepcpu(lp, lp->lwp_cpticks, ttlticks);
 983
 984                 /*
 985                  * Calculate instant estcpu based percentage of (one) cpu
 986                  * used and exponentially average it into the current
 987                  * lwp_estcpu.
 988                  */
 989                 ucount = dfly_pcpu[lp->lwp_qcpu].ucount;
 990                 estcpu = lp->lwp_cpticks * ESTCPUMAX / ttlticks;
 991
 992                 /*
 993                  * The higher ttlticks gets, the more meaning the calculation
 994                  * has and the smaller our decay_factor in the exponential
 995                  * average.
 996                  *
 997                  * The uload calculation has been removed because it actually
 998                  * makes things worse, causing processes which use less cpu
 999                  * (such as a browser) to be pumped up and treated the same
1000                  * as a cpu-bound process (such as a make).  The same effect
1001                  * can occur with sufficient load without the uload
1002                  * calculation, but occurs less quickly and takes more load.
1003                  * In addition, the less cpu a process uses the smaller the
1004                  * effect of the overload.
1005                  */
1006                 if (ttlticks >= hz)
1007                         decay_factor = 1;
1008                 else
1009                         decay_factor = hz - ttlticks;
1010
1011                 lp->lwp_estcpu = ESTCPULIM(
1012                                 (lp->lwp_estcpu * ttlticks + estcpu) /
1013                                 (ttlticks + 1));
1014                 if (usched_dfly_debug == lp->lwp_proc->p_pid)
1015                         kprintf(" finalestcpu %d %d\n", estcpu, lp->lwp_estcpu);
1016
1017 #if 0
1018                 /*
1019                  * Calculate the percentage of one cpu being used then
1020                  * compensate for any system load in excess of ncpus.
1021                  *
1022                  * For example, if we have 8 cores and 16 running cpu-bound
1023                  * processes then all things being equal each process will
1024                  * get 50% of one cpu.  We need to pump this value back
1025                  * up to 100% so the estcpu calculation properly adjusts
1026                  * the process's dynamic priority.
1027                  *
1028                  * estcpu is scaled by ESTCPUMAX, pctcpu is scaled by FSCALE.
1029                  */
1030
1031                 estcpu = (lp->lwp_pctcpu * ESTCPUMAX) >> FSHIFT;
1032                 ucount = dfly_ucount;
1033                 if (ucount > ncpus) {
1034                         estcpu += estcpu * (ucount - ncpus) / ncpus;
1035                 }
1036
1037                 if (usched_dfly_debug == lp->lwp_proc->p_pid) {
1038                         kprintf("pid %d lwp %p estcpu %3d %3d cp %d/%d",
1039                                 lp->lwp_proc->p_pid, lp,
1040                                 estcpu, lp->lwp_estcpu,
1041                                 lp->lwp_cpticks, ttlticks);
1042                 }
1043
1044                 /*
1045                  * Adjust lp->lwp_esetcpu.  The decay factor determines how
1046                  * quickly lwp_estcpu collapses to its realtime calculation.
1047                  * A slower collapse gives us a more accurate number over
1048                  * the long term but can create problems with bursty threads
1049                  * or threads which become cpu hogs.
1050                  *
1051                  * To solve this problem, newly started lwps and lwps which
1052                  * are restarting after having been asleep for a while are
1053                  * given a much, much faster decay in order to quickly
1054                  * detect whether they become cpu-bound.
1055                  *
1056                  * NOTE: p_nice is accounted for in dfly_resetpriority(),
1057                  *       and not here, but we must still ensure that a
1058                  *       cpu-bound nice -20 process does not completely
1059                  *       override a cpu-bound nice +20 process.
1060                  *
1061                  * NOTE: We must use ESTCPULIM() here to deal with any
1062                  *       overshoot.
1063                  */
1064                 decay_factor = usched_dfly_decay;
1065                 if (decay_factor < 1)
1066                         decay_factor = 1;
1067                 if (decay_factor > 1024)
1068                         decay_factor = 1024;
1069
1070                 if (lp->lwp_estfast < usched_dfly_decay) {
1071                         ++lp->lwp_estfast;
1072                         lp->lwp_estcpu = ESTCPULIM(
1073                                 (lp->lwp_estcpu * lp->lwp_estfast + estcpu) /
1074                                 (lp->lwp_estfast + 1));
1075                 } else {
1076                         lp->lwp_estcpu = ESTCPULIM(
1077                                 (lp->lwp_estcpu * decay_factor + estcpu) /
1078                                 (decay_factor + 1));
1079                 }
1080
1081                 if (usched_dfly_debug == lp->lwp_proc->p_pid)
1082                         kprintf(" finalestcpu %d\n", lp->lwp_estcpu);
1083 #endif
1084                 dfly_resetpriority(lp);
1085                 lp->lwp_cpbase += ttlticks * gd->gd_schedclock.periodic;
1086                 lp->lwp_cpticks = 0;
1087         }
1088 }
1089
1090 /*
1091  * Compute the priority of a process when running in user mode.
1092  * Arrange to reschedule if the resulting priority is better
1093  * than that of the current process.
1094  *
1095  * This routine may be called with any process.
1096  *
1097  * This routine is called by fork1() for initial setup with the process of
1098  * the run queue, and also may be called normally with the process on or
1099  * off the run queue.
1100  */
1101 static void
1102 dfly_resetpriority(struct lwp *lp)
1103 {
1104         dfly_pcpu_t rdd;
1105         int newpriority;
1106         u_short newrqtype;
1107         int rcpu;
1108         int checkpri;
1109         int estcpu;
1110         int delta_uload;
1111
1112         crit_enter();
1113
1114         /*
1115          * Lock the scheduler (lp) belongs to.  This can be on a different
1116          * cpu.  Handle races.  This loop breaks out with the appropriate
1117          * rdd locked.
1118          */
1119         for (;;) {
1120                 rcpu = lp->lwp_qcpu;
1121                 cpu_ccfence();
1122                 rdd = &dfly_pcpu[rcpu];
1123                 spin_lock(&rdd->spin);
1124                 if (rcpu == lp->lwp_qcpu)
1125                         break;
1126                 spin_unlock(&rdd->spin);
1127         }
1128
1129         /*
1130          * Calculate the new priority and queue type
1131          */
1132         newrqtype = lp->lwp_rtprio.type;
1133
1134         switch(newrqtype) {
1135         case RTP_PRIO_REALTIME:
1136         case RTP_PRIO_FIFO:
1137                 newpriority = PRIBASE_REALTIME +
1138                              (lp->lwp_rtprio.prio & PRIMASK);
1139                 break;
1140         case RTP_PRIO_NORMAL:
1141                 /*
1142                  * Calculate the new priority.
1143                  *
1144                  * nice contributes up to NICE_QS queues (typ 32 - full range)
1145                  * estcpu contributes up to EST_QS queues (typ 16)
1146                  *
1147                  * A nice +20 process receives 1/10 cpu vs nice+0.  Niced
1148                  * process more than 20 apart may receive no cpu, so cpu
1149                  * bound nice -20 can prevent a nice +5 from getting any
1150                  * cpu.  A nice+0, being in the middle, always gets some cpu
1151                  * no matter what.
1152                  */
1153                 estcpu = lp->lwp_estcpu;
1154                 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) *
1155                               (NICE_QS * PPQ) / PRIO_RANGE;
1156                 newpriority += estcpu * PPQ / ESTCPUPPQ;
1157                 if (newpriority < 0)
1158                         newpriority = 0;
1159                 if (newpriority >= MAXPRI)
1160                         newpriority = MAXPRI - 1;
1161                 newpriority += PRIBASE_NORMAL;
1162                 break;
1163         case RTP_PRIO_IDLE:
1164                 newpriority = PRIBASE_IDLE + (lp->lwp_rtprio.prio & PRIMASK);
1165                 break;
1166         case RTP_PRIO_THREAD:
1167                 newpriority = PRIBASE_THREAD + (lp->lwp_rtprio.prio & PRIMASK);
1168                 break;
1169         default:
1170                 panic("Bad RTP_PRIO %d", newrqtype);
1171                 /* NOT REACHED */
1172         }
1173
1174         /*
1175          * The LWKT scheduler doesn't dive usched structures, give it a hint
1176          * on the relative priority of user threads running in the kernel.
1177          * The LWKT scheduler will always ensure that a user thread running
1178          * in the kernel will get cpu some time, regardless of its upri,
1179          * but can decide not to instantly switch from one kernel or user
1180          * mode user thread to a kernel-mode user thread when it has a less
1181          * desireable user priority.
1182          *
1183          * td_upri has normal sense (higher values are more desireable), so
1184          * negate it (this is a different field lp->lwp_priority)
1185          */
1186         lp->lwp_thread->td_upri = -(newpriority & usched_dfly_swmask);
1187
1188         /*
1189          * The newpriority incorporates the queue type so do a simple masked
1190          * check to determine if the process has moved to another queue.  If
1191          * it has, and it is currently on a run queue, then move it.
1192          *
1193          * Since uload is ~PPQMASK masked, no modifications are necessary if
1194          * we end up in the same run queue.
1195          *
1196          * Reset rrcount if moving to a higher-priority queue, otherwise
1197          * retain rrcount.
1198          */
1199         if ((lp->lwp_priority ^ newpriority) & ~PPQMASK) {
1200                 if (lp->lwp_priority < newpriority)
1201                         lp->lwp_rrcount = 0;
1202                 if (lp->lwp_mpflags & LWP_MP_ONRUNQ) {
1203                         dfly_remrunqueue_locked(rdd, lp);
1204                         lp->lwp_priority = newpriority;
1205                         lp->lwp_rqtype = newrqtype;
1206                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1207                         dfly_setrunqueue_locked(rdd, lp);
1208                         checkpri = 1;
1209                 } else {
1210                         lp->lwp_priority = newpriority;
1211                         lp->lwp_rqtype = newrqtype;
1212                         lp->lwp_rqindex = (newpriority & PRIMASK) / PPQ;
1213                         checkpri = 0;
1214                 }
1215         } else {
1216                 /*
1217                  * In the same PPQ, uload cannot change.
1218                  */
1219                 lp->lwp_priority = newpriority;
1220                 checkpri = 1;
1221                 rcpu = -1;
1222         }
1223
1224         /*
1225          * Adjust effective load.
1226          *
1227          * Calculate load then scale up or down geometrically based on p_nice.
1228          * Processes niced up (positive) are less important, and processes
1229          * niced downard (negative) are more important.  The higher the uload,
1230          * the more important the thread.
1231          */
1232         /* 0-511, 0-100% cpu */
1233         delta_uload = lp->lwp_estcpu / NQS;
1234         delta_uload -= delta_uload * lp->lwp_proc->p_nice / (PRIO_MAX + 1);
1235         delta_uload -= lp->lwp_uload;
1236         if (lp->lwp_uload + delta_uload < -32767) {
1237                 delta_uload = -32768 - lp->lwp_uload;
1238         } else if (lp->lwp_uload + delta_uload > 32767) {
1239                 delta_uload = 32767 - lp->lwp_uload;
1240         }
1241         lp->lwp_uload += delta_uload;
1242         if (lp->lwp_mpflags & LWP_MP_ULOAD)
1243                 atomic_add_long(&dfly_pcpu[lp->lwp_qcpu].uload, delta_uload);
1244
1245         /*
1246          * Determine if we need to reschedule the target cpu.  This only
1247          * occurs if the LWP is already on a scheduler queue, which means
1248          * that idle cpu notification has already occured.  At most we
1249          * need only issue a need_user_resched() on the appropriate cpu.
1250          *
1251          * The LWP may be owned by a CPU different from the current one,
1252          * in which case dd->uschedcp may be modified without an MP lock
1253          * or a spinlock held.  The worst that happens is that the code
1254          * below causes a spurious need_user_resched() on the target CPU
1255          * and dd->pri to be wrong for a short period of time, both of
1256          * which are harmless.
1257          *
1258          * If checkpri is 0 we are adjusting the priority of the current
1259          * process, possibly higher (less desireable), so ignore the upri
1260          * check which will fail in that case.
1261          */
1262         if (rcpu >= 0) {
1263                 if (CPUMASK_TESTBIT(dfly_rdyprocmask, rcpu) &&
1264                     (checkpri == 0 ||
1265                      (rdd->upri & ~PRIMASK) >
1266                      (lp->lwp_priority & ~PRIMASK))) {
1267                         if (rcpu == mycpu->gd_cpuid) {
1268                                 spin_unlock(&rdd->spin);
1269                                 need_user_resched();
1270                         } else {
1271                                 spin_unlock(&rdd->spin);
1272                                 lwkt_send_ipiq(globaldata_find(rcpu),
1273                                                dfly_need_user_resched_remote,
1274                                                NULL);
1275                         }
1276                 } else {
1277                         spin_unlock(&rdd->spin);
1278                 }
1279         } else {
1280                 spin_unlock(&rdd->spin);
1281         }
1282         crit_exit();
1283 }
1284
1285 static
1286 void
1287 dfly_yield(struct lwp *lp)
1288 {
1289         if (lp->lwp_qcpu != mycpu->gd_cpuid)
1290                 return;
1291         KKASSERT(lp == curthread->td_lwp);
1292
1293         /*
1294          * Don't set need_user_resched() or mess with rrcount or anything.
1295          * the TDF flag will override everything as long as we release.
1296          */
1297         atomic_set_int(&lp->lwp_thread->td_mpflags, TDF_MP_DIDYIELD);
1298         dfly_release_curproc(lp);
1299 }
1300
1301 /*
1302  * Thread was forcefully migrated to another cpu.  Normally forced migrations
1303  * are used for iterations and the kernel returns to the original cpu before
1304  * returning and this is not needed.  However, if the kernel migrates a
1305  * thread to another cpu and wants to leave it there, it has to call this
1306  * scheduler helper.
1307  *
1308  * Note that the lwkt_migratecpu() function also released the thread, so
1309  * we don't have to worry about that.
1310  */
1311 static
1312 void
1313 dfly_changedcpu(struct lwp *lp)
1314 {
1315         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1316         dfly_pcpu_t rdd = &dfly_pcpu[mycpu->gd_cpuid];
1317
1318         if (dd != rdd) {
1319                 spin_lock(&dd->spin);
1320                 dfly_changeqcpu_locked(lp, dd, rdd);
1321                 spin_unlock(&dd->spin);
1322         }
1323 }
1324
1325 /*
1326  * Called from fork1() when a new child process is being created.
1327  *
1328  * Give the child process an initial estcpu that is more batch then
1329  * its parent and dock the parent for the fork (but do not
1330  * reschedule the parent).
1331  *
1332  * fast
1333  *
1334  * XXX lwp should be "spawning" instead of "forking"
1335  */
1336 static void
1337 dfly_forking(struct lwp *plp, struct lwp *lp)
1338 {
1339         int estcpu;
1340
1341         /*
1342          * Put the child 4 queue slots (out of 32) higher than the parent
1343          * (less desireable than the parent).
1344          */
1345         lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu +
1346                                    ESTCPUPPQ * usched_dfly_forkbias);
1347         lp->lwp_forked = 1;
1348         lp->lwp_estfast = 0;
1349
1350         /*
1351          * Even though the lp will be scheduled specially the first time
1352          * due to lp->lwp_forked, it is important to initialize lwp_qcpu
1353          * to avoid favoring a fixed cpu.
1354          */
1355 #if 0
1356         static uint16_t save_cpu;
1357         lp->lwp_qcpu = ++save_cpu % ncpus;
1358 #else
1359         lp->lwp_qcpu = plp->lwp_qcpu;
1360         if (CPUMASK_TESTBIT(lp->lwp_cpumask, lp->lwp_qcpu) == 0)
1361                 lp->lwp_qcpu = BSFCPUMASK(lp->lwp_cpumask);
1362 #endif
1363
1364         /*
1365          * Dock the parent a cost for the fork, protecting us from fork
1366          * bombs.  If the parent is forking quickly this makes both the
1367          * parent and child more batchy.
1368          */
1369         estcpu = plp->lwp_estcpu + ESTCPUPPQ / 16;
1370         plp->lwp_estcpu = ESTCPULIM(estcpu);
1371 }
1372
1373 /*
1374  * Called when a lwp is being removed from this scheduler, typically
1375  * during lwp_exit().  We have to clean out any ULOAD accounting before
1376  * we can let the lp go.  The dd->spin lock is not needed for uload
1377  * updates.
1378  *
1379  * Scheduler dequeueing has already occurred, no further action in that
1380  * regard is needed.
1381  */
1382 static void
1383 dfly_exiting(struct lwp *lp, struct proc *child_proc)
1384 {
1385         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1386
1387         if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1388                 atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1389                 atomic_add_long(&dd->uload, -lp->lwp_uload);
1390                 atomic_add_int(&dd->ucount, -1);
1391         }
1392 }
1393
1394 /*
1395  * This function cannot block in any way, but spinlocks are ok.
1396  *
1397  * Update the uload based on the state of the thread (whether it is going
1398  * to sleep or running again).  The uload is meant to be a longer-term
1399  * load and not an instantanious load.
1400  */
1401 static void
1402 dfly_uload_update(struct lwp *lp)
1403 {
1404         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1405
1406         if (lp->lwp_thread->td_flags & TDF_RUNQ) {
1407                 if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
1408                         spin_lock(&dd->spin);
1409                         if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
1410                                 atomic_set_int(&lp->lwp_mpflags,
1411                                                LWP_MP_ULOAD);
1412                                 atomic_add_long(&dd->uload, lp->lwp_uload);
1413                                 atomic_add_int(&dd->ucount, 1);
1414                         }
1415                         spin_unlock(&dd->spin);
1416                 }
1417         } else if (lp->lwp_slptime > 0) {
1418                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1419                         spin_lock(&dd->spin);
1420                         if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1421                                 atomic_clear_int(&lp->lwp_mpflags,
1422                                                  LWP_MP_ULOAD);
1423                                 atomic_add_long(&dd->uload, -lp->lwp_uload);
1424                                 atomic_add_int(&dd->ucount, -1);
1425                         }
1426                         spin_unlock(&dd->spin);
1427                 }
1428         }
1429 }
1430
1431 /*
1432  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
1433  * it selects a user process and returns it.  If chklp is non-NULL and chklp
1434  * has a better or equal priority then the process that would otherwise be
1435  * chosen, NULL is returned.
1436  *
1437  * Until we fix the RUNQ code the chklp test has to be strict or we may
1438  * bounce between processes trying to acquire the current process designation.
1439  *
1440  * Must be called with rdd->spin locked.  The spinlock is left intact through
1441  * the entire routine.  dd->spin does not have to be locked.
1442  *
1443  * If worst is non-zero this function finds the worst thread instead of the
1444  * best thread (used by the schedulerclock-based rover).
1445  */
1446 static
1447 struct lwp *
1448 dfly_chooseproc_locked(dfly_pcpu_t rdd, dfly_pcpu_t dd,
1449                        struct lwp *chklp, int worst)
1450 {
1451         struct lwp *lp;
1452         struct rq *q;
1453         u_int32_t *which;
1454         u_int32_t pri;
1455         u_int32_t rtqbits;
1456         u_int32_t tsqbits;
1457         u_int32_t idqbits;
1458
1459         rtqbits = rdd->rtqueuebits;
1460         tsqbits = rdd->queuebits;
1461         idqbits = rdd->idqueuebits;
1462
1463         if (worst) {
1464                 if (idqbits) {
1465                         pri = bsrl(idqbits);
1466                         q = &rdd->idqueues[pri];
1467                         which = &rdd->idqueuebits;
1468                 } else if (tsqbits) {
1469                         pri = bsrl(tsqbits);
1470                         q = &rdd->queues[pri];
1471                         which = &rdd->queuebits;
1472                 } else if (rtqbits) {
1473                         pri = bsrl(rtqbits);
1474                         q = &rdd->rtqueues[pri];
1475                         which = &rdd->rtqueuebits;
1476                 } else {
1477                         return (NULL);
1478                 }
1479                 lp = TAILQ_LAST(q, rq);
1480         } else {
1481                 if (rtqbits) {
1482                         pri = bsfl(rtqbits);
1483                         q = &rdd->rtqueues[pri];
1484                         which = &rdd->rtqueuebits;
1485                 } else if (tsqbits) {
1486                         pri = bsfl(tsqbits);
1487                         q = &rdd->queues[pri];
1488                         which = &rdd->queuebits;
1489                 } else if (idqbits) {
1490                         pri = bsfl(idqbits);
1491                         q = &rdd->idqueues[pri];
1492                         which = &rdd->idqueuebits;
1493                 } else {
1494                         return (NULL);
1495                 }
1496                 lp = TAILQ_FIRST(q);
1497         }
1498         KASSERT(lp, ("chooseproc: no lwp on busy queue"));
1499
1500         /*
1501          * If the passed lwp <chklp> is reasonably close to the selected
1502          * lwp <lp>, return NULL (indicating that <chklp> should be kept).
1503          *
1504          * Note that we must error on the side of <chklp> to avoid bouncing
1505          * between threads in the acquire code.
1506          */
1507         if (chklp) {
1508                 if (chklp->lwp_priority < lp->lwp_priority + PPQ)
1509                         return(NULL);
1510         }
1511
1512         KTR_COND_LOG(usched_chooseproc,
1513             lp->lwp_proc->p_pid == usched_dfly_pid_debug,
1514             lp->lwp_proc->p_pid,
1515             lp->lwp_thread->td_gd->gd_cpuid,
1516             mycpu->gd_cpuid);
1517
1518         KASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) != 0, ("not on runq6!"));
1519         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
1520         TAILQ_REMOVE(q, lp, lwp_procq);
1521         --rdd->runqcount;
1522         if (TAILQ_EMPTY(q))
1523                 *which &= ~(1 << pri);
1524
1525         /*
1526          * If we are choosing a process from rdd with the intent to
1527          * move it to dd, lwp_qcpu must be adjusted while rdd's spinlock
1528          * is still held.
1529          */
1530         if (rdd != dd) {
1531                 if (lp->lwp_mpflags & LWP_MP_ULOAD) {
1532                         atomic_add_long(&rdd->uload, -lp->lwp_uload);
1533                         atomic_add_int(&rdd->ucount, -1);
1534                 }
1535                 lp->lwp_qcpu = dd->cpuid;
1536                 atomic_add_long(&dd->uload, lp->lwp_uload);
1537                 atomic_add_int(&dd->ucount, 1);
1538                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
1539         }
1540         return lp;
1541 }
1542
1543 /*
1544  * USED TO PUSH RUNNABLE LWPS TO THE LEAST LOADED CPU.
1545  *
1546  * Choose a cpu node to schedule lp on, hopefully nearby its current
1547  * node.
1548  *
1549  * We give the current node a modest advantage for obvious reasons.
1550  *
1551  * We also give the node the thread was woken up FROM a slight advantage
1552  * in order to try to schedule paired threads which synchronize/block waiting
1553  * for each other fairly close to each other.  Similarly in a network setting
1554  * this feature will also attempt to place a user process near the kernel
1555  * protocol thread that is feeding it data.  THIS IS A CRITICAL PART of the
1556  * algorithm as it heuristically groups synchronizing processes for locality
1557  * of reference in multi-socket systems.
1558  *
1559  * We check against running processes and give a big advantage if there
1560  * are none running.
1561  *
1562  * The caller will normally dfly_setrunqueue() lp on the returned queue.
1563  *
1564  * When the topology is known choose a cpu whos group has, in aggregate,
1565  * has the lowest weighted load.
1566  */
1567 static
1568 dfly_pcpu_t
1569 dfly_choose_best_queue(struct lwp *lp)
1570 {
1571         cpumask_t wakemask;
1572         cpumask_t mask;
1573         cpu_node_t *cpup;
1574         cpu_node_t *cpun;
1575         cpu_node_t *cpub;
1576         dfly_pcpu_t dd = &dfly_pcpu[lp->lwp_qcpu];
1577         dfly_pcpu_t rdd;
1578         int wakecpu;
1579         int cpuid;
1580         int n;
1581         int count;
1582         long load;
1583         long lowest_load;
1584
1585         /*
1586          * When the topology is unknown choose a random cpu that is hopefully
1587          * idle.
1588          */
1589         if (dd->cpunode == NULL)
1590                 return (dfly_choose_queue_simple(dd, lp));
1591
1592         /*
1593          * Pairing mask
1594          */
1595         if ((wakecpu = lp->lwp_thread->td_wakefromcpu) >= 0)
1596                 wakemask = dfly_pcpu[wakecpu].cpumask;
1597         else
1598                 CPUMASK_ASSZERO(wakemask);
1599
1600         /*
1601          * When the topology is known choose a cpu whos group has, in
1602          * aggregate, has the lowest weighted load.
1603          */
1604         cpup = root_cpu_node;
1605         rdd = dd;
1606
1607         while (cpup) {
1608                 /*
1609                  * Degenerate case super-root
1610                  */
1611                 if (cpup->child_no == 1) {
1612                         cpup = cpup->child_node[0];
1613                         continue;
1614                 }
1615
1616                 /*
1617                  * Terminal cpunode
1618                  */
1619                 if (cpup->child_no == 0) {
1620                         rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
1621                         break;
1622                 }
1623
1624                 cpub = NULL;
1625                 lowest_load = 0x7FFFFFFFFFFFFFFFLLU;
1626
1627                 for (n = 0; n < cpup->child_no; ++n) {
1628                         /*
1629                          * Accumulate load information for all cpus
1630                          * which are members of this node.
1631                          */
1632                         cpun = cpup->child_node[n];
1633                         mask = cpun->members;
1634                         CPUMASK_ANDMASK(mask, usched_global_cpumask);
1635                         CPUMASK_ANDMASK(mask, smp_active_mask);
1636                         CPUMASK_ANDMASK(mask, lp->lwp_cpumask);
1637                         if (CPUMASK_TESTZERO(mask))
1638                                 continue;
1639
1640                         count = 0;
1641                         load = 0;
1642
1643                         while (CPUMASK_TESTNZERO(mask)) {
1644                                 cpuid = BSFCPUMASK(mask);
1645                                 rdd = &dfly_pcpu[cpuid];
1646                                 load += rdd->uload;
1647                                 load += rdd->ucount * usched_dfly_weight3;
1648
1649                                 if (rdd->uschedcp == NULL &&
1650                                     rdd->runqcount == 0 &&
1651                                     globaldata_find(cpuid)->gd_tdrunqcount == 0
1652                                 ) {
1653                                         load -= usched_dfly_weight4;
1654                                 }
1655 #if 0
1656                                 else if (rdd->upri > lp->lwp_priority + PPQ) {
1657                                         load -= usched_dfly_weight4 / 2;
1658                                 }
1659 #endif
1660                                 CPUMASK_NANDBIT(mask, cpuid);
1661                                 ++count;
1662                         }
1663
1664                         /*
1665                          * Compensate if the lp is already accounted for in
1666                          * the aggregate uload for this mask set.  We want
1667                          * to calculate the loads as if lp were not present,
1668                          * otherwise the calculation is bogus.
1669                          */
1670                         if ((lp->lwp_mpflags & LWP_MP_ULOAD) &&
1671                             CPUMASK_TESTMASK(dd->cpumask, cpun->members)) {
1672                                 load -= lp->lwp_uload;
1673                                 load -= usched_dfly_weight3;
1674                         }
1675
1676                         load /= count;
1677
1678                         /*
1679                          * Advantage the cpu group (lp) is already on.
1680                          */
1681                         if (CPUMASK_TESTMASK(cpun->members, dd->cpumask))
1682                                 load -= usched_dfly_weight1;
1683
1684                         /*
1685                          * Advantage the cpu group we want to pair (lp) to,
1686                          * but don't let it go to the exact same cpu as
1687                          * the wakecpu target.
1688                          *
1689                          * We do this by checking whether cpun is a
1690                          * terminal node or not.  All cpun's at the same
1691                          * level will either all be terminal or all not
1692                          * terminal.
1693                          *
1694                          * If it is and we match we disadvantage the load.
1695                          * If it is and we don't match we advantage the load.
1696                          *
1697                          * Also note that we are effectively disadvantaging
1698                          * all-but-one by the same amount, so it won't effect
1699                          * the weight1 factor for the all-but-one nodes.
1700                          */
1701                         if (CPUMASK_TESTMASK(cpun->members, wakemask)) {
1702                                 if (cpun->child_no != 0) {
1703                                         /* advantage */
1704                                         load -= usched_dfly_weight2;
1705                                 } else {
1706                                         if (usched_dfly_features & 0x10)
1707                                                 load += usched_dfly_weight2;
1708                                         else
1709                                                 load -= usched_dfly_weight2;
1710                                 }
1711                         }
1712
1713                         /*
1714                          * Calculate the best load
1715                          */
1716                         if (cpub == NULL || lowest_load > load ||
1717                             (lowest_load == load &&
1718                              CPUMASK_TESTMASK(cpun->members, dd->cpumask))
1719                         ) {
1720                                 lowest_load = load;
1721                                 cpub = cpun;
1722                         }
1723                 }
1724                 cpup = cpub;
1725         }
1726         /* Dispatch this outcast to a proper CPU. */
1727         if (__predict_false(CPUMASK_TESTBIT(lp->lwp_cpumask, rdd->cpuid) == 0))
1728                 rdd = &dfly_pcpu[BSFCPUMASK(lp->lwp_cpumask)];
1729         if (usched_dfly_chooser > 0) {
1730                 --usched_dfly_chooser;          /* only N lines */
1731                 kprintf("lp %02d->%02d %s\n",
1732                         lp->lwp_qcpu, rdd->cpuid, lp->lwp_proc->p_comm);
1733         }
1734         return (rdd);
1735 }
1736
1737 /*
1738  * USED TO PULL RUNNABLE LWPS FROM THE MOST LOADED CPU.
1739  *
1740  * Choose the worst queue close to dd's cpu node with a non-empty runq
1741  * that is NOT dd.  Also require that the moving of the highest-load thread
1742  * from rdd to dd does not cause the uload's to cross each other.
1743  *
1744  * This is used by the thread chooser when the current cpu's queues are
1745  * empty to steal a thread from another cpu's queue.  We want to offload
1746  * the most heavily-loaded queue.
1747  */
1748 static
1749 dfly_pcpu_t
1750 dfly_choose_worst_queue(dfly_pcpu_t dd)
1751 {
1752         cpumask_t mask;
1753         cpu_node_t *cpup;
1754         cpu_node_t *cpun;
1755         cpu_node_t *cpub;
1756         dfly_pcpu_t rdd;
1757         int cpuid;
1758         int n;
1759         int count;
1760         long load;
1761         long highest_load;
1762 #if 0
1763         int pri;
1764         int hpri;
1765 #endif
1766
1767         /*
1768          * When the topology is unknown choose a random cpu that is hopefully
1769          * idle.
1770          */
1771         if (dd->cpunode == NULL) {
1772                 return (NULL);
1773         }
1774
1775         /*
1776          * When the topology is known choose a cpu whos group has, in
1777          * aggregate, has the highest weighted load.
1778          */
1779         cpup = root_cpu_node;
1780         rdd = dd;
1781         while (cpup) {
1782                 /*
1783                  * Degenerate case super-root
1784                  */
1785                 if (cpup->child_no == 1) {
1786                         cpup = cpup->child_node[0];
1787                         continue;
1788                 }
1789
1790                 /*
1791                  * Terminal cpunode
1792                  */
1793                 if (cpup->child_no == 0) {
1794                         rdd = &dfly_pcpu[BSFCPUMASK(cpup->members)];
1795                         break;
1796                 }
1797
1798                 cpub = NULL;
1799                 highest_load = 0;
1800
1801                 for (n = 0; n < cpup->child_no; ++n) {
1802                         /*
1803                          * Accumulate load information for all cpus
1804                          * which are members of this node.
1805                          */
1806                         cpun = cpup->child_node[n];
1807                         mask = cpun->members;
1808                         CPUMASK_ANDMASK(mask, usched_global_cpumask);
1809                         CPUMASK_ANDMASK(mask, smp_active_mask);
1810                         if (CPUMASK_TESTZERO(mask))
1811                                 continue;
1812
1813                         count = 0;
1814                         load = 0;
1815
1816                         while (CPUMASK_TESTNZERO(mask)) {
1817                                 cpuid = BSFCPUMASK(mask);
1818                                 rdd = &dfly_pcpu[cpuid];
1819                                 load += rdd->uload;
1820                                 load += (long)rdd->ucount * usched_dfly_weight3;
1821
1822                                 if (rdd->uschedcp == NULL &&
1823                                     rdd->runqcount == 0 &&
1824                                     globaldata_find(cpuid)->gd_tdrunqcount == 0
1825                                 ) {
1826                                         load -= usched_dfly_weight4;
1827                                 }
1828 #if 0
1829                                 else if (rdd->upri > dd->upri + PPQ) {
1830                                         load -= usched_dfly_weight4 / 2;
1831                                 }
1832 #endif
1833                                 CPUMASK_NANDBIT(mask, cpuid);
1834                                 ++count;
1835                         }
1836                         load /= count;
1837
1838                         /*
1839                          * Prefer candidates which are somewhat closer to
1840                          * our cpu.
1841                          */
1842                         if (CPUMASK_TESTMASK(dd->cpumask, cpun->members))
1843                                 load += usched_dfly_weight1;
1844
1845                         /*
1846                          * The best candidate is the one with the worst
1847                          * (highest) load.
1848                          */
1849                         if (cpub == NULL || highest_load < load ||
1850                             (highest_load == load &&
1851                              CPUMASK_TESTMASK(cpun->members, dd->cpumask))) {
1852                                 highest_load = load;
1853                                 cpub = cpun;
1854                         }
1855                 }
1856                 cpup = cpub;
1857         }
1858
1859         /*
1860          * We never return our own node (dd), and only return a remote
1861          * node if it's load is significantly worse than ours (i.e. where
1862          * stealing a thread would be considered reasonable).
1863          *
1864          * This also helps us avoid breaking paired threads apart which
1865          * can have disastrous effects on performance.
1866          */
1867         if (rdd == dd)
1868                 return(NULL);
1869
1870 #if 0
1871         hpri = 0;
1872         if (rdd->rtqueuebits && hpri < (pri = bsrl(rdd->rtqueuebits)))
1873                 hpri = pri;
1874         if (rdd->queuebits && hpri < (pri = bsrl(rdd->queuebits)))
1875                 hpri = pri;
1876         if (rdd->idqueuebits && hpri < (pri = bsrl(rdd->idqueuebits)))
1877                 hpri = pri;
1878         hpri *= PPQ;
1879         if (rdd->uload - hpri < dd->uload + hpri)
1880                 return(NULL);
1881 #endif
1882         return (rdd);
1883 }
1884
1885 static
1886 dfly_pcpu_t
1887 dfly_choose_queue_simple(dfly_pcpu_t dd, struct lwp *lp)
1888 {
1889         dfly_pcpu_t rdd;
1890         cpumask_t tmpmask;
1891         cpumask_t mask;
1892         int cpubase;
1893         int cpuid;
1894
1895         /*
1896          * Fallback to the original heuristic, select random cpu,
1897          * first checking the cpus not currently running a user thread.
1898          *
1899          * Use cpuid as the base cpu in our scan, first checking
1900          * cpuid...(ncpus-1), then 0...(cpuid-1).  This avoid favoring
1901          * lower-numbered cpus.
1902          */
1903         ++dd->scancpu;          /* SMP race ok */
1904         mask = dfly_rdyprocmask;
1905         CPUMASK_NANDMASK(mask, dfly_curprocmask);
1906         CPUMASK_ANDMASK(mask, lp->lwp_cpumask);
1907         CPUMASK_ANDMASK(mask, smp_active_mask);
1908         CPUMASK_ANDMASK(mask, usched_global_cpumask);
1909
1910         cpubase = (int)(dd->scancpu % ncpus);
1911         CPUMASK_ASSBMASK(tmpmask, cpubase);
1912         CPUMASK_INVMASK(tmpmask);
1913         CPUMASK_ANDMASK(tmpmask, mask);
1914         while (CPUMASK_TESTNZERO(tmpmask)) {
1915                 cpuid = BSFCPUMASK(tmpmask);
1916                 rdd = &dfly_pcpu[cpuid];
1917
1918                 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
1919                         goto found;
1920                 CPUMASK_NANDBIT(tmpmask, cpuid);
1921         }
1922
1923         CPUMASK_ASSBMASK(tmpmask, cpubase);
1924         CPUMASK_ANDMASK(tmpmask, mask);
1925         while (CPUMASK_TESTNZERO(tmpmask)) {
1926                 cpuid = BSFCPUMASK(tmpmask);
1927                 rdd = &dfly_pcpu[cpuid];
1928
1929                 if ((rdd->upri & ~PPQMASK) >= (lp->lwp_priority & ~PPQMASK))
1930                         goto found;
1931                 CPUMASK_NANDBIT(tmpmask, cpuid);
1932         }
1933
1934         /*
1935          * Then cpus which might have a currently running lp
1936          */
1937         mask = dfly_rdyprocmask;
1938         CPUMASK_ANDMASK(mask, dfly_curprocmask);
1939         CPUMASK_ANDMASK(mask, lp->lwp_cpumask);
1940         CPUMASK_ANDMASK(mask, smp_active_mask);
1941         CPUMASK_ANDMASK(mask, usched_global_cpumask);
1942
1943         CPUMASK_ASSBMASK(tmpmask, cpubase);
1944         CPUMASK_INVMASK(tmpmask);
1945         CPUMASK_ANDMASK(tmpmask, mask);
1946         while (CPUMASK_TESTNZERO(tmpmask)) {
1947                 cpuid = BSFCPUMASK(tmpmask);
1948                 rdd = &dfly_pcpu[cpuid];
1949
1950                 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
1951                         goto found;
1952                 CPUMASK_NANDBIT(tmpmask, cpuid);
1953         }
1954
1955         CPUMASK_ASSBMASK(tmpmask, cpubase);
1956         CPUMASK_ANDMASK(tmpmask, mask);
1957         while (CPUMASK_TESTNZERO(tmpmask)) {
1958                 cpuid = BSFCPUMASK(tmpmask);
1959                 rdd = &dfly_pcpu[cpuid];
1960
1961                 if ((rdd->upri & ~PPQMASK) > (lp->lwp_priority & ~PPQMASK))
1962                         goto found;
1963                 CPUMASK_NANDBIT(tmpmask, cpuid);
1964         }
1965
1966         /*
1967          * If we cannot find a suitable cpu we round-robin using scancpu.
1968          * Other cpus will pickup as they release their current lwps or
1969          * become ready.
1970          *
1971          * Avoid a degenerate system lockup case if usched_global_cpumask
1972          * is set to 0 or otherwise does not cover lwp_cpumask.
1973          *
1974          * We only kick the target helper thread in this case, we do not
1975          * set the user resched flag because
1976          */
1977         cpuid = cpubase;
1978         if (CPUMASK_TESTBIT(lp->lwp_cpumask, cpuid) == 0)
1979                 cpuid = BSFCPUMASK(lp->lwp_cpumask);
1980         else if (CPUMASK_TESTBIT(usched_global_cpumask, cpuid) == 0)
1981                 cpuid = 0;
1982         rdd = &dfly_pcpu[cpuid];
1983 found:
1984         return (rdd);
1985 }
1986
1987 static
1988 void
1989 dfly_need_user_resched_remote(void *dummy)
1990 {
1991         globaldata_t gd = mycpu;
1992         dfly_pcpu_t  dd = &dfly_pcpu[gd->gd_cpuid];
1993
1994         /*
1995          * Flag reschedule needed
1996          */
1997         need_user_resched();
1998
1999         /*
2000          * If no user thread is currently running we need to kick the helper
2001          * on our cpu to recover.  Otherwise the cpu will never schedule
2002          * anything again.
2003          *
2004          * We cannot schedule the process ourselves because this is an
2005          * IPI callback and we cannot acquire spinlocks in an IPI callback.
2006          *
2007          * Call wakeup_mycpu to avoid sending IPIs to other CPUs
2008          */
2009         if (dd->uschedcp == NULL &&
2010             CPUMASK_TESTBIT(dfly_rdyprocmask, gd->gd_cpuid)) {
2011                 ATOMIC_CPUMASK_NANDBIT(dfly_rdyprocmask, gd->gd_cpuid);
2012                 wakeup_mycpu(dd->helper_thread);
2013         }
2014 }
2015
2016 /*
2017  * dfly_remrunqueue_locked() removes a given process from the run queue
2018  * that it is on, clearing the queue busy bit if it becomes empty.
2019  *
2020  * Note that user process scheduler is different from the LWKT schedule.
2021  * The user process scheduler only manages user processes but it uses LWKT
2022  * underneath, and a user process operating in the kernel will often be
2023  * 'released' from our management.
2024  *
2025  * uload is NOT adjusted here.  It is only adjusted if the lwkt_thread goes
2026  * to sleep or the lwp is moved to a different runq.
2027  */
2028 static void
2029 dfly_remrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
2030 {
2031         struct rq *q;
2032         u_int32_t *which;
2033         u_int8_t pri;
2034
2035         KKASSERT(rdd->runqcount >= 0);
2036
2037         pri = lp->lwp_rqindex;
2038
2039         switch(lp->lwp_rqtype) {
2040         case RTP_PRIO_NORMAL:
2041                 q = &rdd->queues[pri];
2042                 which = &rdd->queuebits;
2043                 break;
2044         case RTP_PRIO_REALTIME:
2045         case RTP_PRIO_FIFO:
2046                 q = &rdd->rtqueues[pri];
2047                 which = &rdd->rtqueuebits;
2048                 break;
2049         case RTP_PRIO_IDLE:
2050                 q = &rdd->idqueues[pri];
2051                 which = &rdd->idqueuebits;
2052                 break;
2053         default:
2054                 panic("remrunqueue: invalid rtprio type");
2055                 /* NOT REACHED */
2056         }
2057         KKASSERT(lp->lwp_mpflags & LWP_MP_ONRUNQ);
2058         atomic_clear_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
2059         TAILQ_REMOVE(q, lp, lwp_procq);
2060         --rdd->runqcount;
2061         if (TAILQ_EMPTY(q)) {
2062                 KASSERT((*which & (1 << pri)) != 0,
2063                         ("remrunqueue: remove from empty queue"));
2064                 *which &= ~(1 << pri);
2065         }
2066 }
2067
2068 /*
2069  * dfly_setrunqueue_locked()
2070  *
2071  * Add a process whos rqtype and rqindex had previously been calculated
2072  * onto the appropriate run queue.   Determine if the addition requires
2073  * a reschedule on a cpu and return the cpuid or -1.
2074  *
2075  * NOTE:          Lower priorities are better priorities.
2076  *
2077  * NOTE ON ULOAD: This variable specifies the aggregate load on a cpu, the
2078  *                sum of the rough lwp_priority for all running and runnable
2079  *                processes.  Lower priority processes (higher lwp_priority
2080  *                values) actually DO count as more load, not less, because
2081  *                these are the programs which require the most care with
2082  *                regards to cpu selection.
2083  */
2084 static void
2085 dfly_setrunqueue_locked(dfly_pcpu_t rdd, struct lwp *lp)
2086 {
2087         u_int32_t *which;
2088         struct rq *q;
2089         int pri;
2090
2091         KKASSERT(lp->lwp_qcpu == rdd->cpuid);
2092
2093         if ((lp->lwp_mpflags & LWP_MP_ULOAD) == 0) {
2094                 atomic_set_int(&lp->lwp_mpflags, LWP_MP_ULOAD);
2095                 atomic_add_long(&dfly_pcpu[lp->lwp_qcpu].uload, lp->lwp_uload);
2096                 atomic_add_int(&dfly_pcpu[lp->lwp_qcpu].ucount, 1);
2097         }
2098
2099         pri = lp->lwp_rqindex;
2100
2101         switch(lp->lwp_rqtype) {
2102         case RTP_PRIO_NORMAL:
2103                 q = &rdd->queues[pri];
2104                 which = &rdd->queuebits;
2105                 break;
2106         case RTP_PRIO_REALTIME:
2107         case RTP_PRIO_FIFO:
2108                 q = &rdd->rtqueues[pri];
2109                 which = &rdd->rtqueuebits;
2110                 break;
2111         case RTP_PRIO_IDLE:
2112                 q = &rdd->idqueues[pri];
2113                 which = &rdd->idqueuebits;
2114                 break;
2115         default:
2116                 panic("remrunqueue: invalid rtprio type");
2117                 /* NOT REACHED */
2118         }
2119
2120         /*
2121          * Place us on the selected queue.  Determine if we should be
2122          * placed at the head of the queue or at the end.
2123          *
2124          * We are placed at the tail if our round-robin count has expired,
2125          * or is about to expire and the system thinks its a good place to
2126          * round-robin, or there is already a next thread on the queue
2127          * (it might be trying to pick up where it left off and we don't
2128          * want to interfere).
2129          */
2130         KKASSERT((lp->lwp_mpflags & LWP_MP_ONRUNQ) == 0);
2131         atomic_set_int(&lp->lwp_mpflags, LWP_MP_ONRUNQ);
2132         ++rdd->runqcount;
2133
2134         if (lp->lwp_rrcount >= usched_dfly_rrinterval ||
2135             (lp->lwp_rrcount >= usched_dfly_rrinterval / 2 &&
2136              (lp->lwp_thread->td_mpflags & TDF_MP_BATCH_DEMARC))
2137         ) {
2138                 /*
2139                  * Place on tail
2140                  */
2141                 atomic_clear_int(&lp->lwp_thread->td_mpflags,
2142                                  TDF_MP_BATCH_DEMARC);
2143                 lp->lwp_rrcount = 0;
2144                 TAILQ_INSERT_TAIL(q, lp, lwp_procq);
2145         } else {
2146                 /*
2147                  * Retain rrcount and place on head.  Count is retained
2148                  * even if the queue is empty.
2149                  */
2150                 TAILQ_INSERT_HEAD(q, lp, lwp_procq);
2151         }
2152         *which |= 1 << pri;
2153 }
2154
2155 /*
2156  * For SMP systems a user scheduler helper thread is created for each
2157  * cpu and is used to allow one cpu to wakeup another for the purposes of
2158  * scheduling userland threads from setrunqueue().
2159  *
2160  * UP systems do not need the helper since there is only one cpu.
2161  *
2162  * We can't use the idle thread for this because we might block.
2163  * Additionally, doing things this way allows us to HLT idle cpus
2164  * on MP systems.
2165  */
2166 static void
2167 dfly_helper_thread(void *dummy)
2168 {
2169     globaldata_t gd;
2170     dfly_pcpu_t dd;
2171     dfly_pcpu_t rdd;
2172     struct lwp *nlp;
2173     cpumask_t mask;
2174     int cpuid;
2175
2176     gd = mycpu;
2177     cpuid = gd->gd_cpuid;       /* doesn't change */
2178     mask = gd->gd_cpumask;      /* doesn't change */
2179     dd = &dfly_pcpu[cpuid];
2180
2181     /*
2182      * Since we only want to be woken up only when no user processes
2183      * are scheduled on a cpu, run at an ultra low priority.
2184      */
2185     lwkt_setpri_self(TDPRI_USER_SCHEDULER);
2186
2187     tsleep(dd->helper_thread, 0, "schslp", 0);
2188
2189     for (;;) {
2190         /*
2191          * We use the LWKT deschedule-interlock trick to avoid racing
2192          * dfly_rdyprocmask.  This means we cannot block through to the
2193          * manual lwkt_switch() call we make below.
2194          */
2195         crit_enter_gd(gd);
2196         tsleep_interlock(dd->helper_thread, 0);
2197
2198         spin_lock(&dd->spin);
2199
2200         ATOMIC_CPUMASK_ORMASK(dfly_rdyprocmask, mask);
2201         clear_user_resched();   /* This satisfied the reschedule request */
2202 #if 0
2203         dd->rrcount = 0;        /* Reset the round-robin counter */
2204 #endif
2205
2206         if (dd->runqcount || dd->uschedcp != NULL) {
2207                 /*
2208                  * Threads are available.  A thread may or may not be
2209                  * currently scheduled.  Get the best thread already queued
2210                  * to this cpu.
2211                  */
2212                 nlp = dfly_chooseproc_locked(dd, dd, dd->uschedcp, 0);
2213                 if (nlp) {
2214                         ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, mask);
2215                         dd->upri = nlp->lwp_priority;
2216                         dd->uschedcp = nlp;
2217 #if 0
2218                         dd->rrcount = 0;        /* reset round robin */
2219 #endif
2220                         spin_unlock(&dd->spin);
2221                         lwkt_acquire(nlp->lwp_thread);
2222                         lwkt_schedule(nlp->lwp_thread);
2223                 } else {
2224                         /*
2225                          * This situation should not occur because we had
2226                          * at least one thread available.
2227                          */
2228                         spin_unlock(&dd->spin);
2229                 }
2230         } else if (usched_dfly_features & 0x01) {
2231                 /*
2232                  * This cpu is devoid of runnable threads, steal a thread
2233                  * from another cpu.  Since we're stealing, might as well
2234                  * load balance at the same time.
2235                  *
2236                  * We choose the highest-loaded thread from the worst queue.
2237                  *
2238                  * NOTE! This function only returns a non-NULL rdd when
2239                  *       another cpu's queue is obviously overloaded.  We
2240                  *       do not want to perform the type of rebalancing
2241                  *       the schedclock does here because it would result
2242                  *       in insane process pulling when 'steady' state is
2243                  *       partially unbalanced (e.g. 6 runnables and only
2244                  *       4 cores).
2245                  */
2246                 rdd = dfly_choose_worst_queue(dd);
2247                 if (rdd && spin_trylock(&rdd->spin)) {
2248                         nlp = dfly_chooseproc_locked(rdd, dd, NULL, 1);
2249                         spin_unlock(&rdd->spin);
2250                 } else {
2251                         nlp = NULL;
2252                 }
2253                 if (nlp) {
2254                         ATOMIC_CPUMASK_ORMASK(dfly_curprocmask, mask);
2255                         dd->upri = nlp->lwp_priority;
2256                         dd->uschedcp = nlp;
2257 #if 0
2258                         dd->rrcount = 0;        /* reset round robin */
2259 #endif
2260                         spin_unlock(&dd->spin);
2261                         lwkt_acquire(nlp->lwp_thread);
2262                         lwkt_schedule(nlp->lwp_thread);
2263                 } else {
2264                         /*
2265                          * Leave the thread on our run queue.  Another
2266                          * scheduler will try to pull it later.
2267                          */
2268                         spin_unlock(&dd->spin);
2269                 }
2270         } else {
2271                 /*
2272                  * devoid of runnable threads and not allowed to steal
2273                  * any.
2274                  */
2275                 spin_unlock(&dd->spin);
2276         }
2277
2278         /*
2279          * We're descheduled unless someone scheduled us.  Switch away.
2280          * Exiting the critical section will cause splz() to be called
2281          * for us if interrupts and such are pending.
2282          */
2283         crit_exit_gd(gd);
2284         tsleep(dd->helper_thread, PINTERLOCKED, "schslp", 0);
2285     }
2286 }
2287
2288 #if 0
2289 static int
2290 sysctl_usched_dfly_stick_to_level(SYSCTL_HANDLER_ARGS)
2291 {
2292         int error, new_val;
2293
2294         new_val = usched_dfly_stick_to_level;
2295
2296         error = sysctl_handle_int(oidp, &new_val, 0, req);
2297         if (error != 0 || req->newptr == NULL)
2298                 return (error);
2299         if (new_val > cpu_topology_levels_number - 1 || new_val < 0)
2300                 return (EINVAL);
2301         usched_dfly_stick_to_level = new_val;
2302         return (0);
2303 }
2304 #endif
2305
2306 /*
2307  * Setup the queues and scheduler helpers (scheduler helpers are SMP only).
2308  * Note that curprocmask bit 0 has already been cleared by rqinit() and
2309  * we should not mess with it further.
2310  */
2311 static void
2312 usched_dfly_cpu_init(void)
2313 {
2314         int i;
2315         int j;
2316         int smt_not_supported = 0;
2317         int cache_coherent_not_supported = 0;
2318
2319         if (bootverbose)
2320                 kprintf("Start usched_dfly helpers on cpus:\n");
2321
2322         sysctl_ctx_init(&usched_dfly_sysctl_ctx);
2323         usched_dfly_sysctl_tree =
2324                 SYSCTL_ADD_NODE(&usched_dfly_sysctl_ctx,
2325                                 SYSCTL_STATIC_CHILDREN(_kern), OID_AUTO,
2326                                 "usched_dfly", CTLFLAG_RD, 0, "");
2327
2328         for (i = 0; i < ncpus; ++i) {
2329                 dfly_pcpu_t dd = &dfly_pcpu[i];
2330                 cpumask_t mask;
2331
2332                 CPUMASK_ASSBIT(mask, i);
2333                 if (CPUMASK_TESTMASK(mask, smp_active_mask) == 0)
2334                     continue;
2335
2336                 spin_init(&dd->spin, "uschedcpuinit");
2337                 dd->cpunode = get_cpu_node_by_cpuid(i);
2338                 dd->cpuid = i;
2339                 CPUMASK_ASSBIT(dd->cpumask, i);
2340                 for (j = 0; j < NQS; j++) {
2341                         TAILQ_INIT(&dd->queues[j]);
2342                         TAILQ_INIT(&dd->rtqueues[j]);
2343                         TAILQ_INIT(&dd->idqueues[j]);
2344                 }
2345                 ATOMIC_CPUMASK_NANDBIT(dfly_curprocmask, 0);
2346
2347                 if (dd->cpunode == NULL) {
2348                         smt_not_supported = 1;
2349                         cache_coherent_not_supported = 1;
2350                         if (bootverbose)
2351                                 kprintf ("    cpu%d - WARNING: No CPU NODE "
2352                                          "found for cpu\n", i);
2353                 } else {
2354                         switch (dd->cpunode->type) {
2355                         case THREAD_LEVEL:
2356                                 if (bootverbose)
2357                                         kprintf ("    cpu%d - HyperThreading "
2358                                                  "available. Core siblings: ",
2359                                                  i);
2360                                 break;
2361                         case CORE_LEVEL:
2362                                 smt_not_supported = 1;
2363
2364                                 if (bootverbose)
2365                                         kprintf ("    cpu%d - No HT available, "
2366                                                  "multi-core/physical "
2367                                                  "cpu. Physical siblings: ",
2368                                                  i);
2369                                 break;
2370                         case CHIP_LEVEL:
2371                                 smt_not_supported = 1;
2372
2373                                 if (bootverbose)
2374                                         kprintf ("    cpu%d - No HT available, "
2375                                                  "single-core/physical cpu. "
2376                                                  "Package siblings: ",
2377                                                  i);
2378                                 break;
2379                         default:
2380                                 /* Let's go for safe defaults here */
2381                                 smt_not_supported = 1;
2382                                 cache_coherent_not_supported = 1;
2383                                 if (bootverbose)
2384                                         kprintf ("    cpu%d - Unknown cpunode->"
2385                                                  "type=%u. siblings: ",
2386                                                  i,
2387                                                  (u_int)dd->cpunode->type);
2388                                 break;
2389                         }
2390
2391                         if (bootverbose) {
2392                                 if (dd->cpunode->parent_node != NULL) {
2393                                         kprint_cpuset(&dd->cpunode->
2394                                                         parent_node->members);
2395                                         kprintf("\n");
2396                                 } else {
2397                                         kprintf(" no siblings\n");
2398                                 }
2399                         }
2400                 }
2401
2402                 lwkt_create(dfly_helper_thread, NULL, &dd->helper_thread, NULL,
2403                             0, i, "usched %d", i);
2404
2405                 /*
2406                  * Allow user scheduling on the target cpu.  cpu #0 has already
2407                  * been enabled in rqinit().
2408                  */
2409                 if (i)
2410                         ATOMIC_CPUMASK_NANDMASK(dfly_curprocmask, mask);
2411                 ATOMIC_CPUMASK_ORMASK(dfly_rdyprocmask, mask);
2412                 dd->upri = PRIBASE_NULL;
2413
2414         }
2415
2416         /* usched_dfly sysctl configurable parameters */
2417
2418         SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2419                        SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2420                        OID_AUTO, "rrinterval", CTLFLAG_RW,
2421                        &usched_dfly_rrinterval, 0, "");
2422         SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2423                        SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2424                        OID_AUTO, "decay", CTLFLAG_RW,
2425                        &usched_dfly_decay, 0, "Extra decay when not running");
2426
2427         /* Add enable/disable option for SMT scheduling if supported */
2428         if (smt_not_supported) {
2429                 usched_dfly_smt = 0;
2430                 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
2431                                   SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2432                                   OID_AUTO, "smt", CTLFLAG_RD,
2433                                   "NOT SUPPORTED", 0, "SMT NOT SUPPORTED");
2434         } else {
2435                 usched_dfly_smt = 1;
2436                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2437                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2438                                OID_AUTO, "smt", CTLFLAG_RW,
2439                                &usched_dfly_smt, 0, "Enable SMT scheduling");
2440         }
2441
2442         /*
2443          * Add enable/disable option for cache coherent scheduling
2444          * if supported
2445          */
2446         if (cache_coherent_not_supported) {
2447                 usched_dfly_cache_coherent = 0;
2448                 SYSCTL_ADD_STRING(&usched_dfly_sysctl_ctx,
2449                                   SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2450                                   OID_AUTO, "cache_coherent", CTLFLAG_RD,
2451                                   "NOT SUPPORTED", 0,
2452                                   "Cache coherence NOT SUPPORTED");
2453         } else {
2454                 usched_dfly_cache_coherent = 1;
2455                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2456                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2457                                OID_AUTO, "cache_coherent", CTLFLAG_RW,
2458                                &usched_dfly_cache_coherent, 0,
2459                                "Enable/Disable cache coherent scheduling");
2460
2461                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2462                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2463                                OID_AUTO, "weight1", CTLFLAG_RW,
2464                                &usched_dfly_weight1, 200,
2465                                "Weight selection for current cpu");
2466
2467                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2468                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2469                                OID_AUTO, "weight2", CTLFLAG_RW,
2470                                &usched_dfly_weight2, 180,
2471                                "Weight selection for wakefrom cpu");
2472
2473                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2474                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2475                                OID_AUTO, "weight3", CTLFLAG_RW,
2476                                &usched_dfly_weight3, 40,
2477                                "Weight selection for num threads on queue");
2478
2479                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2480                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2481                                OID_AUTO, "weight4", CTLFLAG_RW,
2482                                &usched_dfly_weight4, 160,
2483                                "Availability of other idle cpus");
2484
2485                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2486                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2487                                OID_AUTO, "fast_resched", CTLFLAG_RW,
2488                                &usched_dfly_fast_resched, 0,
2489                                "Availability of other idle cpus");
2490
2491                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2492                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2493                                OID_AUTO, "features", CTLFLAG_RW,
2494                                &usched_dfly_features, 0x8F,
2495                                "Allow pulls into empty queues");
2496
2497                 SYSCTL_ADD_INT(&usched_dfly_sysctl_ctx,
2498                                SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2499                                OID_AUTO, "swmask", CTLFLAG_RW,
2500                                &usched_dfly_swmask, ~PPQMASK,
2501                                "Queue mask to force thread switch");
2502
2503 #if 0
2504                 SYSCTL_ADD_PROC(&usched_dfly_sysctl_ctx,
2505                                 SYSCTL_CHILDREN(usched_dfly_sysctl_tree),
2506                                 OID_AUTO, "stick_to_level",
2507                                 CTLTYPE_INT | CTLFLAG_RW,
2508                                 NULL, sizeof usched_dfly_stick_to_level,
2509                                 sysctl_usched_dfly_stick_to_level, "I",
2510                                 "Stick a process to this level. See sysctl"
2511                                 "paremter hw.cpu_topology.level_description");
2512 #endif
2513         }
2514 }
2515 SYSINIT(uschedtd, SI_BOOT2_USCHED, SI_ORDER_SECOND,
2516         usched_dfly_cpu_init, NULL);