libgo/runtime/proc.c

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 #include <limits.h>
   6 #include <signal.h>
   7 #include <stdlib.h>
   8 #include <pthread.h>
   9 #include <unistd.h>
  10
  11 #include "config.h"
  12
  13 #ifdef HAVE_DL_ITERATE_PHDR
  14 #include <link.h>
  15 #endif
  16
  17 #include "runtime.h"
  18 #include "arch.h"
  19 #include "defs.h"
  20 #include "malloc.h"
  21 #include "race.h"
  22 #include "go-type.h"
  23 #include "go-defer.h"
  24
  25 #ifdef USING_SPLIT_STACK
  26
  27 /* FIXME: These are not declared anywhere.  */
  28
  29 extern void __splitstack_getcontext(void *context[10]);
  30
  31 extern void __splitstack_setcontext(void *context[10]);
  32
  33 extern void *__splitstack_makecontext(size_t, void *context[10], size_t *);
  34
  35 extern void * __splitstack_resetcontext(void *context[10], size_t *);
  36
  37 extern void *__splitstack_find(void *, void *, size_t *, void **, void **,
  38                                void **);
  39
  40 extern void __splitstack_block_signals (int *, int *);
  41
  42 extern void __splitstack_block_signals_context (void *context[10], int *,
  43                                                 int *);
  44
  45 #endif
  46
  47 #ifndef PTHREAD_STACK_MIN
  48 # define PTHREAD_STACK_MIN 8192
  49 #endif
  50
  51 #if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK)
  52 # define StackMin PTHREAD_STACK_MIN
  53 #else
  54 # define StackMin 2 * 1024 * 1024
  55 #endif
  56
  57 uintptr runtime_stacks_sys;
  58
  59 static void gtraceback(G*);
  60
  61 #ifdef __rtems__
  62 #define __thread
  63 #endif
  64
  65 static __thread G *g;
  66 static __thread M *m;
  67
  68 #ifndef SETCONTEXT_CLOBBERS_TLS
  69
  70 static inline void
  71 initcontext(void)
  72 {
  73 }
  74
  75 static inline void
  76 fixcontext(ucontext_t *c __attribute__ ((unused)))
  77 {
  78 }
  79
  80 #else
  81
  82 # if defined(__x86_64__) && defined(__sun__)
  83
  84 // x86_64 Solaris 10 and 11 have a bug: setcontext switches the %fs
  85 // register to that of the thread which called getcontext.  The effect
  86 // is that the address of all __thread variables changes.  This bug
  87 // also affects pthread_self() and pthread_getspecific.  We work
  88 // around it by clobbering the context field directly to keep %fs the
  89 // same.
  90
  91 static __thread greg_t fs;
  92
  93 static inline void
  94 initcontext(void)
  95 {
  96         ucontext_t c;
  97
  98         getcontext(&c);
  99         fs = c.uc_mcontext.gregs[REG_FSBASE];
 100 }
 101
 102 static inline void
 103 fixcontext(ucontext_t* c)
 104 {
 105         c->uc_mcontext.gregs[REG_FSBASE] = fs;
 106 }
 107
 108 # elif defined(__NetBSD__)
 109
 110 // NetBSD has a bug: setcontext clobbers tlsbase, we need to save
 111 // and restore it ourselves.
 112
 113 static __thread __greg_t tlsbase;
 114
 115 static inline void
 116 initcontext(void)
 117 {
 118         ucontext_t c;
 119
 120         getcontext(&c);
 121         tlsbase = c.uc_mcontext._mc_tlsbase;
 122 }
 123
 124 static inline void
 125 fixcontext(ucontext_t* c)
 126 {
 127         c->uc_mcontext._mc_tlsbase = tlsbase;
 128 }
 129
 130 # else
 131
 132 #  error unknown case for SETCONTEXT_CLOBBERS_TLS
 133
 134 # endif
 135
 136 #endif
 137
 138 // We can not always refer to the TLS variables directly.  The
 139 // compiler will call tls_get_addr to get the address of the variable,
 140 // and it may hold it in a register across a call to schedule.  When
 141 // we get back from the call we may be running in a different thread,
 142 // in which case the register now points to the TLS variable for a
 143 // different thread.  We use non-inlinable functions to avoid this
 144 // when necessary.
 145
 146 G* runtime_g(void) __attribute__ ((noinline, no_split_stack));
 147
 148 G*
 149 runtime_g(void)
 150 {
 151         return g;
 152 }
 153
 154 M* runtime_m(void) __attribute__ ((noinline, no_split_stack));
 155
 156 M*
 157 runtime_m(void)
 158 {
 159         return m;
 160 }
 161
 162 // Set m and g.
 163 void
 164 runtime_setmg(M* mp, G* gp)
 165 {
 166         m = mp;
 167         g = gp;
 168 }
 169
 170 // The static TLS size.  See runtime_newm.
 171 static int tlssize;
 172
 173 // Start a new thread.
 174 static void
 175 runtime_newosproc(M *mp)
 176 {
 177         pthread_attr_t attr;
 178         size_t stacksize;
 179         sigset_t clear, old;
 180         pthread_t tid;
 181         int ret;
 182
 183         if(pthread_attr_init(&attr) != 0)
 184                 runtime_throw("pthread_attr_init");
 185         if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
 186                 runtime_throw("pthread_attr_setdetachstate");
 187
 188         stacksize = PTHREAD_STACK_MIN;
 189
 190         // With glibc before version 2.16 the static TLS size is taken
 191         // out of the stack size, and we get an error or a crash if
 192         // there is not enough stack space left.  Add it back in if we
 193         // can, in case the program uses a lot of TLS space.  FIXME:
 194         // This can be disabled in glibc 2.16 and later, if the bug is
 195         // indeed fixed then.
 196         stacksize += tlssize;
 197
 198         if(pthread_attr_setstacksize(&attr, stacksize) != 0)
 199                 runtime_throw("pthread_attr_setstacksize");
 200
 201         // Block signals during pthread_create so that the new thread
 202         // starts with signals disabled.  It will enable them in minit.
 203         sigfillset(&clear);
 204
 205 #ifdef SIGTRAP
 206         // Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
 207         sigdelset(&clear, SIGTRAP);
 208 #endif
 209
 210         sigemptyset(&old);
 211         pthread_sigmask(SIG_BLOCK, &clear, &old);
 212         ret = pthread_create(&tid, &attr, runtime_mstart, mp);
 213         pthread_sigmask(SIG_SETMASK, &old, nil);
 214
 215         if (ret != 0)
 216                 runtime_throw("pthread_create");
 217 }
 218
 219 // First function run by a new goroutine.  This replaces gogocall.
 220 static void
 221 kickoff(void)
 222 {
 223         void (*fn)(void*);
 224
 225         if(g->traceback != nil)
 226                 gtraceback(g);
 227
 228         fn = (void (*)(void*))(g->entry);
 229         fn(g->param);
 230         runtime_goexit();
 231 }
 232
 233 // Switch context to a different goroutine.  This is like longjmp.
 234 void runtime_gogo(G*) __attribute__ ((noinline));
 235 void
 236 runtime_gogo(G* newg)
 237 {
 238 #ifdef USING_SPLIT_STACK
 239         __splitstack_setcontext(&newg->stack_context[0]);
 240 #endif
 241         g = newg;
 242         newg->fromgogo = true;
 243         fixcontext(&newg->context);
 244         setcontext(&newg->context);
 245         runtime_throw("gogo setcontext returned");
 246 }
 247
 248 // Save context and call fn passing g as a parameter.  This is like
 249 // setjmp.  Because getcontext always returns 0, unlike setjmp, we use
 250 // g->fromgogo as a code.  It will be true if we got here via
 251 // setcontext.  g == nil the first time this is called in a new m.
 252 void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
 253 void
 254 runtime_mcall(void (*pfn)(G*))
 255 {
 256         M *mp;
 257         G *gp;
 258 #ifndef USING_SPLIT_STACK
 259         int i;
 260 #endif
 261
 262         // Ensure that all registers are on the stack for the garbage
 263         // collector.
 264         __builtin_unwind_init();
 265
 266         mp = m;
 267         gp = g;
 268         if(gp == mp->g0)
 269                 runtime_throw("runtime: mcall called on m->g0 stack");
 270
 271         if(gp != nil) {
 272
 273 #ifdef USING_SPLIT_STACK
 274                 __splitstack_getcontext(&g->stack_context[0]);
 275 #else
 276                 gp->gcnext_sp = &i;
 277 #endif
 278                 gp->fromgogo = false;
 279                 getcontext(&gp->context);
 280
 281                 // When we return from getcontext, we may be running
 282                 // in a new thread.  That means that m and g may have
 283                 // changed.  They are global variables so we will
 284                 // reload them, but the addresses of m and g may be
 285                 // cached in our local stack frame, and those
 286                 // addresses may be wrong.  Call functions to reload
 287                 // the values for this thread.
 288                 mp = runtime_m();
 289                 gp = runtime_g();
 290
 291                 if(gp->traceback != nil)
 292                         gtraceback(gp);
 293         }
 294         if (gp == nil || !gp->fromgogo) {
 295 #ifdef USING_SPLIT_STACK
 296                 __splitstack_setcontext(&mp->g0->stack_context[0]);
 297 #endif
 298                 mp->g0->entry = (byte*)pfn;
 299                 mp->g0->param = gp;
 300
 301                 // It's OK to set g directly here because this case
 302                 // can not occur if we got here via a setcontext to
 303                 // the getcontext call just above.
 304                 g = mp->g0;
 305
 306                 fixcontext(&mp->g0->context);
 307                 setcontext(&mp->g0->context);
 308                 runtime_throw("runtime: mcall function returned");
 309         }
 310 }
 311
 312 #ifdef HAVE_DL_ITERATE_PHDR
 313
 314 // Called via dl_iterate_phdr.
 315
 316 static int
 317 addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data)
 318 {
 319         size_t *total = (size_t *)data;
 320         unsigned int i;
 321
 322         for(i = 0; i < info->dlpi_phnum; ++i) {
 323                 if(info->dlpi_phdr[i].p_type == PT_TLS)
 324                         *total += info->dlpi_phdr[i].p_memsz;
 325         }
 326         return 0;
 327 }
 328
 329 // Set the total TLS size.
 330
 331 static void
 332 inittlssize()
 333 {
 334         size_t total = 0;
 335
 336         dl_iterate_phdr(addtls, (void *)&total);
 337         tlssize = total;
 338 }
 339
 340 #else
 341
 342 static void
 343 inittlssize()
 344 {
 345 }
 346
 347 #endif
 348
 349 // Goroutine scheduler
 350 // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
 351 //
 352 // The main concepts are:
 353 // G - goroutine.
 354 // M - worker thread, or machine.
 355 // P - processor, a resource that is required to execute Go code.
 356 //     M must have an associated P to execute Go code, however it can be
 357 //     blocked or in a syscall w/o an associated P.
 358 //
 359 // Design doc at http://golang.org/s/go11sched.
 360
 361 typedef struct Sched Sched;
 362 struct Sched {
 363         Lock;
 364
 365         uint64  goidgen;
 366         M*      midle;   // idle m's waiting for work
 367         int32   nmidle;  // number of idle m's waiting for work
 368         int32   nmidlelocked; // number of locked m's waiting for work
 369         int32   mcount;  // number of m's that have been created
 370         int32   maxmcount;      // maximum number of m's allowed (or die)
 371
 372         P*      pidle;  // idle P's
 373         uint32  npidle;
 374         uint32  nmspinning;
 375
 376         // Global runnable queue.
 377         G*      runqhead;
 378         G*      runqtail;
 379         int32   runqsize;
 380
 381         // Global cache of dead G's.
 382         Lock    gflock;
 383         G*      gfree;
 384
 385         uint32  gcwaiting;      // gc is waiting to run
 386         int32   stopwait;
 387         Note    stopnote;
 388         uint32  sysmonwait;
 389         Note    sysmonnote;
 390         uint64  lastpoll;
 391
 392         int32   profilehz;      // cpu profiling rate
 393 };
 394
 395 // The max value of GOMAXPROCS.
 396 // There are no fundamental restrictions on the value.
 397 enum { MaxGomaxprocs = 1<<8 };
 398
 399 Sched   runtime_sched;
 400 int32   runtime_gomaxprocs;
 401 uint32  runtime_needextram = 1;
 402 bool    runtime_iscgo = true;
 403 M       runtime_m0;
 404 G       runtime_g0;      // idle goroutine for m0
 405 G*      runtime_allg;
 406 G*      runtime_lastg;
 407 M*      runtime_allm;
 408 P**     runtime_allp;
 409 M*      runtime_extram;
 410 int8*   runtime_goos;
 411 int32   runtime_ncpu;
 412 bool    runtime_precisestack;
 413 static int32    newprocs;
 414
 415 void* runtime_mstart(void*);
 416 static void runqput(P*, G*);
 417 static G* runqget(P*);
 418 static void runqgrow(P*);
 419 static G* runqsteal(P*, P*);
 420 static void mput(M*);
 421 static M* mget(void);
 422 static void mcommoninit(M*);
 423 static void schedule(void);
 424 static void procresize(int32);
 425 static void acquirep(P*);
 426 static P* releasep(void);
 427 static void newm(void(*)(void), P*);
 428 static void stopm(void);
 429 static void startm(P*, bool);
 430 static void handoffp(P*);
 431 static void wakep(void);
 432 static void stoplockedm(void);
 433 static void startlockedm(G*);
 434 static void sysmon(void);
 435 static uint32 retake(int64);
 436 static void incidlelocked(int32);
 437 static void checkdead(void);
 438 static void exitsyscall0(G*);
 439 static void park0(G*);
 440 static void goexit0(G*);
 441 static void gfput(P*, G*);
 442 static G* gfget(P*);
 443 static void gfpurge(P*);
 444 static void globrunqput(G*);
 445 static G* globrunqget(P*, int32);
 446 static P* pidleget(void);
 447 static void pidleput(P*);
 448 static void injectglist(G*);
 449 static bool preemptall(void);
 450 static bool exitsyscallfast(void);
 451
 452 // The bootstrap sequence is:
 453 //
 454 //      call osinit
 455 //      call schedinit
 456 //      make & queue new G
 457 //      call runtime_mstart
 458 //
 459 // The new G calls runtime_main.
 460 void
 461 runtime_schedinit(void)
 462 {
 463         int32 n, procs;
 464         const byte *p;
 465         Eface i;
 466
 467         m = &runtime_m0;
 468         g = &runtime_g0;
 469         m->g0 = g;
 470         m->curg = g;
 471         g->m = m;
 472
 473         initcontext();
 474         inittlssize();
 475
 476         runtime_sched.maxmcount = 10000;
 477         runtime_precisestack = 0;
 478
 479         runtime_mprofinit();
 480         runtime_mallocinit();
 481         mcommoninit(m);
 482
 483         // Initialize the itable value for newErrorCString,
 484         // so that the next time it gets called, possibly
 485         // in a fault during a garbage collection, it will not
 486         // need to allocated memory.
 487         runtime_newErrorCString(0, &i);
 488
 489         runtime_goargs();
 490         runtime_goenvs();
 491         runtime_parsedebugvars();
 492
 493         runtime_sched.lastpoll = runtime_nanotime();
 494         procs = 1;
 495         p = runtime_getenv("GOMAXPROCS");
 496         if(p != nil && (n = runtime_atoi(p)) > 0) {
 497                 if(n > MaxGomaxprocs)
 498                         n = MaxGomaxprocs;
 499                 procs = n;
 500         }
 501         runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
 502         procresize(procs);
 503
 504         // Can not enable GC until all roots are registered.
 505         // mstats.enablegc = 1;
 506
 507         // if(raceenabled)
 508         //      g->racectx = runtime_raceinit();
 509 }
 510
 511 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
 512 extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 513
 514 static void
 515 initDone(void *arg __attribute__ ((unused))) {
 516         runtime_unlockOSThread();
 517 };
 518
 519 // The main goroutine.
 520 void
 521 runtime_main(void* dummy __attribute__((unused)))
 522 {
 523         Defer d;
 524         _Bool frame;
 525
 526         newm(sysmon, nil);
 527
 528         // Lock the main goroutine onto this, the main OS thread,
 529         // during initialization.  Most programs won't care, but a few
 530         // do require certain calls to be made by the main thread.
 531         // Those can arrange for main.main to run in the main thread
 532         // by calling runtime.LockOSThread during initialization
 533         // to preserve the lock.
 534         runtime_lockOSThread();
 535
 536         // Defer unlock so that runtime.Goexit during init does the unlock too.
 537         d.__pfn = initDone;
 538         d.__next = g->defer;
 539         d.__arg = (void*)-1;
 540         d.__panic = g->panic;
 541         d.__retaddr = nil;
 542         d.__makefunc_can_recover = 0;
 543         d.__frame = &frame;
 544         d.__free = 0;
 545         g->defer = &d;
 546
 547         if(m != &runtime_m0)
 548                 runtime_throw("runtime_main not on m0");
 549         __go_go(runtime_MHeap_Scavenger, nil);
 550         main_init();
 551
 552         if(g->defer != &d || d.__pfn != initDone)
 553                 runtime_throw("runtime: bad defer entry after init");
 554         g->defer = d.__next;
 555         runtime_unlockOSThread();
 556
 557         // For gccgo we have to wait until after main is initialized
 558         // to enable GC, because initializing main registers the GC
 559         // roots.
 560         mstats.enablegc = 1;
 561
 562         main_main();
 563         if(raceenabled)
 564                 runtime_racefini();
 565
 566         // Make racy client program work: if panicking on
 567         // another goroutine at the same time as main returns,
 568         // let the other goroutine finish printing the panic trace.
 569         // Once it does, it will exit. See issue 3934.
 570         if(runtime_panicking)
 571                 runtime_park(nil, nil, "panicwait");
 572
 573         runtime_exit(0);
 574         for(;;)
 575                 *(int32*)0 = 0;
 576 }
 577
 578 void
 579 runtime_goroutineheader(G *gp)
 580 {
 581         const char *status;
 582
 583         switch(gp->status) {
 584         case Gidle:
 585                 status = "idle";
 586                 break;
 587         case Grunnable:
 588                 status = "runnable";
 589                 break;
 590         case Grunning:
 591                 status = "running";
 592                 break;
 593         case Gsyscall:
 594                 status = "syscall";
 595                 break;
 596         case Gwaiting:
 597                 if(gp->waitreason)
 598                         status = gp->waitreason;
 599                 else
 600                         status = "waiting";
 601                 break;
 602         default:
 603                 status = "???";
 604                 break;
 605         }
 606         runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
 607 }
 608
 609 void
 610 runtime_printcreatedby(G *g)
 611 {
 612         if(g != nil && g->gopc != 0 && g->goid != 1) {
 613                 String fn;
 614                 String file;
 615                 intgo line;
 616
 617                 if(__go_file_line(g->gopc - 1, &fn, &file, &line)) {
 618                         runtime_printf("created by %S\n", fn);
 619                         runtime_printf("\t%S:%D\n", file, (int64) line);
 620                 }
 621         }
 622 }
 623
 624 struct Traceback
 625 {
 626         G* gp;
 627         Location locbuf[100];
 628         int32 c;
 629 };
 630
 631 void
 632 runtime_tracebackothers(G * volatile me)
 633 {
 634         G * volatile gp;
 635         Traceback tb;
 636         int32 traceback;
 637
 638         tb.gp = me;
 639         traceback = runtime_gotraceback(nil);
 640
 641         // Show the current goroutine first, if we haven't already.
 642         if((gp = m->curg) != nil && gp != me) {
 643                 runtime_printf("\n");
 644                 runtime_goroutineheader(gp);
 645                 gp->traceback = &tb;
 646
 647 #ifdef USING_SPLIT_STACK
 648                 __splitstack_getcontext(&me->stack_context[0]);
 649 #endif
 650                 getcontext(&me->context);
 651
 652                 if(gp->traceback != nil) {
 653                   runtime_gogo(gp);
 654                 }
 655
 656                 runtime_printtrace(tb.locbuf, tb.c, false);
 657                 runtime_printcreatedby(gp);
 658         }
 659
 660         for(gp = runtime_allg; gp != nil; gp = gp->alllink) {
 661                 if(gp == me || gp == m->curg || gp->status == Gdead)
 662                         continue;
 663                 if(gp->issystem && traceback < 2)
 664                         continue;
 665                 runtime_printf("\n");
 666                 runtime_goroutineheader(gp);
 667
 668                 // Our only mechanism for doing a stack trace is
 669                 // _Unwind_Backtrace.  And that only works for the
 670                 // current thread, not for other random goroutines.
 671                 // So we need to switch context to the goroutine, get
 672                 // the backtrace, and then switch back.
 673
 674                 // This means that if g is running or in a syscall, we
 675                 // can't reliably print a stack trace.  FIXME.
 676
 677                 if(gp->status == Grunning) {
 678                         runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
 679                         runtime_printcreatedby(gp);
 680                 } else if(gp->status == Gsyscall) {
 681                         runtime_printf("\tgoroutine in C code; stack unavailable\n");
 682                         runtime_printcreatedby(gp);
 683                 } else {
 684                         gp->traceback = &tb;
 685
 686 #ifdef USING_SPLIT_STACK
 687                         __splitstack_getcontext(&me->stack_context[0]);
 688 #endif
 689                         getcontext(&me->context);
 690
 691                         if(gp->traceback != nil) {
 692                                 runtime_gogo(gp);
 693                         }
 694
 695                         runtime_printtrace(tb.locbuf, tb.c, false);
 696                         runtime_printcreatedby(gp);
 697                 }
 698         }
 699 }
 700
 701 static void
 702 checkmcount(void)
 703 {
 704         // sched lock is held
 705         if(runtime_sched.mcount > runtime_sched.maxmcount) {
 706                 runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
 707                 runtime_throw("thread exhaustion");
 708         }
 709 }
 710
 711 // Do a stack trace of gp, and then restore the context to
 712 // gp->dotraceback.
 713
 714 static void
 715 gtraceback(G* gp)
 716 {
 717         Traceback* traceback;
 718
 719         traceback = gp->traceback;
 720         gp->traceback = nil;
 721         traceback->c = runtime_callers(1, traceback->locbuf,
 722                 sizeof traceback->locbuf / sizeof traceback->locbuf[0]);
 723         runtime_gogo(traceback->gp);
 724 }
 725
 726 static void
 727 mcommoninit(M *mp)
 728 {
 729         // If there is no mcache runtime_callers() will crash,
 730         // and we are most likely in sysmon thread so the stack is senseless anyway.
 731         if(m->mcache)
 732                 runtime_callers(1, mp->createstack, nelem(mp->createstack));
 733
 734         mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 735
 736         runtime_lock(&runtime_sched);
 737         mp->id = runtime_sched.mcount++;
 738         checkmcount();
 739         runtime_mpreinit(mp);
 740
 741         // Add to runtime_allm so garbage collector doesn't free m
 742         // when it is just in a register or thread-local storage.
 743         mp->alllink = runtime_allm;
 744         // runtime_NumCgoCall() iterates over allm w/o schedlock,
 745         // so we need to publish it safely.
 746         runtime_atomicstorep(&runtime_allm, mp);
 747         runtime_unlock(&runtime_sched);
 748 }
 749
 750 // Mark gp ready to run.
 751 void
 752 runtime_ready(G *gp)
 753 {
 754         // Mark runnable.
 755         m->locks++;  // disable preemption because it can be holding p in a local var
 756         if(gp->status != Gwaiting) {
 757                 runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 758                 runtime_throw("bad g->status in ready");
 759         }
 760         gp->status = Grunnable;
 761         runqput(m->p, gp);
 762         if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 763                 wakep();
 764         m->locks--;
 765 }
 766
 767 int32
 768 runtime_gcprocs(void)
 769 {
 770         int32 n;
 771
 772         // Figure out how many CPUs to use during GC.
 773         // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
 774         runtime_lock(&runtime_sched);
 775         n = runtime_gomaxprocs;
 776         if(n > runtime_ncpu)
 777                 n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 778         if(n > MaxGcproc)
 779                 n = MaxGcproc;
 780         if(n > runtime_sched.nmidle+1) // one M is currently running
 781                 n = runtime_sched.nmidle+1;
 782         runtime_unlock(&runtime_sched);
 783         return n;
 784 }
 785
 786 static bool
 787 needaddgcproc(void)
 788 {
 789         int32 n;
 790
 791         runtime_lock(&runtime_sched);
 792         n = runtime_gomaxprocs;
 793         if(n > runtime_ncpu)
 794                 n = runtime_ncpu;
 795         if(n > MaxGcproc)
 796                 n = MaxGcproc;
 797         n -= runtime_sched.nmidle+1; // one M is currently running
 798         runtime_unlock(&runtime_sched);
 799         return n > 0;
 800 }
 801
 802 void
 803 runtime_helpgc(int32 nproc)
 804 {
 805         M *mp;
 806         int32 n, pos;
 807
 808         runtime_lock(&runtime_sched);
 809         pos = 0;
 810         for(n = 1; n < nproc; n++) {  // one M is currently running
 811                 if(runtime_allp[pos]->mcache == m->mcache)
 812                         pos++;
 813                 mp = mget();
 814                 if(mp == nil)
 815                         runtime_throw("runtime_gcprocs inconsistency");
 816                 mp->helpgc = n;
 817                 mp->mcache = runtime_allp[pos]->mcache;
 818                 pos++;
 819                 runtime_notewakeup(&mp->park);
 820         }
 821         runtime_unlock(&runtime_sched);
 822 }
 823
 824 // Similar to stoptheworld but best-effort and can be called several times.
 825 // There is no reverse operation, used during crashing.
 826 // This function must not lock any mutexes.
 827 void
 828 runtime_freezetheworld(void)
 829 {
 830         int32 i;
 831
 832         if(runtime_gomaxprocs == 1)
 833                 return;
 834         // stopwait and preemption requests can be lost
 835         // due to races with concurrently executing threads,
 836         // so try several times
 837         for(i = 0; i < 5; i++) {
 838                 // this should tell the scheduler to not start any new goroutines
 839                 runtime_sched.stopwait = 0x7fffffff;
 840                 runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 841                 // this should stop running goroutines
 842                 if(!preemptall())
 843                         break;  // no running goroutines
 844                 runtime_usleep(1000);
 845         }
 846         // to be sure
 847         runtime_usleep(1000);
 848         preemptall();
 849         runtime_usleep(1000);
 850 }
 851
 852 void
 853 runtime_stoptheworld(void)
 854 {
 855         int32 i;
 856         uint32 s;
 857         P *p;
 858         bool wait;
 859
 860         runtime_lock(&runtime_sched);
 861         runtime_sched.stopwait = runtime_gomaxprocs;
 862         runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 863         preemptall();
 864         // stop current P
 865         m->p->status = Pgcstop;
 866         runtime_sched.stopwait--;
 867         // try to retake all P's in Psyscall status
 868         for(i = 0; i < runtime_gomaxprocs; i++) {
 869                 p = runtime_allp[i];
 870                 s = p->status;
 871                 if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
 872                         runtime_sched.stopwait--;
 873         }
 874         // stop idle P's
 875         while((p = pidleget()) != nil) {
 876                 p->status = Pgcstop;
 877                 runtime_sched.stopwait--;
 878         }
 879         wait = runtime_sched.stopwait > 0;
 880         runtime_unlock(&runtime_sched);
 881
 882         // wait for remaining P's to stop voluntarily
 883         if(wait) {
 884                 runtime_notesleep(&runtime_sched.stopnote);
 885                 runtime_noteclear(&runtime_sched.stopnote);
 886         }
 887         if(runtime_sched.stopwait)
 888                 runtime_throw("stoptheworld: not stopped");
 889         for(i = 0; i < runtime_gomaxprocs; i++) {
 890                 p = runtime_allp[i];
 891                 if(p->status != Pgcstop)
 892                         runtime_throw("stoptheworld: not stopped");
 893         }
 894 }
 895
 896 static void
 897 mhelpgc(void)
 898 {
 899         m->helpgc = -1;
 900 }
 901
 902 void
 903 runtime_starttheworld(void)
 904 {
 905         P *p, *p1;
 906         M *mp;
 907         G *gp;
 908         bool add;
 909
 910         m->locks++;  // disable preemption because it can be holding p in a local var
 911         gp = runtime_netpoll(false);  // non-blocking
 912         injectglist(gp);
 913         add = needaddgcproc();
 914         runtime_lock(&runtime_sched);
 915         if(newprocs) {
 916                 procresize(newprocs);
 917                 newprocs = 0;
 918         } else
 919                 procresize(runtime_gomaxprocs);
 920         runtime_sched.gcwaiting = 0;
 921
 922         p1 = nil;
 923         while((p = pidleget()) != nil) {
 924                 // procresize() puts p's with work at the beginning of the list.
 925                 // Once we reach a p without a run queue, the rest don't have one either.
 926                 if(p->runqhead == p->runqtail) {
 927                         pidleput(p);
 928                         break;
 929                 }
 930                 p->m = mget();
 931                 p->link = p1;
 932                 p1 = p;
 933         }
 934         if(runtime_sched.sysmonwait) {
 935                 runtime_sched.sysmonwait = false;
 936                 runtime_notewakeup(&runtime_sched.sysmonnote);
 937         }
 938         runtime_unlock(&runtime_sched);
 939
 940         while(p1) {
 941                 p = p1;
 942                 p1 = p1->link;
 943                 if(p->m) {
 944                         mp = p->m;
 945                         p->m = nil;
 946                         if(mp->nextp)
 947                                 runtime_throw("starttheworld: inconsistent mp->nextp");
 948                         mp->nextp = p;
 949                         runtime_notewakeup(&mp->park);
 950                 } else {
 951                         // Start M to run P.  Do not start another M below.
 952                         newm(nil, p);
 953                         add = false;
 954                 }
 955         }
 956
 957         if(add) {
 958                 // If GC could have used another helper proc, start one now,
 959                 // in the hope that it will be available next time.
 960                 // It would have been even better to start it before the collection,
 961                 // but doing so requires allocating memory, so it's tricky to
 962                 // coordinate.  This lazy approach works out in practice:
 963                 // we don't mind if the first couple gc rounds don't have quite
 964                 // the maximum number of procs.
 965                 newm(mhelpgc, nil);
 966         }
 967         m->locks--;
 968 }
 969
 970 // Called to start an M.
 971 void*
 972 runtime_mstart(void* mp)
 973 {
 974         m = (M*)mp;
 975         g = m->g0;
 976
 977         initcontext();
 978
 979         g->entry = nil;
 980         g->param = nil;
 981
 982         // Record top of stack for use by mcall.
 983         // Once we call schedule we're never coming back,
 984         // so other calls can reuse this stack space.
 985 #ifdef USING_SPLIT_STACK
 986         __splitstack_getcontext(&g->stack_context[0]);
 987 #else
 988         g->gcinitial_sp = &mp;
 989         // Setting gcstack_size to 0 is a marker meaning that gcinitial_sp
 990         // is the top of the stack, not the bottom.
 991         g->gcstack_size = 0;
 992         g->gcnext_sp = &mp;
 993 #endif
 994         getcontext(&g->context);
 995
 996         if(g->entry != nil) {
 997                 // Got here from mcall.
 998                 void (*pfn)(G*) = (void (*)(G*))g->entry;
 999                 G* gp = (G*)g->param;
1000                 pfn(gp);
1001                 *(int*)0x21 = 0x21;
1002         }
1003         runtime_minit();
1004
1005 #ifdef USING_SPLIT_STACK
1006         {
1007                 int dont_block_signals = 0;
1008                 __splitstack_block_signals(&dont_block_signals, nil);
1009         }
1010 #endif
1011
1012         // Install signal handlers; after minit so that minit can
1013         // prepare the thread to be able to handle the signals.
1014         if(m == &runtime_m0)
1015                 runtime_initsig();
1016
1017         if(m->mstartfn)
1018                 m->mstartfn();
1019
1020         if(m->helpgc) {
1021                 m->helpgc = 0;
1022                 stopm();
1023         } else if(m != &runtime_m0) {
1024                 acquirep(m->nextp);
1025                 m->nextp = nil;
1026         }
1027         schedule();
1028
1029         // TODO(brainman): This point is never reached, because scheduler
1030         // does not release os threads at the moment. But once this path
1031         // is enabled, we must remove our seh here.
1032
1033         return nil;
1034 }
1035
1036 typedef struct CgoThreadStart CgoThreadStart;
1037 struct CgoThreadStart
1038 {
1039         M *m;
1040         G *g;
1041         void (*fn)(void);
1042 };
1043
1044 // Allocate a new m unassociated with any thread.
1045 // Can use p for allocation context if needed.
1046 M*
1047 runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacksize)
1048 {
1049         M *mp;
1050
1051         m->locks++;  // disable GC because it can be called from sysmon
1052         if(m->p == nil)
1053                 acquirep(p);  // temporarily borrow p for mallocs in this function
1054 #if 0
1055         if(mtype == nil) {
1056                 Eface e;
1057                 runtime_gc_m_ptr(&e);
1058                 mtype = ((const PtrType*)e.__type_descriptor)->__element_type;
1059         }
1060 #endif
1061
1062         mp = runtime_mal(sizeof *mp);
1063         mcommoninit(mp);
1064         mp->g0 = runtime_malg(stacksize, ret_g0_stack, ret_g0_stacksize);
1065
1066         if(p == m->p)
1067                 releasep();
1068         m->locks--;
1069
1070         return mp;
1071 }
1072
1073 static M* lockextra(bool nilokay);
1074 static void unlockextra(M*);
1075
1076 // needm is called when a cgo callback happens on a
1077 // thread without an m (a thread not created by Go).
1078 // In this case, needm is expected to find an m to use
1079 // and return with m, g initialized correctly.
1080 // Since m and g are not set now (likely nil, but see below)
1081 // needm is limited in what routines it can call. In particular
1082 // it can only call nosplit functions (textflag 7) and cannot
1083 // do any scheduling that requires an m.
1084 //
1085 // In order to avoid needing heavy lifting here, we adopt
1086 // the following strategy: there is a stack of available m's
1087 // that can be stolen. Using compare-and-swap
1088 // to pop from the stack has ABA races, so we simulate
1089 // a lock by doing an exchange (via casp) to steal the stack
1090 // head and replace the top pointer with MLOCKED (1).
1091 // This serves as a simple spin lock that we can use even
1092 // without an m. The thread that locks the stack in this way
1093 // unlocks the stack by storing a valid stack head pointer.
1094 //
1095 // In order to make sure that there is always an m structure
1096 // available to be stolen, we maintain the invariant that there
1097 // is always one more than needed. At the beginning of the
1098 // program (if cgo is in use) the list is seeded with a single m.
1099 // If needm finds that it has taken the last m off the list, its job
1100 // is - once it has installed its own m so that it can do things like
1101 // allocate memory - to create a spare m and put it on the list.
1102 //
1103 // Each of these extra m's also has a g0 and a curg that are
1104 // pressed into service as the scheduling stack and current
1105 // goroutine for the duration of the cgo callback.
1106 //
1107 // When the callback is done with the m, it calls dropm to
1108 // put the m back on the list.
1109 //
1110 // Unlike the gc toolchain, we start running on curg, since we are
1111 // just going to return and let the caller continue.
1112 void
1113 runtime_needm(void)
1114 {
1115         M *mp;
1116
1117         if(runtime_needextram) {
1118                 // Can happen if C/C++ code calls Go from a global ctor.
1119                 // Can not throw, because scheduler is not initialized yet.
1120                 runtime_write(2, "fatal error: cgo callback before cgo call\n",
1121                         sizeof("fatal error: cgo callback before cgo call\n")-1);
1122                 runtime_exit(1);
1123         }
1124
1125         // Lock extra list, take head, unlock popped list.
1126         // nilokay=false is safe here because of the invariant above,
1127         // that the extra list always contains or will soon contain
1128         // at least one m.
1129         mp = lockextra(false);
1130
1131         // Set needextram when we've just emptied the list,
1132         // so that the eventual call into cgocallbackg will
1133         // allocate a new m for the extra list. We delay the
1134         // allocation until then so that it can be done
1135         // after exitsyscall makes sure it is okay to be
1136         // running at all (that is, there's no garbage collection
1137         // running right now).
1138         mp->needextram = mp->schedlink == nil;
1139         unlockextra(mp->schedlink);
1140
1141         // Install m and g (= m->curg).
1142         runtime_setmg(mp, mp->curg);
1143
1144         // Initialize g's context as in mstart.
1145         initcontext();
1146         g->status = Gsyscall;
1147         g->entry = nil;
1148         g->param = nil;
1149 #ifdef USING_SPLIT_STACK
1150         __splitstack_getcontext(&g->stack_context[0]);
1151 #else
1152         g->gcinitial_sp = &mp;
1153         g->gcstack_size = 0;
1154         g->gcnext_sp = &mp;
1155 #endif
1156         getcontext(&g->context);
1157
1158         if(g->entry != nil) {
1159                 // Got here from mcall.
1160                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1161                 G* gp = (G*)g->param;
1162                 pfn(gp);
1163                 *(int*)0x22 = 0x22;
1164         }
1165
1166         // Initialize this thread to use the m.
1167         runtime_minit();
1168
1169 #ifdef USING_SPLIT_STACK
1170         {
1171                 int dont_block_signals = 0;
1172                 __splitstack_block_signals(&dont_block_signals, nil);
1173         }
1174 #endif
1175 }
1176
1177 // newextram allocates an m and puts it on the extra list.
1178 // It is called with a working local m, so that it can do things
1179 // like call schedlock and allocate.
1180 void
1181 runtime_newextram(void)
1182 {
1183         M *mp, *mnext;
1184         G *gp;
1185         byte *g0_sp, *sp;
1186         size_t g0_spsize, spsize;
1187
1188         // Create extra goroutine locked to extra m.
1189         // The goroutine is the context in which the cgo callback will run.
1190         // The sched.pc will never be returned to, but setting it to
1191         // runtime.goexit makes clear to the traceback routines where
1192         // the goroutine stack ends.
1193         mp = runtime_allocm(nil, StackMin, &g0_sp, &g0_spsize);
1194         gp = runtime_malg(StackMin, &sp, &spsize);
1195         gp->status = Gdead;
1196         mp->curg = gp;
1197         mp->locked = LockInternal;
1198         mp->lockedg = gp;
1199         gp->lockedm = mp;
1200         gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
1201         // put on allg for garbage collector
1202         runtime_lock(&runtime_sched);
1203         if(runtime_lastg == nil)
1204                 runtime_allg = gp;
1205         else
1206                 runtime_lastg->alllink = gp;
1207         runtime_lastg = gp;
1208         runtime_unlock(&runtime_sched);
1209         gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
1210
1211         // The context for gp will be set up in runtime_needm.  But
1212         // here we need to set up the context for g0.
1213         getcontext(&mp->g0->context);
1214         mp->g0->context.uc_stack.ss_sp = g0_sp;
1215 #ifdef MAKECONTEXT_STACK_TOP
1216         mp->g0->context.uc_stack.ss_sp += g0_spsize;
1217 #endif
1218         mp->g0->context.uc_stack.ss_size = g0_spsize;
1219         makecontext(&mp->g0->context, kickoff, 0);
1220
1221         // Add m to the extra list.
1222         mnext = lockextra(true);
1223         mp->schedlink = mnext;
1224         unlockextra(mp);
1225 }
1226
1227 // dropm is called when a cgo callback has called needm but is now
1228 // done with the callback and returning back into the non-Go thread.
1229 // It puts the current m back onto the extra list.
1230 //
1231 // The main expense here is the call to signalstack to release the
1232 // m's signal stack, and then the call to needm on the next callback
1233 // from this thread. It is tempting to try to save the m for next time,
1234 // which would eliminate both these costs, but there might not be
1235 // a next time: the current thread (which Go does not control) might exit.
1236 // If we saved the m for that thread, there would be an m leak each time
1237 // such a thread exited. Instead, we acquire and release an m on each
1238 // call. These should typically not be scheduling operations, just a few
1239 // atomics, so the cost should be small.
1240 //
1241 // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
1242 // variable using pthread_key_create. Unlike the pthread keys we already use
1243 // on OS X, this dummy key would never be read by Go code. It would exist
1244 // only so that we could register at thread-exit-time destructor.
1245 // That destructor would put the m back onto the extra list.
1246 // This is purely a performance optimization. The current version,
1247 // in which dropm happens on each cgo call, is still correct too.
1248 // We may have to keep the current version on systems with cgo
1249 // but without pthreads, like Windows.
1250 void
1251 runtime_dropm(void)
1252 {
1253         M *mp, *mnext;
1254
1255         // Undo whatever initialization minit did during needm.
1256         runtime_unminit();
1257
1258         // Clear m and g, and return m to the extra list.
1259         // After the call to setmg we can only call nosplit functions.
1260         mp = m;
1261         runtime_setmg(nil, nil);
1262
1263         mp->curg->status = Gdead;
1264
1265         mnext = lockextra(true);
1266         mp->schedlink = mnext;
1267         unlockextra(mp);
1268 }
1269
1270 #define MLOCKED ((M*)1)
1271
1272 // lockextra locks the extra list and returns the list head.
1273 // The caller must unlock the list by storing a new list head
1274 // to runtime.extram. If nilokay is true, then lockextra will
1275 // return a nil list head if that's what it finds. If nilokay is false,
1276 // lockextra will keep waiting until the list head is no longer nil.
1277 static M*
1278 lockextra(bool nilokay)
1279 {
1280         M *mp;
1281         void (*yield)(void);
1282
1283         for(;;) {
1284                 mp = runtime_atomicloadp(&runtime_extram);
1285                 if(mp == MLOCKED) {
1286                         yield = runtime_osyield;
1287                         yield();
1288                         continue;
1289                 }
1290                 if(mp == nil && !nilokay) {
1291                         runtime_usleep(1);
1292                         continue;
1293                 }
1294                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1295                         yield = runtime_osyield;
1296                         yield();
1297                         continue;
1298                 }
1299                 break;
1300         }
1301         return mp;
1302 }
1303
1304 static void
1305 unlockextra(M *mp)
1306 {
1307         runtime_atomicstorep(&runtime_extram, mp);
1308 }
1309
1310 static int32
1311 countextra()
1312 {
1313         M *mp, *mc;
1314         int32 c;
1315
1316         for(;;) {
1317                 mp = runtime_atomicloadp(&runtime_extram);
1318                 if(mp == MLOCKED) {
1319                         runtime_osyield();
1320                         continue;
1321                 }
1322                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1323                         runtime_osyield();
1324                         continue;
1325                 }
1326                 c = 0;
1327                 for(mc = mp; mc != nil; mc = mc->schedlink)
1328                         c++;
1329                 runtime_atomicstorep(&runtime_extram, mp);
1330                 return c;
1331         }
1332 }
1333
1334 // Create a new m.  It will start off with a call to fn, or else the scheduler.
1335 static void
1336 newm(void(*fn)(void), P *p)
1337 {
1338         M *mp;
1339
1340         mp = runtime_allocm(p, -1, nil, nil);
1341         mp->nextp = p;
1342         mp->mstartfn = fn;
1343
1344         runtime_newosproc(mp);
1345 }
1346
1347 // Stops execution of the current m until new work is available.
1348 // Returns with acquired P.
1349 static void
1350 stopm(void)
1351 {
1352         if(m->locks)
1353                 runtime_throw("stopm holding locks");
1354         if(m->p)
1355                 runtime_throw("stopm holding p");
1356         if(m->spinning) {
1357                 m->spinning = false;
1358                 runtime_xadd(&runtime_sched.nmspinning, -1);
1359         }
1360
1361 retry:
1362         runtime_lock(&runtime_sched);
1363         mput(m);
1364         runtime_unlock(&runtime_sched);
1365         runtime_notesleep(&m->park);
1366         runtime_noteclear(&m->park);
1367         if(m->helpgc) {
1368                 runtime_gchelper();
1369                 m->helpgc = 0;
1370                 m->mcache = nil;
1371                 goto retry;
1372         }
1373         acquirep(m->nextp);
1374         m->nextp = nil;
1375 }
1376
1377 static void
1378 mspinning(void)
1379 {
1380         m->spinning = true;
1381 }
1382
1383 // Schedules some M to run the p (creates an M if necessary).
1384 // If p==nil, tries to get an idle P, if no idle P's returns false.
1385 static void
1386 startm(P *p, bool spinning)
1387 {
1388         M *mp;
1389         void (*fn)(void);
1390
1391         runtime_lock(&runtime_sched);
1392         if(p == nil) {
1393                 p = pidleget();
1394                 if(p == nil) {
1395                         runtime_unlock(&runtime_sched);
1396                         if(spinning)
1397                                 runtime_xadd(&runtime_sched.nmspinning, -1);
1398                         return;
1399                 }
1400         }
1401         mp = mget();
1402         runtime_unlock(&runtime_sched);
1403         if(mp == nil) {
1404                 fn = nil;
1405                 if(spinning)
1406                         fn = mspinning;
1407                 newm(fn, p);
1408                 return;
1409         }
1410         if(mp->spinning)
1411                 runtime_throw("startm: m is spinning");
1412         if(mp->nextp)
1413                 runtime_throw("startm: m has p");
1414         mp->spinning = spinning;
1415         mp->nextp = p;
1416         runtime_notewakeup(&mp->park);
1417 }
1418
1419 // Hands off P from syscall or locked M.
1420 static void
1421 handoffp(P *p)
1422 {
1423         // if it has local work, start it straight away
1424         if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
1425                 startm(p, false);
1426                 return;
1427         }
1428         // no local work, check that there are no spinning/idle M's,
1429         // otherwise our help is not required
1430         if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
1431                 runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
1432                 startm(p, true);
1433                 return;
1434         }
1435         runtime_lock(&runtime_sched);
1436         if(runtime_sched.gcwaiting) {
1437                 p->status = Pgcstop;
1438                 if(--runtime_sched.stopwait == 0)
1439                         runtime_notewakeup(&runtime_sched.stopnote);
1440                 runtime_unlock(&runtime_sched);
1441                 return;
1442         }
1443         if(runtime_sched.runqsize) {
1444                 runtime_unlock(&runtime_sched);
1445                 startm(p, false);
1446                 return;
1447         }
1448         // If this is the last running P and nobody is polling network,
1449         // need to wakeup another M to poll network.
1450         if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
1451                 runtime_unlock(&runtime_sched);
1452                 startm(p, false);
1453                 return;
1454         }
1455         pidleput(p);
1456         runtime_unlock(&runtime_sched);
1457 }
1458
1459 // Tries to add one more P to execute G's.
1460 // Called when a G is made runnable (newproc, ready).
1461 static void
1462 wakep(void)
1463 {
1464         // be conservative about spinning threads
1465         if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
1466                 return;
1467         startm(nil, true);
1468 }
1469
1470 // Stops execution of the current m that is locked to a g until the g is runnable again.
1471 // Returns with acquired P.
1472 static void
1473 stoplockedm(void)
1474 {
1475         P *p;
1476
1477         if(m->lockedg == nil || m->lockedg->lockedm != m)
1478                 runtime_throw("stoplockedm: inconsistent locking");
1479         if(m->p) {
1480                 // Schedule another M to run this p.
1481                 p = releasep();
1482                 handoffp(p);
1483         }
1484         incidlelocked(1);
1485         // Wait until another thread schedules lockedg again.
1486         runtime_notesleep(&m->park);
1487         runtime_noteclear(&m->park);
1488         if(m->lockedg->status != Grunnable)
1489                 runtime_throw("stoplockedm: not runnable");
1490         acquirep(m->nextp);
1491         m->nextp = nil;
1492 }
1493
1494 // Schedules the locked m to run the locked gp.
1495 static void
1496 startlockedm(G *gp)
1497 {
1498         M *mp;
1499         P *p;
1500
1501         mp = gp->lockedm;
1502         if(mp == m)
1503                 runtime_throw("startlockedm: locked to me");
1504         if(mp->nextp)
1505                 runtime_throw("startlockedm: m has p");
1506         // directly handoff current P to the locked m
1507         incidlelocked(-1);
1508         p = releasep();
1509         mp->nextp = p;
1510         runtime_notewakeup(&mp->park);
1511         stopm();
1512 }
1513
1514 // Stops the current m for stoptheworld.
1515 // Returns when the world is restarted.
1516 static void
1517 gcstopm(void)
1518 {
1519         P *p;
1520
1521         if(!runtime_sched.gcwaiting)
1522                 runtime_throw("gcstopm: not waiting for gc");
1523         if(m->spinning) {
1524                 m->spinning = false;
1525                 runtime_xadd(&runtime_sched.nmspinning, -1);
1526         }
1527         p = releasep();
1528         runtime_lock(&runtime_sched);
1529         p->status = Pgcstop;
1530         if(--runtime_sched.stopwait == 0)
1531                 runtime_notewakeup(&runtime_sched.stopnote);
1532         runtime_unlock(&runtime_sched);
1533         stopm();
1534 }
1535
1536 // Schedules gp to run on the current M.
1537 // Never returns.
1538 static void
1539 execute(G *gp)
1540 {
1541         int32 hz;
1542
1543         if(gp->status != Grunnable) {
1544                 runtime_printf("execute: bad g status %d\n", gp->status);
1545                 runtime_throw("execute: bad g status");
1546         }
1547         gp->status = Grunning;
1548         m->p->schedtick++;
1549         m->curg = gp;
1550         gp->m = m;
1551
1552         // Check whether the profiler needs to be turned on or off.
1553         hz = runtime_sched.profilehz;
1554         if(m->profilehz != hz)
1555                 runtime_resetcpuprofiler(hz);
1556
1557         runtime_gogo(gp);
1558 }
1559
1560 // Finds a runnable goroutine to execute.
1561 // Tries to steal from other P's, get g from global queue, poll network.
1562 static G*
1563 findrunnable(void)
1564 {
1565         G *gp;
1566         P *p;
1567         int32 i;
1568
1569 top:
1570         if(runtime_sched.gcwaiting) {
1571                 gcstopm();
1572                 goto top;
1573         }
1574         // local runq
1575         gp = runqget(m->p);
1576         if(gp)
1577                 return gp;
1578         // global runq
1579         if(runtime_sched.runqsize) {
1580                 runtime_lock(&runtime_sched);
1581                 gp = globrunqget(m->p, 0);
1582                 runtime_unlock(&runtime_sched);
1583                 if(gp)
1584                         return gp;
1585         }
1586         // poll network
1587         gp = runtime_netpoll(false);  // non-blocking
1588         if(gp) {
1589                 injectglist(gp->schedlink);
1590                 gp->status = Grunnable;
1591                 return gp;
1592         }
1593         // If number of spinning M's >= number of busy P's, block.
1594         // This is necessary to prevent excessive CPU consumption
1595         // when GOMAXPROCS>>1 but the program parallelism is low.
1596         if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
1597                 goto stop;
1598         if(!m->spinning) {
1599                 m->spinning = true;
1600                 runtime_xadd(&runtime_sched.nmspinning, 1);
1601         }
1602         // random steal from other P's
1603         for(i = 0; i < 2*runtime_gomaxprocs; i++) {
1604                 if(runtime_sched.gcwaiting)
1605                         goto top;
1606                 p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
1607                 if(p == m->p)
1608                         gp = runqget(p);
1609                 else
1610                         gp = runqsteal(m->p, p);
1611                 if(gp)
1612                         return gp;
1613         }
1614 stop:
1615         // return P and block
1616         runtime_lock(&runtime_sched);
1617         if(runtime_sched.gcwaiting) {
1618                 runtime_unlock(&runtime_sched);
1619                 goto top;
1620         }
1621         if(runtime_sched.runqsize) {
1622                 gp = globrunqget(m->p, 0);
1623                 runtime_unlock(&runtime_sched);
1624                 return gp;
1625         }
1626         p = releasep();
1627         pidleput(p);
1628         runtime_unlock(&runtime_sched);
1629         if(m->spinning) {
1630                 m->spinning = false;
1631                 runtime_xadd(&runtime_sched.nmspinning, -1);
1632         }
1633         // check all runqueues once again
1634         for(i = 0; i < runtime_gomaxprocs; i++) {
1635                 p = runtime_allp[i];
1636                 if(p && p->runqhead != p->runqtail) {
1637                         runtime_lock(&runtime_sched);
1638                         p = pidleget();
1639                         runtime_unlock(&runtime_sched);
1640                         if(p) {
1641                                 acquirep(p);
1642                                 goto top;
1643                         }
1644                         break;
1645                 }
1646         }
1647         // poll network
1648         if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
1649                 if(m->p)
1650                         runtime_throw("findrunnable: netpoll with p");
1651                 if(m->spinning)
1652                         runtime_throw("findrunnable: netpoll with spinning");
1653                 gp = runtime_netpoll(true);  // block until new work is available
1654                 runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
1655                 if(gp) {
1656                         runtime_lock(&runtime_sched);
1657                         p = pidleget();
1658                         runtime_unlock(&runtime_sched);
1659                         if(p) {
1660                                 acquirep(p);
1661                                 injectglist(gp->schedlink);
1662                                 gp->status = Grunnable;
1663                                 return gp;
1664                         }
1665                         injectglist(gp);
1666                 }
1667         }
1668         stopm();
1669         goto top;
1670 }
1671
1672 static void
1673 resetspinning(void)
1674 {
1675         int32 nmspinning;
1676
1677         if(m->spinning) {
1678                 m->spinning = false;
1679                 nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
1680                 if(nmspinning < 0)
1681                         runtime_throw("findrunnable: negative nmspinning");
1682         } else
1683                 nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
1684
1685         // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
1686         // so see if we need to wakeup another P here.
1687         if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
1688                 wakep();
1689 }
1690
1691 // Injects the list of runnable G's into the scheduler.
1692 // Can run concurrently with GC.
1693 static void
1694 injectglist(G *glist)
1695 {
1696         int32 n;
1697         G *gp;
1698
1699         if(glist == nil)
1700                 return;
1701         runtime_lock(&runtime_sched);
1702         for(n = 0; glist; n++) {
1703                 gp = glist;
1704                 glist = gp->schedlink;
1705                 gp->status = Grunnable;
1706                 globrunqput(gp);
1707         }
1708         runtime_unlock(&runtime_sched);
1709
1710         for(; n && runtime_sched.npidle; n--)
1711                 startm(nil, false);
1712 }
1713
1714 // One round of scheduler: find a runnable goroutine and execute it.
1715 // Never returns.
1716 static void
1717 schedule(void)
1718 {
1719         G *gp;
1720         uint32 tick;
1721
1722         if(m->locks)
1723                 runtime_throw("schedule: holding locks");
1724
1725 top:
1726         if(runtime_sched.gcwaiting) {
1727                 gcstopm();
1728                 goto top;
1729         }
1730
1731         gp = nil;
1732         // Check the global runnable queue once in a while to ensure fairness.
1733         // Otherwise two goroutines can completely occupy the local runqueue
1734         // by constantly respawning each other.
1735         tick = m->p->schedtick;
1736         // This is a fancy way to say tick%61==0,
1737         // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
1738         if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
1739                 runtime_lock(&runtime_sched);
1740                 gp = globrunqget(m->p, 1);
1741                 runtime_unlock(&runtime_sched);
1742                 if(gp)
1743                         resetspinning();
1744         }
1745         if(gp == nil) {
1746                 gp = runqget(m->p);
1747                 if(gp && m->spinning)
1748                         runtime_throw("schedule: spinning with local work");
1749         }
1750         if(gp == nil) {
1751                 gp = findrunnable();  // blocks until work is available
1752                 resetspinning();
1753         }
1754
1755         if(gp->lockedm) {
1756                 // Hands off own p to the locked m,
1757                 // then blocks waiting for a new p.
1758                 startlockedm(gp);
1759                 goto top;
1760         }
1761
1762         execute(gp);
1763 }
1764
1765 // Puts the current goroutine into a waiting state and unlocks the lock.
1766 // The goroutine can be made runnable again by calling runtime_ready(gp).
1767 void
1768 runtime_park(void(*unlockf)(Lock*), Lock *lock, const char *reason)
1769 {
1770         m->waitlock = lock;
1771         m->waitunlockf = unlockf;
1772         g->waitreason = reason;
1773         runtime_mcall(park0);
1774 }
1775
1776 // runtime_park continuation on g0.
1777 static void
1778 park0(G *gp)
1779 {
1780         gp->status = Gwaiting;
1781         gp->m = nil;
1782         m->curg = nil;
1783         if(m->waitunlockf) {
1784                 m->waitunlockf(m->waitlock);
1785                 m->waitunlockf = nil;
1786                 m->waitlock = nil;
1787         }
1788         if(m->lockedg) {
1789                 stoplockedm();
1790                 execute(gp);  // Never returns.
1791         }
1792         schedule();
1793 }
1794
1795 // Scheduler yield.
1796 void
1797 runtime_gosched(void)
1798 {
1799         runtime_mcall(runtime_gosched0);
1800 }
1801
1802 // runtime_gosched continuation on g0.
1803 void
1804 runtime_gosched0(G *gp)
1805 {
1806         gp->status = Grunnable;
1807         gp->m = nil;
1808         m->curg = nil;
1809         runtime_lock(&runtime_sched);
1810         globrunqput(gp);
1811         runtime_unlock(&runtime_sched);
1812         if(m->lockedg) {
1813                 stoplockedm();
1814                 execute(gp);  // Never returns.
1815         }
1816         schedule();
1817 }
1818
1819 // Finishes execution of the current goroutine.
1820 // Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
1821 // Since it does not return it does not matter.  But if it is preempted
1822 // at the split stack check, GC will complain about inconsistent sp.
1823 void
1824 runtime_goexit(void)
1825 {
1826         if(raceenabled)
1827                 runtime_racegoend();
1828         runtime_mcall(goexit0);
1829 }
1830
1831 // runtime_goexit continuation on g0.
1832 static void
1833 goexit0(G *gp)
1834 {
1835         gp->status = Gdead;
1836         gp->entry = nil;
1837         gp->m = nil;
1838         gp->lockedm = nil;
1839         m->curg = nil;
1840         m->lockedg = nil;
1841         if(m->locked & ~LockExternal) {
1842                 runtime_printf("invalid m->locked = %d\n", m->locked);
1843                 runtime_throw("internal lockOSThread error");
1844         }
1845         m->locked = 0;
1846         gfput(m->p, gp);
1847         schedule();
1848 }
1849
1850 // The goroutine g is about to enter a system call.
1851 // Record that it's not using the cpu anymore.
1852 // This is called only from the go syscall library and cgocall,
1853 // not from the low-level system calls used by the runtime.
1854 //
1855 // Entersyscall cannot split the stack: the runtime_gosave must
1856 // make g->sched refer to the caller's stack segment, because
1857 // entersyscall is going to return immediately after.
1858
1859 void runtime_entersyscall(void) __attribute__ ((no_split_stack));
1860 static void doentersyscall(void) __attribute__ ((no_split_stack, noinline));
1861
1862 void
1863 runtime_entersyscall()
1864 {
1865         // Save the registers in the g structure so that any pointers
1866         // held in registers will be seen by the garbage collector.
1867         getcontext(&g->gcregs);
1868
1869         // Do the work in a separate function, so that this function
1870         // doesn't save any registers on its own stack.  If this
1871         // function does save any registers, we might store the wrong
1872         // value in the call to getcontext.
1873         //
1874         // FIXME: This assumes that we do not need to save any
1875         // callee-saved registers to access the TLS variable g.  We
1876         // don't want to put the ucontext_t on the stack because it is
1877         // large and we can not split the stack here.
1878         doentersyscall();
1879 }
1880
1881 static void
1882 doentersyscall()
1883 {
1884         // Disable preemption because during this function g is in Gsyscall status,
1885         // but can have inconsistent g->sched, do not let GC observe it.
1886         m->locks++;
1887
1888         // Leave SP around for GC and traceback.
1889 #ifdef USING_SPLIT_STACK
1890         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
1891                                        &g->gcnext_segment, &g->gcnext_sp,
1892                                        &g->gcinitial_sp);
1893 #else
1894         {
1895                 uint32 v;
1896
1897                 g->gcnext_sp = (byte *) &v;
1898         }
1899 #endif
1900
1901         g->status = Gsyscall;
1902
1903         if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
1904                 runtime_lock(&runtime_sched);
1905                 if(runtime_atomicload(&runtime_sched.sysmonwait)) {
1906                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
1907                         runtime_notewakeup(&runtime_sched.sysmonnote);
1908                 }
1909                 runtime_unlock(&runtime_sched);
1910         }
1911
1912         m->mcache = nil;
1913         m->p->m = nil;
1914         runtime_atomicstore(&m->p->status, Psyscall);
1915         if(runtime_sched.gcwaiting) {
1916                 runtime_lock(&runtime_sched);
1917                 if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
1918                         if(--runtime_sched.stopwait == 0)
1919                                 runtime_notewakeup(&runtime_sched.stopnote);
1920                 }
1921                 runtime_unlock(&runtime_sched);
1922         }
1923
1924         m->locks--;
1925 }
1926
1927 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
1928 void
1929 runtime_entersyscallblock(void)
1930 {
1931         P *p;
1932
1933         m->locks++;  // see comment in entersyscall
1934
1935         // Leave SP around for GC and traceback.
1936 #ifdef USING_SPLIT_STACK
1937         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
1938                                        &g->gcnext_segment, &g->gcnext_sp,
1939                                        &g->gcinitial_sp);
1940 #else
1941         g->gcnext_sp = (byte *) &p;
1942 #endif
1943
1944         // Save the registers in the g structure so that any pointers
1945         // held in registers will be seen by the garbage collector.
1946         getcontext(&g->gcregs);
1947
1948         g->status = Gsyscall;
1949
1950         p = releasep();
1951         handoffp(p);
1952         if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
1953                 incidlelocked(1);
1954
1955         m->locks--;
1956 }
1957
1958 // The goroutine g exited its system call.
1959 // Arrange for it to run on a cpu again.
1960 // This is called only from the go syscall library, not
1961 // from the low-level system calls used by the runtime.
1962 void
1963 runtime_exitsyscall(void)
1964 {
1965         G *gp;
1966
1967         m->locks++;  // see comment in entersyscall
1968
1969         gp = g;
1970         if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
1971                 incidlelocked(-1);
1972
1973         if(exitsyscallfast()) {
1974                 // There's a cpu for us, so we can run.
1975                 m->p->syscalltick++;
1976                 gp->status = Grunning;
1977                 // Garbage collector isn't running (since we are),
1978                 // so okay to clear gcstack and gcsp.
1979 #ifdef USING_SPLIT_STACK
1980                 gp->gcstack = nil;
1981 #endif
1982                 gp->gcnext_sp = nil;
1983                 runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
1984                 m->locks--;
1985                 return;
1986         }
1987
1988         m->locks--;
1989
1990         // Call the scheduler.
1991         runtime_mcall(exitsyscall0);
1992
1993         // Scheduler returned, so we're allowed to run now.
1994         // Delete the gcstack information that we left for
1995         // the garbage collector during the system call.
1996         // Must wait until now because until gosched returns
1997         // we don't know for sure that the garbage collector
1998         // is not running.
1999 #ifdef USING_SPLIT_STACK
2000         gp->gcstack = nil;
2001 #endif
2002         gp->gcnext_sp = nil;
2003         runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2004
2005         // Don't refer to m again, we might be running on a different
2006         // thread after returning from runtime_mcall.
2007         runtime_m()->p->syscalltick++;
2008 }
2009
2010 static bool
2011 exitsyscallfast(void)
2012 {
2013         P *p;
2014
2015         // Freezetheworld sets stopwait but does not retake P's.
2016         if(runtime_sched.stopwait) {
2017                 m->p = nil;
2018                 return false;
2019         }
2020
2021         // Try to re-acquire the last P.
2022         if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
2023                 // There's a cpu for us, so we can run.
2024                 m->mcache = m->p->mcache;
2025                 m->p->m = m;
2026                 return true;
2027         }
2028         // Try to get any other idle P.
2029         m->p = nil;
2030         if(runtime_sched.pidle) {
2031                 runtime_lock(&runtime_sched);
2032                 p = pidleget();
2033                 if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
2034                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2035                         runtime_notewakeup(&runtime_sched.sysmonnote);
2036                 }
2037                 runtime_unlock(&runtime_sched);
2038                 if(p) {
2039                         acquirep(p);
2040                         return true;
2041                 }
2042         }
2043         return false;
2044 }
2045
2046 // runtime_exitsyscall slow path on g0.
2047 // Failed to acquire P, enqueue gp as runnable.
2048 static void
2049 exitsyscall0(G *gp)
2050 {
2051         P *p;
2052
2053         gp->status = Grunnable;
2054         gp->m = nil;
2055         m->curg = nil;
2056         runtime_lock(&runtime_sched);
2057         p = pidleget();
2058         if(p == nil)
2059                 globrunqput(gp);
2060         else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
2061                 runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2062                 runtime_notewakeup(&runtime_sched.sysmonnote);
2063         }
2064         runtime_unlock(&runtime_sched);
2065         if(p) {
2066                 acquirep(p);
2067                 execute(gp);  // Never returns.
2068         }
2069         if(m->lockedg) {
2070                 // Wait until another thread schedules gp and so m again.
2071                 stoplockedm();
2072                 execute(gp);  // Never returns.
2073         }
2074         stopm();
2075         schedule();  // Never returns.
2076 }
2077
2078 // Called from syscall package before fork.
2079 void syscall_runtime_BeforeFork(void)
2080   __asm__(GOSYM_PREFIX "syscall.runtime_BeforeFork");
2081 void
2082 syscall_runtime_BeforeFork(void)
2083 {
2084         // Fork can hang if preempted with signals frequently enough (see issue 5517).
2085         // Ensure that we stay on the same M where we disable profiling.
2086         m->locks++;
2087         if(m->profilehz != 0)
2088                 runtime_resetcpuprofiler(0);
2089 }
2090
2091 // Called from syscall package after fork in parent.
2092 void syscall_runtime_AfterFork(void)
2093   __asm__(GOSYM_PREFIX "syscall.runtime_AfterFork");
2094 void
2095 syscall_runtime_AfterFork(void)
2096 {
2097         int32 hz;
2098
2099         hz = runtime_sched.profilehz;
2100         if(hz != 0)
2101                 runtime_resetcpuprofiler(hz);
2102         m->locks--;
2103 }
2104
2105 // Allocate a new g, with a stack big enough for stacksize bytes.
2106 G*
2107 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
2108 {
2109         G *newg;
2110
2111         newg = runtime_malloc(sizeof(G));
2112         if(stacksize >= 0) {
2113 #if USING_SPLIT_STACK
2114                 int dont_block_signals = 0;
2115
2116                 *ret_stack = __splitstack_makecontext(stacksize,
2117                                                       &newg->stack_context[0],
2118                                                       ret_stacksize);
2119                 __splitstack_block_signals_context(&newg->stack_context[0],
2120                                                    &dont_block_signals, nil);
2121 #else
2122                 *ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
2123                 *ret_stacksize = stacksize;
2124                 newg->gcinitial_sp = *ret_stack;
2125                 newg->gcstack_size = stacksize;
2126                 runtime_xadd(&runtime_stacks_sys, stacksize);
2127 #endif
2128         }
2129         return newg;
2130 }
2131
2132 /* For runtime package testing.  */
2133
2134
2135 // Create a new g running fn with siz bytes of arguments.
2136 // Put it on the queue of g's waiting to run.
2137 // The compiler turns a go statement into a call to this.
2138 // Cannot split the stack because it assumes that the arguments
2139 // are available sequentially after &fn; they would not be
2140 // copied if a stack split occurred.  It's OK for this to call
2141 // functions that split the stack.
2142 void runtime_testing_entersyscall(void)
2143   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
2144 void
2145 runtime_testing_entersyscall()
2146 {
2147         runtime_entersyscall();
2148 }
2149
2150 void runtime_testing_exitsyscall(void)
2151   __asm__ (GOSYM_PREFIX "runtime.exitsyscall");
2152
2153 void
2154 runtime_testing_exitsyscall()
2155 {
2156         runtime_exitsyscall();
2157 }
2158
2159 G*
2160 __go_go(void (*fn)(void*), void* arg)
2161 {
2162         byte *sp;
2163         size_t spsize;
2164         G *newg;
2165
2166 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
2167         m->locks++;  // disable preemption because it can be holding p in a local var
2168
2169         if((newg = gfget(m->p)) != nil) {
2170 #ifdef USING_SPLIT_STACK
2171                 int dont_block_signals = 0;
2172
2173                 sp = __splitstack_resetcontext(&newg->stack_context[0],
2174                                                &spsize);
2175                 __splitstack_block_signals_context(&newg->stack_context[0],
2176                                                    &dont_block_signals, nil);
2177 #else
2178                 sp = newg->gcinitial_sp;
2179                 spsize = newg->gcstack_size;
2180                 if(spsize == 0)
2181                         runtime_throw("bad spsize in __go_go");
2182                 newg->gcnext_sp = sp;
2183 #endif
2184         } else {
2185                 newg = runtime_malg(StackMin, &sp, &spsize);
2186                 runtime_lock(&runtime_sched);
2187                 if(runtime_lastg == nil)
2188                         runtime_allg = newg;
2189                 else
2190                         runtime_lastg->alllink = newg;
2191                 runtime_lastg = newg;
2192                 runtime_unlock(&runtime_sched);
2193         }
2194
2195         newg->entry = (byte*)fn;
2196         newg->param = arg;
2197         newg->gopc = (uintptr)__builtin_return_address(0);
2198         newg->status = Grunnable;
2199         newg->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
2200
2201         {
2202                 // Avoid warnings about variables clobbered by
2203                 // longjmp.
2204                 byte * volatile vsp = sp;
2205                 size_t volatile vspsize = spsize;
2206                 G * volatile vnewg = newg;
2207
2208                 getcontext(&vnewg->context);
2209                 vnewg->context.uc_stack.ss_sp = vsp;
2210 #ifdef MAKECONTEXT_STACK_TOP
2211                 vnewg->context.uc_stack.ss_sp += vspsize;
2212 #endif
2213                 vnewg->context.uc_stack.ss_size = vspsize;
2214                 makecontext(&vnewg->context, kickoff, 0);
2215
2216                 runqput(m->p, vnewg);
2217
2218                 if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
2219                         wakep();
2220                 m->locks--;
2221                 return vnewg;
2222         }
2223 }
2224
2225 // Put on gfree list.
2226 // If local list is too long, transfer a batch to the global list.
2227 static void
2228 gfput(P *p, G *gp)
2229 {
2230         gp->schedlink = p->gfree;
2231         p->gfree = gp;
2232         p->gfreecnt++;
2233         if(p->gfreecnt >= 64) {
2234                 runtime_lock(&runtime_sched.gflock);
2235                 while(p->gfreecnt >= 32) {
2236                         p->gfreecnt--;
2237                         gp = p->gfree;
2238                         p->gfree = gp->schedlink;
2239                         gp->schedlink = runtime_sched.gfree;
2240                         runtime_sched.gfree = gp;
2241                 }
2242                 runtime_unlock(&runtime_sched.gflock);
2243         }
2244 }
2245
2246 // Get from gfree list.
2247 // If local list is empty, grab a batch from global list.
2248 static G*
2249 gfget(P *p)
2250 {
2251         G *gp;
2252
2253 retry:
2254         gp = p->gfree;
2255         if(gp == nil && runtime_sched.gfree) {
2256                 runtime_lock(&runtime_sched.gflock);
2257                 while(p->gfreecnt < 32 && runtime_sched.gfree) {
2258                         p->gfreecnt++;
2259                         gp = runtime_sched.gfree;
2260                         runtime_sched.gfree = gp->schedlink;
2261                         gp->schedlink = p->gfree;
2262                         p->gfree = gp;
2263                 }
2264                 runtime_unlock(&runtime_sched.gflock);
2265                 goto retry;
2266         }
2267         if(gp) {
2268                 p->gfree = gp->schedlink;
2269                 p->gfreecnt--;
2270         }
2271         return gp;
2272 }
2273
2274 // Purge all cached G's from gfree list to the global list.
2275 static void
2276 gfpurge(P *p)
2277 {
2278         G *gp;
2279
2280         runtime_lock(&runtime_sched.gflock);
2281         while(p->gfreecnt) {
2282                 p->gfreecnt--;
2283                 gp = p->gfree;
2284                 p->gfree = gp->schedlink;
2285                 gp->schedlink = runtime_sched.gfree;
2286                 runtime_sched.gfree = gp;
2287         }
2288         runtime_unlock(&runtime_sched.gflock);
2289 }
2290
2291 void
2292 runtime_Breakpoint(void)
2293 {
2294         runtime_breakpoint();
2295 }
2296
2297 void runtime_Gosched (void) __asm__ (GOSYM_PREFIX "runtime.Gosched");
2298
2299 void
2300 runtime_Gosched(void)
2301 {
2302         runtime_gosched();
2303 }
2304
2305 // Implementation of runtime.GOMAXPROCS.
2306 // delete when scheduler is even stronger
2307 int32
2308 runtime_gomaxprocsfunc(int32 n)
2309 {
2310         int32 ret;
2311
2312         if(n > MaxGomaxprocs)
2313                 n = MaxGomaxprocs;
2314         runtime_lock(&runtime_sched);
2315         ret = runtime_gomaxprocs;
2316         if(n <= 0 || n == ret) {
2317                 runtime_unlock(&runtime_sched);
2318                 return ret;
2319         }
2320         runtime_unlock(&runtime_sched);
2321
2322         runtime_semacquire(&runtime_worldsema, false);
2323         m->gcing = 1;
2324         runtime_stoptheworld();
2325         newprocs = n;
2326         m->gcing = 0;
2327         runtime_semrelease(&runtime_worldsema);
2328         runtime_starttheworld();
2329
2330         return ret;
2331 }
2332
2333 // lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
2334 // after they modify m->locked. Do not allow preemption during this call,
2335 // or else the m might be different in this function than in the caller.
2336 static void
2337 lockOSThread(void)
2338 {
2339         m->lockedg = g;
2340         g->lockedm = m;
2341 }
2342
2343 void    runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
2344 void
2345 runtime_LockOSThread(void)
2346 {
2347         m->locked |= LockExternal;
2348         lockOSThread();
2349 }
2350
2351 void
2352 runtime_lockOSThread(void)
2353 {
2354         m->locked += LockInternal;
2355         lockOSThread();
2356 }
2357
2358
2359 // unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
2360 // after they update m->locked. Do not allow preemption during this call,
2361 // or else the m might be in different in this function than in the caller.
2362 static void
2363 unlockOSThread(void)
2364 {
2365         if(m->locked != 0)
2366                 return;
2367         m->lockedg = nil;
2368         g->lockedm = nil;
2369 }
2370
2371 void    runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
2372
2373 void
2374 runtime_UnlockOSThread(void)
2375 {
2376         m->locked &= ~LockExternal;
2377         unlockOSThread();
2378 }
2379
2380 void
2381 runtime_unlockOSThread(void)
2382 {
2383         if(m->locked < LockInternal)
2384                 runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
2385         m->locked -= LockInternal;
2386         unlockOSThread();
2387 }
2388
2389 bool
2390 runtime_lockedOSThread(void)
2391 {
2392         return g->lockedm != nil && m->lockedg != nil;
2393 }
2394
2395 // for testing of callbacks
2396
2397 _Bool runtime_golockedOSThread(void)
2398   __asm__ (GOSYM_PREFIX "runtime.golockedOSThread");
2399
2400 _Bool
2401 runtime_golockedOSThread(void)
2402 {
2403         return runtime_lockedOSThread();
2404 }
2405
2406 intgo runtime_NumGoroutine (void)
2407   __asm__ (GOSYM_PREFIX "runtime.NumGoroutine");
2408
2409 intgo
2410 runtime_NumGoroutine()
2411 {
2412         return runtime_gcount();
2413 }
2414
2415 int32
2416 runtime_gcount(void)
2417 {
2418         G *gp;
2419         int32 n, s;
2420
2421         n = 0;
2422         runtime_lock(&runtime_sched);
2423         // TODO(dvyukov): runtime.NumGoroutine() is O(N).
2424         // We do not want to increment/decrement centralized counter in newproc/goexit,
2425         // just to make runtime.NumGoroutine() faster.
2426         // Compromise solution is to introduce per-P counters of active goroutines.
2427         for(gp = runtime_allg; gp; gp = gp->alllink) {
2428                 s = gp->status;
2429                 if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
2430                         n++;
2431         }
2432         runtime_unlock(&runtime_sched);
2433         return n;
2434 }
2435
2436 int32
2437 runtime_mcount(void)
2438 {
2439         return runtime_sched.mcount;
2440 }
2441
2442 static struct {
2443         Lock;
2444         void (*fn)(uintptr*, int32);
2445         int32 hz;
2446         uintptr pcbuf[100];
2447         Location locbuf[100];
2448 } prof;
2449
2450 static void
2451 System(void)
2452 {
2453 }
2454
2455 // Called if we receive a SIGPROF signal.
2456 void
2457 runtime_sigprof()
2458 {
2459         int32 n, i;
2460         bool traceback;
2461
2462         if(prof.fn == nil || prof.hz == 0)
2463                 return;
2464         traceback = true;
2465         // Windows does profiling in a dedicated thread w/o m.
2466         if(!Windows && (m == nil || m->mcache == nil))
2467                 traceback = false;
2468
2469         runtime_lock(&prof);
2470         if(prof.fn == nil) {
2471                 runtime_unlock(&prof);
2472                 return;
2473         }
2474         n = 0;
2475
2476         if(runtime_atomicload(&runtime_in_callers) > 0) {
2477                 // If SIGPROF arrived while already fetching runtime
2478                 // callers we can have trouble on older systems
2479                 // because the unwind library calls dl_iterate_phdr
2480                 // which was not recursive in the past.
2481                 traceback = false;
2482         }
2483
2484         if(traceback) {
2485                 n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf));
2486                 for(i = 0; i < n; i++)
2487                         prof.pcbuf[i] = prof.locbuf[i].pc;
2488         }
2489         if (!traceback || n <= 0) {
2490                 n = 2;
2491                 prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
2492                 prof.pcbuf[1] = (uintptr)System + 1;
2493         }
2494         prof.fn(prof.pcbuf, n);
2495         runtime_unlock(&prof);
2496 }
2497
2498 // Arrange to call fn with a traceback hz times a second.
2499 void
2500 runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
2501 {
2502         // Force sane arguments.
2503         if(hz < 0)
2504                 hz = 0;
2505         if(hz == 0)
2506                 fn = nil;
2507         if(fn == nil)
2508                 hz = 0;
2509
2510         // Disable preemption, otherwise we can be rescheduled to another thread
2511         // that has profiling enabled.
2512         m->locks++;
2513
2514         // Stop profiler on this thread so that it is safe to lock prof.
2515         // if a profiling signal came in while we had prof locked,
2516         // it would deadlock.
2517         runtime_resetcpuprofiler(0);
2518
2519         runtime_lock(&prof);
2520         prof.fn = fn;
2521         prof.hz = hz;
2522         runtime_unlock(&prof);
2523         runtime_lock(&runtime_sched);
2524         runtime_sched.profilehz = hz;
2525         runtime_unlock(&runtime_sched);
2526
2527         if(hz != 0)
2528                 runtime_resetcpuprofiler(hz);
2529
2530         m->locks--;
2531 }
2532
2533 // Change number of processors.  The world is stopped, sched is locked.
2534 static void
2535 procresize(int32 new)
2536 {
2537         int32 i, old;
2538         G *gp;
2539         P *p;
2540
2541         old = runtime_gomaxprocs;
2542         if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
2543                 runtime_throw("procresize: invalid arg");
2544         // initialize new P's
2545         for(i = 0; i < new; i++) {
2546                 p = runtime_allp[i];
2547                 if(p == nil) {
2548                         p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
2549                         p->id = i;
2550                         p->status = Pgcstop;
2551                         runtime_atomicstorep(&runtime_allp[i], p);
2552                 }
2553                 if(p->mcache == nil) {
2554                         if(old==0 && i==0)
2555                                 p->mcache = m->mcache;  // bootstrap
2556                         else
2557                                 p->mcache = runtime_allocmcache();
2558                 }
2559                 if(p->runq == nil) {
2560                         p->runqsize = 128;
2561                         p->runq = (G**)runtime_mallocgc(p->runqsize*sizeof(G*), 0, FlagNoInvokeGC);
2562                 }
2563         }
2564
2565         // redistribute runnable G's evenly
2566         for(i = 0; i < old; i++) {
2567                 p = runtime_allp[i];
2568                 while((gp = runqget(p)) != nil)
2569                         globrunqput(gp);
2570         }
2571         // start at 1 because current M already executes some G and will acquire allp[0] below,
2572         // so if we have a spare G we want to put it into allp[1].
2573         for(i = 1; runtime_sched.runqhead; i++) {
2574                 gp = runtime_sched.runqhead;
2575                 runtime_sched.runqhead = gp->schedlink;
2576                 runqput(runtime_allp[i%new], gp);
2577         }
2578         runtime_sched.runqtail = nil;
2579         runtime_sched.runqsize = 0;
2580
2581         // free unused P's
2582         for(i = new; i < old; i++) {
2583                 p = runtime_allp[i];
2584                 runtime_freemcache(p->mcache);
2585                 p->mcache = nil;
2586                 gfpurge(p);
2587                 p->status = Pdead;
2588                 // can't free P itself because it can be referenced by an M in syscall
2589         }
2590
2591         if(m->p)
2592                 m->p->m = nil;
2593         m->p = nil;
2594         m->mcache = nil;
2595         p = runtime_allp[0];
2596         p->m = nil;
2597         p->status = Pidle;
2598         acquirep(p);
2599         for(i = new-1; i > 0; i--) {
2600                 p = runtime_allp[i];
2601                 p->status = Pidle;
2602                 pidleput(p);
2603         }
2604         runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
2605 }
2606
2607 // Associate p and the current m.
2608 static void
2609 acquirep(P *p)
2610 {
2611         if(m->p || m->mcache)
2612                 runtime_throw("acquirep: already in go");
2613         if(p->m || p->status != Pidle) {
2614                 runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
2615                 runtime_throw("acquirep: invalid p state");
2616         }
2617         m->mcache = p->mcache;
2618         m->p = p;
2619         p->m = m;
2620         p->status = Prunning;
2621 }
2622
2623 // Disassociate p and the current m.
2624 static P*
2625 releasep(void)
2626 {
2627         P *p;
2628
2629         if(m->p == nil || m->mcache == nil)
2630                 runtime_throw("releasep: invalid arg");
2631         p = m->p;
2632         if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
2633                 runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
2634                         m, m->p, p->m, m->mcache, p->mcache, p->status);
2635                 runtime_throw("releasep: invalid p state");
2636         }
2637         m->p = nil;
2638         m->mcache = nil;
2639         p->m = nil;
2640         p->status = Pidle;
2641         return p;
2642 }
2643
2644 static void
2645 incidlelocked(int32 v)
2646 {
2647         runtime_lock(&runtime_sched);
2648         runtime_sched.nmidlelocked += v;
2649         if(v > 0)
2650                 checkdead();
2651         runtime_unlock(&runtime_sched);
2652 }
2653
2654 // Check for deadlock situation.
2655 // The check is based on number of running M's, if 0 -> deadlock.
2656 static void
2657 checkdead(void)
2658 {
2659         G *gp;
2660         int32 run, grunning, s;
2661
2662         // -1 for sysmon
2663         run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
2664         if(run > 0)
2665                 return;
2666         if(run < 0) {
2667                 runtime_printf("checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
2668                         runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
2669                 runtime_throw("checkdead: inconsistent counts");
2670         }
2671         grunning = 0;
2672         for(gp = runtime_allg; gp; gp = gp->alllink) {
2673                 if(gp->isbackground)
2674                         continue;
2675                 s = gp->status;
2676                 if(s == Gwaiting)
2677                         grunning++;
2678                 else if(s == Grunnable || s == Grunning || s == Gsyscall) {
2679                         runtime_printf("checkdead: find g %D in status %d\n", gp->goid, s);
2680                         runtime_throw("checkdead: runnable g");
2681                 }
2682         }
2683         if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
2684                 runtime_exit(0);
2685         m->throwing = -1;  // do not dump full stacks
2686         runtime_throw("all goroutines are asleep - deadlock!");
2687 }
2688
2689 static void
2690 sysmon(void)
2691 {
2692         uint32 idle, delay;
2693         int64 now, lastpoll, lasttrace;
2694         G *gp;
2695
2696         lasttrace = 0;
2697         idle = 0;  // how many cycles in succession we had not wokeup somebody
2698         delay = 0;
2699         for(;;) {
2700                 if(idle == 0)  // start with 20us sleep...
2701                         delay = 20;
2702                 else if(idle > 50)  // start doubling the sleep after 1ms...
2703                         delay *= 2;
2704                 if(delay > 10*1000)  // up to 10ms
2705                         delay = 10*1000;
2706                 runtime_usleep(delay);
2707                 if(runtime_debug.schedtrace <= 0 &&
2708                         (runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
2709                         runtime_lock(&runtime_sched);
2710                         if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
2711                                 runtime_atomicstore(&runtime_sched.sysmonwait, 1);
2712                                 runtime_unlock(&runtime_sched);
2713                                 runtime_notesleep(&runtime_sched.sysmonnote);
2714                                 runtime_noteclear(&runtime_sched.sysmonnote);
2715                                 idle = 0;
2716                                 delay = 20;
2717                         } else
2718                                 runtime_unlock(&runtime_sched);
2719                 }
2720                 // poll network if not polled for more than 10ms
2721                 lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
2722                 now = runtime_nanotime();
2723                 if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
2724                         runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
2725                         gp = runtime_netpoll(false);  // non-blocking
2726                         if(gp) {
2727                                 // Need to decrement number of idle locked M's
2728                                 // (pretending that one more is running) before injectglist.
2729                                 // Otherwise it can lead to the following situation:
2730                                 // injectglist grabs all P's but before it starts M's to run the P's,
2731                                 // another M returns from syscall, finishes running its G,
2732                                 // observes that there is no work to do and no other running M's
2733                                 // and reports deadlock.
2734                                 incidlelocked(-1);
2735                                 injectglist(gp);
2736                                 incidlelocked(1);
2737                         }
2738                 }
2739                 // retake P's blocked in syscalls
2740                 // and preempt long running G's
2741                 if(retake(now))
2742                         idle = 0;
2743                 else
2744                         idle++;
2745
2746                 if(runtime_debug.schedtrace > 0 && lasttrace + runtime_debug.schedtrace*1000000ll <= now) {
2747                         lasttrace = now;
2748                         runtime_schedtrace(runtime_debug.scheddetail);
2749                 }
2750         }
2751 }
2752
2753 typedef struct Pdesc Pdesc;
2754 struct Pdesc
2755 {
2756         uint32  schedtick;
2757         int64   schedwhen;
2758         uint32  syscalltick;
2759         int64   syscallwhen;
2760 };
2761 static Pdesc pdesc[MaxGomaxprocs];
2762
2763 static uint32
2764 retake(int64 now)
2765 {
2766         uint32 i, s, n;
2767         int64 t;
2768         P *p;
2769         Pdesc *pd;
2770
2771         n = 0;
2772         for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
2773                 p = runtime_allp[i];
2774                 if(p==nil)
2775                         continue;
2776                 pd = &pdesc[i];
2777                 s = p->status;
2778                 if(s == Psyscall) {
2779                         // Retake P from syscall if it's there for more than 1 sysmon tick (20us).
2780                         // But only if there is other work to do.
2781                         t = p->syscalltick;
2782                         if(pd->syscalltick != t) {
2783                                 pd->syscalltick = t;
2784                                 pd->syscallwhen = now;
2785                                 continue;
2786                         }
2787                         if(p->runqhead == p->runqtail &&
2788                                 runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0)
2789                                 continue;
2790                         // Need to decrement number of idle locked M's
2791                         // (pretending that one more is running) before the CAS.
2792                         // Otherwise the M from which we retake can exit the syscall,
2793                         // increment nmidle and report deadlock.
2794                         incidlelocked(-1);
2795                         if(runtime_cas(&p->status, s, Pidle)) {
2796                                 n++;
2797                                 handoffp(p);
2798                         }
2799                         incidlelocked(1);
2800                 } else if(s == Prunning) {
2801                         // Preempt G if it's running for more than 10ms.
2802                         t = p->schedtick;
2803                         if(pd->schedtick != t) {
2804                                 pd->schedtick = t;
2805                                 pd->schedwhen = now;
2806                                 continue;
2807                         }
2808                         if(pd->schedwhen + 10*1000*1000 > now)
2809                                 continue;
2810                         // preemptone(p);
2811                 }
2812         }
2813         return n;
2814 }
2815
2816 // Tell all goroutines that they have been preempted and they should stop.
2817 // This function is purely best-effort.  It can fail to inform a goroutine if a
2818 // processor just started running it.
2819 // No locks need to be held.
2820 // Returns true if preemption request was issued to at least one goroutine.
2821 static bool
2822 preemptall(void)
2823 {
2824         return false;
2825 }
2826
2827 void
2828 runtime_schedtrace(bool detailed)
2829 {
2830         static int64 starttime;
2831         int64 now;
2832         int64 id1, id2, id3;
2833         int32 i, q, t, h, s;
2834         const char *fmt;
2835         M *mp, *lockedm;
2836         G *gp, *lockedg;
2837         P *p;
2838
2839         now = runtime_nanotime();
2840         if(starttime == 0)
2841                 starttime = now;
2842
2843         runtime_lock(&runtime_sched);
2844         runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
2845                 (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
2846                 runtime_sched.nmidle, runtime_sched.runqsize);
2847         if(detailed) {
2848                 runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
2849                         runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
2850                         runtime_sched.stopwait, runtime_sched.sysmonwait);
2851         }
2852         // We must be careful while reading data from P's, M's and G's.
2853         // Even if we hold schedlock, most data can be changed concurrently.
2854         // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
2855         for(i = 0; i < runtime_gomaxprocs; i++) {
2856                 p = runtime_allp[i];
2857                 if(p == nil)
2858                         continue;
2859                 mp = p->m;
2860                 t = p->runqtail;
2861                 h = p->runqhead;
2862                 s = p->runqsize;
2863                 q = t - h;
2864                 if(q < 0)
2865                         q += s;
2866                 if(detailed)
2867                         runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d/%d gfreecnt=%d\n",
2868                                 i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, q, s, p->gfreecnt);
2869                 else {
2870                         // In non-detailed mode format lengths of per-P run queues as:
2871                         // [len1 len2 len3 len4]
2872                         fmt = " %d";
2873                         if(runtime_gomaxprocs == 1)
2874                                 fmt = " [%d]\n";
2875                         else if(i == 0)
2876                                 fmt = " [%d";
2877                         else if(i == runtime_gomaxprocs-1)
2878                                 fmt = " %d]\n";
2879                         runtime_printf(fmt, q);
2880                 }
2881         }
2882         if(!detailed) {
2883                 runtime_unlock(&runtime_sched);
2884                 return;
2885         }
2886         for(mp = runtime_allm; mp; mp = mp->alllink) {
2887                 p = mp->p;
2888                 gp = mp->curg;
2889                 lockedg = mp->lockedg;
2890                 id1 = -1;
2891                 if(p)
2892                         id1 = p->id;
2893                 id2 = -1;
2894                 if(gp)
2895                         id2 = gp->goid;
2896                 id3 = -1;
2897                 if(lockedg)
2898                         id3 = lockedg->goid;
2899                 runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
2900                         " locks=%d dying=%d helpgc=%d spinning=%d lockedg=%D\n",
2901                         mp->id, id1, id2,
2902                         mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
2903                         mp->spinning, id3);
2904         }
2905         for(gp = runtime_allg; gp; gp = gp->alllink) {
2906                 mp = gp->m;
2907                 lockedm = gp->lockedm;
2908                 runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
2909                         gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
2910                         lockedm ? lockedm->id : -1);
2911         }
2912         runtime_unlock(&runtime_sched);
2913 }
2914
2915 // Put mp on midle list.
2916 // Sched must be locked.
2917 static void
2918 mput(M *mp)
2919 {
2920         mp->schedlink = runtime_sched.midle;
2921         runtime_sched.midle = mp;
2922         runtime_sched.nmidle++;
2923         checkdead();
2924 }
2925
2926 // Try to get an m from midle list.
2927 // Sched must be locked.
2928 static M*
2929 mget(void)
2930 {
2931         M *mp;
2932
2933         if((mp = runtime_sched.midle) != nil){
2934                 runtime_sched.midle = mp->schedlink;
2935                 runtime_sched.nmidle--;
2936         }
2937         return mp;
2938 }
2939
2940 // Put gp on the global runnable queue.
2941 // Sched must be locked.
2942 static void
2943 globrunqput(G *gp)
2944 {
2945         gp->schedlink = nil;
2946         if(runtime_sched.runqtail)
2947                 runtime_sched.runqtail->schedlink = gp;
2948         else
2949                 runtime_sched.runqhead = gp;
2950         runtime_sched.runqtail = gp;
2951         runtime_sched.runqsize++;
2952 }
2953
2954 // Try get a batch of G's from the global runnable queue.
2955 // Sched must be locked.
2956 static G*
2957 globrunqget(P *p, int32 max)
2958 {
2959         G *gp, *gp1;
2960         int32 n;
2961
2962         if(runtime_sched.runqsize == 0)
2963                 return nil;
2964         n = runtime_sched.runqsize/runtime_gomaxprocs+1;
2965         if(n > runtime_sched.runqsize)
2966                 n = runtime_sched.runqsize;
2967         if(max > 0 && n > max)
2968                 n = max;
2969         runtime_sched.runqsize -= n;
2970         if(runtime_sched.runqsize == 0)
2971                 runtime_sched.runqtail = nil;
2972         gp = runtime_sched.runqhead;
2973         runtime_sched.runqhead = gp->schedlink;
2974         n--;
2975         while(n--) {
2976                 gp1 = runtime_sched.runqhead;
2977                 runtime_sched.runqhead = gp1->schedlink;
2978                 runqput(p, gp1);
2979         }
2980         return gp;
2981 }
2982
2983 // Put p to on pidle list.
2984 // Sched must be locked.
2985 static void
2986 pidleput(P *p)
2987 {
2988         p->link = runtime_sched.pidle;
2989         runtime_sched.pidle = p;
2990         runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
2991 }
2992
2993 // Try get a p from pidle list.
2994 // Sched must be locked.
2995 static P*
2996 pidleget(void)
2997 {
2998         P *p;
2999
3000         p = runtime_sched.pidle;
3001         if(p) {
3002                 runtime_sched.pidle = p->link;
3003                 runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
3004         }
3005         return p;
3006 }
3007
3008 // Put g on local runnable queue.
3009 // TODO(dvyukov): consider using lock-free queue.
3010 static void
3011 runqput(P *p, G *gp)
3012 {
3013         int32 h, t, s;
3014
3015         runtime_lock(p);
3016 retry:
3017         h = p->runqhead;
3018         t = p->runqtail;
3019         s = p->runqsize;
3020         if(t == h-1 || (h == 0 && t == s-1)) {
3021                 runqgrow(p);
3022                 goto retry;
3023         }
3024         p->runq[t++] = gp;
3025         if(t == s)
3026                 t = 0;
3027         p->runqtail = t;
3028         runtime_unlock(p);
3029 }
3030
3031 // Get g from local runnable queue.
3032 static G*
3033 runqget(P *p)
3034 {
3035         G *gp;
3036         int32 t, h, s;
3037
3038         if(p->runqhead == p->runqtail)
3039                 return nil;
3040         runtime_lock(p);
3041         h = p->runqhead;
3042         t = p->runqtail;
3043         s = p->runqsize;
3044         if(t == h) {
3045                 runtime_unlock(p);
3046                 return nil;
3047         }
3048         gp = p->runq[h++];
3049         if(h == s)
3050                 h = 0;
3051         p->runqhead = h;
3052         runtime_unlock(p);
3053         return gp;
3054 }
3055
3056 // Grow local runnable queue.
3057 // TODO(dvyukov): consider using fixed-size array
3058 // and transfer excess to the global list (local queue can grow way too big).
3059 static void
3060 runqgrow(P *p)
3061 {
3062         G **q;
3063         int32 s, t, h, t2;
3064
3065         h = p->runqhead;
3066         t = p->runqtail;
3067         s = p->runqsize;
3068         t2 = 0;
3069         q = runtime_malloc(2*s*sizeof(*q));
3070         while(t != h) {
3071                 q[t2++] = p->runq[h++];
3072                 if(h == s)
3073                         h = 0;
3074         }
3075         runtime_free(p->runq);
3076         p->runq = q;
3077         p->runqhead = 0;
3078         p->runqtail = t2;
3079         p->runqsize = 2*s;
3080 }
3081
3082 // Steal half of elements from local runnable queue of p2
3083 // and put onto local runnable queue of p.
3084 // Returns one of the stolen elements (or nil if failed).
3085 static G*
3086 runqsteal(P *p, P *p2)
3087 {
3088         G *gp, *gp1;
3089         int32 t, h, s, t2, h2, s2, c, i;
3090
3091         if(p2->runqhead == p2->runqtail)
3092                 return nil;
3093         // sort locks to prevent deadlocks
3094         if(p < p2)
3095                 runtime_lock(p);
3096         runtime_lock(p2);
3097         if(p2->runqhead == p2->runqtail) {
3098                 runtime_unlock(p2);
3099                 if(p < p2)
3100                         runtime_unlock(p);
3101                 return nil;
3102         }
3103         if(p >= p2)
3104                 runtime_lock(p);
3105         // now we've locked both queues and know the victim is not empty
3106         h = p->runqhead;
3107         t = p->runqtail;
3108         s = p->runqsize;
3109         h2 = p2->runqhead;
3110         t2 = p2->runqtail;
3111         s2 = p2->runqsize;
3112         gp = p2->runq[h2++];  // return value
3113         if(h2 == s2)
3114                 h2 = 0;
3115         // steal roughly half
3116         if(t2 > h2)
3117                 c = (t2 - h2) / 2;
3118         else
3119                 c = (s2 - h2 + t2) / 2;
3120         // copy
3121         for(i = 0; i != c; i++) {
3122                 // the target queue is full?
3123                 if(t == h-1 || (h == 0 && t == s-1))
3124                         break;
3125                 // the victim queue is empty?
3126                 if(t2 == h2)
3127                         break;
3128                 gp1 = p2->runq[h2++];
3129                 if(h2 == s2)
3130                         h2 = 0;
3131                 p->runq[t++] = gp1;
3132                 if(t == s)
3133                         t = 0;
3134         }
3135         p->runqtail = t;
3136         p2->runqhead = h2;
3137         runtime_unlock(p2);
3138         runtime_unlock(p);
3139         return gp;
3140 }
3141
3142 void runtime_testSchedLocalQueue(void)
3143   __asm__("runtime.testSchedLocalQueue");
3144
3145 void
3146 runtime_testSchedLocalQueue(void)
3147 {
3148         P p;
3149         G gs[1000];
3150         int32 i, j;
3151
3152         runtime_memclr((byte*)&p, sizeof(p));
3153         p.runqsize = 1;
3154         p.runqhead = 0;
3155         p.runqtail = 0;
3156         p.runq = runtime_malloc(p.runqsize*sizeof(*p.runq));
3157
3158         for(i = 0; i < (int32)nelem(gs); i++) {
3159                 if(runqget(&p) != nil)
3160                         runtime_throw("runq is not empty initially");
3161                 for(j = 0; j < i; j++)
3162                         runqput(&p, &gs[i]);
3163                 for(j = 0; j < i; j++) {
3164                         if(runqget(&p) != &gs[i]) {
3165                                 runtime_printf("bad element at iter %d/%d\n", i, j);
3166                                 runtime_throw("bad element");
3167                         }
3168                 }
3169                 if(runqget(&p) != nil)
3170                         runtime_throw("runq is not empty afterwards");
3171         }
3172 }
3173
3174 void runtime_testSchedLocalQueueSteal(void)
3175   __asm__("runtime.testSchedLocalQueueSteal");
3176
3177 void
3178 runtime_testSchedLocalQueueSteal(void)
3179 {
3180         P p1, p2;
3181         G gs[1000], *gp;
3182         int32 i, j, s;
3183
3184         runtime_memclr((byte*)&p1, sizeof(p1));
3185         p1.runqsize = 1;
3186         p1.runqhead = 0;
3187         p1.runqtail = 0;
3188         p1.runq = runtime_malloc(p1.runqsize*sizeof(*p1.runq));
3189
3190         runtime_memclr((byte*)&p2, sizeof(p2));
3191         p2.runqsize = nelem(gs);
3192         p2.runqhead = 0;
3193         p2.runqtail = 0;
3194         p2.runq = runtime_malloc(p2.runqsize*sizeof(*p2.runq));
3195
3196         for(i = 0; i < (int32)nelem(gs); i++) {
3197                 for(j = 0; j < i; j++) {
3198                         gs[j].sig = 0;
3199                         runqput(&p1, &gs[j]);
3200                 }
3201                 gp = runqsteal(&p2, &p1);
3202                 s = 0;
3203                 if(gp) {
3204                         s++;
3205                         gp->sig++;
3206                 }
3207                 while((gp = runqget(&p2)) != nil) {
3208                         s++;
3209                         gp->sig++;
3210                 }
3211                 while((gp = runqget(&p1)) != nil)
3212                         gp->sig++;
3213                 for(j = 0; j < i; j++) {
3214                         if(gs[j].sig != 1) {
3215                                 runtime_printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
3216                                 runtime_throw("bad element");
3217                         }
3218                 }
3219                 if(s != i/2 && s != i/2+1) {
3220                         runtime_printf("bad steal %d, want %d or %d, iter %d\n",
3221                                 s, i/2, i/2+1, i);
3222                         runtime_throw("bad steal");
3223                 }
3224         }
3225 }
3226
3227 intgo runtime_debug_setMaxThreads(intgo)
3228      __asm__(GOSYM_PREFIX "runtime_debug.setMaxThreads");
3229
3230 intgo
3231 runtime_debug_setMaxThreads(intgo in)
3232 {
3233         intgo out;
3234
3235         runtime_lock(&runtime_sched);
3236         out = runtime_sched.maxmcount;
3237         runtime_sched.maxmcount = in;
3238         checkmcount();
3239         runtime_unlock(&runtime_sched);
3240         return out;
3241 }
3242
3243 void
3244 runtime_proc_scan(void (*addroot)(Obj))
3245 {
3246         addroot((Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
3247 }
3248
3249 // When a function calls a closure, it passes the closure value to
3250 // __go_set_closure immediately before the function call.  When a
3251 // function uses a closure, it calls __go_get_closure immediately on
3252 // function entry.  This is a hack, but it will work on any system.
3253 // It would be better to use the static chain register when there is
3254 // one.  It is also worth considering expanding these functions
3255 // directly in the compiler.
3256
3257 void
3258 __go_set_closure(void* v)
3259 {
3260         g->closure = v;
3261 }
3262
3263 void *
3264 __go_get_closure(void)
3265 {
3266         return g->closure;
3267 }
3268
3269 // Return whether we are waiting for a GC.  This gc toolchain uses
3270 // preemption instead.
3271 bool
3272 runtime_gcwaiting(void)
3273 {
3274         return runtime_sched.gcwaiting;
3275 }