libgo/runtime/proc.c

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 #include <limits.h>
   6 #include <signal.h>
   7 #include <stdlib.h>
   8 #include <pthread.h>
   9 #include <unistd.h>
  10
  11 #include "config.h"
  12
  13 #ifdef HAVE_DL_ITERATE_PHDR
  14 #include <link.h>
  15 #endif
  16
  17 #include "runtime.h"
  18 #include "arch.h"
  19 #include "defs.h"
  20 #include "malloc.h"
  21 #include "race.h"
  22 #include "go-type.h"
  23 #include "go-defer.h"
  24
  25 #ifdef USING_SPLIT_STACK
  26
  27 /* FIXME: These are not declared anywhere.  */
  28
  29 extern void __splitstack_getcontext(void *context[10]);
  30
  31 extern void __splitstack_setcontext(void *context[10]);
  32
  33 extern void *__splitstack_makecontext(size_t, void *context[10], size_t *);
  34
  35 extern void * __splitstack_resetcontext(void *context[10], size_t *);
  36
  37 extern void *__splitstack_find(void *, void *, size_t *, void **, void **,
  38                                void **);
  39
  40 extern void __splitstack_block_signals (int *, int *);
  41
  42 extern void __splitstack_block_signals_context (void *context[10], int *,
  43                                                 int *);
  44
  45 #endif
  46
  47 #ifndef PTHREAD_STACK_MIN
  48 # define PTHREAD_STACK_MIN 8192
  49 #endif
  50
  51 #if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK)
  52 # define StackMin PTHREAD_STACK_MIN
  53 #else
  54 # define StackMin 2 * 1024 * 1024
  55 #endif
  56
  57 uintptr runtime_stacks_sys;
  58
  59 static void gtraceback(G*);
  60
  61 #ifdef __rtems__
  62 #define __thread
  63 #endif
  64
  65 static __thread G *g;
  66 static __thread M *m;
  67
  68 #ifndef SETCONTEXT_CLOBBERS_TLS
  69
  70 static inline void
  71 initcontext(void)
  72 {
  73 }
  74
  75 static inline void
  76 fixcontext(ucontext_t *c __attribute__ ((unused)))
  77 {
  78 }
  79
  80 #else
  81
  82 # if defined(__x86_64__) && defined(__sun__)
  83
  84 // x86_64 Solaris 10 and 11 have a bug: setcontext switches the %fs
  85 // register to that of the thread which called getcontext.  The effect
  86 // is that the address of all __thread variables changes.  This bug
  87 // also affects pthread_self() and pthread_getspecific.  We work
  88 // around it by clobbering the context field directly to keep %fs the
  89 // same.
  90
  91 static __thread greg_t fs;
  92
  93 static inline void
  94 initcontext(void)
  95 {
  96         ucontext_t c;
  97
  98         getcontext(&c);
  99         fs = c.uc_mcontext.gregs[REG_FSBASE];
 100 }
 101
 102 static inline void
 103 fixcontext(ucontext_t* c)
 104 {
 105         c->uc_mcontext.gregs[REG_FSBASE] = fs;
 106 }
 107
 108 # elif defined(__NetBSD__)
 109
 110 // NetBSD has a bug: setcontext clobbers tlsbase, we need to save
 111 // and restore it ourselves.
 112
 113 static __thread __greg_t tlsbase;
 114
 115 static inline void
 116 initcontext(void)
 117 {
 118         ucontext_t c;
 119
 120         getcontext(&c);
 121         tlsbase = c.uc_mcontext._mc_tlsbase;
 122 }
 123
 124 static inline void
 125 fixcontext(ucontext_t* c)
 126 {
 127         c->uc_mcontext._mc_tlsbase = tlsbase;
 128 }
 129
 130 # else
 131
 132 #  error unknown case for SETCONTEXT_CLOBBERS_TLS
 133
 134 # endif
 135
 136 #endif
 137
 138 // We can not always refer to the TLS variables directly.  The
 139 // compiler will call tls_get_addr to get the address of the variable,
 140 // and it may hold it in a register across a call to schedule.  When
 141 // we get back from the call we may be running in a different thread,
 142 // in which case the register now points to the TLS variable for a
 143 // different thread.  We use non-inlinable functions to avoid this
 144 // when necessary.
 145
 146 G* runtime_g(void) __attribute__ ((noinline, no_split_stack));
 147
 148 G*
 149 runtime_g(void)
 150 {
 151         return g;
 152 }
 153
 154 M* runtime_m(void) __attribute__ ((noinline, no_split_stack));
 155
 156 M*
 157 runtime_m(void)
 158 {
 159         return m;
 160 }
 161
 162 // Set m and g.
 163 void
 164 runtime_setmg(M* mp, G* gp)
 165 {
 166         m = mp;
 167         g = gp;
 168 }
 169
 170 // The static TLS size.  See runtime_newm.
 171 static int tlssize;
 172
 173 // Start a new thread.
 174 static void
 175 runtime_newosproc(M *mp)
 176 {
 177         pthread_attr_t attr;
 178         size_t stacksize;
 179         sigset_t clear, old;
 180         pthread_t tid;
 181         int ret;
 182
 183         if(pthread_attr_init(&attr) != 0)
 184                 runtime_throw("pthread_attr_init");
 185         if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
 186                 runtime_throw("pthread_attr_setdetachstate");
 187
 188         stacksize = PTHREAD_STACK_MIN;
 189
 190         // With glibc before version 2.16 the static TLS size is taken
 191         // out of the stack size, and we get an error or a crash if
 192         // there is not enough stack space left.  Add it back in if we
 193         // can, in case the program uses a lot of TLS space.  FIXME:
 194         // This can be disabled in glibc 2.16 and later, if the bug is
 195         // indeed fixed then.
 196         stacksize += tlssize;
 197
 198         if(pthread_attr_setstacksize(&attr, stacksize) != 0)
 199                 runtime_throw("pthread_attr_setstacksize");
 200
 201         // Block signals during pthread_create so that the new thread
 202         // starts with signals disabled.  It will enable them in minit.
 203         sigfillset(&clear);
 204
 205 #ifdef SIGTRAP
 206         // Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
 207         sigdelset(&clear, SIGTRAP);
 208 #endif
 209
 210         sigemptyset(&old);
 211         pthread_sigmask(SIG_BLOCK, &clear, &old);
 212         ret = pthread_create(&tid, &attr, runtime_mstart, mp);
 213         pthread_sigmask(SIG_SETMASK, &old, nil);
 214
 215         if (ret != 0)
 216                 runtime_throw("pthread_create");
 217 }
 218
 219 // First function run by a new goroutine.  This replaces gogocall.
 220 static void
 221 kickoff(void)
 222 {
 223         void (*fn)(void*);
 224
 225         if(g->traceback != nil)
 226                 gtraceback(g);
 227
 228         fn = (void (*)(void*))(g->entry);
 229         fn(g->param);
 230         runtime_goexit();
 231 }
 232
 233 // Switch context to a different goroutine.  This is like longjmp.
 234 void runtime_gogo(G*) __attribute__ ((noinline));
 235 void
 236 runtime_gogo(G* newg)
 237 {
 238 #ifdef USING_SPLIT_STACK
 239         __splitstack_setcontext(&newg->stack_context[0]);
 240 #endif
 241         g = newg;
 242         newg->fromgogo = true;
 243         fixcontext(&newg->context);
 244         setcontext(&newg->context);
 245         runtime_throw("gogo setcontext returned");
 246 }
 247
 248 // Save context and call fn passing g as a parameter.  This is like
 249 // setjmp.  Because getcontext always returns 0, unlike setjmp, we use
 250 // g->fromgogo as a code.  It will be true if we got here via
 251 // setcontext.  g == nil the first time this is called in a new m.
 252 void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
 253 void
 254 runtime_mcall(void (*pfn)(G*))
 255 {
 256         M *mp;
 257         G *gp;
 258
 259         // Ensure that all registers are on the stack for the garbage
 260         // collector.
 261         __builtin_unwind_init();
 262
 263         mp = m;
 264         gp = g;
 265         if(gp == mp->g0)
 266                 runtime_throw("runtime: mcall called on m->g0 stack");
 267
 268         if(gp != nil) {
 269
 270 #ifdef USING_SPLIT_STACK
 271                 __splitstack_getcontext(&g->stack_context[0]);
 272 #else
 273                 gp->gcnext_sp = &pfn;
 274 #endif
 275                 gp->fromgogo = false;
 276                 getcontext(&gp->context);
 277
 278                 // When we return from getcontext, we may be running
 279                 // in a new thread.  That means that m and g may have
 280                 // changed.  They are global variables so we will
 281                 // reload them, but the addresses of m and g may be
 282                 // cached in our local stack frame, and those
 283                 // addresses may be wrong.  Call functions to reload
 284                 // the values for this thread.
 285                 mp = runtime_m();
 286                 gp = runtime_g();
 287
 288                 if(gp->traceback != nil)
 289                         gtraceback(gp);
 290         }
 291         if (gp == nil || !gp->fromgogo) {
 292 #ifdef USING_SPLIT_STACK
 293                 __splitstack_setcontext(&mp->g0->stack_context[0]);
 294 #endif
 295                 mp->g0->entry = (byte*)pfn;
 296                 mp->g0->param = gp;
 297
 298                 // It's OK to set g directly here because this case
 299                 // can not occur if we got here via a setcontext to
 300                 // the getcontext call just above.
 301                 g = mp->g0;
 302
 303                 fixcontext(&mp->g0->context);
 304                 setcontext(&mp->g0->context);
 305                 runtime_throw("runtime: mcall function returned");
 306         }
 307 }
 308
 309 #ifdef HAVE_DL_ITERATE_PHDR
 310
 311 // Called via dl_iterate_phdr.
 312
 313 static int
 314 addtls(struct dl_phdr_info* info, size_t size __attribute__ ((unused)), void *data)
 315 {
 316         size_t *total = (size_t *)data;
 317         unsigned int i;
 318
 319         for(i = 0; i < info->dlpi_phnum; ++i) {
 320                 if(info->dlpi_phdr[i].p_type == PT_TLS)
 321                         *total += info->dlpi_phdr[i].p_memsz;
 322         }
 323         return 0;
 324 }
 325
 326 // Set the total TLS size.
 327
 328 static void
 329 inittlssize()
 330 {
 331         size_t total = 0;
 332
 333         dl_iterate_phdr(addtls, (void *)&total);
 334         tlssize = total;
 335 }
 336
 337 #else
 338
 339 static void
 340 inittlssize()
 341 {
 342 }
 343
 344 #endif
 345
 346 // Goroutine scheduler
 347 // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
 348 //
 349 // The main concepts are:
 350 // G - goroutine.
 351 // M - worker thread, or machine.
 352 // P - processor, a resource that is required to execute Go code.
 353 //     M must have an associated P to execute Go code, however it can be
 354 //     blocked or in a syscall w/o an associated P.
 355 //
 356 // Design doc at http://golang.org/s/go11sched.
 357
 358 typedef struct Sched Sched;
 359 struct Sched {
 360         Lock;
 361
 362         uint64  goidgen;
 363         M*      midle;   // idle m's waiting for work
 364         int32   nmidle;  // number of idle m's waiting for work
 365         int32   nmidlelocked; // number of locked m's waiting for work
 366         int32   mcount;  // number of m's that have been created
 367         int32   maxmcount;      // maximum number of m's allowed (or die)
 368
 369         P*      pidle;  // idle P's
 370         uint32  npidle;
 371         uint32  nmspinning;
 372
 373         // Global runnable queue.
 374         G*      runqhead;
 375         G*      runqtail;
 376         int32   runqsize;
 377
 378         // Global cache of dead G's.
 379         Lock    gflock;
 380         G*      gfree;
 381
 382         uint32  gcwaiting;      // gc is waiting to run
 383         int32   stopwait;
 384         Note    stopnote;
 385         uint32  sysmonwait;
 386         Note    sysmonnote;
 387         uint64  lastpoll;
 388
 389         int32   profilehz;      // cpu profiling rate
 390 };
 391
 392 enum
 393 {
 394         // The max value of GOMAXPROCS.
 395         // There are no fundamental restrictions on the value.
 396         MaxGomaxprocs = 1<<8,
 397
 398         // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
 399         // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
 400         GoidCacheBatch = 16,
 401 };
 402
 403 Sched   runtime_sched;
 404 int32   runtime_gomaxprocs;
 405 uint32  runtime_needextram = 1;
 406 bool    runtime_iscgo = true;
 407 M       runtime_m0;
 408 G       runtime_g0;     // idle goroutine for m0
 409 G*      runtime_lastg;
 410 M*      runtime_allm;
 411 P**     runtime_allp;
 412 M*      runtime_extram;
 413 int8*   runtime_goos;
 414 int32   runtime_ncpu;
 415 bool    runtime_precisestack;
 416 static int32    newprocs;
 417
 418 static  Lock allglock;  // the following vars are protected by this lock or by stoptheworld
 419 G**     runtime_allg;
 420 uintptr runtime_allglen;
 421 static  uintptr allgcap;
 422
 423 void* runtime_mstart(void*);
 424 static void runqput(P*, G*);
 425 static G* runqget(P*);
 426 static bool runqputslow(P*, G*, uint32, uint32);
 427 static G* runqsteal(P*, P*);
 428 static void mput(M*);
 429 static M* mget(void);
 430 static void mcommoninit(M*);
 431 static void schedule(void);
 432 static void procresize(int32);
 433 static void acquirep(P*);
 434 static P* releasep(void);
 435 static void newm(void(*)(void), P*);
 436 static void stopm(void);
 437 static void startm(P*, bool);
 438 static void handoffp(P*);
 439 static void wakep(void);
 440 static void stoplockedm(void);
 441 static void startlockedm(G*);
 442 static void sysmon(void);
 443 static uint32 retake(int64);
 444 static void incidlelocked(int32);
 445 static void checkdead(void);
 446 static void exitsyscall0(G*);
 447 static void park0(G*);
 448 static void goexit0(G*);
 449 static void gfput(P*, G*);
 450 static G* gfget(P*);
 451 static void gfpurge(P*);
 452 static void globrunqput(G*);
 453 static void globrunqputbatch(G*, G*, int32);
 454 static G* globrunqget(P*, int32);
 455 static P* pidleget(void);
 456 static void pidleput(P*);
 457 static void injectglist(G*);
 458 static bool preemptall(void);
 459 static bool exitsyscallfast(void);
 460 static void allgadd(G*);
 461
 462 // The bootstrap sequence is:
 463 //
 464 //      call osinit
 465 //      call schedinit
 466 //      make & queue new G
 467 //      call runtime_mstart
 468 //
 469 // The new G calls runtime_main.
 470 void
 471 runtime_schedinit(void)
 472 {
 473         int32 n, procs;
 474         const byte *p;
 475         Eface i;
 476
 477         m = &runtime_m0;
 478         g = &runtime_g0;
 479         m->g0 = g;
 480         m->curg = g;
 481         g->m = m;
 482
 483         initcontext();
 484         inittlssize();
 485
 486         runtime_sched.maxmcount = 10000;
 487         runtime_precisestack = 0;
 488
 489         // runtime_symtabinit();
 490         runtime_mallocinit();
 491         mcommoninit(m);
 492
 493         // Initialize the itable value for newErrorCString,
 494         // so that the next time it gets called, possibly
 495         // in a fault during a garbage collection, it will not
 496         // need to allocated memory.
 497         runtime_newErrorCString(0, &i);
 498
 499         // Initialize the cached gotraceback value, since
 500         // gotraceback calls getenv, which mallocs on Plan 9.
 501         runtime_gotraceback(nil);
 502
 503         runtime_goargs();
 504         runtime_goenvs();
 505         runtime_parsedebugvars();
 506
 507         runtime_sched.lastpoll = runtime_nanotime();
 508         procs = 1;
 509         p = runtime_getenv("GOMAXPROCS");
 510         if(p != nil && (n = runtime_atoi(p)) > 0) {
 511                 if(n > MaxGomaxprocs)
 512                         n = MaxGomaxprocs;
 513                 procs = n;
 514         }
 515         runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
 516         procresize(procs);
 517
 518         // Can not enable GC until all roots are registered.
 519         // mstats.enablegc = 1;
 520
 521         // if(raceenabled)
 522         //      g->racectx = runtime_raceinit();
 523 }
 524
 525 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
 526 extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 527
 528 static void
 529 initDone(void *arg __attribute__ ((unused))) {
 530         runtime_unlockOSThread();
 531 };
 532
 533 // The main goroutine.
 534 // Note: C frames in general are not copyable during stack growth, for two reasons:
 535 //   1) We don't know where in a frame to find pointers to other stack locations.
 536 //   2) There's no guarantee that globals or heap values do not point into the frame.
 537 //
 538 // The C frame for runtime.main is copyable, because:
 539 //   1) There are no pointers to other stack locations in the frame
 540 //      (d.fn points at a global, d.link is nil, d.argp is -1).
 541 //   2) The only pointer into this frame is from the defer chain,
 542 //      which is explicitly handled during stack copying.
 543 void
 544 runtime_main(void* dummy __attribute__((unused)))
 545 {
 546         Defer d;
 547         _Bool frame;
 548
 549         newm(sysmon, nil);
 550
 551         // Lock the main goroutine onto this, the main OS thread,
 552         // during initialization.  Most programs won't care, but a few
 553         // do require certain calls to be made by the main thread.
 554         // Those can arrange for main.main to run in the main thread
 555         // by calling runtime.LockOSThread during initialization
 556         // to preserve the lock.
 557         runtime_lockOSThread();
 558
 559         // Defer unlock so that runtime.Goexit during init does the unlock too.
 560         d.__pfn = initDone;
 561         d.__next = g->defer;
 562         d.__arg = (void*)-1;
 563         d.__panic = g->panic;
 564         d.__retaddr = nil;
 565         d.__makefunc_can_recover = 0;
 566         d.__frame = &frame;
 567         d.__special = true;
 568         g->defer = &d;
 569
 570         if(m != &runtime_m0)
 571                 runtime_throw("runtime_main not on m0");
 572         __go_go(runtime_MHeap_Scavenger, nil);
 573         main_init();
 574
 575         if(g->defer != &d || d.__pfn != initDone)
 576                 runtime_throw("runtime: bad defer entry after init");
 577         g->defer = d.__next;
 578         runtime_unlockOSThread();
 579
 580         // For gccgo we have to wait until after main is initialized
 581         // to enable GC, because initializing main registers the GC
 582         // roots.
 583         mstats.enablegc = 1;
 584
 585         main_main();
 586         if(raceenabled)
 587                 runtime_racefini();
 588
 589         // Make racy client program work: if panicking on
 590         // another goroutine at the same time as main returns,
 591         // let the other goroutine finish printing the panic trace.
 592         // Once it does, it will exit. See issue 3934.
 593         if(runtime_panicking)
 594                 runtime_park(nil, nil, "panicwait");
 595
 596         runtime_exit(0);
 597         for(;;)
 598                 *(int32*)0 = 0;
 599 }
 600
 601 void
 602 runtime_goroutineheader(G *gp)
 603 {
 604         const char *status;
 605         int64 waitfor;
 606
 607         switch(gp->status) {
 608         case Gidle:
 609                 status = "idle";
 610                 break;
 611         case Grunnable:
 612                 status = "runnable";
 613                 break;
 614         case Grunning:
 615                 status = "running";
 616                 break;
 617         case Gsyscall:
 618                 status = "syscall";
 619                 break;
 620         case Gwaiting:
 621                 if(gp->waitreason)
 622                         status = gp->waitreason;
 623                 else
 624                         status = "waiting";
 625                 break;
 626         default:
 627                 status = "???";
 628                 break;
 629         }
 630
 631         // approx time the G is blocked, in minutes
 632         waitfor = 0;
 633         if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
 634                 waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
 635
 636         if(waitfor < 1)
 637                 runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
 638         else
 639                 runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
 640 }
 641
 642 void
 643 runtime_printcreatedby(G *g)
 644 {
 645         if(g != nil && g->gopc != 0 && g->goid != 1) {
 646                 String fn;
 647                 String file;
 648                 intgo line;
 649
 650                 if(__go_file_line(g->gopc - 1, &fn, &file, &line)) {
 651                         runtime_printf("created by %S\n", fn);
 652                         runtime_printf("\t%S:%D\n", file, (int64) line);
 653                 }
 654         }
 655 }
 656
 657 struct Traceback
 658 {
 659         G* gp;
 660         Location locbuf[TracebackMaxFrames];
 661         int32 c;
 662 };
 663
 664 void
 665 runtime_tracebackothers(G * volatile me)
 666 {
 667         G * volatile gp;
 668         Traceback tb;
 669         int32 traceback;
 670         volatile uintptr i;
 671
 672         tb.gp = me;
 673         traceback = runtime_gotraceback(nil);
 674
 675         // Show the current goroutine first, if we haven't already.
 676         if((gp = m->curg) != nil && gp != me) {
 677                 runtime_printf("\n");
 678                 runtime_goroutineheader(gp);
 679                 gp->traceback = &tb;
 680
 681 #ifdef USING_SPLIT_STACK
 682                 __splitstack_getcontext(&me->stack_context[0]);
 683 #endif
 684                 getcontext(&me->context);
 685
 686                 if(gp->traceback != nil) {
 687                   runtime_gogo(gp);
 688                 }
 689
 690                 runtime_printtrace(tb.locbuf, tb.c, false);
 691                 runtime_printcreatedby(gp);
 692         }
 693
 694         runtime_lock(&allglock);
 695         for(i = 0; i < runtime_allglen; i++) {
 696                 gp = runtime_allg[i];
 697                 if(gp == me || gp == m->curg || gp->status == Gdead)
 698                         continue;
 699                 if(gp->issystem && traceback < 2)
 700                         continue;
 701                 runtime_printf("\n");
 702                 runtime_goroutineheader(gp);
 703
 704                 // Our only mechanism for doing a stack trace is
 705                 // _Unwind_Backtrace.  And that only works for the
 706                 // current thread, not for other random goroutines.
 707                 // So we need to switch context to the goroutine, get
 708                 // the backtrace, and then switch back.
 709
 710                 // This means that if g is running or in a syscall, we
 711                 // can't reliably print a stack trace.  FIXME.
 712
 713                 if(gp->status == Grunning) {
 714                         runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
 715                         runtime_printcreatedby(gp);
 716                 } else if(gp->status == Gsyscall) {
 717                         runtime_printf("\tgoroutine in C code; stack unavailable\n");
 718                         runtime_printcreatedby(gp);
 719                 } else {
 720                         gp->traceback = &tb;
 721
 722 #ifdef USING_SPLIT_STACK
 723                         __splitstack_getcontext(&me->stack_context[0]);
 724 #endif
 725                         getcontext(&me->context);
 726
 727                         if(gp->traceback != nil) {
 728                                 runtime_gogo(gp);
 729                         }
 730
 731                         runtime_printtrace(tb.locbuf, tb.c, false);
 732                         runtime_printcreatedby(gp);
 733                 }
 734         }
 735         runtime_unlock(&allglock);
 736 }
 737
 738 static void
 739 checkmcount(void)
 740 {
 741         // sched lock is held
 742         if(runtime_sched.mcount > runtime_sched.maxmcount) {
 743                 runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
 744                 runtime_throw("thread exhaustion");
 745         }
 746 }
 747
 748 // Do a stack trace of gp, and then restore the context to
 749 // gp->dotraceback.
 750
 751 static void
 752 gtraceback(G* gp)
 753 {
 754         Traceback* traceback;
 755
 756         traceback = gp->traceback;
 757         gp->traceback = nil;
 758         traceback->c = runtime_callers(1, traceback->locbuf,
 759                 sizeof traceback->locbuf / sizeof traceback->locbuf[0], false);
 760         runtime_gogo(traceback->gp);
 761 }
 762
 763 static void
 764 mcommoninit(M *mp)
 765 {
 766         // If there is no mcache runtime_callers() will crash,
 767         // and we are most likely in sysmon thread so the stack is senseless anyway.
 768         if(m->mcache)
 769                 runtime_callers(1, mp->createstack, nelem(mp->createstack), false);
 770
 771         mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 772
 773         runtime_lock(&runtime_sched);
 774         mp->id = runtime_sched.mcount++;
 775         checkmcount();
 776         runtime_mpreinit(mp);
 777
 778         // Add to runtime_allm so garbage collector doesn't free m
 779         // when it is just in a register or thread-local storage.
 780         mp->alllink = runtime_allm;
 781         // runtime_NumCgoCall() iterates over allm w/o schedlock,
 782         // so we need to publish it safely.
 783         runtime_atomicstorep(&runtime_allm, mp);
 784         runtime_unlock(&runtime_sched);
 785 }
 786
 787 // Mark gp ready to run.
 788 void
 789 runtime_ready(G *gp)
 790 {
 791         // Mark runnable.
 792         m->locks++;  // disable preemption because it can be holding p in a local var
 793         if(gp->status != Gwaiting) {
 794                 runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 795                 runtime_throw("bad g->status in ready");
 796         }
 797         gp->status = Grunnable;
 798         runqput(m->p, gp);
 799         if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 800                 wakep();
 801         m->locks--;
 802 }
 803
 804 int32
 805 runtime_gcprocs(void)
 806 {
 807         int32 n;
 808
 809         // Figure out how many CPUs to use during GC.
 810         // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
 811         runtime_lock(&runtime_sched);
 812         n = runtime_gomaxprocs;
 813         if(n > runtime_ncpu)
 814                 n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 815         if(n > MaxGcproc)
 816                 n = MaxGcproc;
 817         if(n > runtime_sched.nmidle+1) // one M is currently running
 818                 n = runtime_sched.nmidle+1;
 819         runtime_unlock(&runtime_sched);
 820         return n;
 821 }
 822
 823 static bool
 824 needaddgcproc(void)
 825 {
 826         int32 n;
 827
 828         runtime_lock(&runtime_sched);
 829         n = runtime_gomaxprocs;
 830         if(n > runtime_ncpu)
 831                 n = runtime_ncpu;
 832         if(n > MaxGcproc)
 833                 n = MaxGcproc;
 834         n -= runtime_sched.nmidle+1; // one M is currently running
 835         runtime_unlock(&runtime_sched);
 836         return n > 0;
 837 }
 838
 839 void
 840 runtime_helpgc(int32 nproc)
 841 {
 842         M *mp;
 843         int32 n, pos;
 844
 845         runtime_lock(&runtime_sched);
 846         pos = 0;
 847         for(n = 1; n < nproc; n++) {  // one M is currently running
 848                 if(runtime_allp[pos]->mcache == m->mcache)
 849                         pos++;
 850                 mp = mget();
 851                 if(mp == nil)
 852                         runtime_throw("runtime_gcprocs inconsistency");
 853                 mp->helpgc = n;
 854                 mp->mcache = runtime_allp[pos]->mcache;
 855                 pos++;
 856                 runtime_notewakeup(&mp->park);
 857         }
 858         runtime_unlock(&runtime_sched);
 859 }
 860
 861 // Similar to stoptheworld but best-effort and can be called several times.
 862 // There is no reverse operation, used during crashing.
 863 // This function must not lock any mutexes.
 864 void
 865 runtime_freezetheworld(void)
 866 {
 867         int32 i;
 868
 869         if(runtime_gomaxprocs == 1)
 870                 return;
 871         // stopwait and preemption requests can be lost
 872         // due to races with concurrently executing threads,
 873         // so try several times
 874         for(i = 0; i < 5; i++) {
 875                 // this should tell the scheduler to not start any new goroutines
 876                 runtime_sched.stopwait = 0x7fffffff;
 877                 runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 878                 // this should stop running goroutines
 879                 if(!preemptall())
 880                         break;  // no running goroutines
 881                 runtime_usleep(1000);
 882         }
 883         // to be sure
 884         runtime_usleep(1000);
 885         preemptall();
 886         runtime_usleep(1000);
 887 }
 888
 889 void
 890 runtime_stoptheworld(void)
 891 {
 892         int32 i;
 893         uint32 s;
 894         P *p;
 895         bool wait;
 896
 897         runtime_lock(&runtime_sched);
 898         runtime_sched.stopwait = runtime_gomaxprocs;
 899         runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 900         preemptall();
 901         // stop current P
 902         m->p->status = Pgcstop;
 903         runtime_sched.stopwait--;
 904         // try to retake all P's in Psyscall status
 905         for(i = 0; i < runtime_gomaxprocs; i++) {
 906                 p = runtime_allp[i];
 907                 s = p->status;
 908                 if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
 909                         runtime_sched.stopwait--;
 910         }
 911         // stop idle P's
 912         while((p = pidleget()) != nil) {
 913                 p->status = Pgcstop;
 914                 runtime_sched.stopwait--;
 915         }
 916         wait = runtime_sched.stopwait > 0;
 917         runtime_unlock(&runtime_sched);
 918
 919         // wait for remaining P's to stop voluntarily
 920         if(wait) {
 921                 runtime_notesleep(&runtime_sched.stopnote);
 922                 runtime_noteclear(&runtime_sched.stopnote);
 923         }
 924         if(runtime_sched.stopwait)
 925                 runtime_throw("stoptheworld: not stopped");
 926         for(i = 0; i < runtime_gomaxprocs; i++) {
 927                 p = runtime_allp[i];
 928                 if(p->status != Pgcstop)
 929                         runtime_throw("stoptheworld: not stopped");
 930         }
 931 }
 932
 933 static void
 934 mhelpgc(void)
 935 {
 936         m->helpgc = -1;
 937 }
 938
 939 void
 940 runtime_starttheworld(void)
 941 {
 942         P *p, *p1;
 943         M *mp;
 944         G *gp;
 945         bool add;
 946
 947         m->locks++;  // disable preemption because it can be holding p in a local var
 948         gp = runtime_netpoll(false);  // non-blocking
 949         injectglist(gp);
 950         add = needaddgcproc();
 951         runtime_lock(&runtime_sched);
 952         if(newprocs) {
 953                 procresize(newprocs);
 954                 newprocs = 0;
 955         } else
 956                 procresize(runtime_gomaxprocs);
 957         runtime_sched.gcwaiting = 0;
 958
 959         p1 = nil;
 960         while((p = pidleget()) != nil) {
 961                 // procresize() puts p's with work at the beginning of the list.
 962                 // Once we reach a p without a run queue, the rest don't have one either.
 963                 if(p->runqhead == p->runqtail) {
 964                         pidleput(p);
 965                         break;
 966                 }
 967                 p->m = mget();
 968                 p->link = p1;
 969                 p1 = p;
 970         }
 971         if(runtime_sched.sysmonwait) {
 972                 runtime_sched.sysmonwait = false;
 973                 runtime_notewakeup(&runtime_sched.sysmonnote);
 974         }
 975         runtime_unlock(&runtime_sched);
 976
 977         while(p1) {
 978                 p = p1;
 979                 p1 = p1->link;
 980                 if(p->m) {
 981                         mp = p->m;
 982                         p->m = nil;
 983                         if(mp->nextp)
 984                                 runtime_throw("starttheworld: inconsistent mp->nextp");
 985                         mp->nextp = p;
 986                         runtime_notewakeup(&mp->park);
 987                 } else {
 988                         // Start M to run P.  Do not start another M below.
 989                         newm(nil, p);
 990                         add = false;
 991                 }
 992         }
 993
 994         if(add) {
 995                 // If GC could have used another helper proc, start one now,
 996                 // in the hope that it will be available next time.
 997                 // It would have been even better to start it before the collection,
 998                 // but doing so requires allocating memory, so it's tricky to
 999                 // coordinate.  This lazy approach works out in practice:
1000                 // we don't mind if the first couple gc rounds don't have quite
1001                 // the maximum number of procs.
1002                 newm(mhelpgc, nil);
1003         }
1004         m->locks--;
1005 }
1006
1007 // Called to start an M.
1008 void*
1009 runtime_mstart(void* mp)
1010 {
1011         m = (M*)mp;
1012         g = m->g0;
1013
1014         initcontext();
1015
1016         g->entry = nil;
1017         g->param = nil;
1018
1019         // Record top of stack for use by mcall.
1020         // Once we call schedule we're never coming back,
1021         // so other calls can reuse this stack space.
1022 #ifdef USING_SPLIT_STACK
1023         __splitstack_getcontext(&g->stack_context[0]);
1024 #else
1025         g->gcinitial_sp = &mp;
1026         // Setting gcstack_size to 0 is a marker meaning that gcinitial_sp
1027         // is the top of the stack, not the bottom.
1028         g->gcstack_size = 0;
1029         g->gcnext_sp = &mp;
1030 #endif
1031         getcontext(&g->context);
1032
1033         if(g->entry != nil) {
1034                 // Got here from mcall.
1035                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1036                 G* gp = (G*)g->param;
1037                 pfn(gp);
1038                 *(int*)0x21 = 0x21;
1039         }
1040         runtime_minit();
1041
1042 #ifdef USING_SPLIT_STACK
1043         {
1044                 int dont_block_signals = 0;
1045                 __splitstack_block_signals(&dont_block_signals, nil);
1046         }
1047 #endif
1048
1049         // Install signal handlers; after minit so that minit can
1050         // prepare the thread to be able to handle the signals.
1051         if(m == &runtime_m0)
1052                 runtime_initsig();
1053
1054         if(m->mstartfn)
1055                 m->mstartfn();
1056
1057         if(m->helpgc) {
1058                 m->helpgc = 0;
1059                 stopm();
1060         } else if(m != &runtime_m0) {
1061                 acquirep(m->nextp);
1062                 m->nextp = nil;
1063         }
1064         schedule();
1065
1066         // TODO(brainman): This point is never reached, because scheduler
1067         // does not release os threads at the moment. But once this path
1068         // is enabled, we must remove our seh here.
1069
1070         return nil;
1071 }
1072
1073 typedef struct CgoThreadStart CgoThreadStart;
1074 struct CgoThreadStart
1075 {
1076         M *m;
1077         G *g;
1078         uintptr *tls;
1079         void (*fn)(void);
1080 };
1081
1082 // Allocate a new m unassociated with any thread.
1083 // Can use p for allocation context if needed.
1084 M*
1085 runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacksize)
1086 {
1087         M *mp;
1088
1089         m->locks++;  // disable GC because it can be called from sysmon
1090         if(m->p == nil)
1091                 acquirep(p);  // temporarily borrow p for mallocs in this function
1092 #if 0
1093         if(mtype == nil) {
1094                 Eface e;
1095                 runtime_gc_m_ptr(&e);
1096                 mtype = ((const PtrType*)e.__type_descriptor)->__element_type;
1097         }
1098 #endif
1099
1100         mp = runtime_mal(sizeof *mp);
1101         mcommoninit(mp);
1102         mp->g0 = runtime_malg(stacksize, ret_g0_stack, ret_g0_stacksize);
1103
1104         if(p == m->p)
1105                 releasep();
1106         m->locks--;
1107
1108         return mp;
1109 }
1110
1111 static G*
1112 allocg(void)
1113 {
1114         G *gp;
1115         // static Type *gtype;
1116
1117         // if(gtype == nil) {
1118         //      Eface e;
1119         //      runtime_gc_g_ptr(&e);
1120         //      gtype = ((PtrType*)e.__type_descriptor)->__element_type;
1121         // }
1122         // gp = runtime_cnew(gtype);
1123         gp = runtime_malloc(sizeof(G));
1124         return gp;
1125 }
1126
1127 static M* lockextra(bool nilokay);
1128 static void unlockextra(M*);
1129
1130 // needm is called when a cgo callback happens on a
1131 // thread without an m (a thread not created by Go).
1132 // In this case, needm is expected to find an m to use
1133 // and return with m, g initialized correctly.
1134 // Since m and g are not set now (likely nil, but see below)
1135 // needm is limited in what routines it can call. In particular
1136 // it can only call nosplit functions (textflag 7) and cannot
1137 // do any scheduling that requires an m.
1138 //
1139 // In order to avoid needing heavy lifting here, we adopt
1140 // the following strategy: there is a stack of available m's
1141 // that can be stolen. Using compare-and-swap
1142 // to pop from the stack has ABA races, so we simulate
1143 // a lock by doing an exchange (via casp) to steal the stack
1144 // head and replace the top pointer with MLOCKED (1).
1145 // This serves as a simple spin lock that we can use even
1146 // without an m. The thread that locks the stack in this way
1147 // unlocks the stack by storing a valid stack head pointer.
1148 //
1149 // In order to make sure that there is always an m structure
1150 // available to be stolen, we maintain the invariant that there
1151 // is always one more than needed. At the beginning of the
1152 // program (if cgo is in use) the list is seeded with a single m.
1153 // If needm finds that it has taken the last m off the list, its job
1154 // is - once it has installed its own m so that it can do things like
1155 // allocate memory - to create a spare m and put it on the list.
1156 //
1157 // Each of these extra m's also has a g0 and a curg that are
1158 // pressed into service as the scheduling stack and current
1159 // goroutine for the duration of the cgo callback.
1160 //
1161 // When the callback is done with the m, it calls dropm to
1162 // put the m back on the list.
1163 //
1164 // Unlike the gc toolchain, we start running on curg, since we are
1165 // just going to return and let the caller continue.
1166 void
1167 runtime_needm(void)
1168 {
1169         M *mp;
1170
1171         if(runtime_needextram) {
1172                 // Can happen if C/C++ code calls Go from a global ctor.
1173                 // Can not throw, because scheduler is not initialized yet.
1174                 int rv __attribute__((unused));
1175                 rv = runtime_write(2, "fatal error: cgo callback before cgo call\n",
1176                         sizeof("fatal error: cgo callback before cgo call\n")-1);
1177                 runtime_exit(1);
1178         }
1179
1180         // Lock extra list, take head, unlock popped list.
1181         // nilokay=false is safe here because of the invariant above,
1182         // that the extra list always contains or will soon contain
1183         // at least one m.
1184         mp = lockextra(false);
1185
1186         // Set needextram when we've just emptied the list,
1187         // so that the eventual call into cgocallbackg will
1188         // allocate a new m for the extra list. We delay the
1189         // allocation until then so that it can be done
1190         // after exitsyscall makes sure it is okay to be
1191         // running at all (that is, there's no garbage collection
1192         // running right now).
1193         mp->needextram = mp->schedlink == nil;
1194         unlockextra(mp->schedlink);
1195
1196         // Install m and g (= m->curg).
1197         runtime_setmg(mp, mp->curg);
1198
1199         // Initialize g's context as in mstart.
1200         initcontext();
1201         g->status = Gsyscall;
1202         g->entry = nil;
1203         g->param = nil;
1204 #ifdef USING_SPLIT_STACK
1205         __splitstack_getcontext(&g->stack_context[0]);
1206 #else
1207         g->gcinitial_sp = &mp;
1208         g->gcstack_size = 0;
1209         g->gcnext_sp = &mp;
1210 #endif
1211         getcontext(&g->context);
1212
1213         if(g->entry != nil) {
1214                 // Got here from mcall.
1215                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1216                 G* gp = (G*)g->param;
1217                 pfn(gp);
1218                 *(int*)0x22 = 0x22;
1219         }
1220
1221         // Initialize this thread to use the m.
1222         runtime_minit();
1223
1224 #ifdef USING_SPLIT_STACK
1225         {
1226                 int dont_block_signals = 0;
1227                 __splitstack_block_signals(&dont_block_signals, nil);
1228         }
1229 #endif
1230 }
1231
1232 // newextram allocates an m and puts it on the extra list.
1233 // It is called with a working local m, so that it can do things
1234 // like call schedlock and allocate.
1235 void
1236 runtime_newextram(void)
1237 {
1238         M *mp, *mnext;
1239         G *gp;
1240         byte *g0_sp, *sp;
1241         size_t g0_spsize, spsize;
1242
1243         // Create extra goroutine locked to extra m.
1244         // The goroutine is the context in which the cgo callback will run.
1245         // The sched.pc will never be returned to, but setting it to
1246         // runtime.goexit makes clear to the traceback routines where
1247         // the goroutine stack ends.
1248         mp = runtime_allocm(nil, StackMin, &g0_sp, &g0_spsize);
1249         gp = runtime_malg(StackMin, &sp, &spsize);
1250         gp->status = Gdead;
1251         mp->curg = gp;
1252         mp->locked = LockInternal;
1253         mp->lockedg = gp;
1254         gp->lockedm = mp;
1255         gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
1256         // put on allg for garbage collector
1257         allgadd(gp);
1258
1259         // The context for gp will be set up in runtime_needm.  But
1260         // here we need to set up the context for g0.
1261         getcontext(&mp->g0->context);
1262         mp->g0->context.uc_stack.ss_sp = g0_sp;
1263         mp->g0->context.uc_stack.ss_size = g0_spsize;
1264         makecontext(&mp->g0->context, kickoff, 0);
1265
1266         // Add m to the extra list.
1267         mnext = lockextra(true);
1268         mp->schedlink = mnext;
1269         unlockextra(mp);
1270 }
1271
1272 // dropm is called when a cgo callback has called needm but is now
1273 // done with the callback and returning back into the non-Go thread.
1274 // It puts the current m back onto the extra list.
1275 //
1276 // The main expense here is the call to signalstack to release the
1277 // m's signal stack, and then the call to needm on the next callback
1278 // from this thread. It is tempting to try to save the m for next time,
1279 // which would eliminate both these costs, but there might not be
1280 // a next time: the current thread (which Go does not control) might exit.
1281 // If we saved the m for that thread, there would be an m leak each time
1282 // such a thread exited. Instead, we acquire and release an m on each
1283 // call. These should typically not be scheduling operations, just a few
1284 // atomics, so the cost should be small.
1285 //
1286 // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
1287 // variable using pthread_key_create. Unlike the pthread keys we already use
1288 // on OS X, this dummy key would never be read by Go code. It would exist
1289 // only so that we could register at thread-exit-time destructor.
1290 // That destructor would put the m back onto the extra list.
1291 // This is purely a performance optimization. The current version,
1292 // in which dropm happens on each cgo call, is still correct too.
1293 // We may have to keep the current version on systems with cgo
1294 // but without pthreads, like Windows.
1295 void
1296 runtime_dropm(void)
1297 {
1298         M *mp, *mnext;
1299
1300         // Undo whatever initialization minit did during needm.
1301         runtime_unminit();
1302
1303         // Clear m and g, and return m to the extra list.
1304         // After the call to setmg we can only call nosplit functions.
1305         mp = m;
1306         runtime_setmg(nil, nil);
1307
1308         mp->curg->status = Gdead;
1309
1310         mnext = lockextra(true);
1311         mp->schedlink = mnext;
1312         unlockextra(mp);
1313 }
1314
1315 #define MLOCKED ((M*)1)
1316
1317 // lockextra locks the extra list and returns the list head.
1318 // The caller must unlock the list by storing a new list head
1319 // to runtime.extram. If nilokay is true, then lockextra will
1320 // return a nil list head if that's what it finds. If nilokay is false,
1321 // lockextra will keep waiting until the list head is no longer nil.
1322 static M*
1323 lockextra(bool nilokay)
1324 {
1325         M *mp;
1326         void (*yield)(void);
1327
1328         for(;;) {
1329                 mp = runtime_atomicloadp(&runtime_extram);
1330                 if(mp == MLOCKED) {
1331                         yield = runtime_osyield;
1332                         yield();
1333                         continue;
1334                 }
1335                 if(mp == nil && !nilokay) {
1336                         runtime_usleep(1);
1337                         continue;
1338                 }
1339                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1340                         yield = runtime_osyield;
1341                         yield();
1342                         continue;
1343                 }
1344                 break;
1345         }
1346         return mp;
1347 }
1348
1349 static void
1350 unlockextra(M *mp)
1351 {
1352         runtime_atomicstorep(&runtime_extram, mp);
1353 }
1354
1355 static int32
1356 countextra()
1357 {
1358         M *mp, *mc;
1359         int32 c;
1360
1361         for(;;) {
1362                 mp = runtime_atomicloadp(&runtime_extram);
1363                 if(mp == MLOCKED) {
1364                         runtime_osyield();
1365                         continue;
1366                 }
1367                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1368                         runtime_osyield();
1369                         continue;
1370                 }
1371                 c = 0;
1372                 for(mc = mp; mc != nil; mc = mc->schedlink)
1373                         c++;
1374                 runtime_atomicstorep(&runtime_extram, mp);
1375                 return c;
1376         }
1377 }
1378
1379 // Create a new m.  It will start off with a call to fn, or else the scheduler.
1380 static void
1381 newm(void(*fn)(void), P *p)
1382 {
1383         M *mp;
1384
1385         mp = runtime_allocm(p, -1, nil, nil);
1386         mp->nextp = p;
1387         mp->mstartfn = fn;
1388
1389         runtime_newosproc(mp);
1390 }
1391
1392 // Stops execution of the current m until new work is available.
1393 // Returns with acquired P.
1394 static void
1395 stopm(void)
1396 {
1397         if(m->locks)
1398                 runtime_throw("stopm holding locks");
1399         if(m->p)
1400                 runtime_throw("stopm holding p");
1401         if(m->spinning) {
1402                 m->spinning = false;
1403                 runtime_xadd(&runtime_sched.nmspinning, -1);
1404         }
1405
1406 retry:
1407         runtime_lock(&runtime_sched);
1408         mput(m);
1409         runtime_unlock(&runtime_sched);
1410         runtime_notesleep(&m->park);
1411         runtime_noteclear(&m->park);
1412         if(m->helpgc) {
1413                 runtime_gchelper();
1414                 m->helpgc = 0;
1415                 m->mcache = nil;
1416                 goto retry;
1417         }
1418         acquirep(m->nextp);
1419         m->nextp = nil;
1420 }
1421
1422 static void
1423 mspinning(void)
1424 {
1425         m->spinning = true;
1426 }
1427
1428 // Schedules some M to run the p (creates an M if necessary).
1429 // If p==nil, tries to get an idle P, if no idle P's does nothing.
1430 static void
1431 startm(P *p, bool spinning)
1432 {
1433         M *mp;
1434         void (*fn)(void);
1435
1436         runtime_lock(&runtime_sched);
1437         if(p == nil) {
1438                 p = pidleget();
1439                 if(p == nil) {
1440                         runtime_unlock(&runtime_sched);
1441                         if(spinning)
1442                                 runtime_xadd(&runtime_sched.nmspinning, -1);
1443                         return;
1444                 }
1445         }
1446         mp = mget();
1447         runtime_unlock(&runtime_sched);
1448         if(mp == nil) {
1449                 fn = nil;
1450                 if(spinning)
1451                         fn = mspinning;
1452                 newm(fn, p);
1453                 return;
1454         }
1455         if(mp->spinning)
1456                 runtime_throw("startm: m is spinning");
1457         if(mp->nextp)
1458                 runtime_throw("startm: m has p");
1459         mp->spinning = spinning;
1460         mp->nextp = p;
1461         runtime_notewakeup(&mp->park);
1462 }
1463
1464 // Hands off P from syscall or locked M.
1465 static void
1466 handoffp(P *p)
1467 {
1468         // if it has local work, start it straight away
1469         if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
1470                 startm(p, false);
1471                 return;
1472         }
1473         // no local work, check that there are no spinning/idle M's,
1474         // otherwise our help is not required
1475         if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
1476                 runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
1477                 startm(p, true);
1478                 return;
1479         }
1480         runtime_lock(&runtime_sched);
1481         if(runtime_sched.gcwaiting) {
1482                 p->status = Pgcstop;
1483                 if(--runtime_sched.stopwait == 0)
1484                         runtime_notewakeup(&runtime_sched.stopnote);
1485                 runtime_unlock(&runtime_sched);
1486                 return;
1487         }
1488         if(runtime_sched.runqsize) {
1489                 runtime_unlock(&runtime_sched);
1490                 startm(p, false);
1491                 return;
1492         }
1493         // If this is the last running P and nobody is polling network,
1494         // need to wakeup another M to poll network.
1495         if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
1496                 runtime_unlock(&runtime_sched);
1497                 startm(p, false);
1498                 return;
1499         }
1500         pidleput(p);
1501         runtime_unlock(&runtime_sched);
1502 }
1503
1504 // Tries to add one more P to execute G's.
1505 // Called when a G is made runnable (newproc, ready).
1506 static void
1507 wakep(void)
1508 {
1509         // be conservative about spinning threads
1510         if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
1511                 return;
1512         startm(nil, true);
1513 }
1514
1515 // Stops execution of the current m that is locked to a g until the g is runnable again.
1516 // Returns with acquired P.
1517 static void
1518 stoplockedm(void)
1519 {
1520         P *p;
1521
1522         if(m->lockedg == nil || m->lockedg->lockedm != m)
1523                 runtime_throw("stoplockedm: inconsistent locking");
1524         if(m->p) {
1525                 // Schedule another M to run this p.
1526                 p = releasep();
1527                 handoffp(p);
1528         }
1529         incidlelocked(1);
1530         // Wait until another thread schedules lockedg again.
1531         runtime_notesleep(&m->park);
1532         runtime_noteclear(&m->park);
1533         if(m->lockedg->status != Grunnable)
1534                 runtime_throw("stoplockedm: not runnable");
1535         acquirep(m->nextp);
1536         m->nextp = nil;
1537 }
1538
1539 // Schedules the locked m to run the locked gp.
1540 static void
1541 startlockedm(G *gp)
1542 {
1543         M *mp;
1544         P *p;
1545
1546         mp = gp->lockedm;
1547         if(mp == m)
1548                 runtime_throw("startlockedm: locked to me");
1549         if(mp->nextp)
1550                 runtime_throw("startlockedm: m has p");
1551         // directly handoff current P to the locked m
1552         incidlelocked(-1);
1553         p = releasep();
1554         mp->nextp = p;
1555         runtime_notewakeup(&mp->park);
1556         stopm();
1557 }
1558
1559 // Stops the current m for stoptheworld.
1560 // Returns when the world is restarted.
1561 static void
1562 gcstopm(void)
1563 {
1564         P *p;
1565
1566         if(!runtime_sched.gcwaiting)
1567                 runtime_throw("gcstopm: not waiting for gc");
1568         if(m->spinning) {
1569                 m->spinning = false;
1570                 runtime_xadd(&runtime_sched.nmspinning, -1);
1571         }
1572         p = releasep();
1573         runtime_lock(&runtime_sched);
1574         p->status = Pgcstop;
1575         if(--runtime_sched.stopwait == 0)
1576                 runtime_notewakeup(&runtime_sched.stopnote);
1577         runtime_unlock(&runtime_sched);
1578         stopm();
1579 }
1580
1581 // Schedules gp to run on the current M.
1582 // Never returns.
1583 static void
1584 execute(G *gp)
1585 {
1586         int32 hz;
1587
1588         if(gp->status != Grunnable) {
1589                 runtime_printf("execute: bad g status %d\n", gp->status);
1590                 runtime_throw("execute: bad g status");
1591         }
1592         gp->status = Grunning;
1593         gp->waitsince = 0;
1594         m->p->schedtick++;
1595         m->curg = gp;
1596         gp->m = m;
1597
1598         // Check whether the profiler needs to be turned on or off.
1599         hz = runtime_sched.profilehz;
1600         if(m->profilehz != hz)
1601                 runtime_resetcpuprofiler(hz);
1602
1603         runtime_gogo(gp);
1604 }
1605
1606 // Finds a runnable goroutine to execute.
1607 // Tries to steal from other P's, get g from global queue, poll network.
1608 static G*
1609 findrunnable(void)
1610 {
1611         G *gp;
1612         P *p;
1613         int32 i;
1614
1615 top:
1616         if(runtime_sched.gcwaiting) {
1617                 gcstopm();
1618                 goto top;
1619         }
1620         if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil)
1621                 runtime_ready(gp);
1622         // local runq
1623         gp = runqget(m->p);
1624         if(gp)
1625                 return gp;
1626         // global runq
1627         if(runtime_sched.runqsize) {
1628                 runtime_lock(&runtime_sched);
1629                 gp = globrunqget(m->p, 0);
1630                 runtime_unlock(&runtime_sched);
1631                 if(gp)
1632                         return gp;
1633         }
1634         // poll network
1635         gp = runtime_netpoll(false);  // non-blocking
1636         if(gp) {
1637                 injectglist(gp->schedlink);
1638                 gp->status = Grunnable;
1639                 return gp;
1640         }
1641         // If number of spinning M's >= number of busy P's, block.
1642         // This is necessary to prevent excessive CPU consumption
1643         // when GOMAXPROCS>>1 but the program parallelism is low.
1644         if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
1645                 goto stop;
1646         if(!m->spinning) {
1647                 m->spinning = true;
1648                 runtime_xadd(&runtime_sched.nmspinning, 1);
1649         }
1650         // random steal from other P's
1651         for(i = 0; i < 2*runtime_gomaxprocs; i++) {
1652                 if(runtime_sched.gcwaiting)
1653                         goto top;
1654                 p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
1655                 if(p == m->p)
1656                         gp = runqget(p);
1657                 else
1658                         gp = runqsteal(m->p, p);
1659                 if(gp)
1660                         return gp;
1661         }
1662 stop:
1663         // return P and block
1664         runtime_lock(&runtime_sched);
1665         if(runtime_sched.gcwaiting) {
1666                 runtime_unlock(&runtime_sched);
1667                 goto top;
1668         }
1669         if(runtime_sched.runqsize) {
1670                 gp = globrunqget(m->p, 0);
1671                 runtime_unlock(&runtime_sched);
1672                 return gp;
1673         }
1674         p = releasep();
1675         pidleput(p);
1676         runtime_unlock(&runtime_sched);
1677         if(m->spinning) {
1678                 m->spinning = false;
1679                 runtime_xadd(&runtime_sched.nmspinning, -1);
1680         }
1681         // check all runqueues once again
1682         for(i = 0; i < runtime_gomaxprocs; i++) {
1683                 p = runtime_allp[i];
1684                 if(p && p->runqhead != p->runqtail) {
1685                         runtime_lock(&runtime_sched);
1686                         p = pidleget();
1687                         runtime_unlock(&runtime_sched);
1688                         if(p) {
1689                                 acquirep(p);
1690                                 goto top;
1691                         }
1692                         break;
1693                 }
1694         }
1695         // poll network
1696         if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
1697                 if(m->p)
1698                         runtime_throw("findrunnable: netpoll with p");
1699                 if(m->spinning)
1700                         runtime_throw("findrunnable: netpoll with spinning");
1701                 gp = runtime_netpoll(true);  // block until new work is available
1702                 runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
1703                 if(gp) {
1704                         runtime_lock(&runtime_sched);
1705                         p = pidleget();
1706                         runtime_unlock(&runtime_sched);
1707                         if(p) {
1708                                 acquirep(p);
1709                                 injectglist(gp->schedlink);
1710                                 gp->status = Grunnable;
1711                                 return gp;
1712                         }
1713                         injectglist(gp);
1714                 }
1715         }
1716         stopm();
1717         goto top;
1718 }
1719
1720 static void
1721 resetspinning(void)
1722 {
1723         int32 nmspinning;
1724
1725         if(m->spinning) {
1726                 m->spinning = false;
1727                 nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
1728                 if(nmspinning < 0)
1729                         runtime_throw("findrunnable: negative nmspinning");
1730         } else
1731                 nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
1732
1733         // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
1734         // so see if we need to wakeup another P here.
1735         if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
1736                 wakep();
1737 }
1738
1739 // Injects the list of runnable G's into the scheduler.
1740 // Can run concurrently with GC.
1741 static void
1742 injectglist(G *glist)
1743 {
1744         int32 n;
1745         G *gp;
1746
1747         if(glist == nil)
1748                 return;
1749         runtime_lock(&runtime_sched);
1750         for(n = 0; glist; n++) {
1751                 gp = glist;
1752                 glist = gp->schedlink;
1753                 gp->status = Grunnable;
1754                 globrunqput(gp);
1755         }
1756         runtime_unlock(&runtime_sched);
1757
1758         for(; n && runtime_sched.npidle; n--)
1759                 startm(nil, false);
1760 }
1761
1762 // One round of scheduler: find a runnable goroutine and execute it.
1763 // Never returns.
1764 static void
1765 schedule(void)
1766 {
1767         G *gp;
1768         uint32 tick;
1769
1770         if(m->locks)
1771                 runtime_throw("schedule: holding locks");
1772
1773 top:
1774         if(runtime_sched.gcwaiting) {
1775                 gcstopm();
1776                 goto top;
1777         }
1778
1779         gp = nil;
1780         // Check the global runnable queue once in a while to ensure fairness.
1781         // Otherwise two goroutines can completely occupy the local runqueue
1782         // by constantly respawning each other.
1783         tick = m->p->schedtick;
1784         // This is a fancy way to say tick%61==0,
1785         // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
1786         if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
1787                 runtime_lock(&runtime_sched);
1788                 gp = globrunqget(m->p, 1);
1789                 runtime_unlock(&runtime_sched);
1790                 if(gp)
1791                         resetspinning();
1792         }
1793         if(gp == nil) {
1794                 gp = runqget(m->p);
1795                 if(gp && m->spinning)
1796                         runtime_throw("schedule: spinning with local work");
1797         }
1798         if(gp == nil) {
1799                 gp = findrunnable();  // blocks until work is available
1800                 resetspinning();
1801         }
1802
1803         if(gp->lockedm) {
1804                 // Hands off own p to the locked m,
1805                 // then blocks waiting for a new p.
1806                 startlockedm(gp);
1807                 goto top;
1808         }
1809
1810         execute(gp);
1811 }
1812
1813 // Puts the current goroutine into a waiting state and calls unlockf.
1814 // If unlockf returns false, the goroutine is resumed.
1815 void
1816 runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
1817 {
1818         if(g->status != Grunning)
1819                 runtime_throw("bad g status");
1820         m->waitlock = lock;
1821         m->waitunlockf = unlockf;
1822         g->waitreason = reason;
1823         runtime_mcall(park0);
1824 }
1825
1826 static bool
1827 parkunlock(G *gp, void *lock)
1828 {
1829         USED(gp);
1830         runtime_unlock(lock);
1831         return true;
1832 }
1833
1834 // Puts the current goroutine into a waiting state and unlocks the lock.
1835 // The goroutine can be made runnable again by calling runtime_ready(gp).
1836 void
1837 runtime_parkunlock(Lock *lock, const char *reason)
1838 {
1839         runtime_park(parkunlock, lock, reason);
1840 }
1841
1842 // runtime_park continuation on g0.
1843 static void
1844 park0(G *gp)
1845 {
1846         bool ok;
1847
1848         gp->status = Gwaiting;
1849         gp->m = nil;
1850         m->curg = nil;
1851         if(m->waitunlockf) {
1852                 ok = m->waitunlockf(gp, m->waitlock);
1853                 m->waitunlockf = nil;
1854                 m->waitlock = nil;
1855                 if(!ok) {
1856                         gp->status = Grunnable;
1857                         execute(gp);  // Schedule it back, never returns.
1858                 }
1859         }
1860         if(m->lockedg) {
1861                 stoplockedm();
1862                 execute(gp);  // Never returns.
1863         }
1864         schedule();
1865 }
1866
1867 // Scheduler yield.
1868 void
1869 runtime_gosched(void)
1870 {
1871         if(g->status != Grunning)
1872                 runtime_throw("bad g status");
1873         runtime_mcall(runtime_gosched0);
1874 }
1875
1876 // runtime_gosched continuation on g0.
1877 void
1878 runtime_gosched0(G *gp)
1879 {
1880         gp->status = Grunnable;
1881         gp->m = nil;
1882         m->curg = nil;
1883         runtime_lock(&runtime_sched);
1884         globrunqput(gp);
1885         runtime_unlock(&runtime_sched);
1886         if(m->lockedg) {
1887                 stoplockedm();
1888                 execute(gp);  // Never returns.
1889         }
1890         schedule();
1891 }
1892
1893 // Finishes execution of the current goroutine.
1894 // Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
1895 // Since it does not return it does not matter.  But if it is preempted
1896 // at the split stack check, GC will complain about inconsistent sp.
1897 void
1898 runtime_goexit(void)
1899 {
1900         if(g->status != Grunning)
1901                 runtime_throw("bad g status");
1902         if(raceenabled)
1903                 runtime_racegoend();
1904         runtime_mcall(goexit0);
1905 }
1906
1907 // runtime_goexit continuation on g0.
1908 static void
1909 goexit0(G *gp)
1910 {
1911         gp->status = Gdead;
1912         gp->entry = nil;
1913         gp->m = nil;
1914         gp->lockedm = nil;
1915         gp->paniconfault = 0;
1916         gp->defer = nil; // should be true already but just in case.
1917         gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
1918         gp->writenbuf = 0;
1919         gp->writebuf = nil;
1920         gp->waitreason = nil;
1921         gp->param = nil;
1922         m->curg = nil;
1923         m->lockedg = nil;
1924         if(m->locked & ~LockExternal) {
1925                 runtime_printf("invalid m->locked = %d\n", m->locked);
1926                 runtime_throw("internal lockOSThread error");
1927         }
1928         m->locked = 0;
1929         gfput(m->p, gp);
1930         schedule();
1931 }
1932
1933 // The goroutine g is about to enter a system call.
1934 // Record that it's not using the cpu anymore.
1935 // This is called only from the go syscall library and cgocall,
1936 // not from the low-level system calls used by the runtime.
1937 //
1938 // Entersyscall cannot split the stack: the runtime_gosave must
1939 // make g->sched refer to the caller's stack segment, because
1940 // entersyscall is going to return immediately after.
1941
1942 void runtime_entersyscall(void) __attribute__ ((no_split_stack));
1943 static void doentersyscall(void) __attribute__ ((no_split_stack, noinline));
1944
1945 void
1946 runtime_entersyscall()
1947 {
1948         // Save the registers in the g structure so that any pointers
1949         // held in registers will be seen by the garbage collector.
1950         getcontext(&g->gcregs);
1951
1952         // Do the work in a separate function, so that this function
1953         // doesn't save any registers on its own stack.  If this
1954         // function does save any registers, we might store the wrong
1955         // value in the call to getcontext.
1956         //
1957         // FIXME: This assumes that we do not need to save any
1958         // callee-saved registers to access the TLS variable g.  We
1959         // don't want to put the ucontext_t on the stack because it is
1960         // large and we can not split the stack here.
1961         doentersyscall();
1962 }
1963
1964 static void
1965 doentersyscall()
1966 {
1967         // Disable preemption because during this function g is in Gsyscall status,
1968         // but can have inconsistent g->sched, do not let GC observe it.
1969         m->locks++;
1970
1971         // Leave SP around for GC and traceback.
1972 #ifdef USING_SPLIT_STACK
1973         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
1974                                        &g->gcnext_segment, &g->gcnext_sp,
1975                                        &g->gcinitial_sp);
1976 #else
1977         {
1978                 void *v;
1979
1980                 g->gcnext_sp = (byte *) &v;
1981         }
1982 #endif
1983
1984         g->status = Gsyscall;
1985
1986         if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
1987                 runtime_lock(&runtime_sched);
1988                 if(runtime_atomicload(&runtime_sched.sysmonwait)) {
1989                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
1990                         runtime_notewakeup(&runtime_sched.sysmonnote);
1991                 }
1992                 runtime_unlock(&runtime_sched);
1993         }
1994
1995         m->mcache = nil;
1996         m->p->m = nil;
1997         runtime_atomicstore(&m->p->status, Psyscall);
1998         if(runtime_sched.gcwaiting) {
1999                 runtime_lock(&runtime_sched);
2000                 if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
2001                         if(--runtime_sched.stopwait == 0)
2002                                 runtime_notewakeup(&runtime_sched.stopnote);
2003                 }
2004                 runtime_unlock(&runtime_sched);
2005         }
2006
2007         m->locks--;
2008 }
2009
2010 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
2011 void
2012 runtime_entersyscallblock(void)
2013 {
2014         P *p;
2015
2016         m->locks++;  // see comment in entersyscall
2017
2018         // Leave SP around for GC and traceback.
2019 #ifdef USING_SPLIT_STACK
2020         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
2021                                        &g->gcnext_segment, &g->gcnext_sp,
2022                                        &g->gcinitial_sp);
2023 #else
2024         g->gcnext_sp = (byte *) &p;
2025 #endif
2026
2027         // Save the registers in the g structure so that any pointers
2028         // held in registers will be seen by the garbage collector.
2029         getcontext(&g->gcregs);
2030
2031         g->status = Gsyscall;
2032
2033         p = releasep();
2034         handoffp(p);
2035         if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
2036                 incidlelocked(1);
2037
2038         m->locks--;
2039 }
2040
2041 // The goroutine g exited its system call.
2042 // Arrange for it to run on a cpu again.
2043 // This is called only from the go syscall library, not
2044 // from the low-level system calls used by the runtime.
2045 void
2046 runtime_exitsyscall(void)
2047 {
2048         G *gp;
2049
2050         m->locks++;  // see comment in entersyscall
2051
2052         gp = g;
2053         if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
2054                 incidlelocked(-1);
2055
2056         g->waitsince = 0;
2057         if(exitsyscallfast()) {
2058                 // There's a cpu for us, so we can run.
2059                 m->p->syscalltick++;
2060                 gp->status = Grunning;
2061                 // Garbage collector isn't running (since we are),
2062                 // so okay to clear gcstack and gcsp.
2063 #ifdef USING_SPLIT_STACK
2064                 gp->gcstack = nil;
2065 #endif
2066                 gp->gcnext_sp = nil;
2067                 runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2068                 m->locks--;
2069                 return;
2070         }
2071
2072         m->locks--;
2073
2074         // Call the scheduler.
2075         runtime_mcall(exitsyscall0);
2076
2077         // Scheduler returned, so we're allowed to run now.
2078         // Delete the gcstack information that we left for
2079         // the garbage collector during the system call.
2080         // Must wait until now because until gosched returns
2081         // we don't know for sure that the garbage collector
2082         // is not running.
2083 #ifdef USING_SPLIT_STACK
2084         gp->gcstack = nil;
2085 #endif
2086         gp->gcnext_sp = nil;
2087         runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2088
2089         // Don't refer to m again, we might be running on a different
2090         // thread after returning from runtime_mcall.
2091         runtime_m()->p->syscalltick++;
2092 }
2093
2094 static bool
2095 exitsyscallfast(void)
2096 {
2097         P *p;
2098
2099         // Freezetheworld sets stopwait but does not retake P's.
2100         if(runtime_sched.stopwait) {
2101                 m->p = nil;
2102                 return false;
2103         }
2104
2105         // Try to re-acquire the last P.
2106         if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
2107                 // There's a cpu for us, so we can run.
2108                 m->mcache = m->p->mcache;
2109                 m->p->m = m;
2110                 return true;
2111         }
2112         // Try to get any other idle P.
2113         m->p = nil;
2114         if(runtime_sched.pidle) {
2115                 runtime_lock(&runtime_sched);
2116                 p = pidleget();
2117                 if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
2118                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2119                         runtime_notewakeup(&runtime_sched.sysmonnote);
2120                 }
2121                 runtime_unlock(&runtime_sched);
2122                 if(p) {
2123                         acquirep(p);
2124                         return true;
2125                 }
2126         }
2127         return false;
2128 }
2129
2130 // runtime_exitsyscall slow path on g0.
2131 // Failed to acquire P, enqueue gp as runnable.
2132 static void
2133 exitsyscall0(G *gp)
2134 {
2135         P *p;
2136
2137         gp->status = Grunnable;
2138         gp->m = nil;
2139         m->curg = nil;
2140         runtime_lock(&runtime_sched);
2141         p = pidleget();
2142         if(p == nil)
2143                 globrunqput(gp);
2144         else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
2145                 runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2146                 runtime_notewakeup(&runtime_sched.sysmonnote);
2147         }
2148         runtime_unlock(&runtime_sched);
2149         if(p) {
2150                 acquirep(p);
2151                 execute(gp);  // Never returns.
2152         }
2153         if(m->lockedg) {
2154                 // Wait until another thread schedules gp and so m again.
2155                 stoplockedm();
2156                 execute(gp);  // Never returns.
2157         }
2158         stopm();
2159         schedule();  // Never returns.
2160 }
2161
2162 // Called from syscall package before fork.
2163 void syscall_runtime_BeforeFork(void)
2164   __asm__(GOSYM_PREFIX "syscall.runtime_BeforeFork");
2165 void
2166 syscall_runtime_BeforeFork(void)
2167 {
2168         // Fork can hang if preempted with signals frequently enough (see issue 5517).
2169         // Ensure that we stay on the same M where we disable profiling.
2170         runtime_m()->locks++;
2171         if(runtime_m()->profilehz != 0)
2172                 runtime_resetcpuprofiler(0);
2173 }
2174
2175 // Called from syscall package after fork in parent.
2176 void syscall_runtime_AfterFork(void)
2177   __asm__(GOSYM_PREFIX "syscall.runtime_AfterFork");
2178 void
2179 syscall_runtime_AfterFork(void)
2180 {
2181         int32 hz;
2182
2183         hz = runtime_sched.profilehz;
2184         if(hz != 0)
2185                 runtime_resetcpuprofiler(hz);
2186         runtime_m()->locks--;
2187 }
2188
2189 // Allocate a new g, with a stack big enough for stacksize bytes.
2190 G*
2191 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
2192 {
2193         G *newg;
2194
2195         newg = allocg();
2196         if(stacksize >= 0) {
2197 #if USING_SPLIT_STACK
2198                 int dont_block_signals = 0;
2199
2200                 *ret_stack = __splitstack_makecontext(stacksize,
2201                                                       &newg->stack_context[0],
2202                                                       ret_stacksize);
2203                 __splitstack_block_signals_context(&newg->stack_context[0],
2204                                                    &dont_block_signals, nil);
2205 #else
2206                 *ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
2207                 *ret_stacksize = stacksize;
2208                 newg->gcinitial_sp = *ret_stack;
2209                 newg->gcstack_size = stacksize;
2210                 runtime_xadd(&runtime_stacks_sys, stacksize);
2211 #endif
2212         }
2213         return newg;
2214 }
2215
2216 /* For runtime package testing.  */
2217
2218
2219 // Create a new g running fn with siz bytes of arguments.
2220 // Put it on the queue of g's waiting to run.
2221 // The compiler turns a go statement into a call to this.
2222 // Cannot split the stack because it assumes that the arguments
2223 // are available sequentially after &fn; they would not be
2224 // copied if a stack split occurred.  It's OK for this to call
2225 // functions that split the stack.
2226 void runtime_testing_entersyscall(void)
2227   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
2228 void
2229 runtime_testing_entersyscall()
2230 {
2231         runtime_entersyscall();
2232 }
2233
2234 void runtime_testing_exitsyscall(void)
2235   __asm__ (GOSYM_PREFIX "runtime.exitsyscall");
2236
2237 void
2238 runtime_testing_exitsyscall()
2239 {
2240         runtime_exitsyscall();
2241 }
2242
2243 G*
2244 __go_go(void (*fn)(void*), void* arg)
2245 {
2246         byte *sp;
2247         size_t spsize;
2248         G *newg;
2249         P *p;
2250
2251 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
2252         if(fn == nil) {
2253                 m->throwing = -1;  // do not dump full stacks
2254                 runtime_throw("go of nil func value");
2255         }
2256         m->locks++;  // disable preemption because it can be holding p in a local var
2257
2258         p = m->p;
2259         if((newg = gfget(p)) != nil) {
2260 #ifdef USING_SPLIT_STACK
2261                 int dont_block_signals = 0;
2262
2263                 sp = __splitstack_resetcontext(&newg->stack_context[0],
2264                                                &spsize);
2265                 __splitstack_block_signals_context(&newg->stack_context[0],
2266                                                    &dont_block_signals, nil);
2267 #else
2268                 sp = newg->gcinitial_sp;
2269                 spsize = newg->gcstack_size;
2270                 if(spsize == 0)
2271                         runtime_throw("bad spsize in __go_go");
2272                 newg->gcnext_sp = sp;
2273 #endif
2274         } else {
2275                 newg = runtime_malg(StackMin, &sp, &spsize);
2276                 allgadd(newg);
2277         }
2278
2279         newg->entry = (byte*)fn;
2280         newg->param = arg;
2281         newg->gopc = (uintptr)__builtin_return_address(0);
2282         newg->status = Grunnable;
2283         if(p->goidcache == p->goidcacheend) {
2284                 p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
2285                 p->goidcacheend = p->goidcache + GoidCacheBatch;
2286         }
2287         newg->goid = p->goidcache++;
2288
2289         {
2290                 // Avoid warnings about variables clobbered by
2291                 // longjmp.
2292                 byte * volatile vsp = sp;
2293                 size_t volatile vspsize = spsize;
2294                 G * volatile vnewg = newg;
2295
2296                 getcontext(&vnewg->context);
2297                 vnewg->context.uc_stack.ss_sp = vsp;
2298 #ifdef MAKECONTEXT_STACK_TOP
2299                 vnewg->context.uc_stack.ss_sp += vspsize;
2300 #endif
2301                 vnewg->context.uc_stack.ss_size = vspsize;
2302                 makecontext(&vnewg->context, kickoff, 0);
2303
2304                 runqput(p, vnewg);
2305
2306                 if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
2307                         wakep();
2308                 m->locks--;
2309                 return vnewg;
2310         }
2311 }
2312
2313 static void
2314 allgadd(G *gp)
2315 {
2316         G **new;
2317         uintptr cap;
2318
2319         runtime_lock(&allglock);
2320         if(runtime_allglen >= allgcap) {
2321                 cap = 4096/sizeof(new[0]);
2322                 if(cap < 2*allgcap)
2323                         cap = 2*allgcap;
2324                 new = runtime_malloc(cap*sizeof(new[0]));
2325                 if(new == nil)
2326                         runtime_throw("runtime: cannot allocate memory");
2327                 if(runtime_allg != nil) {
2328                         runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0]));
2329                         runtime_free(runtime_allg);
2330                 }
2331                 runtime_allg = new;
2332                 allgcap = cap;
2333         }
2334         runtime_allg[runtime_allglen++] = gp;
2335         runtime_unlock(&allglock);
2336 }
2337
2338 // Put on gfree list.
2339 // If local list is too long, transfer a batch to the global list.
2340 static void
2341 gfput(P *p, G *gp)
2342 {
2343         gp->schedlink = p->gfree;
2344         p->gfree = gp;
2345         p->gfreecnt++;
2346         if(p->gfreecnt >= 64) {
2347                 runtime_lock(&runtime_sched.gflock);
2348                 while(p->gfreecnt >= 32) {
2349                         p->gfreecnt--;
2350                         gp = p->gfree;
2351                         p->gfree = gp->schedlink;
2352                         gp->schedlink = runtime_sched.gfree;
2353                         runtime_sched.gfree = gp;
2354                 }
2355                 runtime_unlock(&runtime_sched.gflock);
2356         }
2357 }
2358
2359 // Get from gfree list.
2360 // If local list is empty, grab a batch from global list.
2361 static G*
2362 gfget(P *p)
2363 {
2364         G *gp;
2365
2366 retry:
2367         gp = p->gfree;
2368         if(gp == nil && runtime_sched.gfree) {
2369                 runtime_lock(&runtime_sched.gflock);
2370                 while(p->gfreecnt < 32 && runtime_sched.gfree) {
2371                         p->gfreecnt++;
2372                         gp = runtime_sched.gfree;
2373                         runtime_sched.gfree = gp->schedlink;
2374                         gp->schedlink = p->gfree;
2375                         p->gfree = gp;
2376                 }
2377                 runtime_unlock(&runtime_sched.gflock);
2378                 goto retry;
2379         }
2380         if(gp) {
2381                 p->gfree = gp->schedlink;
2382                 p->gfreecnt--;
2383         }
2384         return gp;
2385 }
2386
2387 // Purge all cached G's from gfree list to the global list.
2388 static void
2389 gfpurge(P *p)
2390 {
2391         G *gp;
2392
2393         runtime_lock(&runtime_sched.gflock);
2394         while(p->gfreecnt) {
2395                 p->gfreecnt--;
2396                 gp = p->gfree;
2397                 p->gfree = gp->schedlink;
2398                 gp->schedlink = runtime_sched.gfree;
2399                 runtime_sched.gfree = gp;
2400         }
2401         runtime_unlock(&runtime_sched.gflock);
2402 }
2403
2404 void
2405 runtime_Breakpoint(void)
2406 {
2407         runtime_breakpoint();
2408 }
2409
2410 void runtime_Gosched (void) __asm__ (GOSYM_PREFIX "runtime.Gosched");
2411
2412 void
2413 runtime_Gosched(void)
2414 {
2415         runtime_gosched();
2416 }
2417
2418 // Implementation of runtime.GOMAXPROCS.
2419 // delete when scheduler is even stronger
2420 int32
2421 runtime_gomaxprocsfunc(int32 n)
2422 {
2423         int32 ret;
2424
2425         if(n > MaxGomaxprocs)
2426                 n = MaxGomaxprocs;
2427         runtime_lock(&runtime_sched);
2428         ret = runtime_gomaxprocs;
2429         if(n <= 0 || n == ret) {
2430                 runtime_unlock(&runtime_sched);
2431                 return ret;
2432         }
2433         runtime_unlock(&runtime_sched);
2434
2435         runtime_semacquire(&runtime_worldsema, false);
2436         m->gcing = 1;
2437         runtime_stoptheworld();
2438         newprocs = n;
2439         m->gcing = 0;
2440         runtime_semrelease(&runtime_worldsema);
2441         runtime_starttheworld();
2442
2443         return ret;
2444 }
2445
2446 // lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
2447 // after they modify m->locked. Do not allow preemption during this call,
2448 // or else the m might be different in this function than in the caller.
2449 static void
2450 lockOSThread(void)
2451 {
2452         m->lockedg = g;
2453         g->lockedm = m;
2454 }
2455
2456 void    runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
2457 void
2458 runtime_LockOSThread(void)
2459 {
2460         m->locked |= LockExternal;
2461         lockOSThread();
2462 }
2463
2464 void
2465 runtime_lockOSThread(void)
2466 {
2467         m->locked += LockInternal;
2468         lockOSThread();
2469 }
2470
2471
2472 // unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
2473 // after they update m->locked. Do not allow preemption during this call,
2474 // or else the m might be in different in this function than in the caller.
2475 static void
2476 unlockOSThread(void)
2477 {
2478         if(m->locked != 0)
2479                 return;
2480         m->lockedg = nil;
2481         g->lockedm = nil;
2482 }
2483
2484 void    runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
2485
2486 void
2487 runtime_UnlockOSThread(void)
2488 {
2489         m->locked &= ~LockExternal;
2490         unlockOSThread();
2491 }
2492
2493 void
2494 runtime_unlockOSThread(void)
2495 {
2496         if(m->locked < LockInternal)
2497                 runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
2498         m->locked -= LockInternal;
2499         unlockOSThread();
2500 }
2501
2502 bool
2503 runtime_lockedOSThread(void)
2504 {
2505         return g->lockedm != nil && m->lockedg != nil;
2506 }
2507
2508 int32
2509 runtime_gcount(void)
2510 {
2511         G *gp;
2512         int32 n, s;
2513         uintptr i;
2514
2515         n = 0;
2516         runtime_lock(&allglock);
2517         // TODO(dvyukov): runtime.NumGoroutine() is O(N).
2518         // We do not want to increment/decrement centralized counter in newproc/goexit,
2519         // just to make runtime.NumGoroutine() faster.
2520         // Compromise solution is to introduce per-P counters of active goroutines.
2521         for(i = 0; i < runtime_allglen; i++) {
2522                 gp = runtime_allg[i];
2523                 s = gp->status;
2524                 if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
2525                         n++;
2526         }
2527         runtime_unlock(&allglock);
2528         return n;
2529 }
2530
2531 int32
2532 runtime_mcount(void)
2533 {
2534         return runtime_sched.mcount;
2535 }
2536
2537 static struct {
2538         Lock;
2539         void (*fn)(uintptr*, int32);
2540         int32 hz;
2541         uintptr pcbuf[TracebackMaxFrames];
2542         Location locbuf[TracebackMaxFrames];
2543 } prof;
2544
2545 static void System(void) {}
2546 static void GC(void) {}
2547
2548 // Called if we receive a SIGPROF signal.
2549 void
2550 runtime_sigprof()
2551 {
2552         M *mp = m;
2553         int32 n, i;
2554         bool traceback;
2555
2556         if(prof.fn == nil || prof.hz == 0)
2557                 return;
2558
2559         if(mp == nil)
2560                 return;
2561
2562         // Profiling runs concurrently with GC, so it must not allocate.
2563         mp->mallocing++;
2564
2565         traceback = true;
2566
2567         if(mp->mcache == nil)
2568                 traceback = false;
2569
2570         runtime_lock(&prof);
2571         if(prof.fn == nil) {
2572                 runtime_unlock(&prof);
2573                 mp->mallocing--;
2574                 return;
2575         }
2576         n = 0;
2577
2578         if(runtime_atomicload(&runtime_in_callers) > 0) {
2579                 // If SIGPROF arrived while already fetching runtime
2580                 // callers we can have trouble on older systems
2581                 // because the unwind library calls dl_iterate_phdr
2582                 // which was not recursive in the past.
2583                 traceback = false;
2584         }
2585
2586         if(traceback) {
2587                 n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false);
2588                 for(i = 0; i < n; i++)
2589                         prof.pcbuf[i] = prof.locbuf[i].pc;
2590         }
2591         if(!traceback || n <= 0) {
2592                 n = 2;
2593                 prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
2594                 if(mp->gcing || mp->helpgc)
2595                         prof.pcbuf[1] = (uintptr)GC;
2596                 else
2597                         prof.pcbuf[1] = (uintptr)System;
2598         }
2599         prof.fn(prof.pcbuf, n);
2600         runtime_unlock(&prof);
2601         mp->mallocing--;
2602 }
2603
2604 // Arrange to call fn with a traceback hz times a second.
2605 void
2606 runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
2607 {
2608         // Force sane arguments.
2609         if(hz < 0)
2610                 hz = 0;
2611         if(hz == 0)
2612                 fn = nil;
2613         if(fn == nil)
2614                 hz = 0;
2615
2616         // Disable preemption, otherwise we can be rescheduled to another thread
2617         // that has profiling enabled.
2618         m->locks++;
2619
2620         // Stop profiler on this thread so that it is safe to lock prof.
2621         // if a profiling signal came in while we had prof locked,
2622         // it would deadlock.
2623         runtime_resetcpuprofiler(0);
2624
2625         runtime_lock(&prof);
2626         prof.fn = fn;
2627         prof.hz = hz;
2628         runtime_unlock(&prof);
2629         runtime_lock(&runtime_sched);
2630         runtime_sched.profilehz = hz;
2631         runtime_unlock(&runtime_sched);
2632
2633         if(hz != 0)
2634                 runtime_resetcpuprofiler(hz);
2635
2636         m->locks--;
2637 }
2638
2639 // Change number of processors.  The world is stopped, sched is locked.
2640 static void
2641 procresize(int32 new)
2642 {
2643         int32 i, old;
2644         bool empty;
2645         G *gp;
2646         P *p;
2647
2648         old = runtime_gomaxprocs;
2649         if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
2650                 runtime_throw("procresize: invalid arg");
2651         // initialize new P's
2652         for(i = 0; i < new; i++) {
2653                 p = runtime_allp[i];
2654                 if(p == nil) {
2655                         p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
2656                         p->id = i;
2657                         p->status = Pgcstop;
2658                         runtime_atomicstorep(&runtime_allp[i], p);
2659                 }
2660                 if(p->mcache == nil) {
2661                         if(old==0 && i==0)
2662                                 p->mcache = m->mcache;  // bootstrap
2663                         else
2664                                 p->mcache = runtime_allocmcache();
2665                 }
2666         }
2667
2668         // redistribute runnable G's evenly
2669         // collect all runnable goroutines in global queue preserving FIFO order
2670         // FIFO order is required to ensure fairness even during frequent GCs
2671         // see http://golang.org/issue/7126
2672         empty = false;
2673         while(!empty) {
2674                 empty = true;
2675                 for(i = 0; i < old; i++) {
2676                         p = runtime_allp[i];
2677                         if(p->runqhead == p->runqtail)
2678                                 continue;
2679                         empty = false;
2680                         // pop from tail of local queue
2681                         p->runqtail--;
2682                         gp = p->runq[p->runqtail%nelem(p->runq)];
2683                         // push onto head of global queue
2684                         gp->schedlink = runtime_sched.runqhead;
2685                         runtime_sched.runqhead = gp;
2686                         if(runtime_sched.runqtail == nil)
2687                                 runtime_sched.runqtail = gp;
2688                         runtime_sched.runqsize++;
2689                 }
2690         }
2691         // fill local queues with at most nelem(p->runq)/2 goroutines
2692         // start at 1 because current M already executes some G and will acquire allp[0] below,
2693         // so if we have a spare G we want to put it into allp[1].
2694         for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
2695                 gp = runtime_sched.runqhead;
2696                 runtime_sched.runqhead = gp->schedlink;
2697                 if(runtime_sched.runqhead == nil)
2698                         runtime_sched.runqtail = nil;
2699                 runtime_sched.runqsize--;
2700                 runqput(runtime_allp[i%new], gp);
2701         }
2702
2703         // free unused P's
2704         for(i = new; i < old; i++) {
2705                 p = runtime_allp[i];
2706                 runtime_freemcache(p->mcache);
2707                 p->mcache = nil;
2708                 gfpurge(p);
2709                 p->status = Pdead;
2710                 // can't free P itself because it can be referenced by an M in syscall
2711         }
2712
2713         if(m->p)
2714                 m->p->m = nil;
2715         m->p = nil;
2716         m->mcache = nil;
2717         p = runtime_allp[0];
2718         p->m = nil;
2719         p->status = Pidle;
2720         acquirep(p);
2721         for(i = new-1; i > 0; i--) {
2722                 p = runtime_allp[i];
2723                 p->status = Pidle;
2724                 pidleput(p);
2725         }
2726         runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
2727 }
2728
2729 // Associate p and the current m.
2730 static void
2731 acquirep(P *p)
2732 {
2733         if(m->p || m->mcache)
2734                 runtime_throw("acquirep: already in go");
2735         if(p->m || p->status != Pidle) {
2736                 runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
2737                 runtime_throw("acquirep: invalid p state");
2738         }
2739         m->mcache = p->mcache;
2740         m->p = p;
2741         p->m = m;
2742         p->status = Prunning;
2743 }
2744
2745 // Disassociate p and the current m.
2746 static P*
2747 releasep(void)
2748 {
2749         P *p;
2750
2751         if(m->p == nil || m->mcache == nil)
2752                 runtime_throw("releasep: invalid arg");
2753         p = m->p;
2754         if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
2755                 runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
2756                         m, m->p, p->m, m->mcache, p->mcache, p->status);
2757                 runtime_throw("releasep: invalid p state");
2758         }
2759         m->p = nil;
2760         m->mcache = nil;
2761         p->m = nil;
2762         p->status = Pidle;
2763         return p;
2764 }
2765
2766 static void
2767 incidlelocked(int32 v)
2768 {
2769         runtime_lock(&runtime_sched);
2770         runtime_sched.nmidlelocked += v;
2771         if(v > 0)
2772                 checkdead();
2773         runtime_unlock(&runtime_sched);
2774 }
2775
2776 // Check for deadlock situation.
2777 // The check is based on number of running M's, if 0 -> deadlock.
2778 static void
2779 checkdead(void)
2780 {
2781         G *gp;
2782         int32 run, grunning, s;
2783         uintptr i;
2784
2785         // -1 for sysmon
2786         run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
2787         if(run > 0)
2788                 return;
2789         // If we are dying because of a signal caught on an already idle thread,
2790         // freezetheworld will cause all running threads to block.
2791         // And runtime will essentially enter into deadlock state,
2792         // except that there is a thread that will call runtime_exit soon.
2793         if(runtime_panicking > 0)
2794                 return;
2795         if(run < 0) {
2796                 runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
2797                         runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
2798                 runtime_throw("checkdead: inconsistent counts");
2799         }
2800         grunning = 0;
2801         runtime_lock(&allglock);
2802         for(i = 0; i < runtime_allglen; i++) {
2803                 gp = runtime_allg[i];
2804                 if(gp->isbackground)
2805                         continue;
2806                 s = gp->status;
2807                 if(s == Gwaiting)
2808                         grunning++;
2809                 else if(s == Grunnable || s == Grunning || s == Gsyscall) {
2810                         runtime_unlock(&allglock);
2811                         runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
2812                         runtime_throw("checkdead: runnable g");
2813                 }
2814         }
2815         runtime_unlock(&allglock);
2816         if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
2817                 runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!");
2818         m->throwing = -1;  // do not dump full stacks
2819         runtime_throw("all goroutines are asleep - deadlock!");
2820 }
2821
2822 static void
2823 sysmon(void)
2824 {
2825         uint32 idle, delay;
2826         int64 now, lastpoll, lasttrace;
2827         G *gp;
2828
2829         lasttrace = 0;
2830         idle = 0;  // how many cycles in succession we had not wokeup somebody
2831         delay = 0;
2832         for(;;) {
2833                 if(idle == 0)  // start with 20us sleep...
2834                         delay = 20;
2835                 else if(idle > 50)  // start doubling the sleep after 1ms...
2836                         delay *= 2;
2837                 if(delay > 10*1000)  // up to 10ms
2838                         delay = 10*1000;
2839                 runtime_usleep(delay);
2840                 if(runtime_debug.schedtrace <= 0 &&
2841                         (runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
2842                         runtime_lock(&runtime_sched);
2843                         if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
2844                                 runtime_atomicstore(&runtime_sched.sysmonwait, 1);
2845                                 runtime_unlock(&runtime_sched);
2846                                 runtime_notesleep(&runtime_sched.sysmonnote);
2847                                 runtime_noteclear(&runtime_sched.sysmonnote);
2848                                 idle = 0;
2849                                 delay = 20;
2850                         } else
2851                                 runtime_unlock(&runtime_sched);
2852                 }
2853                 // poll network if not polled for more than 10ms
2854                 lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
2855                 now = runtime_nanotime();
2856                 if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
2857                         runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
2858                         gp = runtime_netpoll(false);  // non-blocking
2859                         if(gp) {
2860                                 // Need to decrement number of idle locked M's
2861                                 // (pretending that one more is running) before injectglist.
2862                                 // Otherwise it can lead to the following situation:
2863                                 // injectglist grabs all P's but before it starts M's to run the P's,
2864                                 // another M returns from syscall, finishes running its G,
2865                                 // observes that there is no work to do and no other running M's
2866                                 // and reports deadlock.
2867                                 incidlelocked(-1);
2868                                 injectglist(gp);
2869                                 incidlelocked(1);
2870                         }
2871                 }
2872                 // retake P's blocked in syscalls
2873                 // and preempt long running G's
2874                 if(retake(now))
2875                         idle = 0;
2876                 else
2877                         idle++;
2878
2879                 if(runtime_debug.schedtrace > 0 && lasttrace + runtime_debug.schedtrace*1000000ll <= now) {
2880                         lasttrace = now;
2881                         runtime_schedtrace(runtime_debug.scheddetail);
2882                 }
2883         }
2884 }
2885
2886 typedef struct Pdesc Pdesc;
2887 struct Pdesc
2888 {
2889         uint32  schedtick;
2890         int64   schedwhen;
2891         uint32  syscalltick;
2892         int64   syscallwhen;
2893 };
2894 static Pdesc pdesc[MaxGomaxprocs];
2895
2896 static uint32
2897 retake(int64 now)
2898 {
2899         uint32 i, s, n;
2900         int64 t;
2901         P *p;
2902         Pdesc *pd;
2903
2904         n = 0;
2905         for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
2906                 p = runtime_allp[i];
2907                 if(p==nil)
2908                         continue;
2909                 pd = &pdesc[i];
2910                 s = p->status;
2911                 if(s == Psyscall) {
2912                         // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
2913                         t = p->syscalltick;
2914                         if(pd->syscalltick != t) {
2915                                 pd->syscalltick = t;
2916                                 pd->syscallwhen = now;
2917                                 continue;
2918                         }
2919                         // On the one hand we don't want to retake Ps if there is no other work to do,
2920                         // but on the other hand we want to retake them eventually
2921                         // because they can prevent the sysmon thread from deep sleep.
2922                         if(p->runqhead == p->runqtail &&
2923                                 runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
2924                                 pd->syscallwhen + 10*1000*1000 > now)
2925                                 continue;
2926                         // Need to decrement number of idle locked M's
2927                         // (pretending that one more is running) before the CAS.
2928                         // Otherwise the M from which we retake can exit the syscall,
2929                         // increment nmidle and report deadlock.
2930                         incidlelocked(-1);
2931                         if(runtime_cas(&p->status, s, Pidle)) {
2932                                 n++;
2933                                 handoffp(p);
2934                         }
2935                         incidlelocked(1);
2936                 } else if(s == Prunning) {
2937                         // Preempt G if it's running for more than 10ms.
2938                         t = p->schedtick;
2939                         if(pd->schedtick != t) {
2940                                 pd->schedtick = t;
2941                                 pd->schedwhen = now;
2942                                 continue;
2943                         }
2944                         if(pd->schedwhen + 10*1000*1000 > now)
2945                                 continue;
2946                         // preemptone(p);
2947                 }
2948         }
2949         return n;
2950 }
2951
2952 // Tell all goroutines that they have been preempted and they should stop.
2953 // This function is purely best-effort.  It can fail to inform a goroutine if a
2954 // processor just started running it.
2955 // No locks need to be held.
2956 // Returns true if preemption request was issued to at least one goroutine.
2957 static bool
2958 preemptall(void)
2959 {
2960         return false;
2961 }
2962
2963 void
2964 runtime_schedtrace(bool detailed)
2965 {
2966         static int64 starttime;
2967         int64 now;
2968         int64 id1, id2, id3;
2969         int32 i, t, h;
2970         uintptr gi;
2971         const char *fmt;
2972         M *mp, *lockedm;
2973         G *gp, *lockedg;
2974         P *p;
2975
2976         now = runtime_nanotime();
2977         if(starttime == 0)
2978                 starttime = now;
2979
2980         runtime_lock(&runtime_sched);
2981         runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
2982                 (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
2983                 runtime_sched.nmidle, runtime_sched.runqsize);
2984         if(detailed) {
2985                 runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
2986                         runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
2987                         runtime_sched.stopwait, runtime_sched.sysmonwait);
2988         }
2989         // We must be careful while reading data from P's, M's and G's.
2990         // Even if we hold schedlock, most data can be changed concurrently.
2991         // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
2992         for(i = 0; i < runtime_gomaxprocs; i++) {
2993                 p = runtime_allp[i];
2994                 if(p == nil)
2995                         continue;
2996                 mp = p->m;
2997                 h = runtime_atomicload(&p->runqhead);
2998                 t = runtime_atomicload(&p->runqtail);
2999                 if(detailed)
3000                         runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
3001                                 i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
3002                 else {
3003                         // In non-detailed mode format lengths of per-P run queues as:
3004                         // [len1 len2 len3 len4]
3005                         fmt = " %d";
3006                         if(runtime_gomaxprocs == 1)
3007                                 fmt = " [%d]\n";
3008                         else if(i == 0)
3009                                 fmt = " [%d";
3010                         else if(i == runtime_gomaxprocs-1)
3011                                 fmt = " %d]\n";
3012                         runtime_printf(fmt, t-h);
3013                 }
3014         }
3015         if(!detailed) {
3016                 runtime_unlock(&runtime_sched);
3017                 return;
3018         }
3019         for(mp = runtime_allm; mp; mp = mp->alllink) {
3020                 p = mp->p;
3021                 gp = mp->curg;
3022                 lockedg = mp->lockedg;
3023                 id1 = -1;
3024                 if(p)
3025                         id1 = p->id;
3026                 id2 = -1;
3027                 if(gp)
3028                         id2 = gp->goid;
3029                 id3 = -1;
3030                 if(lockedg)
3031                         id3 = lockedg->goid;
3032                 runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
3033                         " locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
3034                         mp->id, id1, id2,
3035                         mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
3036                         mp->spinning, m->blocked, id3);
3037         }
3038         runtime_lock(&allglock);
3039         for(gi = 0; gi < runtime_allglen; gi++) {
3040                 gp = runtime_allg[gi];
3041                 mp = gp->m;
3042                 lockedm = gp->lockedm;
3043                 runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
3044                         gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
3045                         lockedm ? lockedm->id : -1);
3046         }
3047         runtime_unlock(&allglock);
3048         runtime_unlock(&runtime_sched);
3049 }
3050
3051 // Put mp on midle list.
3052 // Sched must be locked.
3053 static void
3054 mput(M *mp)
3055 {
3056         mp->schedlink = runtime_sched.midle;
3057         runtime_sched.midle = mp;
3058         runtime_sched.nmidle++;
3059         checkdead();
3060 }
3061
3062 // Try to get an m from midle list.
3063 // Sched must be locked.
3064 static M*
3065 mget(void)
3066 {
3067         M *mp;
3068
3069         if((mp = runtime_sched.midle) != nil){
3070                 runtime_sched.midle = mp->schedlink;
3071                 runtime_sched.nmidle--;
3072         }
3073         return mp;
3074 }
3075
3076 // Put gp on the global runnable queue.
3077 // Sched must be locked.
3078 static void
3079 globrunqput(G *gp)
3080 {
3081         gp->schedlink = nil;
3082         if(runtime_sched.runqtail)
3083                 runtime_sched.runqtail->schedlink = gp;
3084         else
3085                 runtime_sched.runqhead = gp;
3086         runtime_sched.runqtail = gp;
3087         runtime_sched.runqsize++;
3088 }
3089
3090 // Put a batch of runnable goroutines on the global runnable queue.
3091 // Sched must be locked.
3092 static void
3093 globrunqputbatch(G *ghead, G *gtail, int32 n)
3094 {
3095         gtail->schedlink = nil;
3096         if(runtime_sched.runqtail)
3097                 runtime_sched.runqtail->schedlink = ghead;
3098         else
3099                 runtime_sched.runqhead = ghead;
3100         runtime_sched.runqtail = gtail;
3101         runtime_sched.runqsize += n;
3102 }
3103
3104 // Try get a batch of G's from the global runnable queue.
3105 // Sched must be locked.
3106 static G*
3107 globrunqget(P *p, int32 max)
3108 {
3109         G *gp, *gp1;
3110         int32 n;
3111
3112         if(runtime_sched.runqsize == 0)
3113                 return nil;
3114         n = runtime_sched.runqsize/runtime_gomaxprocs+1;
3115         if(n > runtime_sched.runqsize)
3116                 n = runtime_sched.runqsize;
3117         if(max > 0 && n > max)
3118                 n = max;
3119         if((uint32)n > nelem(p->runq)/2)
3120                 n = nelem(p->runq)/2;
3121         runtime_sched.runqsize -= n;
3122         if(runtime_sched.runqsize == 0)
3123                 runtime_sched.runqtail = nil;
3124         gp = runtime_sched.runqhead;
3125         runtime_sched.runqhead = gp->schedlink;
3126         n--;
3127         while(n--) {
3128                 gp1 = runtime_sched.runqhead;
3129                 runtime_sched.runqhead = gp1->schedlink;
3130                 runqput(p, gp1);
3131         }
3132         return gp;
3133 }
3134
3135 // Put p to on pidle list.
3136 // Sched must be locked.
3137 static void
3138 pidleput(P *p)
3139 {
3140         p->link = runtime_sched.pidle;
3141         runtime_sched.pidle = p;
3142         runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
3143 }
3144
3145 // Try get a p from pidle list.
3146 // Sched must be locked.
3147 static P*
3148 pidleget(void)
3149 {
3150         P *p;
3151
3152         p = runtime_sched.pidle;
3153         if(p) {
3154                 runtime_sched.pidle = p->link;
3155                 runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
3156         }
3157         return p;
3158 }
3159
3160 // Try to put g on local runnable queue.
3161 // If it's full, put onto global queue.
3162 // Executed only by the owner P.
3163 static void
3164 runqput(P *p, G *gp)
3165 {
3166         uint32 h, t;
3167
3168 retry:
3169         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3170         t = p->runqtail;
3171         if(t - h < nelem(p->runq)) {
3172                 p->runq[t%nelem(p->runq)] = gp;
3173                 runtime_atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
3174                 return;
3175         }
3176         if(runqputslow(p, gp, h, t))
3177                 return;
3178         // the queue is not full, now the put above must suceed
3179         goto retry;
3180 }
3181
3182 // Put g and a batch of work from local runnable queue on global queue.
3183 // Executed only by the owner P.
3184 static bool
3185 runqputslow(P *p, G *gp, uint32 h, uint32 t)
3186 {
3187         G *batch[nelem(p->runq)/2+1];
3188         uint32 n, i;
3189
3190         // First, grab a batch from local queue.
3191         n = t-h;
3192         n = n/2;
3193         if(n != nelem(p->runq)/2)
3194                 runtime_throw("runqputslow: queue is not full");
3195         for(i=0; i<n; i++)
3196                 batch[i] = p->runq[(h+i)%nelem(p->runq)];
3197         if(!runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3198                 return false;
3199         batch[n] = gp;
3200         // Link the goroutines.
3201         for(i=0; i<n; i++)
3202                 batch[i]->schedlink = batch[i+1];
3203         // Now put the batch on global queue.
3204         runtime_lock(&runtime_sched);
3205         globrunqputbatch(batch[0], batch[n], n+1);
3206         runtime_unlock(&runtime_sched);
3207         return true;
3208 }
3209
3210 // Get g from local runnable queue.
3211 // Executed only by the owner P.
3212 static G*
3213 runqget(P *p)
3214 {
3215         G *gp;
3216         uint32 t, h;
3217
3218         for(;;) {
3219                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3220                 t = p->runqtail;
3221                 if(t == h)
3222                         return nil;
3223                 gp = p->runq[h%nelem(p->runq)];
3224                 if(runtime_cas(&p->runqhead, h, h+1))  // cas-release, commits consume
3225                         return gp;
3226         }
3227 }
3228
3229 // Grabs a batch of goroutines from local runnable queue.
3230 // batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
3231 // Can be executed by any P.
3232 static uint32
3233 runqgrab(P *p, G **batch)
3234 {
3235         uint32 t, h, n, i;
3236
3237         for(;;) {
3238                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3239                 t = runtime_atomicload(&p->runqtail);  // load-acquire, synchronize with the producer
3240                 n = t-h;
3241                 n = n - n/2;
3242                 if(n == 0)
3243                         break;
3244                 if(n > nelem(p->runq)/2)  // read inconsistent h and t
3245                         continue;
3246                 for(i=0; i<n; i++)
3247                         batch[i] = p->runq[(h+i)%nelem(p->runq)];
3248                 if(runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3249                         break;
3250         }
3251         return n;
3252 }
3253
3254 // Steal half of elements from local runnable queue of p2
3255 // and put onto local runnable queue of p.
3256 // Returns one of the stolen elements (or nil if failed).
3257 static G*
3258 runqsteal(P *p, P *p2)
3259 {
3260         G *gp;
3261         G *batch[nelem(p->runq)/2];
3262         uint32 t, h, n, i;
3263
3264         n = runqgrab(p2, batch);
3265         if(n == 0)
3266                 return nil;
3267         n--;
3268         gp = batch[n];
3269         if(n == 0)
3270                 return gp;
3271         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3272         t = p->runqtail;
3273         if(t - h + n >= nelem(p->runq))
3274                 runtime_throw("runqsteal: runq overflow");
3275         for(i=0; i<n; i++, t++)
3276                 p->runq[t%nelem(p->runq)] = batch[i];
3277         runtime_atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
3278         return gp;
3279 }
3280
3281 void runtime_testSchedLocalQueue(void)
3282   __asm__("runtime.testSchedLocalQueue");
3283
3284 void
3285 runtime_testSchedLocalQueue(void)
3286 {
3287         P p;
3288         G gs[nelem(p.runq)];
3289         int32 i, j;
3290
3291         runtime_memclr((byte*)&p, sizeof(p));
3292
3293         for(i = 0; i < (int32)nelem(gs); i++) {
3294                 if(runqget(&p) != nil)
3295                         runtime_throw("runq is not empty initially");
3296                 for(j = 0; j < i; j++)
3297                         runqput(&p, &gs[i]);
3298                 for(j = 0; j < i; j++) {
3299                         if(runqget(&p) != &gs[i]) {
3300                                 runtime_printf("bad element at iter %d/%d\n", i, j);
3301                                 runtime_throw("bad element");
3302                         }
3303                 }
3304                 if(runqget(&p) != nil)
3305                         runtime_throw("runq is not empty afterwards");
3306         }
3307 }
3308
3309 void runtime_testSchedLocalQueueSteal(void)
3310   __asm__("runtime.testSchedLocalQueueSteal");
3311
3312 void
3313 runtime_testSchedLocalQueueSteal(void)
3314 {
3315         P p1, p2;
3316         G gs[nelem(p1.runq)], *gp;
3317         int32 i, j, s;
3318
3319         runtime_memclr((byte*)&p1, sizeof(p1));
3320         runtime_memclr((byte*)&p2, sizeof(p2));
3321
3322         for(i = 0; i < (int32)nelem(gs); i++) {
3323                 for(j = 0; j < i; j++) {
3324                         gs[j].sig = 0;
3325                         runqput(&p1, &gs[j]);
3326                 }
3327                 gp = runqsteal(&p2, &p1);
3328                 s = 0;
3329                 if(gp) {
3330                         s++;
3331                         gp->sig++;
3332                 }
3333                 while((gp = runqget(&p2)) != nil) {
3334                         s++;
3335                         gp->sig++;
3336                 }
3337                 while((gp = runqget(&p1)) != nil)
3338                         gp->sig++;
3339                 for(j = 0; j < i; j++) {
3340                         if(gs[j].sig != 1) {
3341                                 runtime_printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
3342                                 runtime_throw("bad element");
3343                         }
3344                 }
3345                 if(s != i/2 && s != i/2+1) {
3346                         runtime_printf("bad steal %d, want %d or %d, iter %d\n",
3347                                 s, i/2, i/2+1, i);
3348                         runtime_throw("bad steal");
3349                 }
3350         }
3351 }
3352
3353 int32
3354 runtime_setmaxthreads(int32 in)
3355 {
3356         int32 out;
3357
3358         runtime_lock(&runtime_sched);
3359         out = runtime_sched.maxmcount;
3360         runtime_sched.maxmcount = in;
3361         checkmcount();
3362         runtime_unlock(&runtime_sched);
3363         return out;
3364 }
3365
3366 void
3367 runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
3368 {
3369         enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
3370 }
3371
3372 // When a function calls a closure, it passes the closure value to
3373 // __go_set_closure immediately before the function call.  When a
3374 // function uses a closure, it calls __go_get_closure immediately on
3375 // function entry.  This is a hack, but it will work on any system.
3376 // It would be better to use the static chain register when there is
3377 // one.  It is also worth considering expanding these functions
3378 // directly in the compiler.
3379
3380 void
3381 __go_set_closure(void* v)
3382 {
3383         g->closure = v;
3384 }
3385
3386 void *
3387 __go_get_closure(void)
3388 {
3389         return g->closure;
3390 }
3391
3392 // Return whether we are waiting for a GC.  This gc toolchain uses
3393 // preemption instead.
3394 bool
3395 runtime_gcwaiting(void)
3396 {
3397         return runtime_sched.gcwaiting;
3398 }