libgo/runtime/proc.c

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 #include <limits.h>
   6 #include <signal.h>
   7 #include <stdlib.h>
   8 #include <pthread.h>
   9 #include <unistd.h>
  10
  11 #include "config.h"
  12
  13 #ifdef HAVE_DL_ITERATE_PHDR
  14 #include <link.h>
  15 #endif
  16
  17 #include "runtime.h"
  18 #include "arch.h"
  19 #include "defs.h"
  20 #include "malloc.h"
  21 #include "go-type.h"
  22 #include "go-defer.h"
  23
  24 #ifdef USING_SPLIT_STACK
  25
  26 /* FIXME: These are not declared anywhere.  */
  27
  28 extern void __splitstack_getcontext(void *context[10]);
  29
  30 extern void __splitstack_setcontext(void *context[10]);
  31
  32 extern void *__splitstack_makecontext(size_t, void *context[10], size_t *);
  33
  34 extern void * __splitstack_resetcontext(void *context[10], size_t *);
  35
  36 extern void *__splitstack_find(void *, void *, size_t *, void **, void **,
  37                                void **);
  38
  39 extern void __splitstack_block_signals (int *, int *);
  40
  41 extern void __splitstack_block_signals_context (void *context[10], int *,
  42                                                 int *);
  43
  44 #endif
  45
  46 #ifndef PTHREAD_STACK_MIN
  47 # define PTHREAD_STACK_MIN 8192
  48 #endif
  49
  50 #if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK)
  51 # define StackMin PTHREAD_STACK_MIN
  52 #else
  53 # define StackMin ((sizeof(char *) < 8) ? 2 * 1024 * 1024 : 4 * 1024 * 1024)
  54 #endif
  55
  56 uintptr runtime_stacks_sys;
  57
  58 static void gtraceback(G*);
  59
  60 #ifdef __rtems__
  61 #define __thread
  62 #endif
  63
  64 static __thread G *g;
  65 static __thread M *m;
  66
  67 #ifndef SETCONTEXT_CLOBBERS_TLS
  68
  69 static inline void
  70 initcontext(void)
  71 {
  72 }
  73
  74 static inline void
  75 fixcontext(ucontext_t *c __attribute__ ((unused)))
  76 {
  77 }
  78
  79 #else
  80
  81 # if defined(__x86_64__) && defined(__sun__)
  82
  83 // x86_64 Solaris 10 and 11 have a bug: setcontext switches the %fs
  84 // register to that of the thread which called getcontext.  The effect
  85 // is that the address of all __thread variables changes.  This bug
  86 // also affects pthread_self() and pthread_getspecific.  We work
  87 // around it by clobbering the context field directly to keep %fs the
  88 // same.
  89
  90 static __thread greg_t fs;
  91
  92 static inline void
  93 initcontext(void)
  94 {
  95         ucontext_t c;
  96
  97         getcontext(&c);
  98         fs = c.uc_mcontext.gregs[REG_FSBASE];
  99 }
 100
 101 static inline void
 102 fixcontext(ucontext_t* c)
 103 {
 104         c->uc_mcontext.gregs[REG_FSBASE] = fs;
 105 }
 106
 107 # elif defined(__NetBSD__)
 108
 109 // NetBSD has a bug: setcontext clobbers tlsbase, we need to save
 110 // and restore it ourselves.
 111
 112 static __thread __greg_t tlsbase;
 113
 114 static inline void
 115 initcontext(void)
 116 {
 117         ucontext_t c;
 118
 119         getcontext(&c);
 120         tlsbase = c.uc_mcontext._mc_tlsbase;
 121 }
 122
 123 static inline void
 124 fixcontext(ucontext_t* c)
 125 {
 126         c->uc_mcontext._mc_tlsbase = tlsbase;
 127 }
 128
 129 # elif defined(__sparc__)
 130
 131 static inline void
 132 initcontext(void)
 133 {
 134 }
 135
 136 static inline void
 137 fixcontext(ucontext_t *c)
 138 {
 139         /* ??? Using
 140              register unsigned long thread __asm__("%g7");
 141              c->uc_mcontext.gregs[REG_G7] = thread;
 142            results in
 143              error: variable ‘thread’ might be clobbered by \
 144                 ‘longjmp’ or ‘vfork’ [-Werror=clobbered]
 145            which ought to be false, as %g7 is a fixed register.  */
 146
 147         if (sizeof (c->uc_mcontext.gregs[REG_G7]) == 8)
 148                 asm ("stx %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7]));
 149         else
 150                 asm ("st %%g7, %0" : "=m"(c->uc_mcontext.gregs[REG_G7]));
 151 }
 152
 153 # else
 154
 155 #  error unknown case for SETCONTEXT_CLOBBERS_TLS
 156
 157 # endif
 158
 159 #endif
 160
 161 // We can not always refer to the TLS variables directly.  The
 162 // compiler will call tls_get_addr to get the address of the variable,
 163 // and it may hold it in a register across a call to schedule.  When
 164 // we get back from the call we may be running in a different thread,
 165 // in which case the register now points to the TLS variable for a
 166 // different thread.  We use non-inlinable functions to avoid this
 167 // when necessary.
 168
 169 G* runtime_g(void) __attribute__ ((noinline, no_split_stack));
 170
 171 G*
 172 runtime_g(void)
 173 {
 174         return g;
 175 }
 176
 177 M* runtime_m(void) __attribute__ ((noinline, no_split_stack));
 178
 179 M*
 180 runtime_m(void)
 181 {
 182         return m;
 183 }
 184
 185 // Set m and g.
 186 void
 187 runtime_setmg(M* mp, G* gp)
 188 {
 189         m = mp;
 190         g = gp;
 191 }
 192
 193 // Start a new thread.
 194 static void
 195 runtime_newosproc(M *mp)
 196 {
 197         pthread_attr_t attr;
 198         sigset_t clear, old;
 199         pthread_t tid;
 200         int ret;
 201
 202         if(pthread_attr_init(&attr) != 0)
 203                 runtime_throw("pthread_attr_init");
 204         if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
 205                 runtime_throw("pthread_attr_setdetachstate");
 206
 207         // Block signals during pthread_create so that the new thread
 208         // starts with signals disabled.  It will enable them in minit.
 209         sigfillset(&clear);
 210
 211 #ifdef SIGTRAP
 212         // Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
 213         sigdelset(&clear, SIGTRAP);
 214 #endif
 215
 216         sigemptyset(&old);
 217         pthread_sigmask(SIG_BLOCK, &clear, &old);
 218         ret = pthread_create(&tid, &attr, runtime_mstart, mp);
 219         pthread_sigmask(SIG_SETMASK, &old, nil);
 220
 221         if (ret != 0)
 222                 runtime_throw("pthread_create");
 223 }
 224
 225 // First function run by a new goroutine.  This replaces gogocall.
 226 static void
 227 kickoff(void)
 228 {
 229         void (*fn)(void*);
 230
 231         if(g->traceback != nil)
 232                 gtraceback(g);
 233
 234         fn = (void (*)(void*))(g->entry);
 235         fn(g->param);
 236         runtime_goexit();
 237 }
 238
 239 // Switch context to a different goroutine.  This is like longjmp.
 240 void runtime_gogo(G*) __attribute__ ((noinline));
 241 void
 242 runtime_gogo(G* newg)
 243 {
 244 #ifdef USING_SPLIT_STACK
 245         __splitstack_setcontext(&newg->stack_context[0]);
 246 #endif
 247         g = newg;
 248         newg->fromgogo = true;
 249         fixcontext(&newg->context);
 250         setcontext(&newg->context);
 251         runtime_throw("gogo setcontext returned");
 252 }
 253
 254 // Save context and call fn passing g as a parameter.  This is like
 255 // setjmp.  Because getcontext always returns 0, unlike setjmp, we use
 256 // g->fromgogo as a code.  It will be true if we got here via
 257 // setcontext.  g == nil the first time this is called in a new m.
 258 void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
 259 void
 260 runtime_mcall(void (*pfn)(G*))
 261 {
 262         M *mp;
 263         G *gp;
 264
 265         // Ensure that all registers are on the stack for the garbage
 266         // collector.
 267         __builtin_unwind_init();
 268
 269         mp = m;
 270         gp = g;
 271         if(gp == mp->g0)
 272                 runtime_throw("runtime: mcall called on m->g0 stack");
 273
 274         if(gp != nil) {
 275
 276 #ifdef USING_SPLIT_STACK
 277                 __splitstack_getcontext(&g->stack_context[0]);
 278 #else
 279                 gp->gcnext_sp = &pfn;
 280 #endif
 281                 gp->fromgogo = false;
 282                 getcontext(&gp->context);
 283
 284                 // When we return from getcontext, we may be running
 285                 // in a new thread.  That means that m and g may have
 286                 // changed.  They are global variables so we will
 287                 // reload them, but the addresses of m and g may be
 288                 // cached in our local stack frame, and those
 289                 // addresses may be wrong.  Call functions to reload
 290                 // the values for this thread.
 291                 mp = runtime_m();
 292                 gp = runtime_g();
 293
 294                 if(gp->traceback != nil)
 295                         gtraceback(gp);
 296         }
 297         if (gp == nil || !gp->fromgogo) {
 298 #ifdef USING_SPLIT_STACK
 299                 __splitstack_setcontext(&mp->g0->stack_context[0]);
 300 #endif
 301                 mp->g0->entry = (byte*)pfn;
 302                 mp->g0->param = gp;
 303
 304                 // It's OK to set g directly here because this case
 305                 // can not occur if we got here via a setcontext to
 306                 // the getcontext call just above.
 307                 g = mp->g0;
 308
 309                 fixcontext(&mp->g0->context);
 310                 setcontext(&mp->g0->context);
 311                 runtime_throw("runtime: mcall function returned");
 312         }
 313 }
 314
 315 // Goroutine scheduler
 316 // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
 317 //
 318 // The main concepts are:
 319 // G - goroutine.
 320 // M - worker thread, or machine.
 321 // P - processor, a resource that is required to execute Go code.
 322 //     M must have an associated P to execute Go code, however it can be
 323 //     blocked or in a syscall w/o an associated P.
 324 //
 325 // Design doc at http://golang.org/s/go11sched.
 326
 327 typedef struct Sched Sched;
 328 struct Sched {
 329         Lock;
 330
 331         uint64  goidgen;
 332         M*      midle;   // idle m's waiting for work
 333         int32   nmidle;  // number of idle m's waiting for work
 334         int32   nmidlelocked; // number of locked m's waiting for work
 335         int32   mcount;  // number of m's that have been created
 336         int32   maxmcount;      // maximum number of m's allowed (or die)
 337
 338         P*      pidle;  // idle P's
 339         uint32  npidle;
 340         uint32  nmspinning;
 341
 342         // Global runnable queue.
 343         G*      runqhead;
 344         G*      runqtail;
 345         int32   runqsize;
 346
 347         // Global cache of dead G's.
 348         Lock    gflock;
 349         G*      gfree;
 350
 351         uint32  gcwaiting;      // gc is waiting to run
 352         int32   stopwait;
 353         Note    stopnote;
 354         uint32  sysmonwait;
 355         Note    sysmonnote;
 356         uint64  lastpoll;
 357
 358         int32   profilehz;      // cpu profiling rate
 359 };
 360
 361 enum
 362 {
 363         // The max value of GOMAXPROCS.
 364         // There are no fundamental restrictions on the value.
 365         MaxGomaxprocs = 1<<8,
 366
 367         // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
 368         // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
 369         GoidCacheBatch = 16,
 370 };
 371
 372 Sched   runtime_sched;
 373 int32   runtime_gomaxprocs;
 374 uint32  runtime_needextram = 1;
 375 M       runtime_m0;
 376 G       runtime_g0;     // idle goroutine for m0
 377 G*      runtime_lastg;
 378 M*      runtime_allm;
 379 P**     runtime_allp;
 380 M*      runtime_extram;
 381 int8*   runtime_goos;
 382 int32   runtime_ncpu;
 383 bool    runtime_precisestack;
 384 static int32    newprocs;
 385
 386 static  Lock allglock;  // the following vars are protected by this lock or by stoptheworld
 387 G**     runtime_allg;
 388 uintptr runtime_allglen;
 389 static  uintptr allgcap;
 390
 391 bool    runtime_isarchive;
 392
 393 void* runtime_mstart(void*);
 394 static void runqput(P*, G*);
 395 static G* runqget(P*);
 396 static bool runqputslow(P*, G*, uint32, uint32);
 397 static G* runqsteal(P*, P*);
 398 static void mput(M*);
 399 static M* mget(void);
 400 static void mcommoninit(M*);
 401 static void schedule(void);
 402 static void procresize(int32);
 403 static void acquirep(P*);
 404 static P* releasep(void);
 405 static void newm(void(*)(void), P*);
 406 static void stopm(void);
 407 static void startm(P*, bool);
 408 static void handoffp(P*);
 409 static void wakep(void);
 410 static void stoplockedm(void);
 411 static void startlockedm(G*);
 412 static void sysmon(void);
 413 static uint32 retake(int64);
 414 static void incidlelocked(int32);
 415 static void checkdead(void);
 416 static void exitsyscall0(G*);
 417 static void park0(G*);
 418 static void goexit0(G*);
 419 static void gfput(P*, G*);
 420 static G* gfget(P*);
 421 static void gfpurge(P*);
 422 static void globrunqput(G*);
 423 static void globrunqputbatch(G*, G*, int32);
 424 static G* globrunqget(P*, int32);
 425 static P* pidleget(void);
 426 static void pidleput(P*);
 427 static void injectglist(G*);
 428 static bool preemptall(void);
 429 static bool exitsyscallfast(void);
 430 static void allgadd(G*);
 431
 432 bool runtime_isstarted;
 433
 434 // The bootstrap sequence is:
 435 //
 436 //      call osinit
 437 //      call schedinit
 438 //      make & queue new G
 439 //      call runtime_mstart
 440 //
 441 // The new G calls runtime_main.
 442 void
 443 runtime_schedinit(void)
 444 {
 445         int32 n, procs;
 446         const byte *p;
 447         Eface i;
 448
 449         m = &runtime_m0;
 450         g = &runtime_g0;
 451         m->g0 = g;
 452         m->curg = g;
 453         g->m = m;
 454
 455         initcontext();
 456
 457         runtime_sched.maxmcount = 10000;
 458         runtime_precisestack = 0;
 459
 460         // runtime_symtabinit();
 461         runtime_mallocinit();
 462         mcommoninit(m);
 463
 464         // Initialize the itable value for newErrorCString,
 465         // so that the next time it gets called, possibly
 466         // in a fault during a garbage collection, it will not
 467         // need to allocated memory.
 468         runtime_newErrorCString(0, &i);
 469
 470         // Initialize the cached gotraceback value, since
 471         // gotraceback calls getenv, which mallocs on Plan 9.
 472         runtime_gotraceback(nil);
 473
 474         runtime_goargs();
 475         runtime_goenvs();
 476         runtime_parsedebugvars();
 477
 478         runtime_sched.lastpoll = runtime_nanotime();
 479         procs = 1;
 480         p = runtime_getenv("GOMAXPROCS");
 481         if(p != nil && (n = runtime_atoi(p)) > 0) {
 482                 if(n > MaxGomaxprocs)
 483                         n = MaxGomaxprocs;
 484                 procs = n;
 485         }
 486         runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
 487         procresize(procs);
 488
 489         // Can not enable GC until all roots are registered.
 490         // mstats.enablegc = 1;
 491 }
 492
 493 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
 494 extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 495
 496 // Used to determine the field alignment.
 497
 498 struct field_align
 499 {
 500   char c;
 501   Hchan *p;
 502 };
 503
 504 // main_init_done is a signal used by cgocallbackg that initialization
 505 // has been completed.  It is made before _cgo_notify_runtime_init_done,
 506 // so all cgo calls can rely on it existing.  When main_init is
 507 // complete, it is closed, meaning cgocallbackg can reliably receive
 508 // from it.
 509 Hchan *runtime_main_init_done;
 510
 511 // The chan bool type, for runtime_main_init_done.
 512
 513 extern const struct __go_type_descriptor bool_type_descriptor
 514   __asm__ (GOSYM_PREFIX "__go_tdn_bool");
 515
 516 static struct __go_channel_type chan_bool_type_descriptor =
 517   {
 518     /* __common */
 519     {
 520       /* __code */
 521       GO_CHAN,
 522       /* __align */
 523       __alignof (Hchan *),
 524       /* __field_align */
 525       offsetof (struct field_align, p) - 1,
 526       /* __size */
 527       sizeof (Hchan *),
 528       /* __hash */
 529       0, /* This value doesn't matter.  */
 530       /* __hashfn */
 531       __go_type_hash_error,
 532       /* __equalfn */
 533       __go_type_equal_error,
 534       /* __gc */
 535       NULL, /* This value doesn't matter */
 536       /* __reflection */
 537       NULL, /* This value doesn't matter */
 538       /* __uncommon */
 539       NULL,
 540       /* __pointer_to_this */
 541       NULL,
 542       /* __zero */
 543       NULL /* This value doesn't matter */
 544     },
 545     /* __element_type */
 546     &bool_type_descriptor,
 547     /* __dir */
 548     CHANNEL_BOTH_DIR
 549   };
 550
 551 extern Hchan *__go_new_channel (ChanType *, uintptr);
 552 extern void closechan(Hchan *) __asm__ (GOSYM_PREFIX "runtime.closechan");
 553
 554 static void
 555 initDone(void *arg __attribute__ ((unused))) {
 556         runtime_unlockOSThread();
 557 };
 558
 559 // The main goroutine.
 560 // Note: C frames in general are not copyable during stack growth, for two reasons:
 561 //   1) We don't know where in a frame to find pointers to other stack locations.
 562 //   2) There's no guarantee that globals or heap values do not point into the frame.
 563 //
 564 // The C frame for runtime.main is copyable, because:
 565 //   1) There are no pointers to other stack locations in the frame
 566 //      (d.fn points at a global, d.link is nil, d.argp is -1).
 567 //   2) The only pointer into this frame is from the defer chain,
 568 //      which is explicitly handled during stack copying.
 569 void
 570 runtime_main(void* dummy __attribute__((unused)))
 571 {
 572         Defer d;
 573         _Bool frame;
 574
 575         newm(sysmon, nil);
 576
 577         // Lock the main goroutine onto this, the main OS thread,
 578         // during initialization.  Most programs won't care, but a few
 579         // do require certain calls to be made by the main thread.
 580         // Those can arrange for main.main to run in the main thread
 581         // by calling runtime.LockOSThread during initialization
 582         // to preserve the lock.
 583         runtime_lockOSThread();
 584
 585         // Defer unlock so that runtime.Goexit during init does the unlock too.
 586         d.__pfn = initDone;
 587         d.__next = g->defer;
 588         d.__arg = (void*)-1;
 589         d.__panic = g->panic;
 590         d.__retaddr = nil;
 591         d.__makefunc_can_recover = 0;
 592         d.__frame = &frame;
 593         d.__special = true;
 594         g->defer = &d;
 595
 596         if(m != &runtime_m0)
 597                 runtime_throw("runtime_main not on m0");
 598         __go_go(runtime_MHeap_Scavenger, nil);
 599
 600         runtime_main_init_done = __go_new_channel(&chan_bool_type_descriptor, 0);
 601
 602         _cgo_notify_runtime_init_done();
 603
 604         main_init();
 605
 606         closechan(runtime_main_init_done);
 607
 608         if(g->defer != &d || d.__pfn != initDone)
 609                 runtime_throw("runtime: bad defer entry after init");
 610         g->defer = d.__next;
 611         runtime_unlockOSThread();
 612
 613         // For gccgo we have to wait until after main is initialized
 614         // to enable GC, because initializing main registers the GC
 615         // roots.
 616         mstats.enablegc = 1;
 617
 618         if(runtime_isarchive) {
 619                 // This is not a complete program, but is instead a
 620                 // library built using -buildmode=c-archive or
 621                 // c-shared.  Now that we are initialized, there is
 622                 // nothing further to do.
 623                 return;
 624         }
 625
 626         main_main();
 627
 628         // Make racy client program work: if panicking on
 629         // another goroutine at the same time as main returns,
 630         // let the other goroutine finish printing the panic trace.
 631         // Once it does, it will exit. See issue 3934.
 632         if(runtime_panicking)
 633                 runtime_park(nil, nil, "panicwait");
 634
 635         runtime_exit(0);
 636         for(;;)
 637                 *(int32*)0 = 0;
 638 }
 639
 640 void
 641 runtime_goroutineheader(G *gp)
 642 {
 643         const char *status;
 644         int64 waitfor;
 645
 646         switch(gp->status) {
 647         case Gidle:
 648                 status = "idle";
 649                 break;
 650         case Grunnable:
 651                 status = "runnable";
 652                 break;
 653         case Grunning:
 654                 status = "running";
 655                 break;
 656         case Gsyscall:
 657                 status = "syscall";
 658                 break;
 659         case Gwaiting:
 660                 if(gp->waitreason)
 661                         status = gp->waitreason;
 662                 else
 663                         status = "waiting";
 664                 break;
 665         default:
 666                 status = "???";
 667                 break;
 668         }
 669
 670         // approx time the G is blocked, in minutes
 671         waitfor = 0;
 672         if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
 673                 waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
 674
 675         if(waitfor < 1)
 676                 runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
 677         else
 678                 runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
 679 }
 680
 681 void
 682 runtime_printcreatedby(G *g)
 683 {
 684         if(g != nil && g->gopc != 0 && g->goid != 1) {
 685                 String fn;
 686                 String file;
 687                 intgo line;
 688
 689                 if(__go_file_line(g->gopc - 1, &fn, &file, &line)) {
 690                         runtime_printf("created by %S\n", fn);
 691                         runtime_printf("\t%S:%D\n", file, (int64) line);
 692                 }
 693         }
 694 }
 695
 696 struct Traceback
 697 {
 698         G* gp;
 699         Location locbuf[TracebackMaxFrames];
 700         int32 c;
 701 };
 702
 703 void
 704 runtime_tracebackothers(G * volatile me)
 705 {
 706         G * volatile gp;
 707         Traceback tb;
 708         int32 traceback;
 709         volatile uintptr i;
 710
 711         tb.gp = me;
 712         traceback = runtime_gotraceback(nil);
 713
 714         // Show the current goroutine first, if we haven't already.
 715         if((gp = m->curg) != nil && gp != me) {
 716                 runtime_printf("\n");
 717                 runtime_goroutineheader(gp);
 718                 gp->traceback = &tb;
 719
 720 #ifdef USING_SPLIT_STACK
 721                 __splitstack_getcontext(&me->stack_context[0]);
 722 #endif
 723                 getcontext(&me->context);
 724
 725                 if(gp->traceback != nil) {
 726                   runtime_gogo(gp);
 727                 }
 728
 729                 runtime_printtrace(tb.locbuf, tb.c, false);
 730                 runtime_printcreatedby(gp);
 731         }
 732
 733         runtime_lock(&allglock);
 734         for(i = 0; i < runtime_allglen; i++) {
 735                 gp = runtime_allg[i];
 736                 if(gp == me || gp == m->curg || gp->status == Gdead)
 737                         continue;
 738                 if(gp->issystem && traceback < 2)
 739                         continue;
 740                 runtime_printf("\n");
 741                 runtime_goroutineheader(gp);
 742
 743                 // Our only mechanism for doing a stack trace is
 744                 // _Unwind_Backtrace.  And that only works for the
 745                 // current thread, not for other random goroutines.
 746                 // So we need to switch context to the goroutine, get
 747                 // the backtrace, and then switch back.
 748
 749                 // This means that if g is running or in a syscall, we
 750                 // can't reliably print a stack trace.  FIXME.
 751
 752                 if(gp->status == Grunning) {
 753                         runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
 754                         runtime_printcreatedby(gp);
 755                 } else if(gp->status == Gsyscall) {
 756                         runtime_printf("\tgoroutine in C code; stack unavailable\n");
 757                         runtime_printcreatedby(gp);
 758                 } else {
 759                         gp->traceback = &tb;
 760
 761 #ifdef USING_SPLIT_STACK
 762                         __splitstack_getcontext(&me->stack_context[0]);
 763 #endif
 764                         getcontext(&me->context);
 765
 766                         if(gp->traceback != nil) {
 767                                 runtime_gogo(gp);
 768                         }
 769
 770                         runtime_printtrace(tb.locbuf, tb.c, false);
 771                         runtime_printcreatedby(gp);
 772                 }
 773         }
 774         runtime_unlock(&allglock);
 775 }
 776
 777 static void
 778 checkmcount(void)
 779 {
 780         // sched lock is held
 781         if(runtime_sched.mcount > runtime_sched.maxmcount) {
 782                 runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
 783                 runtime_throw("thread exhaustion");
 784         }
 785 }
 786
 787 // Do a stack trace of gp, and then restore the context to
 788 // gp->dotraceback.
 789
 790 static void
 791 gtraceback(G* gp)
 792 {
 793         Traceback* traceback;
 794
 795         traceback = gp->traceback;
 796         gp->traceback = nil;
 797         traceback->c = runtime_callers(1, traceback->locbuf,
 798                 sizeof traceback->locbuf / sizeof traceback->locbuf[0], false);
 799         runtime_gogo(traceback->gp);
 800 }
 801
 802 static void
 803 mcommoninit(M *mp)
 804 {
 805         // If there is no mcache runtime_callers() will crash,
 806         // and we are most likely in sysmon thread so the stack is senseless anyway.
 807         if(m->mcache)
 808                 runtime_callers(1, mp->createstack, nelem(mp->createstack), false);
 809
 810         mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 811
 812         runtime_lock(&runtime_sched);
 813         mp->id = runtime_sched.mcount++;
 814         checkmcount();
 815         runtime_mpreinit(mp);
 816
 817         // Add to runtime_allm so garbage collector doesn't free m
 818         // when it is just in a register or thread-local storage.
 819         mp->alllink = runtime_allm;
 820         // runtime_NumCgoCall() iterates over allm w/o schedlock,
 821         // so we need to publish it safely.
 822         runtime_atomicstorep(&runtime_allm, mp);
 823         runtime_unlock(&runtime_sched);
 824 }
 825
 826 // Mark gp ready to run.
 827 void
 828 runtime_ready(G *gp)
 829 {
 830         // Mark runnable.
 831         m->locks++;  // disable preemption because it can be holding p in a local var
 832         if(gp->status != Gwaiting) {
 833                 runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 834                 runtime_throw("bad g->status in ready");
 835         }
 836         gp->status = Grunnable;
 837         runqput(m->p, gp);
 838         if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 839                 wakep();
 840         m->locks--;
 841 }
 842
 843 int32
 844 runtime_gcprocs(void)
 845 {
 846         int32 n;
 847
 848         // Figure out how many CPUs to use during GC.
 849         // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
 850         runtime_lock(&runtime_sched);
 851         n = runtime_gomaxprocs;
 852         if(n > runtime_ncpu)
 853                 n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 854         if(n > MaxGcproc)
 855                 n = MaxGcproc;
 856         if(n > runtime_sched.nmidle+1) // one M is currently running
 857                 n = runtime_sched.nmidle+1;
 858         runtime_unlock(&runtime_sched);
 859         return n;
 860 }
 861
 862 static bool
 863 needaddgcproc(void)
 864 {
 865         int32 n;
 866
 867         runtime_lock(&runtime_sched);
 868         n = runtime_gomaxprocs;
 869         if(n > runtime_ncpu)
 870                 n = runtime_ncpu;
 871         if(n > MaxGcproc)
 872                 n = MaxGcproc;
 873         n -= runtime_sched.nmidle+1; // one M is currently running
 874         runtime_unlock(&runtime_sched);
 875         return n > 0;
 876 }
 877
 878 void
 879 runtime_helpgc(int32 nproc)
 880 {
 881         M *mp;
 882         int32 n, pos;
 883
 884         runtime_lock(&runtime_sched);
 885         pos = 0;
 886         for(n = 1; n < nproc; n++) {  // one M is currently running
 887                 if(runtime_allp[pos]->mcache == m->mcache)
 888                         pos++;
 889                 mp = mget();
 890                 if(mp == nil)
 891                         runtime_throw("runtime_gcprocs inconsistency");
 892                 mp->helpgc = n;
 893                 mp->mcache = runtime_allp[pos]->mcache;
 894                 pos++;
 895                 runtime_notewakeup(&mp->park);
 896         }
 897         runtime_unlock(&runtime_sched);
 898 }
 899
 900 // Similar to stoptheworld but best-effort and can be called several times.
 901 // There is no reverse operation, used during crashing.
 902 // This function must not lock any mutexes.
 903 void
 904 runtime_freezetheworld(void)
 905 {
 906         int32 i;
 907
 908         if(runtime_gomaxprocs == 1)
 909                 return;
 910         // stopwait and preemption requests can be lost
 911         // due to races with concurrently executing threads,
 912         // so try several times
 913         for(i = 0; i < 5; i++) {
 914                 // this should tell the scheduler to not start any new goroutines
 915                 runtime_sched.stopwait = 0x7fffffff;
 916                 runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 917                 // this should stop running goroutines
 918                 if(!preemptall())
 919                         break;  // no running goroutines
 920                 runtime_usleep(1000);
 921         }
 922         // to be sure
 923         runtime_usleep(1000);
 924         preemptall();
 925         runtime_usleep(1000);
 926 }
 927
 928 void
 929 runtime_stoptheworld(void)
 930 {
 931         int32 i;
 932         uint32 s;
 933         P *p;
 934         bool wait;
 935
 936         runtime_lock(&runtime_sched);
 937         runtime_sched.stopwait = runtime_gomaxprocs;
 938         runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 939         preemptall();
 940         // stop current P
 941         m->p->status = Pgcstop;
 942         runtime_sched.stopwait--;
 943         // try to retake all P's in Psyscall status
 944         for(i = 0; i < runtime_gomaxprocs; i++) {
 945                 p = runtime_allp[i];
 946                 s = p->status;
 947                 if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
 948                         runtime_sched.stopwait--;
 949         }
 950         // stop idle P's
 951         while((p = pidleget()) != nil) {
 952                 p->status = Pgcstop;
 953                 runtime_sched.stopwait--;
 954         }
 955         wait = runtime_sched.stopwait > 0;
 956         runtime_unlock(&runtime_sched);
 957
 958         // wait for remaining P's to stop voluntarily
 959         if(wait) {
 960                 runtime_notesleep(&runtime_sched.stopnote);
 961                 runtime_noteclear(&runtime_sched.stopnote);
 962         }
 963         if(runtime_sched.stopwait)
 964                 runtime_throw("stoptheworld: not stopped");
 965         for(i = 0; i < runtime_gomaxprocs; i++) {
 966                 p = runtime_allp[i];
 967                 if(p->status != Pgcstop)
 968                         runtime_throw("stoptheworld: not stopped");
 969         }
 970 }
 971
 972 static void
 973 mhelpgc(void)
 974 {
 975         m->helpgc = -1;
 976 }
 977
 978 void
 979 runtime_starttheworld(void)
 980 {
 981         P *p, *p1;
 982         M *mp;
 983         G *gp;
 984         bool add;
 985
 986         m->locks++;  // disable preemption because it can be holding p in a local var
 987         gp = runtime_netpoll(false);  // non-blocking
 988         injectglist(gp);
 989         add = needaddgcproc();
 990         runtime_lock(&runtime_sched);
 991         if(newprocs) {
 992                 procresize(newprocs);
 993                 newprocs = 0;
 994         } else
 995                 procresize(runtime_gomaxprocs);
 996         runtime_sched.gcwaiting = 0;
 997
 998         p1 = nil;
 999         while((p = pidleget()) != nil) {
1000                 // procresize() puts p's with work at the beginning of the list.
1001                 // Once we reach a p without a run queue, the rest don't have one either.
1002                 if(p->runqhead == p->runqtail) {
1003                         pidleput(p);
1004                         break;
1005                 }
1006                 p->m = mget();
1007                 p->link = p1;
1008                 p1 = p;
1009         }
1010         if(runtime_sched.sysmonwait) {
1011                 runtime_sched.sysmonwait = false;
1012                 runtime_notewakeup(&runtime_sched.sysmonnote);
1013         }
1014         runtime_unlock(&runtime_sched);
1015
1016         while(p1) {
1017                 p = p1;
1018                 p1 = p1->link;
1019                 if(p->m) {
1020                         mp = p->m;
1021                         p->m = nil;
1022                         if(mp->nextp)
1023                                 runtime_throw("starttheworld: inconsistent mp->nextp");
1024                         mp->nextp = p;
1025                         runtime_notewakeup(&mp->park);
1026                 } else {
1027                         // Start M to run P.  Do not start another M below.
1028                         newm(nil, p);
1029                         add = false;
1030                 }
1031         }
1032
1033         if(add) {
1034                 // If GC could have used another helper proc, start one now,
1035                 // in the hope that it will be available next time.
1036                 // It would have been even better to start it before the collection,
1037                 // but doing so requires allocating memory, so it's tricky to
1038                 // coordinate.  This lazy approach works out in practice:
1039                 // we don't mind if the first couple gc rounds don't have quite
1040                 // the maximum number of procs.
1041                 newm(mhelpgc, nil);
1042         }
1043         m->locks--;
1044 }
1045
1046 // Called to start an M.
1047 void*
1048 runtime_mstart(void* mp)
1049 {
1050         m = (M*)mp;
1051         g = m->g0;
1052
1053         initcontext();
1054
1055         g->entry = nil;
1056         g->param = nil;
1057
1058         // Record top of stack for use by mcall.
1059         // Once we call schedule we're never coming back,
1060         // so other calls can reuse this stack space.
1061 #ifdef USING_SPLIT_STACK
1062         __splitstack_getcontext(&g->stack_context[0]);
1063 #else
1064         g->gcinitial_sp = &mp;
1065         // Setting gcstack_size to 0 is a marker meaning that gcinitial_sp
1066         // is the top of the stack, not the bottom.
1067         g->gcstack_size = 0;
1068         g->gcnext_sp = &mp;
1069 #endif
1070         getcontext(&g->context);
1071
1072         if(g->entry != nil) {
1073                 // Got here from mcall.
1074                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1075                 G* gp = (G*)g->param;
1076                 pfn(gp);
1077                 *(int*)0x21 = 0x21;
1078         }
1079         runtime_minit();
1080
1081 #ifdef USING_SPLIT_STACK
1082         {
1083                 int dont_block_signals = 0;
1084                 __splitstack_block_signals(&dont_block_signals, nil);
1085         }
1086 #endif
1087
1088         // Install signal handlers; after minit so that minit can
1089         // prepare the thread to be able to handle the signals.
1090         if(m == &runtime_m0) {
1091                 if(runtime_iscgo && !runtime_cgoHasExtraM) {
1092                         runtime_cgoHasExtraM = true;
1093                         runtime_newextram();
1094                         runtime_needextram = 0;
1095                 }
1096                 runtime_initsig();
1097         }
1098
1099         if(m->mstartfn)
1100                 m->mstartfn();
1101
1102         if(m->helpgc) {
1103                 m->helpgc = 0;
1104                 stopm();
1105         } else if(m != &runtime_m0) {
1106                 acquirep(m->nextp);
1107                 m->nextp = nil;
1108         }
1109         schedule();
1110
1111         // TODO(brainman): This point is never reached, because scheduler
1112         // does not release os threads at the moment. But once this path
1113         // is enabled, we must remove our seh here.
1114
1115         return nil;
1116 }
1117
1118 typedef struct CgoThreadStart CgoThreadStart;
1119 struct CgoThreadStart
1120 {
1121         M *m;
1122         G *g;
1123         uintptr *tls;
1124         void (*fn)(void);
1125 };
1126
1127 // Allocate a new m unassociated with any thread.
1128 // Can use p for allocation context if needed.
1129 M*
1130 runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacksize)
1131 {
1132         M *mp;
1133
1134         m->locks++;  // disable GC because it can be called from sysmon
1135         if(m->p == nil)
1136                 acquirep(p);  // temporarily borrow p for mallocs in this function
1137 #if 0
1138         if(mtype == nil) {
1139                 Eface e;
1140                 runtime_gc_m_ptr(&e);
1141                 mtype = ((const PtrType*)e.__type_descriptor)->__element_type;
1142         }
1143 #endif
1144
1145         mp = runtime_mal(sizeof *mp);
1146         mcommoninit(mp);
1147         mp->g0 = runtime_malg(stacksize, ret_g0_stack, ret_g0_stacksize);
1148
1149         if(p == m->p)
1150                 releasep();
1151         m->locks--;
1152
1153         return mp;
1154 }
1155
1156 static G*
1157 allocg(void)
1158 {
1159         G *gp;
1160         // static Type *gtype;
1161
1162         // if(gtype == nil) {
1163         //      Eface e;
1164         //      runtime_gc_g_ptr(&e);
1165         //      gtype = ((PtrType*)e.__type_descriptor)->__element_type;
1166         // }
1167         // gp = runtime_cnew(gtype);
1168         gp = runtime_malloc(sizeof(G));
1169         return gp;
1170 }
1171
1172 static M* lockextra(bool nilokay);
1173 static void unlockextra(M*);
1174
1175 // needm is called when a cgo callback happens on a
1176 // thread without an m (a thread not created by Go).
1177 // In this case, needm is expected to find an m to use
1178 // and return with m, g initialized correctly.
1179 // Since m and g are not set now (likely nil, but see below)
1180 // needm is limited in what routines it can call. In particular
1181 // it can only call nosplit functions (textflag 7) and cannot
1182 // do any scheduling that requires an m.
1183 //
1184 // In order to avoid needing heavy lifting here, we adopt
1185 // the following strategy: there is a stack of available m's
1186 // that can be stolen. Using compare-and-swap
1187 // to pop from the stack has ABA races, so we simulate
1188 // a lock by doing an exchange (via casp) to steal the stack
1189 // head and replace the top pointer with MLOCKED (1).
1190 // This serves as a simple spin lock that we can use even
1191 // without an m. The thread that locks the stack in this way
1192 // unlocks the stack by storing a valid stack head pointer.
1193 //
1194 // In order to make sure that there is always an m structure
1195 // available to be stolen, we maintain the invariant that there
1196 // is always one more than needed. At the beginning of the
1197 // program (if cgo is in use) the list is seeded with a single m.
1198 // If needm finds that it has taken the last m off the list, its job
1199 // is - once it has installed its own m so that it can do things like
1200 // allocate memory - to create a spare m and put it on the list.
1201 //
1202 // Each of these extra m's also has a g0 and a curg that are
1203 // pressed into service as the scheduling stack and current
1204 // goroutine for the duration of the cgo callback.
1205 //
1206 // When the callback is done with the m, it calls dropm to
1207 // put the m back on the list.
1208 //
1209 // Unlike the gc toolchain, we start running on curg, since we are
1210 // just going to return and let the caller continue.
1211 void
1212 runtime_needm(void)
1213 {
1214         M *mp;
1215
1216         if(runtime_needextram) {
1217                 // Can happen if C/C++ code calls Go from a global ctor.
1218                 // Can not throw, because scheduler is not initialized yet.
1219                 int rv __attribute__((unused));
1220                 rv = runtime_write(2, "fatal error: cgo callback before cgo call\n",
1221                         sizeof("fatal error: cgo callback before cgo call\n")-1);
1222                 runtime_exit(1);
1223         }
1224
1225         // Lock extra list, take head, unlock popped list.
1226         // nilokay=false is safe here because of the invariant above,
1227         // that the extra list always contains or will soon contain
1228         // at least one m.
1229         mp = lockextra(false);
1230
1231         // Set needextram when we've just emptied the list,
1232         // so that the eventual call into cgocallbackg will
1233         // allocate a new m for the extra list. We delay the
1234         // allocation until then so that it can be done
1235         // after exitsyscall makes sure it is okay to be
1236         // running at all (that is, there's no garbage collection
1237         // running right now).
1238         mp->needextram = mp->schedlink == nil;
1239         unlockextra(mp->schedlink);
1240
1241         // Install m and g (= m->curg).
1242         runtime_setmg(mp, mp->curg);
1243
1244         // Initialize g's context as in mstart.
1245         initcontext();
1246         g->status = Gsyscall;
1247         g->entry = nil;
1248         g->param = nil;
1249 #ifdef USING_SPLIT_STACK
1250         __splitstack_getcontext(&g->stack_context[0]);
1251 #else
1252         g->gcinitial_sp = &mp;
1253         g->gcstack = nil;
1254         g->gcstack_size = 0;
1255         g->gcnext_sp = &mp;
1256 #endif
1257         getcontext(&g->context);
1258
1259         if(g->entry != nil) {
1260                 // Got here from mcall.
1261                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1262                 G* gp = (G*)g->param;
1263                 pfn(gp);
1264                 *(int*)0x22 = 0x22;
1265         }
1266
1267         // Initialize this thread to use the m.
1268         runtime_minit();
1269
1270 #ifdef USING_SPLIT_STACK
1271         {
1272                 int dont_block_signals = 0;
1273                 __splitstack_block_signals(&dont_block_signals, nil);
1274         }
1275 #endif
1276 }
1277
1278 // newextram allocates an m and puts it on the extra list.
1279 // It is called with a working local m, so that it can do things
1280 // like call schedlock and allocate.
1281 void
1282 runtime_newextram(void)
1283 {
1284         M *mp, *mnext;
1285         G *gp;
1286         byte *g0_sp, *sp;
1287         size_t g0_spsize, spsize;
1288
1289         // Create extra goroutine locked to extra m.
1290         // The goroutine is the context in which the cgo callback will run.
1291         // The sched.pc will never be returned to, but setting it to
1292         // runtime.goexit makes clear to the traceback routines where
1293         // the goroutine stack ends.
1294         mp = runtime_allocm(nil, StackMin, &g0_sp, &g0_spsize);
1295         gp = runtime_malg(StackMin, &sp, &spsize);
1296         gp->status = Gdead;
1297         mp->curg = gp;
1298         mp->locked = LockInternal;
1299         mp->lockedg = gp;
1300         gp->lockedm = mp;
1301         gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
1302         // put on allg for garbage collector
1303         allgadd(gp);
1304
1305         // The context for gp will be set up in runtime_needm.  But
1306         // here we need to set up the context for g0.
1307         getcontext(&mp->g0->context);
1308         mp->g0->context.uc_stack.ss_sp = g0_sp;
1309         mp->g0->context.uc_stack.ss_size = g0_spsize;
1310         makecontext(&mp->g0->context, kickoff, 0);
1311
1312         // Add m to the extra list.
1313         mnext = lockextra(true);
1314         mp->schedlink = mnext;
1315         unlockextra(mp);
1316 }
1317
1318 // dropm is called when a cgo callback has called needm but is now
1319 // done with the callback and returning back into the non-Go thread.
1320 // It puts the current m back onto the extra list.
1321 //
1322 // The main expense here is the call to signalstack to release the
1323 // m's signal stack, and then the call to needm on the next callback
1324 // from this thread. It is tempting to try to save the m for next time,
1325 // which would eliminate both these costs, but there might not be
1326 // a next time: the current thread (which Go does not control) might exit.
1327 // If we saved the m for that thread, there would be an m leak each time
1328 // such a thread exited. Instead, we acquire and release an m on each
1329 // call. These should typically not be scheduling operations, just a few
1330 // atomics, so the cost should be small.
1331 //
1332 // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
1333 // variable using pthread_key_create. Unlike the pthread keys we already use
1334 // on OS X, this dummy key would never be read by Go code. It would exist
1335 // only so that we could register at thread-exit-time destructor.
1336 // That destructor would put the m back onto the extra list.
1337 // This is purely a performance optimization. The current version,
1338 // in which dropm happens on each cgo call, is still correct too.
1339 // We may have to keep the current version on systems with cgo
1340 // but without pthreads, like Windows.
1341 void
1342 runtime_dropm(void)
1343 {
1344         M *mp, *mnext;
1345
1346         // Undo whatever initialization minit did during needm.
1347         runtime_unminit();
1348
1349         // Clear m and g, and return m to the extra list.
1350         // After the call to setmg we can only call nosplit functions.
1351         mp = m;
1352         runtime_setmg(nil, nil);
1353
1354         mp->curg->status = Gdead;
1355         mp->curg->gcstack = nil;
1356         mp->curg->gcnext_sp = nil;
1357
1358         mnext = lockextra(true);
1359         mp->schedlink = mnext;
1360         unlockextra(mp);
1361 }
1362
1363 #define MLOCKED ((M*)1)
1364
1365 // lockextra locks the extra list and returns the list head.
1366 // The caller must unlock the list by storing a new list head
1367 // to runtime.extram. If nilokay is true, then lockextra will
1368 // return a nil list head if that's what it finds. If nilokay is false,
1369 // lockextra will keep waiting until the list head is no longer nil.
1370 static M*
1371 lockextra(bool nilokay)
1372 {
1373         M *mp;
1374         void (*yield)(void);
1375
1376         for(;;) {
1377                 mp = runtime_atomicloadp(&runtime_extram);
1378                 if(mp == MLOCKED) {
1379                         yield = runtime_osyield;
1380                         yield();
1381                         continue;
1382                 }
1383                 if(mp == nil && !nilokay) {
1384                         runtime_usleep(1);
1385                         continue;
1386                 }
1387                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1388                         yield = runtime_osyield;
1389                         yield();
1390                         continue;
1391                 }
1392                 break;
1393         }
1394         return mp;
1395 }
1396
1397 static void
1398 unlockextra(M *mp)
1399 {
1400         runtime_atomicstorep(&runtime_extram, mp);
1401 }
1402
1403 static int32
1404 countextra()
1405 {
1406         M *mp, *mc;
1407         int32 c;
1408
1409         for(;;) {
1410                 mp = runtime_atomicloadp(&runtime_extram);
1411                 if(mp == MLOCKED) {
1412                         runtime_osyield();
1413                         continue;
1414                 }
1415                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1416                         runtime_osyield();
1417                         continue;
1418                 }
1419                 c = 0;
1420                 for(mc = mp; mc != nil; mc = mc->schedlink)
1421                         c++;
1422                 runtime_atomicstorep(&runtime_extram, mp);
1423                 return c;
1424         }
1425 }
1426
1427 // Create a new m.  It will start off with a call to fn, or else the scheduler.
1428 static void
1429 newm(void(*fn)(void), P *p)
1430 {
1431         M *mp;
1432
1433         mp = runtime_allocm(p, -1, nil, nil);
1434         mp->nextp = p;
1435         mp->mstartfn = fn;
1436
1437         runtime_newosproc(mp);
1438 }
1439
1440 // Stops execution of the current m until new work is available.
1441 // Returns with acquired P.
1442 static void
1443 stopm(void)
1444 {
1445         if(m->locks)
1446                 runtime_throw("stopm holding locks");
1447         if(m->p)
1448                 runtime_throw("stopm holding p");
1449         if(m->spinning) {
1450                 m->spinning = false;
1451                 runtime_xadd(&runtime_sched.nmspinning, -1);
1452         }
1453
1454 retry:
1455         runtime_lock(&runtime_sched);
1456         mput(m);
1457         runtime_unlock(&runtime_sched);
1458         runtime_notesleep(&m->park);
1459         runtime_noteclear(&m->park);
1460         if(m->helpgc) {
1461                 runtime_gchelper();
1462                 m->helpgc = 0;
1463                 m->mcache = nil;
1464                 goto retry;
1465         }
1466         acquirep(m->nextp);
1467         m->nextp = nil;
1468 }
1469
1470 static void
1471 mspinning(void)
1472 {
1473         m->spinning = true;
1474 }
1475
1476 // Schedules some M to run the p (creates an M if necessary).
1477 // If p==nil, tries to get an idle P, if no idle P's does nothing.
1478 static void
1479 startm(P *p, bool spinning)
1480 {
1481         M *mp;
1482         void (*fn)(void);
1483
1484         runtime_lock(&runtime_sched);
1485         if(p == nil) {
1486                 p = pidleget();
1487                 if(p == nil) {
1488                         runtime_unlock(&runtime_sched);
1489                         if(spinning)
1490                                 runtime_xadd(&runtime_sched.nmspinning, -1);
1491                         return;
1492                 }
1493         }
1494         mp = mget();
1495         runtime_unlock(&runtime_sched);
1496         if(mp == nil) {
1497                 fn = nil;
1498                 if(spinning)
1499                         fn = mspinning;
1500                 newm(fn, p);
1501                 return;
1502         }
1503         if(mp->spinning)
1504                 runtime_throw("startm: m is spinning");
1505         if(mp->nextp)
1506                 runtime_throw("startm: m has p");
1507         mp->spinning = spinning;
1508         mp->nextp = p;
1509         runtime_notewakeup(&mp->park);
1510 }
1511
1512 // Hands off P from syscall or locked M.
1513 static void
1514 handoffp(P *p)
1515 {
1516         // if it has local work, start it straight away
1517         if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
1518                 startm(p, false);
1519                 return;
1520         }
1521         // no local work, check that there are no spinning/idle M's,
1522         // otherwise our help is not required
1523         if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
1524                 runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
1525                 startm(p, true);
1526                 return;
1527         }
1528         runtime_lock(&runtime_sched);
1529         if(runtime_sched.gcwaiting) {
1530                 p->status = Pgcstop;
1531                 if(--runtime_sched.stopwait == 0)
1532                         runtime_notewakeup(&runtime_sched.stopnote);
1533                 runtime_unlock(&runtime_sched);
1534                 return;
1535         }
1536         if(runtime_sched.runqsize) {
1537                 runtime_unlock(&runtime_sched);
1538                 startm(p, false);
1539                 return;
1540         }
1541         // If this is the last running P and nobody is polling network,
1542         // need to wakeup another M to poll network.
1543         if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
1544                 runtime_unlock(&runtime_sched);
1545                 startm(p, false);
1546                 return;
1547         }
1548         pidleput(p);
1549         runtime_unlock(&runtime_sched);
1550 }
1551
1552 // Tries to add one more P to execute G's.
1553 // Called when a G is made runnable (newproc, ready).
1554 static void
1555 wakep(void)
1556 {
1557         // be conservative about spinning threads
1558         if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
1559                 return;
1560         startm(nil, true);
1561 }
1562
1563 // Stops execution of the current m that is locked to a g until the g is runnable again.
1564 // Returns with acquired P.
1565 static void
1566 stoplockedm(void)
1567 {
1568         P *p;
1569
1570         if(m->lockedg == nil || m->lockedg->lockedm != m)
1571                 runtime_throw("stoplockedm: inconsistent locking");
1572         if(m->p) {
1573                 // Schedule another M to run this p.
1574                 p = releasep();
1575                 handoffp(p);
1576         }
1577         incidlelocked(1);
1578         // Wait until another thread schedules lockedg again.
1579         runtime_notesleep(&m->park);
1580         runtime_noteclear(&m->park);
1581         if(m->lockedg->status != Grunnable)
1582                 runtime_throw("stoplockedm: not runnable");
1583         acquirep(m->nextp);
1584         m->nextp = nil;
1585 }
1586
1587 // Schedules the locked m to run the locked gp.
1588 static void
1589 startlockedm(G *gp)
1590 {
1591         M *mp;
1592         P *p;
1593
1594         mp = gp->lockedm;
1595         if(mp == m)
1596                 runtime_throw("startlockedm: locked to me");
1597         if(mp->nextp)
1598                 runtime_throw("startlockedm: m has p");
1599         // directly handoff current P to the locked m
1600         incidlelocked(-1);
1601         p = releasep();
1602         mp->nextp = p;
1603         runtime_notewakeup(&mp->park);
1604         stopm();
1605 }
1606
1607 // Stops the current m for stoptheworld.
1608 // Returns when the world is restarted.
1609 static void
1610 gcstopm(void)
1611 {
1612         P *p;
1613
1614         if(!runtime_sched.gcwaiting)
1615                 runtime_throw("gcstopm: not waiting for gc");
1616         if(m->spinning) {
1617                 m->spinning = false;
1618                 runtime_xadd(&runtime_sched.nmspinning, -1);
1619         }
1620         p = releasep();
1621         runtime_lock(&runtime_sched);
1622         p->status = Pgcstop;
1623         if(--runtime_sched.stopwait == 0)
1624                 runtime_notewakeup(&runtime_sched.stopnote);
1625         runtime_unlock(&runtime_sched);
1626         stopm();
1627 }
1628
1629 // Schedules gp to run on the current M.
1630 // Never returns.
1631 static void
1632 execute(G *gp)
1633 {
1634         int32 hz;
1635
1636         if(gp->status != Grunnable) {
1637                 runtime_printf("execute: bad g status %d\n", gp->status);
1638                 runtime_throw("execute: bad g status");
1639         }
1640         gp->status = Grunning;
1641         gp->waitsince = 0;
1642         m->p->schedtick++;
1643         m->curg = gp;
1644         gp->m = m;
1645
1646         // Check whether the profiler needs to be turned on or off.
1647         hz = runtime_sched.profilehz;
1648         if(m->profilehz != hz)
1649                 runtime_resetcpuprofiler(hz);
1650
1651         runtime_gogo(gp);
1652 }
1653
1654 // Finds a runnable goroutine to execute.
1655 // Tries to steal from other P's, get g from global queue, poll network.
1656 static G*
1657 findrunnable(void)
1658 {
1659         G *gp;
1660         P *p;
1661         int32 i;
1662
1663 top:
1664         if(runtime_sched.gcwaiting) {
1665                 gcstopm();
1666                 goto top;
1667         }
1668         if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil)
1669                 runtime_ready(gp);
1670         // local runq
1671         gp = runqget(m->p);
1672         if(gp)
1673                 return gp;
1674         // global runq
1675         if(runtime_sched.runqsize) {
1676                 runtime_lock(&runtime_sched);
1677                 gp = globrunqget(m->p, 0);
1678                 runtime_unlock(&runtime_sched);
1679                 if(gp)
1680                         return gp;
1681         }
1682         // poll network
1683         gp = runtime_netpoll(false);  // non-blocking
1684         if(gp) {
1685                 injectglist(gp->schedlink);
1686                 gp->status = Grunnable;
1687                 return gp;
1688         }
1689         // If number of spinning M's >= number of busy P's, block.
1690         // This is necessary to prevent excessive CPU consumption
1691         // when GOMAXPROCS>>1 but the program parallelism is low.
1692         if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
1693                 goto stop;
1694         if(!m->spinning) {
1695                 m->spinning = true;
1696                 runtime_xadd(&runtime_sched.nmspinning, 1);
1697         }
1698         // random steal from other P's
1699         for(i = 0; i < 2*runtime_gomaxprocs; i++) {
1700                 if(runtime_sched.gcwaiting)
1701                         goto top;
1702                 p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
1703                 if(p == m->p)
1704                         gp = runqget(p);
1705                 else
1706                         gp = runqsteal(m->p, p);
1707                 if(gp)
1708                         return gp;
1709         }
1710 stop:
1711         // return P and block
1712         runtime_lock(&runtime_sched);
1713         if(runtime_sched.gcwaiting) {
1714                 runtime_unlock(&runtime_sched);
1715                 goto top;
1716         }
1717         if(runtime_sched.runqsize) {
1718                 gp = globrunqget(m->p, 0);
1719                 runtime_unlock(&runtime_sched);
1720                 return gp;
1721         }
1722         p = releasep();
1723         pidleput(p);
1724         runtime_unlock(&runtime_sched);
1725         if(m->spinning) {
1726                 m->spinning = false;
1727                 runtime_xadd(&runtime_sched.nmspinning, -1);
1728         }
1729         // check all runqueues once again
1730         for(i = 0; i < runtime_gomaxprocs; i++) {
1731                 p = runtime_allp[i];
1732                 if(p && p->runqhead != p->runqtail) {
1733                         runtime_lock(&runtime_sched);
1734                         p = pidleget();
1735                         runtime_unlock(&runtime_sched);
1736                         if(p) {
1737                                 acquirep(p);
1738                                 goto top;
1739                         }
1740                         break;
1741                 }
1742         }
1743         // poll network
1744         if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
1745                 if(m->p)
1746                         runtime_throw("findrunnable: netpoll with p");
1747                 if(m->spinning)
1748                         runtime_throw("findrunnable: netpoll with spinning");
1749                 gp = runtime_netpoll(true);  // block until new work is available
1750                 runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
1751                 if(gp) {
1752                         runtime_lock(&runtime_sched);
1753                         p = pidleget();
1754                         runtime_unlock(&runtime_sched);
1755                         if(p) {
1756                                 acquirep(p);
1757                                 injectglist(gp->schedlink);
1758                                 gp->status = Grunnable;
1759                                 return gp;
1760                         }
1761                         injectglist(gp);
1762                 }
1763         }
1764         stopm();
1765         goto top;
1766 }
1767
1768 static void
1769 resetspinning(void)
1770 {
1771         int32 nmspinning;
1772
1773         if(m->spinning) {
1774                 m->spinning = false;
1775                 nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
1776                 if(nmspinning < 0)
1777                         runtime_throw("findrunnable: negative nmspinning");
1778         } else
1779                 nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
1780
1781         // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
1782         // so see if we need to wakeup another P here.
1783         if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
1784                 wakep();
1785 }
1786
1787 // Injects the list of runnable G's into the scheduler.
1788 // Can run concurrently with GC.
1789 static void
1790 injectglist(G *glist)
1791 {
1792         int32 n;
1793         G *gp;
1794
1795         if(glist == nil)
1796                 return;
1797         runtime_lock(&runtime_sched);
1798         for(n = 0; glist; n++) {
1799                 gp = glist;
1800                 glist = gp->schedlink;
1801                 gp->status = Grunnable;
1802                 globrunqput(gp);
1803         }
1804         runtime_unlock(&runtime_sched);
1805
1806         for(; n && runtime_sched.npidle; n--)
1807                 startm(nil, false);
1808 }
1809
1810 // One round of scheduler: find a runnable goroutine and execute it.
1811 // Never returns.
1812 static void
1813 schedule(void)
1814 {
1815         G *gp;
1816         uint32 tick;
1817
1818         if(m->locks)
1819                 runtime_throw("schedule: holding locks");
1820
1821 top:
1822         if(runtime_sched.gcwaiting) {
1823                 gcstopm();
1824                 goto top;
1825         }
1826
1827         gp = nil;
1828         // Check the global runnable queue once in a while to ensure fairness.
1829         // Otherwise two goroutines can completely occupy the local runqueue
1830         // by constantly respawning each other.
1831         tick = m->p->schedtick;
1832         // This is a fancy way to say tick%61==0,
1833         // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
1834         if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
1835                 runtime_lock(&runtime_sched);
1836                 gp = globrunqget(m->p, 1);
1837                 runtime_unlock(&runtime_sched);
1838                 if(gp)
1839                         resetspinning();
1840         }
1841         if(gp == nil) {
1842                 gp = runqget(m->p);
1843                 if(gp && m->spinning)
1844                         runtime_throw("schedule: spinning with local work");
1845         }
1846         if(gp == nil) {
1847                 gp = findrunnable();  // blocks until work is available
1848                 resetspinning();
1849         }
1850
1851         if(gp->lockedm) {
1852                 // Hands off own p to the locked m,
1853                 // then blocks waiting for a new p.
1854                 startlockedm(gp);
1855                 goto top;
1856         }
1857
1858         execute(gp);
1859 }
1860
1861 // Puts the current goroutine into a waiting state and calls unlockf.
1862 // If unlockf returns false, the goroutine is resumed.
1863 void
1864 runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
1865 {
1866         if(g->status != Grunning)
1867                 runtime_throw("bad g status");
1868         m->waitlock = lock;
1869         m->waitunlockf = unlockf;
1870         g->waitreason = reason;
1871         runtime_mcall(park0);
1872 }
1873
1874 static bool
1875 parkunlock(G *gp, void *lock)
1876 {
1877         USED(gp);
1878         runtime_unlock(lock);
1879         return true;
1880 }
1881
1882 // Puts the current goroutine into a waiting state and unlocks the lock.
1883 // The goroutine can be made runnable again by calling runtime_ready(gp).
1884 void
1885 runtime_parkunlock(Lock *lock, const char *reason)
1886 {
1887         runtime_park(parkunlock, lock, reason);
1888 }
1889
1890 // runtime_park continuation on g0.
1891 static void
1892 park0(G *gp)
1893 {
1894         bool ok;
1895
1896         gp->status = Gwaiting;
1897         gp->m = nil;
1898         m->curg = nil;
1899         if(m->waitunlockf) {
1900                 ok = m->waitunlockf(gp, m->waitlock);
1901                 m->waitunlockf = nil;
1902                 m->waitlock = nil;
1903                 if(!ok) {
1904                         gp->status = Grunnable;
1905                         execute(gp);  // Schedule it back, never returns.
1906                 }
1907         }
1908         if(m->lockedg) {
1909                 stoplockedm();
1910                 execute(gp);  // Never returns.
1911         }
1912         schedule();
1913 }
1914
1915 // Scheduler yield.
1916 void
1917 runtime_gosched(void)
1918 {
1919         if(g->status != Grunning)
1920                 runtime_throw("bad g status");
1921         runtime_mcall(runtime_gosched0);
1922 }
1923
1924 // runtime_gosched continuation on g0.
1925 void
1926 runtime_gosched0(G *gp)
1927 {
1928         gp->status = Grunnable;
1929         gp->m = nil;
1930         m->curg = nil;
1931         runtime_lock(&runtime_sched);
1932         globrunqput(gp);
1933         runtime_unlock(&runtime_sched);
1934         if(m->lockedg) {
1935                 stoplockedm();
1936                 execute(gp);  // Never returns.
1937         }
1938         schedule();
1939 }
1940
1941 // Finishes execution of the current goroutine.
1942 // Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
1943 // Since it does not return it does not matter.  But if it is preempted
1944 // at the split stack check, GC will complain about inconsistent sp.
1945 void runtime_goexit(void) __attribute__ ((noinline));
1946 void
1947 runtime_goexit(void)
1948 {
1949         if(g->status != Grunning)
1950                 runtime_throw("bad g status");
1951         runtime_mcall(goexit0);
1952 }
1953
1954 // runtime_goexit continuation on g0.
1955 static void
1956 goexit0(G *gp)
1957 {
1958         gp->status = Gdead;
1959         gp->entry = nil;
1960         gp->m = nil;
1961         gp->lockedm = nil;
1962         gp->paniconfault = 0;
1963         gp->defer = nil; // should be true already but just in case.
1964         gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
1965         gp->writenbuf = 0;
1966         gp->writebuf = nil;
1967         gp->waitreason = nil;
1968         gp->param = nil;
1969         m->curg = nil;
1970         m->lockedg = nil;
1971         if(m->locked & ~LockExternal) {
1972                 runtime_printf("invalid m->locked = %d\n", m->locked);
1973                 runtime_throw("internal lockOSThread error");
1974         }
1975         m->locked = 0;
1976         gfput(m->p, gp);
1977         schedule();
1978 }
1979
1980 // The goroutine g is about to enter a system call.
1981 // Record that it's not using the cpu anymore.
1982 // This is called only from the go syscall library and cgocall,
1983 // not from the low-level system calls used by the runtime.
1984 //
1985 // Entersyscall cannot split the stack: the runtime_gosave must
1986 // make g->sched refer to the caller's stack segment, because
1987 // entersyscall is going to return immediately after.
1988
1989 void runtime_entersyscall(void) __attribute__ ((no_split_stack));
1990 static void doentersyscall(void) __attribute__ ((no_split_stack, noinline));
1991
1992 void
1993 runtime_entersyscall()
1994 {
1995         // Save the registers in the g structure so that any pointers
1996         // held in registers will be seen by the garbage collector.
1997         getcontext(&g->gcregs);
1998
1999         // Do the work in a separate function, so that this function
2000         // doesn't save any registers on its own stack.  If this
2001         // function does save any registers, we might store the wrong
2002         // value in the call to getcontext.
2003         //
2004         // FIXME: This assumes that we do not need to save any
2005         // callee-saved registers to access the TLS variable g.  We
2006         // don't want to put the ucontext_t on the stack because it is
2007         // large and we can not split the stack here.
2008         doentersyscall();
2009 }
2010
2011 static void
2012 doentersyscall()
2013 {
2014         // Disable preemption because during this function g is in Gsyscall status,
2015         // but can have inconsistent g->sched, do not let GC observe it.
2016         m->locks++;
2017
2018         // Leave SP around for GC and traceback.
2019 #ifdef USING_SPLIT_STACK
2020         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
2021                                        &g->gcnext_segment, &g->gcnext_sp,
2022                                        &g->gcinitial_sp);
2023 #else
2024         {
2025                 void *v;
2026
2027                 g->gcnext_sp = (byte *) &v;
2028         }
2029 #endif
2030
2031         g->status = Gsyscall;
2032
2033         if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
2034                 runtime_lock(&runtime_sched);
2035                 if(runtime_atomicload(&runtime_sched.sysmonwait)) {
2036                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2037                         runtime_notewakeup(&runtime_sched.sysmonnote);
2038                 }
2039                 runtime_unlock(&runtime_sched);
2040         }
2041
2042         m->mcache = nil;
2043         m->p->m = nil;
2044         runtime_atomicstore(&m->p->status, Psyscall);
2045         if(runtime_sched.gcwaiting) {
2046                 runtime_lock(&runtime_sched);
2047                 if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
2048                         if(--runtime_sched.stopwait == 0)
2049                                 runtime_notewakeup(&runtime_sched.stopnote);
2050                 }
2051                 runtime_unlock(&runtime_sched);
2052         }
2053
2054         m->locks--;
2055 }
2056
2057 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
2058 void
2059 runtime_entersyscallblock(void)
2060 {
2061         P *p;
2062
2063         m->locks++;  // see comment in entersyscall
2064
2065         // Leave SP around for GC and traceback.
2066 #ifdef USING_SPLIT_STACK
2067         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
2068                                        &g->gcnext_segment, &g->gcnext_sp,
2069                                        &g->gcinitial_sp);
2070 #else
2071         g->gcnext_sp = (byte *) &p;
2072 #endif
2073
2074         // Save the registers in the g structure so that any pointers
2075         // held in registers will be seen by the garbage collector.
2076         getcontext(&g->gcregs);
2077
2078         g->status = Gsyscall;
2079
2080         p = releasep();
2081         handoffp(p);
2082         if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
2083                 incidlelocked(1);
2084
2085         m->locks--;
2086 }
2087
2088 // The goroutine g exited its system call.
2089 // Arrange for it to run on a cpu again.
2090 // This is called only from the go syscall library, not
2091 // from the low-level system calls used by the runtime.
2092 void
2093 runtime_exitsyscall(void)
2094 {
2095         G *gp;
2096
2097         m->locks++;  // see comment in entersyscall
2098
2099         gp = g;
2100         if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
2101                 incidlelocked(-1);
2102
2103         g->waitsince = 0;
2104         if(exitsyscallfast()) {
2105                 // There's a cpu for us, so we can run.
2106                 m->p->syscalltick++;
2107                 gp->status = Grunning;
2108                 // Garbage collector isn't running (since we are),
2109                 // so okay to clear gcstack and gcsp.
2110 #ifdef USING_SPLIT_STACK
2111                 gp->gcstack = nil;
2112 #endif
2113                 gp->gcnext_sp = nil;
2114                 runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2115                 m->locks--;
2116                 return;
2117         }
2118
2119         m->locks--;
2120
2121         // Call the scheduler.
2122         runtime_mcall(exitsyscall0);
2123
2124         // Scheduler returned, so we're allowed to run now.
2125         // Delete the gcstack information that we left for
2126         // the garbage collector during the system call.
2127         // Must wait until now because until gosched returns
2128         // we don't know for sure that the garbage collector
2129         // is not running.
2130 #ifdef USING_SPLIT_STACK
2131         gp->gcstack = nil;
2132 #endif
2133         gp->gcnext_sp = nil;
2134         runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2135
2136         // Don't refer to m again, we might be running on a different
2137         // thread after returning from runtime_mcall.
2138         runtime_m()->p->syscalltick++;
2139 }
2140
2141 static bool
2142 exitsyscallfast(void)
2143 {
2144         P *p;
2145
2146         // Freezetheworld sets stopwait but does not retake P's.
2147         if(runtime_sched.stopwait) {
2148                 m->p = nil;
2149                 return false;
2150         }
2151
2152         // Try to re-acquire the last P.
2153         if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
2154                 // There's a cpu for us, so we can run.
2155                 m->mcache = m->p->mcache;
2156                 m->p->m = m;
2157                 return true;
2158         }
2159         // Try to get any other idle P.
2160         m->p = nil;
2161         if(runtime_sched.pidle) {
2162                 runtime_lock(&runtime_sched);
2163                 p = pidleget();
2164                 if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
2165                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2166                         runtime_notewakeup(&runtime_sched.sysmonnote);
2167                 }
2168                 runtime_unlock(&runtime_sched);
2169                 if(p) {
2170                         acquirep(p);
2171                         return true;
2172                 }
2173         }
2174         return false;
2175 }
2176
2177 // runtime_exitsyscall slow path on g0.
2178 // Failed to acquire P, enqueue gp as runnable.
2179 static void
2180 exitsyscall0(G *gp)
2181 {
2182         P *p;
2183
2184         gp->status = Grunnable;
2185         gp->m = nil;
2186         m->curg = nil;
2187         runtime_lock(&runtime_sched);
2188         p = pidleget();
2189         if(p == nil)
2190                 globrunqput(gp);
2191         else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
2192                 runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2193                 runtime_notewakeup(&runtime_sched.sysmonnote);
2194         }
2195         runtime_unlock(&runtime_sched);
2196         if(p) {
2197                 acquirep(p);
2198                 execute(gp);  // Never returns.
2199         }
2200         if(m->lockedg) {
2201                 // Wait until another thread schedules gp and so m again.
2202                 stoplockedm();
2203                 execute(gp);  // Never returns.
2204         }
2205         stopm();
2206         schedule();  // Never returns.
2207 }
2208
2209 // Called from syscall package before fork.
2210 void syscall_runtime_BeforeFork(void)
2211   __asm__(GOSYM_PREFIX "syscall.runtime_BeforeFork");
2212 void
2213 syscall_runtime_BeforeFork(void)
2214 {
2215         // Fork can hang if preempted with signals frequently enough (see issue 5517).
2216         // Ensure that we stay on the same M where we disable profiling.
2217         runtime_m()->locks++;
2218         if(runtime_m()->profilehz != 0)
2219                 runtime_resetcpuprofiler(0);
2220 }
2221
2222 // Called from syscall package after fork in parent.
2223 void syscall_runtime_AfterFork(void)
2224   __asm__(GOSYM_PREFIX "syscall.runtime_AfterFork");
2225 void
2226 syscall_runtime_AfterFork(void)
2227 {
2228         int32 hz;
2229
2230         hz = runtime_sched.profilehz;
2231         if(hz != 0)
2232                 runtime_resetcpuprofiler(hz);
2233         runtime_m()->locks--;
2234 }
2235
2236 // Allocate a new g, with a stack big enough for stacksize bytes.
2237 G*
2238 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
2239 {
2240         G *newg;
2241
2242         newg = allocg();
2243         if(stacksize >= 0) {
2244 #if USING_SPLIT_STACK
2245                 int dont_block_signals = 0;
2246
2247                 *ret_stack = __splitstack_makecontext(stacksize,
2248                                                       &newg->stack_context[0],
2249                                                       ret_stacksize);
2250                 __splitstack_block_signals_context(&newg->stack_context[0],
2251                                                    &dont_block_signals, nil);
2252 #else
2253                 *ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
2254                 *ret_stacksize = stacksize;
2255                 newg->gcinitial_sp = *ret_stack;
2256                 newg->gcstack_size = stacksize;
2257                 runtime_xadd(&runtime_stacks_sys, stacksize);
2258 #endif
2259         }
2260         return newg;
2261 }
2262
2263 /* For runtime package testing.  */
2264
2265
2266 // Create a new g running fn with siz bytes of arguments.
2267 // Put it on the queue of g's waiting to run.
2268 // The compiler turns a go statement into a call to this.
2269 // Cannot split the stack because it assumes that the arguments
2270 // are available sequentially after &fn; they would not be
2271 // copied if a stack split occurred.  It's OK for this to call
2272 // functions that split the stack.
2273 void runtime_testing_entersyscall(void)
2274   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
2275 void
2276 runtime_testing_entersyscall()
2277 {
2278         runtime_entersyscall();
2279 }
2280
2281 void runtime_testing_exitsyscall(void)
2282   __asm__ (GOSYM_PREFIX "runtime.exitsyscall");
2283
2284 void
2285 runtime_testing_exitsyscall()
2286 {
2287         runtime_exitsyscall();
2288 }
2289
2290 G*
2291 __go_go(void (*fn)(void*), void* arg)
2292 {
2293         byte *sp;
2294         size_t spsize;
2295         G *newg;
2296         P *p;
2297
2298 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
2299         if(fn == nil) {
2300                 m->throwing = -1;  // do not dump full stacks
2301                 runtime_throw("go of nil func value");
2302         }
2303         m->locks++;  // disable preemption because it can be holding p in a local var
2304
2305         p = m->p;
2306         if((newg = gfget(p)) != nil) {
2307 #ifdef USING_SPLIT_STACK
2308                 int dont_block_signals = 0;
2309
2310                 sp = __splitstack_resetcontext(&newg->stack_context[0],
2311                                                &spsize);
2312                 __splitstack_block_signals_context(&newg->stack_context[0],
2313                                                    &dont_block_signals, nil);
2314 #else
2315                 sp = newg->gcinitial_sp;
2316                 spsize = newg->gcstack_size;
2317                 if(spsize == 0)
2318                         runtime_throw("bad spsize in __go_go");
2319                 newg->gcnext_sp = sp;
2320 #endif
2321         } else {
2322                 newg = runtime_malg(StackMin, &sp, &spsize);
2323                 allgadd(newg);
2324         }
2325
2326         newg->entry = (byte*)fn;
2327         newg->param = arg;
2328         newg->gopc = (uintptr)__builtin_return_address(0);
2329         newg->status = Grunnable;
2330         if(p->goidcache == p->goidcacheend) {
2331                 p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
2332                 p->goidcacheend = p->goidcache + GoidCacheBatch;
2333         }
2334         newg->goid = p->goidcache++;
2335
2336         {
2337                 // Avoid warnings about variables clobbered by
2338                 // longjmp.
2339                 byte * volatile vsp = sp;
2340                 size_t volatile vspsize = spsize;
2341                 G * volatile vnewg = newg;
2342
2343                 getcontext(&vnewg->context);
2344                 vnewg->context.uc_stack.ss_sp = vsp;
2345 #ifdef MAKECONTEXT_STACK_TOP
2346                 vnewg->context.uc_stack.ss_sp += vspsize;
2347 #endif
2348                 vnewg->context.uc_stack.ss_size = vspsize;
2349                 makecontext(&vnewg->context, kickoff, 0);
2350
2351                 runqput(p, vnewg);
2352
2353                 if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
2354                         wakep();
2355                 m->locks--;
2356                 return vnewg;
2357         }
2358 }
2359
2360 static void
2361 allgadd(G *gp)
2362 {
2363         G **new;
2364         uintptr cap;
2365
2366         runtime_lock(&allglock);
2367         if(runtime_allglen >= allgcap) {
2368                 cap = 4096/sizeof(new[0]);
2369                 if(cap < 2*allgcap)
2370                         cap = 2*allgcap;
2371                 new = runtime_malloc(cap*sizeof(new[0]));
2372                 if(new == nil)
2373                         runtime_throw("runtime: cannot allocate memory");
2374                 if(runtime_allg != nil) {
2375                         runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0]));
2376                         runtime_free(runtime_allg);
2377                 }
2378                 runtime_allg = new;
2379                 allgcap = cap;
2380         }
2381         runtime_allg[runtime_allglen++] = gp;
2382         runtime_unlock(&allglock);
2383 }
2384
2385 // Put on gfree list.
2386 // If local list is too long, transfer a batch to the global list.
2387 static void
2388 gfput(P *p, G *gp)
2389 {
2390         gp->schedlink = p->gfree;
2391         p->gfree = gp;
2392         p->gfreecnt++;
2393         if(p->gfreecnt >= 64) {
2394                 runtime_lock(&runtime_sched.gflock);
2395                 while(p->gfreecnt >= 32) {
2396                         p->gfreecnt--;
2397                         gp = p->gfree;
2398                         p->gfree = gp->schedlink;
2399                         gp->schedlink = runtime_sched.gfree;
2400                         runtime_sched.gfree = gp;
2401                 }
2402                 runtime_unlock(&runtime_sched.gflock);
2403         }
2404 }
2405
2406 // Get from gfree list.
2407 // If local list is empty, grab a batch from global list.
2408 static G*
2409 gfget(P *p)
2410 {
2411         G *gp;
2412
2413 retry:
2414         gp = p->gfree;
2415         if(gp == nil && runtime_sched.gfree) {
2416                 runtime_lock(&runtime_sched.gflock);
2417                 while(p->gfreecnt < 32 && runtime_sched.gfree) {
2418                         p->gfreecnt++;
2419                         gp = runtime_sched.gfree;
2420                         runtime_sched.gfree = gp->schedlink;
2421                         gp->schedlink = p->gfree;
2422                         p->gfree = gp;
2423                 }
2424                 runtime_unlock(&runtime_sched.gflock);
2425                 goto retry;
2426         }
2427         if(gp) {
2428                 p->gfree = gp->schedlink;
2429                 p->gfreecnt--;
2430         }
2431         return gp;
2432 }
2433
2434 // Purge all cached G's from gfree list to the global list.
2435 static void
2436 gfpurge(P *p)
2437 {
2438         G *gp;
2439
2440         runtime_lock(&runtime_sched.gflock);
2441         while(p->gfreecnt) {
2442                 p->gfreecnt--;
2443                 gp = p->gfree;
2444                 p->gfree = gp->schedlink;
2445                 gp->schedlink = runtime_sched.gfree;
2446                 runtime_sched.gfree = gp;
2447         }
2448         runtime_unlock(&runtime_sched.gflock);
2449 }
2450
2451 void
2452 runtime_Breakpoint(void)
2453 {
2454         runtime_breakpoint();
2455 }
2456
2457 void runtime_Gosched (void) __asm__ (GOSYM_PREFIX "runtime.Gosched");
2458
2459 void
2460 runtime_Gosched(void)
2461 {
2462         runtime_gosched();
2463 }
2464
2465 // Implementation of runtime.GOMAXPROCS.
2466 // delete when scheduler is even stronger
2467 int32
2468 runtime_gomaxprocsfunc(int32 n)
2469 {
2470         int32 ret;
2471
2472         if(n > MaxGomaxprocs)
2473                 n = MaxGomaxprocs;
2474         runtime_lock(&runtime_sched);
2475         ret = runtime_gomaxprocs;
2476         if(n <= 0 || n == ret) {
2477                 runtime_unlock(&runtime_sched);
2478                 return ret;
2479         }
2480         runtime_unlock(&runtime_sched);
2481
2482         runtime_semacquire(&runtime_worldsema, false);
2483         m->gcing = 1;
2484         runtime_stoptheworld();
2485         newprocs = n;
2486         m->gcing = 0;
2487         runtime_semrelease(&runtime_worldsema);
2488         runtime_starttheworld();
2489
2490         return ret;
2491 }
2492
2493 // lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
2494 // after they modify m->locked. Do not allow preemption during this call,
2495 // or else the m might be different in this function than in the caller.
2496 static void
2497 lockOSThread(void)
2498 {
2499         m->lockedg = g;
2500         g->lockedm = m;
2501 }
2502
2503 void    runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
2504 void
2505 runtime_LockOSThread(void)
2506 {
2507         m->locked |= LockExternal;
2508         lockOSThread();
2509 }
2510
2511 void
2512 runtime_lockOSThread(void)
2513 {
2514         m->locked += LockInternal;
2515         lockOSThread();
2516 }
2517
2518
2519 // unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
2520 // after they update m->locked. Do not allow preemption during this call,
2521 // or else the m might be in different in this function than in the caller.
2522 static void
2523 unlockOSThread(void)
2524 {
2525         if(m->locked != 0)
2526                 return;
2527         m->lockedg = nil;
2528         g->lockedm = nil;
2529 }
2530
2531 void    runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
2532
2533 void
2534 runtime_UnlockOSThread(void)
2535 {
2536         m->locked &= ~LockExternal;
2537         unlockOSThread();
2538 }
2539
2540 void
2541 runtime_unlockOSThread(void)
2542 {
2543         if(m->locked < LockInternal)
2544                 runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
2545         m->locked -= LockInternal;
2546         unlockOSThread();
2547 }
2548
2549 bool
2550 runtime_lockedOSThread(void)
2551 {
2552         return g->lockedm != nil && m->lockedg != nil;
2553 }
2554
2555 int32
2556 runtime_gcount(void)
2557 {
2558         G *gp;
2559         int32 n, s;
2560         uintptr i;
2561
2562         n = 0;
2563         runtime_lock(&allglock);
2564         // TODO(dvyukov): runtime.NumGoroutine() is O(N).
2565         // We do not want to increment/decrement centralized counter in newproc/goexit,
2566         // just to make runtime.NumGoroutine() faster.
2567         // Compromise solution is to introduce per-P counters of active goroutines.
2568         for(i = 0; i < runtime_allglen; i++) {
2569                 gp = runtime_allg[i];
2570                 s = gp->status;
2571                 if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
2572                         n++;
2573         }
2574         runtime_unlock(&allglock);
2575         return n;
2576 }
2577
2578 int32
2579 runtime_mcount(void)
2580 {
2581         return runtime_sched.mcount;
2582 }
2583
2584 static struct {
2585         Lock;
2586         void (*fn)(uintptr*, int32);
2587         int32 hz;
2588         uintptr pcbuf[TracebackMaxFrames];
2589         Location locbuf[TracebackMaxFrames];
2590 } prof;
2591
2592 static void System(void) {}
2593 static void GC(void) {}
2594
2595 // Called if we receive a SIGPROF signal.
2596 void
2597 runtime_sigprof()
2598 {
2599         M *mp = m;
2600         int32 n, i;
2601         bool traceback;
2602
2603         if(prof.fn == nil || prof.hz == 0)
2604                 return;
2605
2606         if(mp == nil)
2607                 return;
2608
2609         // Profiling runs concurrently with GC, so it must not allocate.
2610         mp->mallocing++;
2611
2612         traceback = true;
2613
2614         if(mp->mcache == nil)
2615                 traceback = false;
2616
2617         runtime_lock(&prof);
2618         if(prof.fn == nil) {
2619                 runtime_unlock(&prof);
2620                 mp->mallocing--;
2621                 return;
2622         }
2623         n = 0;
2624
2625         if(runtime_atomicload(&runtime_in_callers) > 0) {
2626                 // If SIGPROF arrived while already fetching runtime
2627                 // callers we can have trouble on older systems
2628                 // because the unwind library calls dl_iterate_phdr
2629                 // which was not recursive in the past.
2630                 traceback = false;
2631         }
2632
2633         if(traceback) {
2634                 n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false);
2635                 for(i = 0; i < n; i++)
2636                         prof.pcbuf[i] = prof.locbuf[i].pc;
2637         }
2638         if(!traceback || n <= 0) {
2639                 n = 2;
2640                 prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
2641                 if(mp->gcing || mp->helpgc)
2642                         prof.pcbuf[1] = (uintptr)GC;
2643                 else
2644                         prof.pcbuf[1] = (uintptr)System;
2645         }
2646         prof.fn(prof.pcbuf, n);
2647         runtime_unlock(&prof);
2648         mp->mallocing--;
2649 }
2650
2651 // Arrange to call fn with a traceback hz times a second.
2652 void
2653 runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
2654 {
2655         // Force sane arguments.
2656         if(hz < 0)
2657                 hz = 0;
2658         if(hz == 0)
2659                 fn = nil;
2660         if(fn == nil)
2661                 hz = 0;
2662
2663         // Disable preemption, otherwise we can be rescheduled to another thread
2664         // that has profiling enabled.
2665         m->locks++;
2666
2667         // Stop profiler on this thread so that it is safe to lock prof.
2668         // if a profiling signal came in while we had prof locked,
2669         // it would deadlock.
2670         runtime_resetcpuprofiler(0);
2671
2672         runtime_lock(&prof);
2673         prof.fn = fn;
2674         prof.hz = hz;
2675         runtime_unlock(&prof);
2676         runtime_lock(&runtime_sched);
2677         runtime_sched.profilehz = hz;
2678         runtime_unlock(&runtime_sched);
2679
2680         if(hz != 0)
2681                 runtime_resetcpuprofiler(hz);
2682
2683         m->locks--;
2684 }
2685
2686 // Change number of processors.  The world is stopped, sched is locked.
2687 static void
2688 procresize(int32 new)
2689 {
2690         int32 i, old;
2691         bool empty;
2692         G *gp;
2693         P *p;
2694
2695         old = runtime_gomaxprocs;
2696         if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
2697                 runtime_throw("procresize: invalid arg");
2698         // initialize new P's
2699         for(i = 0; i < new; i++) {
2700                 p = runtime_allp[i];
2701                 if(p == nil) {
2702                         p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
2703                         p->id = i;
2704                         p->status = Pgcstop;
2705                         runtime_atomicstorep(&runtime_allp[i], p);
2706                 }
2707                 if(p->mcache == nil) {
2708                         if(old==0 && i==0)
2709                                 p->mcache = m->mcache;  // bootstrap
2710                         else
2711                                 p->mcache = runtime_allocmcache();
2712                 }
2713         }
2714
2715         // redistribute runnable G's evenly
2716         // collect all runnable goroutines in global queue preserving FIFO order
2717         // FIFO order is required to ensure fairness even during frequent GCs
2718         // see http://golang.org/issue/7126
2719         empty = false;
2720         while(!empty) {
2721                 empty = true;
2722                 for(i = 0; i < old; i++) {
2723                         p = runtime_allp[i];
2724                         if(p->runqhead == p->runqtail)
2725                                 continue;
2726                         empty = false;
2727                         // pop from tail of local queue
2728                         p->runqtail--;
2729                         gp = p->runq[p->runqtail%nelem(p->runq)];
2730                         // push onto head of global queue
2731                         gp->schedlink = runtime_sched.runqhead;
2732                         runtime_sched.runqhead = gp;
2733                         if(runtime_sched.runqtail == nil)
2734                                 runtime_sched.runqtail = gp;
2735                         runtime_sched.runqsize++;
2736                 }
2737         }
2738         // fill local queues with at most nelem(p->runq)/2 goroutines
2739         // start at 1 because current M already executes some G and will acquire allp[0] below,
2740         // so if we have a spare G we want to put it into allp[1].
2741         for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
2742                 gp = runtime_sched.runqhead;
2743                 runtime_sched.runqhead = gp->schedlink;
2744                 if(runtime_sched.runqhead == nil)
2745                         runtime_sched.runqtail = nil;
2746                 runtime_sched.runqsize--;
2747                 runqput(runtime_allp[i%new], gp);
2748         }
2749
2750         // free unused P's
2751         for(i = new; i < old; i++) {
2752                 p = runtime_allp[i];
2753                 runtime_freemcache(p->mcache);
2754                 p->mcache = nil;
2755                 gfpurge(p);
2756                 p->status = Pdead;
2757                 // can't free P itself because it can be referenced by an M in syscall
2758         }
2759
2760         if(m->p)
2761                 m->p->m = nil;
2762         m->p = nil;
2763         m->mcache = nil;
2764         p = runtime_allp[0];
2765         p->m = nil;
2766         p->status = Pidle;
2767         acquirep(p);
2768         for(i = new-1; i > 0; i--) {
2769                 p = runtime_allp[i];
2770                 p->status = Pidle;
2771                 pidleput(p);
2772         }
2773         runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
2774 }
2775
2776 // Associate p and the current m.
2777 static void
2778 acquirep(P *p)
2779 {
2780         if(m->p || m->mcache)
2781                 runtime_throw("acquirep: already in go");
2782         if(p->m || p->status != Pidle) {
2783                 runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
2784                 runtime_throw("acquirep: invalid p state");
2785         }
2786         m->mcache = p->mcache;
2787         m->p = p;
2788         p->m = m;
2789         p->status = Prunning;
2790 }
2791
2792 // Disassociate p and the current m.
2793 static P*
2794 releasep(void)
2795 {
2796         P *p;
2797
2798         if(m->p == nil || m->mcache == nil)
2799                 runtime_throw("releasep: invalid arg");
2800         p = m->p;
2801         if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
2802                 runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
2803                         m, m->p, p->m, m->mcache, p->mcache, p->status);
2804                 runtime_throw("releasep: invalid p state");
2805         }
2806         m->p = nil;
2807         m->mcache = nil;
2808         p->m = nil;
2809         p->status = Pidle;
2810         return p;
2811 }
2812
2813 static void
2814 incidlelocked(int32 v)
2815 {
2816         runtime_lock(&runtime_sched);
2817         runtime_sched.nmidlelocked += v;
2818         if(v > 0)
2819                 checkdead();
2820         runtime_unlock(&runtime_sched);
2821 }
2822
2823 // Check for deadlock situation.
2824 // The check is based on number of running M's, if 0 -> deadlock.
2825 static void
2826 checkdead(void)
2827 {
2828         G *gp;
2829         int32 run, grunning, s;
2830         uintptr i;
2831
2832         // For -buildmode=c-shared or -buildmode=c-archive it's OK if
2833         // there are no running goroutines.  The calling program is
2834         // assumed to be running.
2835         if(runtime_isarchive) {
2836                 return;
2837         }
2838
2839         // -1 for sysmon
2840         run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
2841         if(run > 0)
2842                 return;
2843         // If we are dying because of a signal caught on an already idle thread,
2844         // freezetheworld will cause all running threads to block.
2845         // And runtime will essentially enter into deadlock state,
2846         // except that there is a thread that will call runtime_exit soon.
2847         if(runtime_panicking > 0)
2848                 return;
2849         if(run < 0) {
2850                 runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
2851                         runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
2852                 runtime_throw("checkdead: inconsistent counts");
2853         }
2854         grunning = 0;
2855         runtime_lock(&allglock);
2856         for(i = 0; i < runtime_allglen; i++) {
2857                 gp = runtime_allg[i];
2858                 if(gp->isbackground)
2859                         continue;
2860                 s = gp->status;
2861                 if(s == Gwaiting)
2862                         grunning++;
2863                 else if(s == Grunnable || s == Grunning || s == Gsyscall) {
2864                         runtime_unlock(&allglock);
2865                         runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
2866                         runtime_throw("checkdead: runnable g");
2867                 }
2868         }
2869         runtime_unlock(&allglock);
2870         if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
2871                 runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!");
2872         m->throwing = -1;  // do not dump full stacks
2873         runtime_throw("all goroutines are asleep - deadlock!");
2874 }
2875
2876 static void
2877 sysmon(void)
2878 {
2879         uint32 idle, delay;
2880         int64 now, lastpoll, lasttrace;
2881         G *gp;
2882
2883         lasttrace = 0;
2884         idle = 0;  // how many cycles in succession we had not wokeup somebody
2885         delay = 0;
2886         for(;;) {
2887                 if(idle == 0)  // start with 20us sleep...
2888                         delay = 20;
2889                 else if(idle > 50)  // start doubling the sleep after 1ms...
2890                         delay *= 2;
2891                 if(delay > 10*1000)  // up to 10ms
2892                         delay = 10*1000;
2893                 runtime_usleep(delay);
2894                 if(runtime_debug.schedtrace <= 0 &&
2895                         (runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
2896                         runtime_lock(&runtime_sched);
2897                         if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
2898                                 runtime_atomicstore(&runtime_sched.sysmonwait, 1);
2899                                 runtime_unlock(&runtime_sched);
2900                                 runtime_notesleep(&runtime_sched.sysmonnote);
2901                                 runtime_noteclear(&runtime_sched.sysmonnote);
2902                                 idle = 0;
2903                                 delay = 20;
2904                         } else
2905                                 runtime_unlock(&runtime_sched);
2906                 }
2907                 // poll network if not polled for more than 10ms
2908                 lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
2909                 now = runtime_nanotime();
2910                 if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
2911                         runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
2912                         gp = runtime_netpoll(false);  // non-blocking
2913                         if(gp) {
2914                                 // Need to decrement number of idle locked M's
2915                                 // (pretending that one more is running) before injectglist.
2916                                 // Otherwise it can lead to the following situation:
2917                                 // injectglist grabs all P's but before it starts M's to run the P's,
2918                                 // another M returns from syscall, finishes running its G,
2919                                 // observes that there is no work to do and no other running M's
2920                                 // and reports deadlock.
2921                                 incidlelocked(-1);
2922                                 injectglist(gp);
2923                                 incidlelocked(1);
2924                         }
2925                 }
2926                 // retake P's blocked in syscalls
2927                 // and preempt long running G's
2928                 if(retake(now))
2929                         idle = 0;
2930                 else
2931                         idle++;
2932
2933                 if(runtime_debug.schedtrace > 0 && lasttrace + runtime_debug.schedtrace*1000000ll <= now) {
2934                         lasttrace = now;
2935                         runtime_schedtrace(runtime_debug.scheddetail);
2936                 }
2937         }
2938 }
2939
2940 typedef struct Pdesc Pdesc;
2941 struct Pdesc
2942 {
2943         uint32  schedtick;
2944         int64   schedwhen;
2945         uint32  syscalltick;
2946         int64   syscallwhen;
2947 };
2948 static Pdesc pdesc[MaxGomaxprocs];
2949
2950 static uint32
2951 retake(int64 now)
2952 {
2953         uint32 i, s, n;
2954         int64 t;
2955         P *p;
2956         Pdesc *pd;
2957
2958         n = 0;
2959         for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
2960                 p = runtime_allp[i];
2961                 if(p==nil)
2962                         continue;
2963                 pd = &pdesc[i];
2964                 s = p->status;
2965                 if(s == Psyscall) {
2966                         // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
2967                         t = p->syscalltick;
2968                         if(pd->syscalltick != t) {
2969                                 pd->syscalltick = t;
2970                                 pd->syscallwhen = now;
2971                                 continue;
2972                         }
2973                         // On the one hand we don't want to retake Ps if there is no other work to do,
2974                         // but on the other hand we want to retake them eventually
2975                         // because they can prevent the sysmon thread from deep sleep.
2976                         if(p->runqhead == p->runqtail &&
2977                                 runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
2978                                 pd->syscallwhen + 10*1000*1000 > now)
2979                                 continue;
2980                         // Need to decrement number of idle locked M's
2981                         // (pretending that one more is running) before the CAS.
2982                         // Otherwise the M from which we retake can exit the syscall,
2983                         // increment nmidle and report deadlock.
2984                         incidlelocked(-1);
2985                         if(runtime_cas(&p->status, s, Pidle)) {
2986                                 n++;
2987                                 handoffp(p);
2988                         }
2989                         incidlelocked(1);
2990                 } else if(s == Prunning) {
2991                         // Preempt G if it's running for more than 10ms.
2992                         t = p->schedtick;
2993                         if(pd->schedtick != t) {
2994                                 pd->schedtick = t;
2995                                 pd->schedwhen = now;
2996                                 continue;
2997                         }
2998                         if(pd->schedwhen + 10*1000*1000 > now)
2999                                 continue;
3000                         // preemptone(p);
3001                 }
3002         }
3003         return n;
3004 }
3005
3006 // Tell all goroutines that they have been preempted and they should stop.
3007 // This function is purely best-effort.  It can fail to inform a goroutine if a
3008 // processor just started running it.
3009 // No locks need to be held.
3010 // Returns true if preemption request was issued to at least one goroutine.
3011 static bool
3012 preemptall(void)
3013 {
3014         return false;
3015 }
3016
3017 void
3018 runtime_schedtrace(bool detailed)
3019 {
3020         static int64 starttime;
3021         int64 now;
3022         int64 id1, id2, id3;
3023         int32 i, t, h;
3024         uintptr gi;
3025         const char *fmt;
3026         M *mp, *lockedm;
3027         G *gp, *lockedg;
3028         P *p;
3029
3030         now = runtime_nanotime();
3031         if(starttime == 0)
3032                 starttime = now;
3033
3034         runtime_lock(&runtime_sched);
3035         runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
3036                 (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
3037                 runtime_sched.nmidle, runtime_sched.runqsize);
3038         if(detailed) {
3039                 runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
3040                         runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
3041                         runtime_sched.stopwait, runtime_sched.sysmonwait);
3042         }
3043         // We must be careful while reading data from P's, M's and G's.
3044         // Even if we hold schedlock, most data can be changed concurrently.
3045         // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
3046         for(i = 0; i < runtime_gomaxprocs; i++) {
3047                 p = runtime_allp[i];
3048                 if(p == nil)
3049                         continue;
3050                 mp = p->m;
3051                 h = runtime_atomicload(&p->runqhead);
3052                 t = runtime_atomicload(&p->runqtail);
3053                 if(detailed)
3054                         runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
3055                                 i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
3056                 else {
3057                         // In non-detailed mode format lengths of per-P run queues as:
3058                         // [len1 len2 len3 len4]
3059                         fmt = " %d";
3060                         if(runtime_gomaxprocs == 1)
3061                                 fmt = " [%d]\n";
3062                         else if(i == 0)
3063                                 fmt = " [%d";
3064                         else if(i == runtime_gomaxprocs-1)
3065                                 fmt = " %d]\n";
3066                         runtime_printf(fmt, t-h);
3067                 }
3068         }
3069         if(!detailed) {
3070                 runtime_unlock(&runtime_sched);
3071                 return;
3072         }
3073         for(mp = runtime_allm; mp; mp = mp->alllink) {
3074                 p = mp->p;
3075                 gp = mp->curg;
3076                 lockedg = mp->lockedg;
3077                 id1 = -1;
3078                 if(p)
3079                         id1 = p->id;
3080                 id2 = -1;
3081                 if(gp)
3082                         id2 = gp->goid;
3083                 id3 = -1;
3084                 if(lockedg)
3085                         id3 = lockedg->goid;
3086                 runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
3087                         " locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
3088                         mp->id, id1, id2,
3089                         mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
3090                         mp->spinning, m->blocked, id3);
3091         }
3092         runtime_lock(&allglock);
3093         for(gi = 0; gi < runtime_allglen; gi++) {
3094                 gp = runtime_allg[gi];
3095                 mp = gp->m;
3096                 lockedm = gp->lockedm;
3097                 runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
3098                         gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
3099                         lockedm ? lockedm->id : -1);
3100         }
3101         runtime_unlock(&allglock);
3102         runtime_unlock(&runtime_sched);
3103 }
3104
3105 // Put mp on midle list.
3106 // Sched must be locked.
3107 static void
3108 mput(M *mp)
3109 {
3110         mp->schedlink = runtime_sched.midle;
3111         runtime_sched.midle = mp;
3112         runtime_sched.nmidle++;
3113         checkdead();
3114 }
3115
3116 // Try to get an m from midle list.
3117 // Sched must be locked.
3118 static M*
3119 mget(void)
3120 {
3121         M *mp;
3122
3123         if((mp = runtime_sched.midle) != nil){
3124                 runtime_sched.midle = mp->schedlink;
3125                 runtime_sched.nmidle--;
3126         }
3127         return mp;
3128 }
3129
3130 // Put gp on the global runnable queue.
3131 // Sched must be locked.
3132 static void
3133 globrunqput(G *gp)
3134 {
3135         gp->schedlink = nil;
3136         if(runtime_sched.runqtail)
3137                 runtime_sched.runqtail->schedlink = gp;
3138         else
3139                 runtime_sched.runqhead = gp;
3140         runtime_sched.runqtail = gp;
3141         runtime_sched.runqsize++;
3142 }
3143
3144 // Put a batch of runnable goroutines on the global runnable queue.
3145 // Sched must be locked.
3146 static void
3147 globrunqputbatch(G *ghead, G *gtail, int32 n)
3148 {
3149         gtail->schedlink = nil;
3150         if(runtime_sched.runqtail)
3151                 runtime_sched.runqtail->schedlink = ghead;
3152         else
3153                 runtime_sched.runqhead = ghead;
3154         runtime_sched.runqtail = gtail;
3155         runtime_sched.runqsize += n;
3156 }
3157
3158 // Try get a batch of G's from the global runnable queue.
3159 // Sched must be locked.
3160 static G*
3161 globrunqget(P *p, int32 max)
3162 {
3163         G *gp, *gp1;
3164         int32 n;
3165
3166         if(runtime_sched.runqsize == 0)
3167                 return nil;
3168         n = runtime_sched.runqsize/runtime_gomaxprocs+1;
3169         if(n > runtime_sched.runqsize)
3170                 n = runtime_sched.runqsize;
3171         if(max > 0 && n > max)
3172                 n = max;
3173         if((uint32)n > nelem(p->runq)/2)
3174                 n = nelem(p->runq)/2;
3175         runtime_sched.runqsize -= n;
3176         if(runtime_sched.runqsize == 0)
3177                 runtime_sched.runqtail = nil;
3178         gp = runtime_sched.runqhead;
3179         runtime_sched.runqhead = gp->schedlink;
3180         n--;
3181         while(n--) {
3182                 gp1 = runtime_sched.runqhead;
3183                 runtime_sched.runqhead = gp1->schedlink;
3184                 runqput(p, gp1);
3185         }
3186         return gp;
3187 }
3188
3189 // Put p to on pidle list.
3190 // Sched must be locked.
3191 static void
3192 pidleput(P *p)
3193 {
3194         p->link = runtime_sched.pidle;
3195         runtime_sched.pidle = p;
3196         runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
3197 }
3198
3199 // Try get a p from pidle list.
3200 // Sched must be locked.
3201 static P*
3202 pidleget(void)
3203 {
3204         P *p;
3205
3206         p = runtime_sched.pidle;
3207         if(p) {
3208                 runtime_sched.pidle = p->link;
3209                 runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
3210         }
3211         return p;
3212 }
3213
3214 // Try to put g on local runnable queue.
3215 // If it's full, put onto global queue.
3216 // Executed only by the owner P.
3217 static void
3218 runqput(P *p, G *gp)
3219 {
3220         uint32 h, t;
3221
3222 retry:
3223         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3224         t = p->runqtail;
3225         if(t - h < nelem(p->runq)) {
3226                 p->runq[t%nelem(p->runq)] = gp;
3227                 runtime_atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
3228                 return;
3229         }
3230         if(runqputslow(p, gp, h, t))
3231                 return;
3232         // the queue is not full, now the put above must suceed
3233         goto retry;
3234 }
3235
3236 // Put g and a batch of work from local runnable queue on global queue.
3237 // Executed only by the owner P.
3238 static bool
3239 runqputslow(P *p, G *gp, uint32 h, uint32 t)
3240 {
3241         G *batch[nelem(p->runq)/2+1];
3242         uint32 n, i;
3243
3244         // First, grab a batch from local queue.
3245         n = t-h;
3246         n = n/2;
3247         if(n != nelem(p->runq)/2)
3248                 runtime_throw("runqputslow: queue is not full");
3249         for(i=0; i<n; i++)
3250                 batch[i] = p->runq[(h+i)%nelem(p->runq)];
3251         if(!runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3252                 return false;
3253         batch[n] = gp;
3254         // Link the goroutines.
3255         for(i=0; i<n; i++)
3256                 batch[i]->schedlink = batch[i+1];
3257         // Now put the batch on global queue.
3258         runtime_lock(&runtime_sched);
3259         globrunqputbatch(batch[0], batch[n], n+1);
3260         runtime_unlock(&runtime_sched);
3261         return true;
3262 }
3263
3264 // Get g from local runnable queue.
3265 // Executed only by the owner P.
3266 static G*
3267 runqget(P *p)
3268 {
3269         G *gp;
3270         uint32 t, h;
3271
3272         for(;;) {
3273                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3274                 t = p->runqtail;
3275                 if(t == h)
3276                         return nil;
3277                 gp = p->runq[h%nelem(p->runq)];
3278                 if(runtime_cas(&p->runqhead, h, h+1))  // cas-release, commits consume
3279                         return gp;
3280         }
3281 }
3282
3283 // Grabs a batch of goroutines from local runnable queue.
3284 // batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
3285 // Can be executed by any P.
3286 static uint32
3287 runqgrab(P *p, G **batch)
3288 {
3289         uint32 t, h, n, i;
3290
3291         for(;;) {
3292                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3293                 t = runtime_atomicload(&p->runqtail);  // load-acquire, synchronize with the producer
3294                 n = t-h;
3295                 n = n - n/2;
3296                 if(n == 0)
3297                         break;
3298                 if(n > nelem(p->runq)/2)  // read inconsistent h and t
3299                         continue;
3300                 for(i=0; i<n; i++)
3301                         batch[i] = p->runq[(h+i)%nelem(p->runq)];
3302                 if(runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3303                         break;
3304         }
3305         return n;
3306 }
3307
3308 // Steal half of elements from local runnable queue of p2
3309 // and put onto local runnable queue of p.
3310 // Returns one of the stolen elements (or nil if failed).
3311 static G*
3312 runqsteal(P *p, P *p2)
3313 {
3314         G *gp;
3315         G *batch[nelem(p->runq)/2];
3316         uint32 t, h, n, i;
3317
3318         n = runqgrab(p2, batch);
3319         if(n == 0)
3320                 return nil;
3321         n--;
3322         gp = batch[n];
3323         if(n == 0)
3324                 return gp;
3325         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3326         t = p->runqtail;
3327         if(t - h + n >= nelem(p->runq))
3328                 runtime_throw("runqsteal: runq overflow");
3329         for(i=0; i<n; i++, t++)
3330                 p->runq[t%nelem(p->runq)] = batch[i];
3331         runtime_atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
3332         return gp;
3333 }
3334
3335 void runtime_testSchedLocalQueue(void)
3336   __asm__("runtime.testSchedLocalQueue");
3337
3338 void
3339 runtime_testSchedLocalQueue(void)
3340 {
3341         P p;
3342         G gs[nelem(p.runq)];
3343         int32 i, j;
3344
3345         runtime_memclr((byte*)&p, sizeof(p));
3346
3347         for(i = 0; i < (int32)nelem(gs); i++) {
3348                 if(runqget(&p) != nil)
3349                         runtime_throw("runq is not empty initially");
3350                 for(j = 0; j < i; j++)
3351                         runqput(&p, &gs[i]);
3352                 for(j = 0; j < i; j++) {
3353                         if(runqget(&p) != &gs[i]) {
3354                                 runtime_printf("bad element at iter %d/%d\n", i, j);
3355                                 runtime_throw("bad element");
3356                         }
3357                 }
3358                 if(runqget(&p) != nil)
3359                         runtime_throw("runq is not empty afterwards");
3360         }
3361 }
3362
3363 void runtime_testSchedLocalQueueSteal(void)
3364   __asm__("runtime.testSchedLocalQueueSteal");
3365
3366 void
3367 runtime_testSchedLocalQueueSteal(void)
3368 {
3369         P p1, p2;
3370         G gs[nelem(p1.runq)], *gp;
3371         int32 i, j, s;
3372
3373         runtime_memclr((byte*)&p1, sizeof(p1));
3374         runtime_memclr((byte*)&p2, sizeof(p2));
3375
3376         for(i = 0; i < (int32)nelem(gs); i++) {
3377                 for(j = 0; j < i; j++) {
3378                         gs[j].sig = 0;
3379                         runqput(&p1, &gs[j]);
3380                 }
3381                 gp = runqsteal(&p2, &p1);
3382                 s = 0;
3383                 if(gp) {
3384                         s++;
3385                         gp->sig++;
3386                 }
3387                 while((gp = runqget(&p2)) != nil) {
3388                         s++;
3389                         gp->sig++;
3390                 }
3391                 while((gp = runqget(&p1)) != nil)
3392                         gp->sig++;
3393                 for(j = 0; j < i; j++) {
3394                         if(gs[j].sig != 1) {
3395                                 runtime_printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
3396                                 runtime_throw("bad element");
3397                         }
3398                 }
3399                 if(s != i/2 && s != i/2+1) {
3400                         runtime_printf("bad steal %d, want %d or %d, iter %d\n",
3401                                 s, i/2, i/2+1, i);
3402                         runtime_throw("bad steal");
3403                 }
3404         }
3405 }
3406
3407 int32
3408 runtime_setmaxthreads(int32 in)
3409 {
3410         int32 out;
3411
3412         runtime_lock(&runtime_sched);
3413         out = runtime_sched.maxmcount;
3414         runtime_sched.maxmcount = in;
3415         checkmcount();
3416         runtime_unlock(&runtime_sched);
3417         return out;
3418 }
3419
3420 void
3421 runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
3422 {
3423         enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
3424         enqueue1(wbufp, (Obj){(byte*)&runtime_main_init_done, sizeof runtime_main_init_done, 0});
3425 }
3426
3427 // Return whether we are waiting for a GC.  This gc toolchain uses
3428 // preemption instead.
3429 bool
3430 runtime_gcwaiting(void)
3431 {
3432         return runtime_sched.gcwaiting;
3433 }