libgo/runtime/proc.c

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 #include <limits.h>
   6 #include <signal.h>
   7 #include <stdlib.h>
   8 #include <pthread.h>
   9 #include <unistd.h>
  10
  11 #include "config.h"
  12
  13 #ifdef HAVE_DL_ITERATE_PHDR
  14 #include <link.h>
  15 #endif
  16
  17 #include "runtime.h"
  18 #include "arch.h"
  19 #include "defs.h"
  20 #include "malloc.h"
  21 #include "race.h"
  22 #include "go-type.h"
  23 #include "go-defer.h"
  24
  25 #ifdef USING_SPLIT_STACK
  26
  27 /* FIXME: These are not declared anywhere.  */
  28
  29 extern void __splitstack_getcontext(void *context[10]);
  30
  31 extern void __splitstack_setcontext(void *context[10]);
  32
  33 extern void *__splitstack_makecontext(size_t, void *context[10], size_t *);
  34
  35 extern void * __splitstack_resetcontext(void *context[10], size_t *);
  36
  37 extern void *__splitstack_find(void *, void *, size_t *, void **, void **,
  38                                void **);
  39
  40 extern void __splitstack_block_signals (int *, int *);
  41
  42 extern void __splitstack_block_signals_context (void *context[10], int *,
  43                                                 int *);
  44
  45 #endif
  46
  47 #ifndef PTHREAD_STACK_MIN
  48 # define PTHREAD_STACK_MIN 8192
  49 #endif
  50
  51 #if defined(USING_SPLIT_STACK) && defined(LINKER_SUPPORTS_SPLIT_STACK)
  52 # define StackMin PTHREAD_STACK_MIN
  53 #else
  54 # define StackMin 2 * 1024 * 1024
  55 #endif
  56
  57 uintptr runtime_stacks_sys;
  58
  59 static void gtraceback(G*);
  60
  61 #ifdef __rtems__
  62 #define __thread
  63 #endif
  64
  65 static __thread G *g;
  66 static __thread M *m;
  67
  68 #ifndef SETCONTEXT_CLOBBERS_TLS
  69
  70 static inline void
  71 initcontext(void)
  72 {
  73 }
  74
  75 static inline void
  76 fixcontext(ucontext_t *c __attribute__ ((unused)))
  77 {
  78 }
  79
  80 #else
  81
  82 # if defined(__x86_64__) && defined(__sun__)
  83
  84 // x86_64 Solaris 10 and 11 have a bug: setcontext switches the %fs
  85 // register to that of the thread which called getcontext.  The effect
  86 // is that the address of all __thread variables changes.  This bug
  87 // also affects pthread_self() and pthread_getspecific.  We work
  88 // around it by clobbering the context field directly to keep %fs the
  89 // same.
  90
  91 static __thread greg_t fs;
  92
  93 static inline void
  94 initcontext(void)
  95 {
  96         ucontext_t c;
  97
  98         getcontext(&c);
  99         fs = c.uc_mcontext.gregs[REG_FSBASE];
 100 }
 101
 102 static inline void
 103 fixcontext(ucontext_t* c)
 104 {
 105         c->uc_mcontext.gregs[REG_FSBASE] = fs;
 106 }
 107
 108 # elif defined(__NetBSD__)
 109
 110 // NetBSD has a bug: setcontext clobbers tlsbase, we need to save
 111 // and restore it ourselves.
 112
 113 static __thread __greg_t tlsbase;
 114
 115 static inline void
 116 initcontext(void)
 117 {
 118         ucontext_t c;
 119
 120         getcontext(&c);
 121         tlsbase = c.uc_mcontext._mc_tlsbase;
 122 }
 123
 124 static inline void
 125 fixcontext(ucontext_t* c)
 126 {
 127         c->uc_mcontext._mc_tlsbase = tlsbase;
 128 }
 129
 130 # else
 131
 132 #  error unknown case for SETCONTEXT_CLOBBERS_TLS
 133
 134 # endif
 135
 136 #endif
 137
 138 // We can not always refer to the TLS variables directly.  The
 139 // compiler will call tls_get_addr to get the address of the variable,
 140 // and it may hold it in a register across a call to schedule.  When
 141 // we get back from the call we may be running in a different thread,
 142 // in which case the register now points to the TLS variable for a
 143 // different thread.  We use non-inlinable functions to avoid this
 144 // when necessary.
 145
 146 G* runtime_g(void) __attribute__ ((noinline, no_split_stack));
 147
 148 G*
 149 runtime_g(void)
 150 {
 151         return g;
 152 }
 153
 154 M* runtime_m(void) __attribute__ ((noinline, no_split_stack));
 155
 156 M*
 157 runtime_m(void)
 158 {
 159         return m;
 160 }
 161
 162 // Set m and g.
 163 void
 164 runtime_setmg(M* mp, G* gp)
 165 {
 166         m = mp;
 167         g = gp;
 168 }
 169
 170 // Start a new thread.
 171 static void
 172 runtime_newosproc(M *mp)
 173 {
 174         pthread_attr_t attr;
 175         sigset_t clear, old;
 176         pthread_t tid;
 177         int ret;
 178
 179         if(pthread_attr_init(&attr) != 0)
 180                 runtime_throw("pthread_attr_init");
 181         if(pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0)
 182                 runtime_throw("pthread_attr_setdetachstate");
 183
 184         // Block signals during pthread_create so that the new thread
 185         // starts with signals disabled.  It will enable them in minit.
 186         sigfillset(&clear);
 187
 188 #ifdef SIGTRAP
 189         // Blocking SIGTRAP reportedly breaks gdb on Alpha GNU/Linux.
 190         sigdelset(&clear, SIGTRAP);
 191 #endif
 192
 193         sigemptyset(&old);
 194         pthread_sigmask(SIG_BLOCK, &clear, &old);
 195         ret = pthread_create(&tid, &attr, runtime_mstart, mp);
 196         pthread_sigmask(SIG_SETMASK, &old, nil);
 197
 198         if (ret != 0)
 199                 runtime_throw("pthread_create");
 200 }
 201
 202 // First function run by a new goroutine.  This replaces gogocall.
 203 static void
 204 kickoff(void)
 205 {
 206         void (*fn)(void*);
 207
 208         if(g->traceback != nil)
 209                 gtraceback(g);
 210
 211         fn = (void (*)(void*))(g->entry);
 212         fn(g->param);
 213         runtime_goexit();
 214 }
 215
 216 // Switch context to a different goroutine.  This is like longjmp.
 217 void runtime_gogo(G*) __attribute__ ((noinline));
 218 void
 219 runtime_gogo(G* newg)
 220 {
 221 #ifdef USING_SPLIT_STACK
 222         __splitstack_setcontext(&newg->stack_context[0]);
 223 #endif
 224         g = newg;
 225         newg->fromgogo = true;
 226         fixcontext(&newg->context);
 227         setcontext(&newg->context);
 228         runtime_throw("gogo setcontext returned");
 229 }
 230
 231 // Save context and call fn passing g as a parameter.  This is like
 232 // setjmp.  Because getcontext always returns 0, unlike setjmp, we use
 233 // g->fromgogo as a code.  It will be true if we got here via
 234 // setcontext.  g == nil the first time this is called in a new m.
 235 void runtime_mcall(void (*)(G*)) __attribute__ ((noinline));
 236 void
 237 runtime_mcall(void (*pfn)(G*))
 238 {
 239         M *mp;
 240         G *gp;
 241
 242         // Ensure that all registers are on the stack for the garbage
 243         // collector.
 244         __builtin_unwind_init();
 245
 246         mp = m;
 247         gp = g;
 248         if(gp == mp->g0)
 249                 runtime_throw("runtime: mcall called on m->g0 stack");
 250
 251         if(gp != nil) {
 252
 253 #ifdef USING_SPLIT_STACK
 254                 __splitstack_getcontext(&g->stack_context[0]);
 255 #else
 256                 gp->gcnext_sp = &pfn;
 257 #endif
 258                 gp->fromgogo = false;
 259                 getcontext(&gp->context);
 260
 261                 // When we return from getcontext, we may be running
 262                 // in a new thread.  That means that m and g may have
 263                 // changed.  They are global variables so we will
 264                 // reload them, but the addresses of m and g may be
 265                 // cached in our local stack frame, and those
 266                 // addresses may be wrong.  Call functions to reload
 267                 // the values for this thread.
 268                 mp = runtime_m();
 269                 gp = runtime_g();
 270
 271                 if(gp->traceback != nil)
 272                         gtraceback(gp);
 273         }
 274         if (gp == nil || !gp->fromgogo) {
 275 #ifdef USING_SPLIT_STACK
 276                 __splitstack_setcontext(&mp->g0->stack_context[0]);
 277 #endif
 278                 mp->g0->entry = (byte*)pfn;
 279                 mp->g0->param = gp;
 280
 281                 // It's OK to set g directly here because this case
 282                 // can not occur if we got here via a setcontext to
 283                 // the getcontext call just above.
 284                 g = mp->g0;
 285
 286                 fixcontext(&mp->g0->context);
 287                 setcontext(&mp->g0->context);
 288                 runtime_throw("runtime: mcall function returned");
 289         }
 290 }
 291
 292 // Goroutine scheduler
 293 // The scheduler's job is to distribute ready-to-run goroutines over worker threads.
 294 //
 295 // The main concepts are:
 296 // G - goroutine.
 297 // M - worker thread, or machine.
 298 // P - processor, a resource that is required to execute Go code.
 299 //     M must have an associated P to execute Go code, however it can be
 300 //     blocked or in a syscall w/o an associated P.
 301 //
 302 // Design doc at http://golang.org/s/go11sched.
 303
 304 typedef struct Sched Sched;
 305 struct Sched {
 306         Lock;
 307
 308         uint64  goidgen;
 309         M*      midle;   // idle m's waiting for work
 310         int32   nmidle;  // number of idle m's waiting for work
 311         int32   nmidlelocked; // number of locked m's waiting for work
 312         int32   mcount;  // number of m's that have been created
 313         int32   maxmcount;      // maximum number of m's allowed (or die)
 314
 315         P*      pidle;  // idle P's
 316         uint32  npidle;
 317         uint32  nmspinning;
 318
 319         // Global runnable queue.
 320         G*      runqhead;
 321         G*      runqtail;
 322         int32   runqsize;
 323
 324         // Global cache of dead G's.
 325         Lock    gflock;
 326         G*      gfree;
 327
 328         uint32  gcwaiting;      // gc is waiting to run
 329         int32   stopwait;
 330         Note    stopnote;
 331         uint32  sysmonwait;
 332         Note    sysmonnote;
 333         uint64  lastpoll;
 334
 335         int32   profilehz;      // cpu profiling rate
 336 };
 337
 338 enum
 339 {
 340         // The max value of GOMAXPROCS.
 341         // There are no fundamental restrictions on the value.
 342         MaxGomaxprocs = 1<<8,
 343
 344         // Number of goroutine ids to grab from runtime_sched.goidgen to local per-P cache at once.
 345         // 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
 346         GoidCacheBatch = 16,
 347 };
 348
 349 Sched   runtime_sched;
 350 int32   runtime_gomaxprocs;
 351 uint32  runtime_needextram = 1;
 352 bool    runtime_iscgo = true;
 353 M       runtime_m0;
 354 G       runtime_g0;     // idle goroutine for m0
 355 G*      runtime_lastg;
 356 M*      runtime_allm;
 357 P**     runtime_allp;
 358 M*      runtime_extram;
 359 int8*   runtime_goos;
 360 int32   runtime_ncpu;
 361 bool    runtime_precisestack;
 362 static int32    newprocs;
 363
 364 static  Lock allglock;  // the following vars are protected by this lock or by stoptheworld
 365 G**     runtime_allg;
 366 uintptr runtime_allglen;
 367 static  uintptr allgcap;
 368
 369 void* runtime_mstart(void*);
 370 static void runqput(P*, G*);
 371 static G* runqget(P*);
 372 static bool runqputslow(P*, G*, uint32, uint32);
 373 static G* runqsteal(P*, P*);
 374 static void mput(M*);
 375 static M* mget(void);
 376 static void mcommoninit(M*);
 377 static void schedule(void);
 378 static void procresize(int32);
 379 static void acquirep(P*);
 380 static P* releasep(void);
 381 static void newm(void(*)(void), P*);
 382 static void stopm(void);
 383 static void startm(P*, bool);
 384 static void handoffp(P*);
 385 static void wakep(void);
 386 static void stoplockedm(void);
 387 static void startlockedm(G*);
 388 static void sysmon(void);
 389 static uint32 retake(int64);
 390 static void incidlelocked(int32);
 391 static void checkdead(void);
 392 static void exitsyscall0(G*);
 393 static void park0(G*);
 394 static void goexit0(G*);
 395 static void gfput(P*, G*);
 396 static G* gfget(P*);
 397 static void gfpurge(P*);
 398 static void globrunqput(G*);
 399 static void globrunqputbatch(G*, G*, int32);
 400 static G* globrunqget(P*, int32);
 401 static P* pidleget(void);
 402 static void pidleput(P*);
 403 static void injectglist(G*);
 404 static bool preemptall(void);
 405 static bool exitsyscallfast(void);
 406 static void allgadd(G*);
 407
 408 // The bootstrap sequence is:
 409 //
 410 //      call osinit
 411 //      call schedinit
 412 //      make & queue new G
 413 //      call runtime_mstart
 414 //
 415 // The new G calls runtime_main.
 416 void
 417 runtime_schedinit(void)
 418 {
 419         int32 n, procs;
 420         const byte *p;
 421         Eface i;
 422
 423         m = &runtime_m0;
 424         g = &runtime_g0;
 425         m->g0 = g;
 426         m->curg = g;
 427         g->m = m;
 428
 429         initcontext();
 430
 431         runtime_sched.maxmcount = 10000;
 432         runtime_precisestack = 0;
 433
 434         // runtime_symtabinit();
 435         runtime_mallocinit();
 436         mcommoninit(m);
 437
 438         // Initialize the itable value for newErrorCString,
 439         // so that the next time it gets called, possibly
 440         // in a fault during a garbage collection, it will not
 441         // need to allocated memory.
 442         runtime_newErrorCString(0, &i);
 443
 444         // Initialize the cached gotraceback value, since
 445         // gotraceback calls getenv, which mallocs on Plan 9.
 446         runtime_gotraceback(nil);
 447
 448         runtime_goargs();
 449         runtime_goenvs();
 450         runtime_parsedebugvars();
 451
 452         runtime_sched.lastpoll = runtime_nanotime();
 453         procs = 1;
 454         p = runtime_getenv("GOMAXPROCS");
 455         if(p != nil && (n = runtime_atoi(p)) > 0) {
 456                 if(n > MaxGomaxprocs)
 457                         n = MaxGomaxprocs;
 458                 procs = n;
 459         }
 460         runtime_allp = runtime_malloc((MaxGomaxprocs+1)*sizeof(runtime_allp[0]));
 461         procresize(procs);
 462
 463         // Can not enable GC until all roots are registered.
 464         // mstats.enablegc = 1;
 465
 466         // if(raceenabled)
 467         //      g->racectx = runtime_raceinit();
 468 }
 469
 470 extern void main_init(void) __asm__ (GOSYM_PREFIX "__go_init_main");
 471 extern void main_main(void) __asm__ (GOSYM_PREFIX "main.main");
 472
 473 static void
 474 initDone(void *arg __attribute__ ((unused))) {
 475         runtime_unlockOSThread();
 476 };
 477
 478 // The main goroutine.
 479 // Note: C frames in general are not copyable during stack growth, for two reasons:
 480 //   1) We don't know where in a frame to find pointers to other stack locations.
 481 //   2) There's no guarantee that globals or heap values do not point into the frame.
 482 //
 483 // The C frame for runtime.main is copyable, because:
 484 //   1) There are no pointers to other stack locations in the frame
 485 //      (d.fn points at a global, d.link is nil, d.argp is -1).
 486 //   2) The only pointer into this frame is from the defer chain,
 487 //      which is explicitly handled during stack copying.
 488 void
 489 runtime_main(void* dummy __attribute__((unused)))
 490 {
 491         Defer d;
 492         _Bool frame;
 493
 494         newm(sysmon, nil);
 495
 496         // Lock the main goroutine onto this, the main OS thread,
 497         // during initialization.  Most programs won't care, but a few
 498         // do require certain calls to be made by the main thread.
 499         // Those can arrange for main.main to run in the main thread
 500         // by calling runtime.LockOSThread during initialization
 501         // to preserve the lock.
 502         runtime_lockOSThread();
 503
 504         // Defer unlock so that runtime.Goexit during init does the unlock too.
 505         d.__pfn = initDone;
 506         d.__next = g->defer;
 507         d.__arg = (void*)-1;
 508         d.__panic = g->panic;
 509         d.__retaddr = nil;
 510         d.__makefunc_can_recover = 0;
 511         d.__frame = &frame;
 512         d.__special = true;
 513         g->defer = &d;
 514
 515         if(m != &runtime_m0)
 516                 runtime_throw("runtime_main not on m0");
 517         __go_go(runtime_MHeap_Scavenger, nil);
 518         main_init();
 519
 520         if(g->defer != &d || d.__pfn != initDone)
 521                 runtime_throw("runtime: bad defer entry after init");
 522         g->defer = d.__next;
 523         runtime_unlockOSThread();
 524
 525         // For gccgo we have to wait until after main is initialized
 526         // to enable GC, because initializing main registers the GC
 527         // roots.
 528         mstats.enablegc = 1;
 529
 530         main_main();
 531         if(raceenabled)
 532                 runtime_racefini();
 533
 534         // Make racy client program work: if panicking on
 535         // another goroutine at the same time as main returns,
 536         // let the other goroutine finish printing the panic trace.
 537         // Once it does, it will exit. See issue 3934.
 538         if(runtime_panicking)
 539                 runtime_park(nil, nil, "panicwait");
 540
 541         runtime_exit(0);
 542         for(;;)
 543                 *(int32*)0 = 0;
 544 }
 545
 546 void
 547 runtime_goroutineheader(G *gp)
 548 {
 549         const char *status;
 550         int64 waitfor;
 551
 552         switch(gp->status) {
 553         case Gidle:
 554                 status = "idle";
 555                 break;
 556         case Grunnable:
 557                 status = "runnable";
 558                 break;
 559         case Grunning:
 560                 status = "running";
 561                 break;
 562         case Gsyscall:
 563                 status = "syscall";
 564                 break;
 565         case Gwaiting:
 566                 if(gp->waitreason)
 567                         status = gp->waitreason;
 568                 else
 569                         status = "waiting";
 570                 break;
 571         default:
 572                 status = "???";
 573                 break;
 574         }
 575
 576         // approx time the G is blocked, in minutes
 577         waitfor = 0;
 578         if((gp->status == Gwaiting || gp->status == Gsyscall) && gp->waitsince != 0)
 579                 waitfor = (runtime_nanotime() - gp->waitsince) / (60LL*1000*1000*1000);
 580
 581         if(waitfor < 1)
 582                 runtime_printf("goroutine %D [%s]:\n", gp->goid, status);
 583         else
 584                 runtime_printf("goroutine %D [%s, %D minutes]:\n", gp->goid, status, waitfor);
 585 }
 586
 587 void
 588 runtime_printcreatedby(G *g)
 589 {
 590         if(g != nil && g->gopc != 0 && g->goid != 1) {
 591                 String fn;
 592                 String file;
 593                 intgo line;
 594
 595                 if(__go_file_line(g->gopc - 1, &fn, &file, &line)) {
 596                         runtime_printf("created by %S\n", fn);
 597                         runtime_printf("\t%S:%D\n", file, (int64) line);
 598                 }
 599         }
 600 }
 601
 602 struct Traceback
 603 {
 604         G* gp;
 605         Location locbuf[TracebackMaxFrames];
 606         int32 c;
 607 };
 608
 609 void
 610 runtime_tracebackothers(G * volatile me)
 611 {
 612         G * volatile gp;
 613         Traceback tb;
 614         int32 traceback;
 615         volatile uintptr i;
 616
 617         tb.gp = me;
 618         traceback = runtime_gotraceback(nil);
 619
 620         // Show the current goroutine first, if we haven't already.
 621         if((gp = m->curg) != nil && gp != me) {
 622                 runtime_printf("\n");
 623                 runtime_goroutineheader(gp);
 624                 gp->traceback = &tb;
 625
 626 #ifdef USING_SPLIT_STACK
 627                 __splitstack_getcontext(&me->stack_context[0]);
 628 #endif
 629                 getcontext(&me->context);
 630
 631                 if(gp->traceback != nil) {
 632                   runtime_gogo(gp);
 633                 }
 634
 635                 runtime_printtrace(tb.locbuf, tb.c, false);
 636                 runtime_printcreatedby(gp);
 637         }
 638
 639         runtime_lock(&allglock);
 640         for(i = 0; i < runtime_allglen; i++) {
 641                 gp = runtime_allg[i];
 642                 if(gp == me || gp == m->curg || gp->status == Gdead)
 643                         continue;
 644                 if(gp->issystem && traceback < 2)
 645                         continue;
 646                 runtime_printf("\n");
 647                 runtime_goroutineheader(gp);
 648
 649                 // Our only mechanism for doing a stack trace is
 650                 // _Unwind_Backtrace.  And that only works for the
 651                 // current thread, not for other random goroutines.
 652                 // So we need to switch context to the goroutine, get
 653                 // the backtrace, and then switch back.
 654
 655                 // This means that if g is running or in a syscall, we
 656                 // can't reliably print a stack trace.  FIXME.
 657
 658                 if(gp->status == Grunning) {
 659                         runtime_printf("\tgoroutine running on other thread; stack unavailable\n");
 660                         runtime_printcreatedby(gp);
 661                 } else if(gp->status == Gsyscall) {
 662                         runtime_printf("\tgoroutine in C code; stack unavailable\n");
 663                         runtime_printcreatedby(gp);
 664                 } else {
 665                         gp->traceback = &tb;
 666
 667 #ifdef USING_SPLIT_STACK
 668                         __splitstack_getcontext(&me->stack_context[0]);
 669 #endif
 670                         getcontext(&me->context);
 671
 672                         if(gp->traceback != nil) {
 673                                 runtime_gogo(gp);
 674                         }
 675
 676                         runtime_printtrace(tb.locbuf, tb.c, false);
 677                         runtime_printcreatedby(gp);
 678                 }
 679         }
 680         runtime_unlock(&allglock);
 681 }
 682
 683 static void
 684 checkmcount(void)
 685 {
 686         // sched lock is held
 687         if(runtime_sched.mcount > runtime_sched.maxmcount) {
 688                 runtime_printf("runtime: program exceeds %d-thread limit\n", runtime_sched.maxmcount);
 689                 runtime_throw("thread exhaustion");
 690         }
 691 }
 692
 693 // Do a stack trace of gp, and then restore the context to
 694 // gp->dotraceback.
 695
 696 static void
 697 gtraceback(G* gp)
 698 {
 699         Traceback* traceback;
 700
 701         traceback = gp->traceback;
 702         gp->traceback = nil;
 703         traceback->c = runtime_callers(1, traceback->locbuf,
 704                 sizeof traceback->locbuf / sizeof traceback->locbuf[0], false);
 705         runtime_gogo(traceback->gp);
 706 }
 707
 708 static void
 709 mcommoninit(M *mp)
 710 {
 711         // If there is no mcache runtime_callers() will crash,
 712         // and we are most likely in sysmon thread so the stack is senseless anyway.
 713         if(m->mcache)
 714                 runtime_callers(1, mp->createstack, nelem(mp->createstack), false);
 715
 716         mp->fastrand = 0x49f6428aUL + mp->id + runtime_cputicks();
 717
 718         runtime_lock(&runtime_sched);
 719         mp->id = runtime_sched.mcount++;
 720         checkmcount();
 721         runtime_mpreinit(mp);
 722
 723         // Add to runtime_allm so garbage collector doesn't free m
 724         // when it is just in a register or thread-local storage.
 725         mp->alllink = runtime_allm;
 726         // runtime_NumCgoCall() iterates over allm w/o schedlock,
 727         // so we need to publish it safely.
 728         runtime_atomicstorep(&runtime_allm, mp);
 729         runtime_unlock(&runtime_sched);
 730 }
 731
 732 // Mark gp ready to run.
 733 void
 734 runtime_ready(G *gp)
 735 {
 736         // Mark runnable.
 737         m->locks++;  // disable preemption because it can be holding p in a local var
 738         if(gp->status != Gwaiting) {
 739                 runtime_printf("goroutine %D has status %d\n", gp->goid, gp->status);
 740                 runtime_throw("bad g->status in ready");
 741         }
 742         gp->status = Grunnable;
 743         runqput(m->p, gp);
 744         if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0)  // TODO: fast atomic
 745                 wakep();
 746         m->locks--;
 747 }
 748
 749 int32
 750 runtime_gcprocs(void)
 751 {
 752         int32 n;
 753
 754         // Figure out how many CPUs to use during GC.
 755         // Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
 756         runtime_lock(&runtime_sched);
 757         n = runtime_gomaxprocs;
 758         if(n > runtime_ncpu)
 759                 n = runtime_ncpu > 0 ? runtime_ncpu : 1;
 760         if(n > MaxGcproc)
 761                 n = MaxGcproc;
 762         if(n > runtime_sched.nmidle+1) // one M is currently running
 763                 n = runtime_sched.nmidle+1;
 764         runtime_unlock(&runtime_sched);
 765         return n;
 766 }
 767
 768 static bool
 769 needaddgcproc(void)
 770 {
 771         int32 n;
 772
 773         runtime_lock(&runtime_sched);
 774         n = runtime_gomaxprocs;
 775         if(n > runtime_ncpu)
 776                 n = runtime_ncpu;
 777         if(n > MaxGcproc)
 778                 n = MaxGcproc;
 779         n -= runtime_sched.nmidle+1; // one M is currently running
 780         runtime_unlock(&runtime_sched);
 781         return n > 0;
 782 }
 783
 784 void
 785 runtime_helpgc(int32 nproc)
 786 {
 787         M *mp;
 788         int32 n, pos;
 789
 790         runtime_lock(&runtime_sched);
 791         pos = 0;
 792         for(n = 1; n < nproc; n++) {  // one M is currently running
 793                 if(runtime_allp[pos]->mcache == m->mcache)
 794                         pos++;
 795                 mp = mget();
 796                 if(mp == nil)
 797                         runtime_throw("runtime_gcprocs inconsistency");
 798                 mp->helpgc = n;
 799                 mp->mcache = runtime_allp[pos]->mcache;
 800                 pos++;
 801                 runtime_notewakeup(&mp->park);
 802         }
 803         runtime_unlock(&runtime_sched);
 804 }
 805
 806 // Similar to stoptheworld but best-effort and can be called several times.
 807 // There is no reverse operation, used during crashing.
 808 // This function must not lock any mutexes.
 809 void
 810 runtime_freezetheworld(void)
 811 {
 812         int32 i;
 813
 814         if(runtime_gomaxprocs == 1)
 815                 return;
 816         // stopwait and preemption requests can be lost
 817         // due to races with concurrently executing threads,
 818         // so try several times
 819         for(i = 0; i < 5; i++) {
 820                 // this should tell the scheduler to not start any new goroutines
 821                 runtime_sched.stopwait = 0x7fffffff;
 822                 runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 823                 // this should stop running goroutines
 824                 if(!preemptall())
 825                         break;  // no running goroutines
 826                 runtime_usleep(1000);
 827         }
 828         // to be sure
 829         runtime_usleep(1000);
 830         preemptall();
 831         runtime_usleep(1000);
 832 }
 833
 834 void
 835 runtime_stoptheworld(void)
 836 {
 837         int32 i;
 838         uint32 s;
 839         P *p;
 840         bool wait;
 841
 842         runtime_lock(&runtime_sched);
 843         runtime_sched.stopwait = runtime_gomaxprocs;
 844         runtime_atomicstore((uint32*)&runtime_sched.gcwaiting, 1);
 845         preemptall();
 846         // stop current P
 847         m->p->status = Pgcstop;
 848         runtime_sched.stopwait--;
 849         // try to retake all P's in Psyscall status
 850         for(i = 0; i < runtime_gomaxprocs; i++) {
 851                 p = runtime_allp[i];
 852                 s = p->status;
 853                 if(s == Psyscall && runtime_cas(&p->status, s, Pgcstop))
 854                         runtime_sched.stopwait--;
 855         }
 856         // stop idle P's
 857         while((p = pidleget()) != nil) {
 858                 p->status = Pgcstop;
 859                 runtime_sched.stopwait--;
 860         }
 861         wait = runtime_sched.stopwait > 0;
 862         runtime_unlock(&runtime_sched);
 863
 864         // wait for remaining P's to stop voluntarily
 865         if(wait) {
 866                 runtime_notesleep(&runtime_sched.stopnote);
 867                 runtime_noteclear(&runtime_sched.stopnote);
 868         }
 869         if(runtime_sched.stopwait)
 870                 runtime_throw("stoptheworld: not stopped");
 871         for(i = 0; i < runtime_gomaxprocs; i++) {
 872                 p = runtime_allp[i];
 873                 if(p->status != Pgcstop)
 874                         runtime_throw("stoptheworld: not stopped");
 875         }
 876 }
 877
 878 static void
 879 mhelpgc(void)
 880 {
 881         m->helpgc = -1;
 882 }
 883
 884 void
 885 runtime_starttheworld(void)
 886 {
 887         P *p, *p1;
 888         M *mp;
 889         G *gp;
 890         bool add;
 891
 892         m->locks++;  // disable preemption because it can be holding p in a local var
 893         gp = runtime_netpoll(false);  // non-blocking
 894         injectglist(gp);
 895         add = needaddgcproc();
 896         runtime_lock(&runtime_sched);
 897         if(newprocs) {
 898                 procresize(newprocs);
 899                 newprocs = 0;
 900         } else
 901                 procresize(runtime_gomaxprocs);
 902         runtime_sched.gcwaiting = 0;
 903
 904         p1 = nil;
 905         while((p = pidleget()) != nil) {
 906                 // procresize() puts p's with work at the beginning of the list.
 907                 // Once we reach a p without a run queue, the rest don't have one either.
 908                 if(p->runqhead == p->runqtail) {
 909                         pidleput(p);
 910                         break;
 911                 }
 912                 p->m = mget();
 913                 p->link = p1;
 914                 p1 = p;
 915         }
 916         if(runtime_sched.sysmonwait) {
 917                 runtime_sched.sysmonwait = false;
 918                 runtime_notewakeup(&runtime_sched.sysmonnote);
 919         }
 920         runtime_unlock(&runtime_sched);
 921
 922         while(p1) {
 923                 p = p1;
 924                 p1 = p1->link;
 925                 if(p->m) {
 926                         mp = p->m;
 927                         p->m = nil;
 928                         if(mp->nextp)
 929                                 runtime_throw("starttheworld: inconsistent mp->nextp");
 930                         mp->nextp = p;
 931                         runtime_notewakeup(&mp->park);
 932                 } else {
 933                         // Start M to run P.  Do not start another M below.
 934                         newm(nil, p);
 935                         add = false;
 936                 }
 937         }
 938
 939         if(add) {
 940                 // If GC could have used another helper proc, start one now,
 941                 // in the hope that it will be available next time.
 942                 // It would have been even better to start it before the collection,
 943                 // but doing so requires allocating memory, so it's tricky to
 944                 // coordinate.  This lazy approach works out in practice:
 945                 // we don't mind if the first couple gc rounds don't have quite
 946                 // the maximum number of procs.
 947                 newm(mhelpgc, nil);
 948         }
 949         m->locks--;
 950 }
 951
 952 // Called to start an M.
 953 void*
 954 runtime_mstart(void* mp)
 955 {
 956         m = (M*)mp;
 957         g = m->g0;
 958
 959         initcontext();
 960
 961         g->entry = nil;
 962         g->param = nil;
 963
 964         // Record top of stack for use by mcall.
 965         // Once we call schedule we're never coming back,
 966         // so other calls can reuse this stack space.
 967 #ifdef USING_SPLIT_STACK
 968         __splitstack_getcontext(&g->stack_context[0]);
 969 #else
 970         g->gcinitial_sp = &mp;
 971         // Setting gcstack_size to 0 is a marker meaning that gcinitial_sp
 972         // is the top of the stack, not the bottom.
 973         g->gcstack_size = 0;
 974         g->gcnext_sp = &mp;
 975 #endif
 976         getcontext(&g->context);
 977
 978         if(g->entry != nil) {
 979                 // Got here from mcall.
 980                 void (*pfn)(G*) = (void (*)(G*))g->entry;
 981                 G* gp = (G*)g->param;
 982                 pfn(gp);
 983                 *(int*)0x21 = 0x21;
 984         }
 985         runtime_minit();
 986
 987 #ifdef USING_SPLIT_STACK
 988         {
 989                 int dont_block_signals = 0;
 990                 __splitstack_block_signals(&dont_block_signals, nil);
 991         }
 992 #endif
 993
 994         // Install signal handlers; after minit so that minit can
 995         // prepare the thread to be able to handle the signals.
 996         if(m == &runtime_m0)
 997                 runtime_initsig();
 998
 999         if(m->mstartfn)
1000                 m->mstartfn();
1001
1002         if(m->helpgc) {
1003                 m->helpgc = 0;
1004                 stopm();
1005         } else if(m != &runtime_m0) {
1006                 acquirep(m->nextp);
1007                 m->nextp = nil;
1008         }
1009         schedule();
1010
1011         // TODO(brainman): This point is never reached, because scheduler
1012         // does not release os threads at the moment. But once this path
1013         // is enabled, we must remove our seh here.
1014
1015         return nil;
1016 }
1017
1018 typedef struct CgoThreadStart CgoThreadStart;
1019 struct CgoThreadStart
1020 {
1021         M *m;
1022         G *g;
1023         uintptr *tls;
1024         void (*fn)(void);
1025 };
1026
1027 // Allocate a new m unassociated with any thread.
1028 // Can use p for allocation context if needed.
1029 M*
1030 runtime_allocm(P *p, int32 stacksize, byte** ret_g0_stack, size_t* ret_g0_stacksize)
1031 {
1032         M *mp;
1033
1034         m->locks++;  // disable GC because it can be called from sysmon
1035         if(m->p == nil)
1036                 acquirep(p);  // temporarily borrow p for mallocs in this function
1037 #if 0
1038         if(mtype == nil) {
1039                 Eface e;
1040                 runtime_gc_m_ptr(&e);
1041                 mtype = ((const PtrType*)e.__type_descriptor)->__element_type;
1042         }
1043 #endif
1044
1045         mp = runtime_mal(sizeof *mp);
1046         mcommoninit(mp);
1047         mp->g0 = runtime_malg(stacksize, ret_g0_stack, ret_g0_stacksize);
1048
1049         if(p == m->p)
1050                 releasep();
1051         m->locks--;
1052
1053         return mp;
1054 }
1055
1056 static G*
1057 allocg(void)
1058 {
1059         G *gp;
1060         // static Type *gtype;
1061
1062         // if(gtype == nil) {
1063         //      Eface e;
1064         //      runtime_gc_g_ptr(&e);
1065         //      gtype = ((PtrType*)e.__type_descriptor)->__element_type;
1066         // }
1067         // gp = runtime_cnew(gtype);
1068         gp = runtime_malloc(sizeof(G));
1069         return gp;
1070 }
1071
1072 static M* lockextra(bool nilokay);
1073 static void unlockextra(M*);
1074
1075 // needm is called when a cgo callback happens on a
1076 // thread without an m (a thread not created by Go).
1077 // In this case, needm is expected to find an m to use
1078 // and return with m, g initialized correctly.
1079 // Since m and g are not set now (likely nil, but see below)
1080 // needm is limited in what routines it can call. In particular
1081 // it can only call nosplit functions (textflag 7) and cannot
1082 // do any scheduling that requires an m.
1083 //
1084 // In order to avoid needing heavy lifting here, we adopt
1085 // the following strategy: there is a stack of available m's
1086 // that can be stolen. Using compare-and-swap
1087 // to pop from the stack has ABA races, so we simulate
1088 // a lock by doing an exchange (via casp) to steal the stack
1089 // head and replace the top pointer with MLOCKED (1).
1090 // This serves as a simple spin lock that we can use even
1091 // without an m. The thread that locks the stack in this way
1092 // unlocks the stack by storing a valid stack head pointer.
1093 //
1094 // In order to make sure that there is always an m structure
1095 // available to be stolen, we maintain the invariant that there
1096 // is always one more than needed. At the beginning of the
1097 // program (if cgo is in use) the list is seeded with a single m.
1098 // If needm finds that it has taken the last m off the list, its job
1099 // is - once it has installed its own m so that it can do things like
1100 // allocate memory - to create a spare m and put it on the list.
1101 //
1102 // Each of these extra m's also has a g0 and a curg that are
1103 // pressed into service as the scheduling stack and current
1104 // goroutine for the duration of the cgo callback.
1105 //
1106 // When the callback is done with the m, it calls dropm to
1107 // put the m back on the list.
1108 //
1109 // Unlike the gc toolchain, we start running on curg, since we are
1110 // just going to return and let the caller continue.
1111 void
1112 runtime_needm(void)
1113 {
1114         M *mp;
1115
1116         if(runtime_needextram) {
1117                 // Can happen if C/C++ code calls Go from a global ctor.
1118                 // Can not throw, because scheduler is not initialized yet.
1119                 int rv __attribute__((unused));
1120                 rv = runtime_write(2, "fatal error: cgo callback before cgo call\n",
1121                         sizeof("fatal error: cgo callback before cgo call\n")-1);
1122                 runtime_exit(1);
1123         }
1124
1125         // Lock extra list, take head, unlock popped list.
1126         // nilokay=false is safe here because of the invariant above,
1127         // that the extra list always contains or will soon contain
1128         // at least one m.
1129         mp = lockextra(false);
1130
1131         // Set needextram when we've just emptied the list,
1132         // so that the eventual call into cgocallbackg will
1133         // allocate a new m for the extra list. We delay the
1134         // allocation until then so that it can be done
1135         // after exitsyscall makes sure it is okay to be
1136         // running at all (that is, there's no garbage collection
1137         // running right now).
1138         mp->needextram = mp->schedlink == nil;
1139         unlockextra(mp->schedlink);
1140
1141         // Install m and g (= m->curg).
1142         runtime_setmg(mp, mp->curg);
1143
1144         // Initialize g's context as in mstart.
1145         initcontext();
1146         g->status = Gsyscall;
1147         g->entry = nil;
1148         g->param = nil;
1149 #ifdef USING_SPLIT_STACK
1150         __splitstack_getcontext(&g->stack_context[0]);
1151 #else
1152         g->gcinitial_sp = &mp;
1153         g->gcstack = nil;
1154         g->gcstack_size = 0;
1155         g->gcnext_sp = &mp;
1156 #endif
1157         getcontext(&g->context);
1158
1159         if(g->entry != nil) {
1160                 // Got here from mcall.
1161                 void (*pfn)(G*) = (void (*)(G*))g->entry;
1162                 G* gp = (G*)g->param;
1163                 pfn(gp);
1164                 *(int*)0x22 = 0x22;
1165         }
1166
1167         // Initialize this thread to use the m.
1168         runtime_minit();
1169
1170 #ifdef USING_SPLIT_STACK
1171         {
1172                 int dont_block_signals = 0;
1173                 __splitstack_block_signals(&dont_block_signals, nil);
1174         }
1175 #endif
1176 }
1177
1178 // newextram allocates an m and puts it on the extra list.
1179 // It is called with a working local m, so that it can do things
1180 // like call schedlock and allocate.
1181 void
1182 runtime_newextram(void)
1183 {
1184         M *mp, *mnext;
1185         G *gp;
1186         byte *g0_sp, *sp;
1187         size_t g0_spsize, spsize;
1188
1189         // Create extra goroutine locked to extra m.
1190         // The goroutine is the context in which the cgo callback will run.
1191         // The sched.pc will never be returned to, but setting it to
1192         // runtime.goexit makes clear to the traceback routines where
1193         // the goroutine stack ends.
1194         mp = runtime_allocm(nil, StackMin, &g0_sp, &g0_spsize);
1195         gp = runtime_malg(StackMin, &sp, &spsize);
1196         gp->status = Gdead;
1197         mp->curg = gp;
1198         mp->locked = LockInternal;
1199         mp->lockedg = gp;
1200         gp->lockedm = mp;
1201         gp->goid = runtime_xadd64(&runtime_sched.goidgen, 1);
1202         // put on allg for garbage collector
1203         allgadd(gp);
1204
1205         // The context for gp will be set up in runtime_needm.  But
1206         // here we need to set up the context for g0.
1207         getcontext(&mp->g0->context);
1208         mp->g0->context.uc_stack.ss_sp = g0_sp;
1209         mp->g0->context.uc_stack.ss_size = g0_spsize;
1210         makecontext(&mp->g0->context, kickoff, 0);
1211
1212         // Add m to the extra list.
1213         mnext = lockextra(true);
1214         mp->schedlink = mnext;
1215         unlockextra(mp);
1216 }
1217
1218 // dropm is called when a cgo callback has called needm but is now
1219 // done with the callback and returning back into the non-Go thread.
1220 // It puts the current m back onto the extra list.
1221 //
1222 // The main expense here is the call to signalstack to release the
1223 // m's signal stack, and then the call to needm on the next callback
1224 // from this thread. It is tempting to try to save the m for next time,
1225 // which would eliminate both these costs, but there might not be
1226 // a next time: the current thread (which Go does not control) might exit.
1227 // If we saved the m for that thread, there would be an m leak each time
1228 // such a thread exited. Instead, we acquire and release an m on each
1229 // call. These should typically not be scheduling operations, just a few
1230 // atomics, so the cost should be small.
1231 //
1232 // TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
1233 // variable using pthread_key_create. Unlike the pthread keys we already use
1234 // on OS X, this dummy key would never be read by Go code. It would exist
1235 // only so that we could register at thread-exit-time destructor.
1236 // That destructor would put the m back onto the extra list.
1237 // This is purely a performance optimization. The current version,
1238 // in which dropm happens on each cgo call, is still correct too.
1239 // We may have to keep the current version on systems with cgo
1240 // but without pthreads, like Windows.
1241 void
1242 runtime_dropm(void)
1243 {
1244         M *mp, *mnext;
1245
1246         // Undo whatever initialization minit did during needm.
1247         runtime_unminit();
1248
1249         // Clear m and g, and return m to the extra list.
1250         // After the call to setmg we can only call nosplit functions.
1251         mp = m;
1252         runtime_setmg(nil, nil);
1253
1254         mp->curg->status = Gdead;
1255         mp->curg->gcstack = nil;
1256         mp->curg->gcnext_sp = nil;
1257
1258         mnext = lockextra(true);
1259         mp->schedlink = mnext;
1260         unlockextra(mp);
1261 }
1262
1263 #define MLOCKED ((M*)1)
1264
1265 // lockextra locks the extra list and returns the list head.
1266 // The caller must unlock the list by storing a new list head
1267 // to runtime.extram. If nilokay is true, then lockextra will
1268 // return a nil list head if that's what it finds. If nilokay is false,
1269 // lockextra will keep waiting until the list head is no longer nil.
1270 static M*
1271 lockextra(bool nilokay)
1272 {
1273         M *mp;
1274         void (*yield)(void);
1275
1276         for(;;) {
1277                 mp = runtime_atomicloadp(&runtime_extram);
1278                 if(mp == MLOCKED) {
1279                         yield = runtime_osyield;
1280                         yield();
1281                         continue;
1282                 }
1283                 if(mp == nil && !nilokay) {
1284                         runtime_usleep(1);
1285                         continue;
1286                 }
1287                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1288                         yield = runtime_osyield;
1289                         yield();
1290                         continue;
1291                 }
1292                 break;
1293         }
1294         return mp;
1295 }
1296
1297 static void
1298 unlockextra(M *mp)
1299 {
1300         runtime_atomicstorep(&runtime_extram, mp);
1301 }
1302
1303 static int32
1304 countextra()
1305 {
1306         M *mp, *mc;
1307         int32 c;
1308
1309         for(;;) {
1310                 mp = runtime_atomicloadp(&runtime_extram);
1311                 if(mp == MLOCKED) {
1312                         runtime_osyield();
1313                         continue;
1314                 }
1315                 if(!runtime_casp(&runtime_extram, mp, MLOCKED)) {
1316                         runtime_osyield();
1317                         continue;
1318                 }
1319                 c = 0;
1320                 for(mc = mp; mc != nil; mc = mc->schedlink)
1321                         c++;
1322                 runtime_atomicstorep(&runtime_extram, mp);
1323                 return c;
1324         }
1325 }
1326
1327 // Create a new m.  It will start off with a call to fn, or else the scheduler.
1328 static void
1329 newm(void(*fn)(void), P *p)
1330 {
1331         M *mp;
1332
1333         mp = runtime_allocm(p, -1, nil, nil);
1334         mp->nextp = p;
1335         mp->mstartfn = fn;
1336
1337         runtime_newosproc(mp);
1338 }
1339
1340 // Stops execution of the current m until new work is available.
1341 // Returns with acquired P.
1342 static void
1343 stopm(void)
1344 {
1345         if(m->locks)
1346                 runtime_throw("stopm holding locks");
1347         if(m->p)
1348                 runtime_throw("stopm holding p");
1349         if(m->spinning) {
1350                 m->spinning = false;
1351                 runtime_xadd(&runtime_sched.nmspinning, -1);
1352         }
1353
1354 retry:
1355         runtime_lock(&runtime_sched);
1356         mput(m);
1357         runtime_unlock(&runtime_sched);
1358         runtime_notesleep(&m->park);
1359         runtime_noteclear(&m->park);
1360         if(m->helpgc) {
1361                 runtime_gchelper();
1362                 m->helpgc = 0;
1363                 m->mcache = nil;
1364                 goto retry;
1365         }
1366         acquirep(m->nextp);
1367         m->nextp = nil;
1368 }
1369
1370 static void
1371 mspinning(void)
1372 {
1373         m->spinning = true;
1374 }
1375
1376 // Schedules some M to run the p (creates an M if necessary).
1377 // If p==nil, tries to get an idle P, if no idle P's does nothing.
1378 static void
1379 startm(P *p, bool spinning)
1380 {
1381         M *mp;
1382         void (*fn)(void);
1383
1384         runtime_lock(&runtime_sched);
1385         if(p == nil) {
1386                 p = pidleget();
1387                 if(p == nil) {
1388                         runtime_unlock(&runtime_sched);
1389                         if(spinning)
1390                                 runtime_xadd(&runtime_sched.nmspinning, -1);
1391                         return;
1392                 }
1393         }
1394         mp = mget();
1395         runtime_unlock(&runtime_sched);
1396         if(mp == nil) {
1397                 fn = nil;
1398                 if(spinning)
1399                         fn = mspinning;
1400                 newm(fn, p);
1401                 return;
1402         }
1403         if(mp->spinning)
1404                 runtime_throw("startm: m is spinning");
1405         if(mp->nextp)
1406                 runtime_throw("startm: m has p");
1407         mp->spinning = spinning;
1408         mp->nextp = p;
1409         runtime_notewakeup(&mp->park);
1410 }
1411
1412 // Hands off P from syscall or locked M.
1413 static void
1414 handoffp(P *p)
1415 {
1416         // if it has local work, start it straight away
1417         if(p->runqhead != p->runqtail || runtime_sched.runqsize) {
1418                 startm(p, false);
1419                 return;
1420         }
1421         // no local work, check that there are no spinning/idle M's,
1422         // otherwise our help is not required
1423         if(runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) == 0 &&  // TODO: fast atomic
1424                 runtime_cas(&runtime_sched.nmspinning, 0, 1)) {
1425                 startm(p, true);
1426                 return;
1427         }
1428         runtime_lock(&runtime_sched);
1429         if(runtime_sched.gcwaiting) {
1430                 p->status = Pgcstop;
1431                 if(--runtime_sched.stopwait == 0)
1432                         runtime_notewakeup(&runtime_sched.stopnote);
1433                 runtime_unlock(&runtime_sched);
1434                 return;
1435         }
1436         if(runtime_sched.runqsize) {
1437                 runtime_unlock(&runtime_sched);
1438                 startm(p, false);
1439                 return;
1440         }
1441         // If this is the last running P and nobody is polling network,
1442         // need to wakeup another M to poll network.
1443         if(runtime_sched.npidle == (uint32)runtime_gomaxprocs-1 && runtime_atomicload64(&runtime_sched.lastpoll) != 0) {
1444                 runtime_unlock(&runtime_sched);
1445                 startm(p, false);
1446                 return;
1447         }
1448         pidleput(p);
1449         runtime_unlock(&runtime_sched);
1450 }
1451
1452 // Tries to add one more P to execute G's.
1453 // Called when a G is made runnable (newproc, ready).
1454 static void
1455 wakep(void)
1456 {
1457         // be conservative about spinning threads
1458         if(!runtime_cas(&runtime_sched.nmspinning, 0, 1))
1459                 return;
1460         startm(nil, true);
1461 }
1462
1463 // Stops execution of the current m that is locked to a g until the g is runnable again.
1464 // Returns with acquired P.
1465 static void
1466 stoplockedm(void)
1467 {
1468         P *p;
1469
1470         if(m->lockedg == nil || m->lockedg->lockedm != m)
1471                 runtime_throw("stoplockedm: inconsistent locking");
1472         if(m->p) {
1473                 // Schedule another M to run this p.
1474                 p = releasep();
1475                 handoffp(p);
1476         }
1477         incidlelocked(1);
1478         // Wait until another thread schedules lockedg again.
1479         runtime_notesleep(&m->park);
1480         runtime_noteclear(&m->park);
1481         if(m->lockedg->status != Grunnable)
1482                 runtime_throw("stoplockedm: not runnable");
1483         acquirep(m->nextp);
1484         m->nextp = nil;
1485 }
1486
1487 // Schedules the locked m to run the locked gp.
1488 static void
1489 startlockedm(G *gp)
1490 {
1491         M *mp;
1492         P *p;
1493
1494         mp = gp->lockedm;
1495         if(mp == m)
1496                 runtime_throw("startlockedm: locked to me");
1497         if(mp->nextp)
1498                 runtime_throw("startlockedm: m has p");
1499         // directly handoff current P to the locked m
1500         incidlelocked(-1);
1501         p = releasep();
1502         mp->nextp = p;
1503         runtime_notewakeup(&mp->park);
1504         stopm();
1505 }
1506
1507 // Stops the current m for stoptheworld.
1508 // Returns when the world is restarted.
1509 static void
1510 gcstopm(void)
1511 {
1512         P *p;
1513
1514         if(!runtime_sched.gcwaiting)
1515                 runtime_throw("gcstopm: not waiting for gc");
1516         if(m->spinning) {
1517                 m->spinning = false;
1518                 runtime_xadd(&runtime_sched.nmspinning, -1);
1519         }
1520         p = releasep();
1521         runtime_lock(&runtime_sched);
1522         p->status = Pgcstop;
1523         if(--runtime_sched.stopwait == 0)
1524                 runtime_notewakeup(&runtime_sched.stopnote);
1525         runtime_unlock(&runtime_sched);
1526         stopm();
1527 }
1528
1529 // Schedules gp to run on the current M.
1530 // Never returns.
1531 static void
1532 execute(G *gp)
1533 {
1534         int32 hz;
1535
1536         if(gp->status != Grunnable) {
1537                 runtime_printf("execute: bad g status %d\n", gp->status);
1538                 runtime_throw("execute: bad g status");
1539         }
1540         gp->status = Grunning;
1541         gp->waitsince = 0;
1542         m->p->schedtick++;
1543         m->curg = gp;
1544         gp->m = m;
1545
1546         // Check whether the profiler needs to be turned on or off.
1547         hz = runtime_sched.profilehz;
1548         if(m->profilehz != hz)
1549                 runtime_resetcpuprofiler(hz);
1550
1551         runtime_gogo(gp);
1552 }
1553
1554 // Finds a runnable goroutine to execute.
1555 // Tries to steal from other P's, get g from global queue, poll network.
1556 static G*
1557 findrunnable(void)
1558 {
1559         G *gp;
1560         P *p;
1561         int32 i;
1562
1563 top:
1564         if(runtime_sched.gcwaiting) {
1565                 gcstopm();
1566                 goto top;
1567         }
1568         if(runtime_fingwait && runtime_fingwake && (gp = runtime_wakefing()) != nil)
1569                 runtime_ready(gp);
1570         // local runq
1571         gp = runqget(m->p);
1572         if(gp)
1573                 return gp;
1574         // global runq
1575         if(runtime_sched.runqsize) {
1576                 runtime_lock(&runtime_sched);
1577                 gp = globrunqget(m->p, 0);
1578                 runtime_unlock(&runtime_sched);
1579                 if(gp)
1580                         return gp;
1581         }
1582         // poll network
1583         gp = runtime_netpoll(false);  // non-blocking
1584         if(gp) {
1585                 injectglist(gp->schedlink);
1586                 gp->status = Grunnable;
1587                 return gp;
1588         }
1589         // If number of spinning M's >= number of busy P's, block.
1590         // This is necessary to prevent excessive CPU consumption
1591         // when GOMAXPROCS>>1 but the program parallelism is low.
1592         if(!m->spinning && 2 * runtime_atomicload(&runtime_sched.nmspinning) >= runtime_gomaxprocs - runtime_atomicload(&runtime_sched.npidle))  // TODO: fast atomic
1593                 goto stop;
1594         if(!m->spinning) {
1595                 m->spinning = true;
1596                 runtime_xadd(&runtime_sched.nmspinning, 1);
1597         }
1598         // random steal from other P's
1599         for(i = 0; i < 2*runtime_gomaxprocs; i++) {
1600                 if(runtime_sched.gcwaiting)
1601                         goto top;
1602                 p = runtime_allp[runtime_fastrand1()%runtime_gomaxprocs];
1603                 if(p == m->p)
1604                         gp = runqget(p);
1605                 else
1606                         gp = runqsteal(m->p, p);
1607                 if(gp)
1608                         return gp;
1609         }
1610 stop:
1611         // return P and block
1612         runtime_lock(&runtime_sched);
1613         if(runtime_sched.gcwaiting) {
1614                 runtime_unlock(&runtime_sched);
1615                 goto top;
1616         }
1617         if(runtime_sched.runqsize) {
1618                 gp = globrunqget(m->p, 0);
1619                 runtime_unlock(&runtime_sched);
1620                 return gp;
1621         }
1622         p = releasep();
1623         pidleput(p);
1624         runtime_unlock(&runtime_sched);
1625         if(m->spinning) {
1626                 m->spinning = false;
1627                 runtime_xadd(&runtime_sched.nmspinning, -1);
1628         }
1629         // check all runqueues once again
1630         for(i = 0; i < runtime_gomaxprocs; i++) {
1631                 p = runtime_allp[i];
1632                 if(p && p->runqhead != p->runqtail) {
1633                         runtime_lock(&runtime_sched);
1634                         p = pidleget();
1635                         runtime_unlock(&runtime_sched);
1636                         if(p) {
1637                                 acquirep(p);
1638                                 goto top;
1639                         }
1640                         break;
1641                 }
1642         }
1643         // poll network
1644         if(runtime_xchg64(&runtime_sched.lastpoll, 0) != 0) {
1645                 if(m->p)
1646                         runtime_throw("findrunnable: netpoll with p");
1647                 if(m->spinning)
1648                         runtime_throw("findrunnable: netpoll with spinning");
1649                 gp = runtime_netpoll(true);  // block until new work is available
1650                 runtime_atomicstore64(&runtime_sched.lastpoll, runtime_nanotime());
1651                 if(gp) {
1652                         runtime_lock(&runtime_sched);
1653                         p = pidleget();
1654                         runtime_unlock(&runtime_sched);
1655                         if(p) {
1656                                 acquirep(p);
1657                                 injectglist(gp->schedlink);
1658                                 gp->status = Grunnable;
1659                                 return gp;
1660                         }
1661                         injectglist(gp);
1662                 }
1663         }
1664         stopm();
1665         goto top;
1666 }
1667
1668 static void
1669 resetspinning(void)
1670 {
1671         int32 nmspinning;
1672
1673         if(m->spinning) {
1674                 m->spinning = false;
1675                 nmspinning = runtime_xadd(&runtime_sched.nmspinning, -1);
1676                 if(nmspinning < 0)
1677                         runtime_throw("findrunnable: negative nmspinning");
1678         } else
1679                 nmspinning = runtime_atomicload(&runtime_sched.nmspinning);
1680
1681         // M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
1682         // so see if we need to wakeup another P here.
1683         if (nmspinning == 0 && runtime_atomicload(&runtime_sched.npidle) > 0)
1684                 wakep();
1685 }
1686
1687 // Injects the list of runnable G's into the scheduler.
1688 // Can run concurrently with GC.
1689 static void
1690 injectglist(G *glist)
1691 {
1692         int32 n;
1693         G *gp;
1694
1695         if(glist == nil)
1696                 return;
1697         runtime_lock(&runtime_sched);
1698         for(n = 0; glist; n++) {
1699                 gp = glist;
1700                 glist = gp->schedlink;
1701                 gp->status = Grunnable;
1702                 globrunqput(gp);
1703         }
1704         runtime_unlock(&runtime_sched);
1705
1706         for(; n && runtime_sched.npidle; n--)
1707                 startm(nil, false);
1708 }
1709
1710 // One round of scheduler: find a runnable goroutine and execute it.
1711 // Never returns.
1712 static void
1713 schedule(void)
1714 {
1715         G *gp;
1716         uint32 tick;
1717
1718         if(m->locks)
1719                 runtime_throw("schedule: holding locks");
1720
1721 top:
1722         if(runtime_sched.gcwaiting) {
1723                 gcstopm();
1724                 goto top;
1725         }
1726
1727         gp = nil;
1728         // Check the global runnable queue once in a while to ensure fairness.
1729         // Otherwise two goroutines can completely occupy the local runqueue
1730         // by constantly respawning each other.
1731         tick = m->p->schedtick;
1732         // This is a fancy way to say tick%61==0,
1733         // it uses 2 MUL instructions instead of a single DIV and so is faster on modern processors.
1734         if(tick - (((uint64)tick*0x4325c53fu)>>36)*61 == 0 && runtime_sched.runqsize > 0) {
1735                 runtime_lock(&runtime_sched);
1736                 gp = globrunqget(m->p, 1);
1737                 runtime_unlock(&runtime_sched);
1738                 if(gp)
1739                         resetspinning();
1740         }
1741         if(gp == nil) {
1742                 gp = runqget(m->p);
1743                 if(gp && m->spinning)
1744                         runtime_throw("schedule: spinning with local work");
1745         }
1746         if(gp == nil) {
1747                 gp = findrunnable();  // blocks until work is available
1748                 resetspinning();
1749         }
1750
1751         if(gp->lockedm) {
1752                 // Hands off own p to the locked m,
1753                 // then blocks waiting for a new p.
1754                 startlockedm(gp);
1755                 goto top;
1756         }
1757
1758         execute(gp);
1759 }
1760
1761 // Puts the current goroutine into a waiting state and calls unlockf.
1762 // If unlockf returns false, the goroutine is resumed.
1763 void
1764 runtime_park(bool(*unlockf)(G*, void*), void *lock, const char *reason)
1765 {
1766         if(g->status != Grunning)
1767                 runtime_throw("bad g status");
1768         m->waitlock = lock;
1769         m->waitunlockf = unlockf;
1770         g->waitreason = reason;
1771         runtime_mcall(park0);
1772 }
1773
1774 static bool
1775 parkunlock(G *gp, void *lock)
1776 {
1777         USED(gp);
1778         runtime_unlock(lock);
1779         return true;
1780 }
1781
1782 // Puts the current goroutine into a waiting state and unlocks the lock.
1783 // The goroutine can be made runnable again by calling runtime_ready(gp).
1784 void
1785 runtime_parkunlock(Lock *lock, const char *reason)
1786 {
1787         runtime_park(parkunlock, lock, reason);
1788 }
1789
1790 // runtime_park continuation on g0.
1791 static void
1792 park0(G *gp)
1793 {
1794         bool ok;
1795
1796         gp->status = Gwaiting;
1797         gp->m = nil;
1798         m->curg = nil;
1799         if(m->waitunlockf) {
1800                 ok = m->waitunlockf(gp, m->waitlock);
1801                 m->waitunlockf = nil;
1802                 m->waitlock = nil;
1803                 if(!ok) {
1804                         gp->status = Grunnable;
1805                         execute(gp);  // Schedule it back, never returns.
1806                 }
1807         }
1808         if(m->lockedg) {
1809                 stoplockedm();
1810                 execute(gp);  // Never returns.
1811         }
1812         schedule();
1813 }
1814
1815 // Scheduler yield.
1816 void
1817 runtime_gosched(void)
1818 {
1819         if(g->status != Grunning)
1820                 runtime_throw("bad g status");
1821         runtime_mcall(runtime_gosched0);
1822 }
1823
1824 // runtime_gosched continuation on g0.
1825 void
1826 runtime_gosched0(G *gp)
1827 {
1828         gp->status = Grunnable;
1829         gp->m = nil;
1830         m->curg = nil;
1831         runtime_lock(&runtime_sched);
1832         globrunqput(gp);
1833         runtime_unlock(&runtime_sched);
1834         if(m->lockedg) {
1835                 stoplockedm();
1836                 execute(gp);  // Never returns.
1837         }
1838         schedule();
1839 }
1840
1841 // Finishes execution of the current goroutine.
1842 // Need to mark it as nosplit, because it runs with sp > stackbase (as runtime_lessstack).
1843 // Since it does not return it does not matter.  But if it is preempted
1844 // at the split stack check, GC will complain about inconsistent sp.
1845 void runtime_goexit(void) __attribute__ ((noinline));
1846 void
1847 runtime_goexit(void)
1848 {
1849         if(g->status != Grunning)
1850                 runtime_throw("bad g status");
1851         if(raceenabled)
1852                 runtime_racegoend();
1853         runtime_mcall(goexit0);
1854 }
1855
1856 // runtime_goexit continuation on g0.
1857 static void
1858 goexit0(G *gp)
1859 {
1860         gp->status = Gdead;
1861         gp->entry = nil;
1862         gp->m = nil;
1863         gp->lockedm = nil;
1864         gp->paniconfault = 0;
1865         gp->defer = nil; // should be true already but just in case.
1866         gp->panic = nil; // non-nil for Goexit during panic. points at stack-allocated data.
1867         gp->writenbuf = 0;
1868         gp->writebuf = nil;
1869         gp->waitreason = nil;
1870         gp->param = nil;
1871         m->curg = nil;
1872         m->lockedg = nil;
1873         if(m->locked & ~LockExternal) {
1874                 runtime_printf("invalid m->locked = %d\n", m->locked);
1875                 runtime_throw("internal lockOSThread error");
1876         }
1877         m->locked = 0;
1878         gfput(m->p, gp);
1879         schedule();
1880 }
1881
1882 // The goroutine g is about to enter a system call.
1883 // Record that it's not using the cpu anymore.
1884 // This is called only from the go syscall library and cgocall,
1885 // not from the low-level system calls used by the runtime.
1886 //
1887 // Entersyscall cannot split the stack: the runtime_gosave must
1888 // make g->sched refer to the caller's stack segment, because
1889 // entersyscall is going to return immediately after.
1890
1891 void runtime_entersyscall(void) __attribute__ ((no_split_stack));
1892 static void doentersyscall(void) __attribute__ ((no_split_stack, noinline));
1893
1894 void
1895 runtime_entersyscall()
1896 {
1897         // Save the registers in the g structure so that any pointers
1898         // held in registers will be seen by the garbage collector.
1899         getcontext(&g->gcregs);
1900
1901         // Do the work in a separate function, so that this function
1902         // doesn't save any registers on its own stack.  If this
1903         // function does save any registers, we might store the wrong
1904         // value in the call to getcontext.
1905         //
1906         // FIXME: This assumes that we do not need to save any
1907         // callee-saved registers to access the TLS variable g.  We
1908         // don't want to put the ucontext_t on the stack because it is
1909         // large and we can not split the stack here.
1910         doentersyscall();
1911 }
1912
1913 static void
1914 doentersyscall()
1915 {
1916         // Disable preemption because during this function g is in Gsyscall status,
1917         // but can have inconsistent g->sched, do not let GC observe it.
1918         m->locks++;
1919
1920         // Leave SP around for GC and traceback.
1921 #ifdef USING_SPLIT_STACK
1922         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
1923                                        &g->gcnext_segment, &g->gcnext_sp,
1924                                        &g->gcinitial_sp);
1925 #else
1926         {
1927                 void *v;
1928
1929                 g->gcnext_sp = (byte *) &v;
1930         }
1931 #endif
1932
1933         g->status = Gsyscall;
1934
1935         if(runtime_atomicload(&runtime_sched.sysmonwait)) {  // TODO: fast atomic
1936                 runtime_lock(&runtime_sched);
1937                 if(runtime_atomicload(&runtime_sched.sysmonwait)) {
1938                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
1939                         runtime_notewakeup(&runtime_sched.sysmonnote);
1940                 }
1941                 runtime_unlock(&runtime_sched);
1942         }
1943
1944         m->mcache = nil;
1945         m->p->m = nil;
1946         runtime_atomicstore(&m->p->status, Psyscall);
1947         if(runtime_sched.gcwaiting) {
1948                 runtime_lock(&runtime_sched);
1949                 if (runtime_sched.stopwait > 0 && runtime_cas(&m->p->status, Psyscall, Pgcstop)) {
1950                         if(--runtime_sched.stopwait == 0)
1951                                 runtime_notewakeup(&runtime_sched.stopnote);
1952                 }
1953                 runtime_unlock(&runtime_sched);
1954         }
1955
1956         m->locks--;
1957 }
1958
1959 // The same as runtime_entersyscall(), but with a hint that the syscall is blocking.
1960 void
1961 runtime_entersyscallblock(void)
1962 {
1963         P *p;
1964
1965         m->locks++;  // see comment in entersyscall
1966
1967         // Leave SP around for GC and traceback.
1968 #ifdef USING_SPLIT_STACK
1969         g->gcstack = __splitstack_find(nil, nil, &g->gcstack_size,
1970                                        &g->gcnext_segment, &g->gcnext_sp,
1971                                        &g->gcinitial_sp);
1972 #else
1973         g->gcnext_sp = (byte *) &p;
1974 #endif
1975
1976         // Save the registers in the g structure so that any pointers
1977         // held in registers will be seen by the garbage collector.
1978         getcontext(&g->gcregs);
1979
1980         g->status = Gsyscall;
1981
1982         p = releasep();
1983         handoffp(p);
1984         if(g->isbackground)  // do not consider blocked scavenger for deadlock detection
1985                 incidlelocked(1);
1986
1987         m->locks--;
1988 }
1989
1990 // The goroutine g exited its system call.
1991 // Arrange for it to run on a cpu again.
1992 // This is called only from the go syscall library, not
1993 // from the low-level system calls used by the runtime.
1994 void
1995 runtime_exitsyscall(void)
1996 {
1997         G *gp;
1998
1999         m->locks++;  // see comment in entersyscall
2000
2001         gp = g;
2002         if(gp->isbackground)  // do not consider blocked scavenger for deadlock detection
2003                 incidlelocked(-1);
2004
2005         g->waitsince = 0;
2006         if(exitsyscallfast()) {
2007                 // There's a cpu for us, so we can run.
2008                 m->p->syscalltick++;
2009                 gp->status = Grunning;
2010                 // Garbage collector isn't running (since we are),
2011                 // so okay to clear gcstack and gcsp.
2012 #ifdef USING_SPLIT_STACK
2013                 gp->gcstack = nil;
2014 #endif
2015                 gp->gcnext_sp = nil;
2016                 runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2017                 m->locks--;
2018                 return;
2019         }
2020
2021         m->locks--;
2022
2023         // Call the scheduler.
2024         runtime_mcall(exitsyscall0);
2025
2026         // Scheduler returned, so we're allowed to run now.
2027         // Delete the gcstack information that we left for
2028         // the garbage collector during the system call.
2029         // Must wait until now because until gosched returns
2030         // we don't know for sure that the garbage collector
2031         // is not running.
2032 #ifdef USING_SPLIT_STACK
2033         gp->gcstack = nil;
2034 #endif
2035         gp->gcnext_sp = nil;
2036         runtime_memclr(&gp->gcregs, sizeof gp->gcregs);
2037
2038         // Don't refer to m again, we might be running on a different
2039         // thread after returning from runtime_mcall.
2040         runtime_m()->p->syscalltick++;
2041 }
2042
2043 static bool
2044 exitsyscallfast(void)
2045 {
2046         P *p;
2047
2048         // Freezetheworld sets stopwait but does not retake P's.
2049         if(runtime_sched.stopwait) {
2050                 m->p = nil;
2051                 return false;
2052         }
2053
2054         // Try to re-acquire the last P.
2055         if(m->p && m->p->status == Psyscall && runtime_cas(&m->p->status, Psyscall, Prunning)) {
2056                 // There's a cpu for us, so we can run.
2057                 m->mcache = m->p->mcache;
2058                 m->p->m = m;
2059                 return true;
2060         }
2061         // Try to get any other idle P.
2062         m->p = nil;
2063         if(runtime_sched.pidle) {
2064                 runtime_lock(&runtime_sched);
2065                 p = pidleget();
2066                 if(p && runtime_atomicload(&runtime_sched.sysmonwait)) {
2067                         runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2068                         runtime_notewakeup(&runtime_sched.sysmonnote);
2069                 }
2070                 runtime_unlock(&runtime_sched);
2071                 if(p) {
2072                         acquirep(p);
2073                         return true;
2074                 }
2075         }
2076         return false;
2077 }
2078
2079 // runtime_exitsyscall slow path on g0.
2080 // Failed to acquire P, enqueue gp as runnable.
2081 static void
2082 exitsyscall0(G *gp)
2083 {
2084         P *p;
2085
2086         gp->status = Grunnable;
2087         gp->m = nil;
2088         m->curg = nil;
2089         runtime_lock(&runtime_sched);
2090         p = pidleget();
2091         if(p == nil)
2092                 globrunqput(gp);
2093         else if(runtime_atomicload(&runtime_sched.sysmonwait)) {
2094                 runtime_atomicstore(&runtime_sched.sysmonwait, 0);
2095                 runtime_notewakeup(&runtime_sched.sysmonnote);
2096         }
2097         runtime_unlock(&runtime_sched);
2098         if(p) {
2099                 acquirep(p);
2100                 execute(gp);  // Never returns.
2101         }
2102         if(m->lockedg) {
2103                 // Wait until another thread schedules gp and so m again.
2104                 stoplockedm();
2105                 execute(gp);  // Never returns.
2106         }
2107         stopm();
2108         schedule();  // Never returns.
2109 }
2110
2111 // Called from syscall package before fork.
2112 void syscall_runtime_BeforeFork(void)
2113   __asm__(GOSYM_PREFIX "syscall.runtime_BeforeFork");
2114 void
2115 syscall_runtime_BeforeFork(void)
2116 {
2117         // Fork can hang if preempted with signals frequently enough (see issue 5517).
2118         // Ensure that we stay on the same M where we disable profiling.
2119         runtime_m()->locks++;
2120         if(runtime_m()->profilehz != 0)
2121                 runtime_resetcpuprofiler(0);
2122 }
2123
2124 // Called from syscall package after fork in parent.
2125 void syscall_runtime_AfterFork(void)
2126   __asm__(GOSYM_PREFIX "syscall.runtime_AfterFork");
2127 void
2128 syscall_runtime_AfterFork(void)
2129 {
2130         int32 hz;
2131
2132         hz = runtime_sched.profilehz;
2133         if(hz != 0)
2134                 runtime_resetcpuprofiler(hz);
2135         runtime_m()->locks--;
2136 }
2137
2138 // Allocate a new g, with a stack big enough for stacksize bytes.
2139 G*
2140 runtime_malg(int32 stacksize, byte** ret_stack, size_t* ret_stacksize)
2141 {
2142         G *newg;
2143
2144         newg = allocg();
2145         if(stacksize >= 0) {
2146 #if USING_SPLIT_STACK
2147                 int dont_block_signals = 0;
2148
2149                 *ret_stack = __splitstack_makecontext(stacksize,
2150                                                       &newg->stack_context[0],
2151                                                       ret_stacksize);
2152                 __splitstack_block_signals_context(&newg->stack_context[0],
2153                                                    &dont_block_signals, nil);
2154 #else
2155                 *ret_stack = runtime_mallocgc(stacksize, 0, FlagNoProfiling|FlagNoGC);
2156                 *ret_stacksize = stacksize;
2157                 newg->gcinitial_sp = *ret_stack;
2158                 newg->gcstack_size = stacksize;
2159                 runtime_xadd(&runtime_stacks_sys, stacksize);
2160 #endif
2161         }
2162         return newg;
2163 }
2164
2165 /* For runtime package testing.  */
2166
2167
2168 // Create a new g running fn with siz bytes of arguments.
2169 // Put it on the queue of g's waiting to run.
2170 // The compiler turns a go statement into a call to this.
2171 // Cannot split the stack because it assumes that the arguments
2172 // are available sequentially after &fn; they would not be
2173 // copied if a stack split occurred.  It's OK for this to call
2174 // functions that split the stack.
2175 void runtime_testing_entersyscall(void)
2176   __asm__ (GOSYM_PREFIX "runtime.entersyscall");
2177 void
2178 runtime_testing_entersyscall()
2179 {
2180         runtime_entersyscall();
2181 }
2182
2183 void runtime_testing_exitsyscall(void)
2184   __asm__ (GOSYM_PREFIX "runtime.exitsyscall");
2185
2186 void
2187 runtime_testing_exitsyscall()
2188 {
2189         runtime_exitsyscall();
2190 }
2191
2192 G*
2193 __go_go(void (*fn)(void*), void* arg)
2194 {
2195         byte *sp;
2196         size_t spsize;
2197         G *newg;
2198         P *p;
2199
2200 //runtime_printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
2201         if(fn == nil) {
2202                 m->throwing = -1;  // do not dump full stacks
2203                 runtime_throw("go of nil func value");
2204         }
2205         m->locks++;  // disable preemption because it can be holding p in a local var
2206
2207         p = m->p;
2208         if((newg = gfget(p)) != nil) {
2209 #ifdef USING_SPLIT_STACK
2210                 int dont_block_signals = 0;
2211
2212                 sp = __splitstack_resetcontext(&newg->stack_context[0],
2213                                                &spsize);
2214                 __splitstack_block_signals_context(&newg->stack_context[0],
2215                                                    &dont_block_signals, nil);
2216 #else
2217                 sp = newg->gcinitial_sp;
2218                 spsize = newg->gcstack_size;
2219                 if(spsize == 0)
2220                         runtime_throw("bad spsize in __go_go");
2221                 newg->gcnext_sp = sp;
2222 #endif
2223         } else {
2224                 newg = runtime_malg(StackMin, &sp, &spsize);
2225                 allgadd(newg);
2226         }
2227
2228         newg->entry = (byte*)fn;
2229         newg->param = arg;
2230         newg->gopc = (uintptr)__builtin_return_address(0);
2231         newg->status = Grunnable;
2232         if(p->goidcache == p->goidcacheend) {
2233                 p->goidcache = runtime_xadd64(&runtime_sched.goidgen, GoidCacheBatch);
2234                 p->goidcacheend = p->goidcache + GoidCacheBatch;
2235         }
2236         newg->goid = p->goidcache++;
2237
2238         {
2239                 // Avoid warnings about variables clobbered by
2240                 // longjmp.
2241                 byte * volatile vsp = sp;
2242                 size_t volatile vspsize = spsize;
2243                 G * volatile vnewg = newg;
2244
2245                 getcontext(&vnewg->context);
2246                 vnewg->context.uc_stack.ss_sp = vsp;
2247 #ifdef MAKECONTEXT_STACK_TOP
2248                 vnewg->context.uc_stack.ss_sp += vspsize;
2249 #endif
2250                 vnewg->context.uc_stack.ss_size = vspsize;
2251                 makecontext(&vnewg->context, kickoff, 0);
2252
2253                 runqput(p, vnewg);
2254
2255                 if(runtime_atomicload(&runtime_sched.npidle) != 0 && runtime_atomicload(&runtime_sched.nmspinning) == 0 && fn != runtime_main)  // TODO: fast atomic
2256                         wakep();
2257                 m->locks--;
2258                 return vnewg;
2259         }
2260 }
2261
2262 static void
2263 allgadd(G *gp)
2264 {
2265         G **new;
2266         uintptr cap;
2267
2268         runtime_lock(&allglock);
2269         if(runtime_allglen >= allgcap) {
2270                 cap = 4096/sizeof(new[0]);
2271                 if(cap < 2*allgcap)
2272                         cap = 2*allgcap;
2273                 new = runtime_malloc(cap*sizeof(new[0]));
2274                 if(new == nil)
2275                         runtime_throw("runtime: cannot allocate memory");
2276                 if(runtime_allg != nil) {
2277                         runtime_memmove(new, runtime_allg, runtime_allglen*sizeof(new[0]));
2278                         runtime_free(runtime_allg);
2279                 }
2280                 runtime_allg = new;
2281                 allgcap = cap;
2282         }
2283         runtime_allg[runtime_allglen++] = gp;
2284         runtime_unlock(&allglock);
2285 }
2286
2287 // Put on gfree list.
2288 // If local list is too long, transfer a batch to the global list.
2289 static void
2290 gfput(P *p, G *gp)
2291 {
2292         gp->schedlink = p->gfree;
2293         p->gfree = gp;
2294         p->gfreecnt++;
2295         if(p->gfreecnt >= 64) {
2296                 runtime_lock(&runtime_sched.gflock);
2297                 while(p->gfreecnt >= 32) {
2298                         p->gfreecnt--;
2299                         gp = p->gfree;
2300                         p->gfree = gp->schedlink;
2301                         gp->schedlink = runtime_sched.gfree;
2302                         runtime_sched.gfree = gp;
2303                 }
2304                 runtime_unlock(&runtime_sched.gflock);
2305         }
2306 }
2307
2308 // Get from gfree list.
2309 // If local list is empty, grab a batch from global list.
2310 static G*
2311 gfget(P *p)
2312 {
2313         G *gp;
2314
2315 retry:
2316         gp = p->gfree;
2317         if(gp == nil && runtime_sched.gfree) {
2318                 runtime_lock(&runtime_sched.gflock);
2319                 while(p->gfreecnt < 32 && runtime_sched.gfree) {
2320                         p->gfreecnt++;
2321                         gp = runtime_sched.gfree;
2322                         runtime_sched.gfree = gp->schedlink;
2323                         gp->schedlink = p->gfree;
2324                         p->gfree = gp;
2325                 }
2326                 runtime_unlock(&runtime_sched.gflock);
2327                 goto retry;
2328         }
2329         if(gp) {
2330                 p->gfree = gp->schedlink;
2331                 p->gfreecnt--;
2332         }
2333         return gp;
2334 }
2335
2336 // Purge all cached G's from gfree list to the global list.
2337 static void
2338 gfpurge(P *p)
2339 {
2340         G *gp;
2341
2342         runtime_lock(&runtime_sched.gflock);
2343         while(p->gfreecnt) {
2344                 p->gfreecnt--;
2345                 gp = p->gfree;
2346                 p->gfree = gp->schedlink;
2347                 gp->schedlink = runtime_sched.gfree;
2348                 runtime_sched.gfree = gp;
2349         }
2350         runtime_unlock(&runtime_sched.gflock);
2351 }
2352
2353 void
2354 runtime_Breakpoint(void)
2355 {
2356         runtime_breakpoint();
2357 }
2358
2359 void runtime_Gosched (void) __asm__ (GOSYM_PREFIX "runtime.Gosched");
2360
2361 void
2362 runtime_Gosched(void)
2363 {
2364         runtime_gosched();
2365 }
2366
2367 // Implementation of runtime.GOMAXPROCS.
2368 // delete when scheduler is even stronger
2369 int32
2370 runtime_gomaxprocsfunc(int32 n)
2371 {
2372         int32 ret;
2373
2374         if(n > MaxGomaxprocs)
2375                 n = MaxGomaxprocs;
2376         runtime_lock(&runtime_sched);
2377         ret = runtime_gomaxprocs;
2378         if(n <= 0 || n == ret) {
2379                 runtime_unlock(&runtime_sched);
2380                 return ret;
2381         }
2382         runtime_unlock(&runtime_sched);
2383
2384         runtime_semacquire(&runtime_worldsema, false);
2385         m->gcing = 1;
2386         runtime_stoptheworld();
2387         newprocs = n;
2388         m->gcing = 0;
2389         runtime_semrelease(&runtime_worldsema);
2390         runtime_starttheworld();
2391
2392         return ret;
2393 }
2394
2395 // lockOSThread is called by runtime.LockOSThread and runtime.lockOSThread below
2396 // after they modify m->locked. Do not allow preemption during this call,
2397 // or else the m might be different in this function than in the caller.
2398 static void
2399 lockOSThread(void)
2400 {
2401         m->lockedg = g;
2402         g->lockedm = m;
2403 }
2404
2405 void    runtime_LockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.LockOSThread");
2406 void
2407 runtime_LockOSThread(void)
2408 {
2409         m->locked |= LockExternal;
2410         lockOSThread();
2411 }
2412
2413 void
2414 runtime_lockOSThread(void)
2415 {
2416         m->locked += LockInternal;
2417         lockOSThread();
2418 }
2419
2420
2421 // unlockOSThread is called by runtime.UnlockOSThread and runtime.unlockOSThread below
2422 // after they update m->locked. Do not allow preemption during this call,
2423 // or else the m might be in different in this function than in the caller.
2424 static void
2425 unlockOSThread(void)
2426 {
2427         if(m->locked != 0)
2428                 return;
2429         m->lockedg = nil;
2430         g->lockedm = nil;
2431 }
2432
2433 void    runtime_UnlockOSThread(void) __asm__ (GOSYM_PREFIX "runtime.UnlockOSThread");
2434
2435 void
2436 runtime_UnlockOSThread(void)
2437 {
2438         m->locked &= ~LockExternal;
2439         unlockOSThread();
2440 }
2441
2442 void
2443 runtime_unlockOSThread(void)
2444 {
2445         if(m->locked < LockInternal)
2446                 runtime_throw("runtime: internal error: misuse of lockOSThread/unlockOSThread");
2447         m->locked -= LockInternal;
2448         unlockOSThread();
2449 }
2450
2451 bool
2452 runtime_lockedOSThread(void)
2453 {
2454         return g->lockedm != nil && m->lockedg != nil;
2455 }
2456
2457 int32
2458 runtime_gcount(void)
2459 {
2460         G *gp;
2461         int32 n, s;
2462         uintptr i;
2463
2464         n = 0;
2465         runtime_lock(&allglock);
2466         // TODO(dvyukov): runtime.NumGoroutine() is O(N).
2467         // We do not want to increment/decrement centralized counter in newproc/goexit,
2468         // just to make runtime.NumGoroutine() faster.
2469         // Compromise solution is to introduce per-P counters of active goroutines.
2470         for(i = 0; i < runtime_allglen; i++) {
2471                 gp = runtime_allg[i];
2472                 s = gp->status;
2473                 if(s == Grunnable || s == Grunning || s == Gsyscall || s == Gwaiting)
2474                         n++;
2475         }
2476         runtime_unlock(&allglock);
2477         return n;
2478 }
2479
2480 int32
2481 runtime_mcount(void)
2482 {
2483         return runtime_sched.mcount;
2484 }
2485
2486 static struct {
2487         Lock;
2488         void (*fn)(uintptr*, int32);
2489         int32 hz;
2490         uintptr pcbuf[TracebackMaxFrames];
2491         Location locbuf[TracebackMaxFrames];
2492 } prof;
2493
2494 static void System(void) {}
2495 static void GC(void) {}
2496
2497 // Called if we receive a SIGPROF signal.
2498 void
2499 runtime_sigprof()
2500 {
2501         M *mp = m;
2502         int32 n, i;
2503         bool traceback;
2504
2505         if(prof.fn == nil || prof.hz == 0)
2506                 return;
2507
2508         if(mp == nil)
2509                 return;
2510
2511         // Profiling runs concurrently with GC, so it must not allocate.
2512         mp->mallocing++;
2513
2514         traceback = true;
2515
2516         if(mp->mcache == nil)
2517                 traceback = false;
2518
2519         runtime_lock(&prof);
2520         if(prof.fn == nil) {
2521                 runtime_unlock(&prof);
2522                 mp->mallocing--;
2523                 return;
2524         }
2525         n = 0;
2526
2527         if(runtime_atomicload(&runtime_in_callers) > 0) {
2528                 // If SIGPROF arrived while already fetching runtime
2529                 // callers we can have trouble on older systems
2530                 // because the unwind library calls dl_iterate_phdr
2531                 // which was not recursive in the past.
2532                 traceback = false;
2533         }
2534
2535         if(traceback) {
2536                 n = runtime_callers(0, prof.locbuf, nelem(prof.locbuf), false);
2537                 for(i = 0; i < n; i++)
2538                         prof.pcbuf[i] = prof.locbuf[i].pc;
2539         }
2540         if(!traceback || n <= 0) {
2541                 n = 2;
2542                 prof.pcbuf[0] = (uintptr)runtime_getcallerpc(&n);
2543                 if(mp->gcing || mp->helpgc)
2544                         prof.pcbuf[1] = (uintptr)GC;
2545                 else
2546                         prof.pcbuf[1] = (uintptr)System;
2547         }
2548         prof.fn(prof.pcbuf, n);
2549         runtime_unlock(&prof);
2550         mp->mallocing--;
2551 }
2552
2553 // Arrange to call fn with a traceback hz times a second.
2554 void
2555 runtime_setcpuprofilerate(void (*fn)(uintptr*, int32), int32 hz)
2556 {
2557         // Force sane arguments.
2558         if(hz < 0)
2559                 hz = 0;
2560         if(hz == 0)
2561                 fn = nil;
2562         if(fn == nil)
2563                 hz = 0;
2564
2565         // Disable preemption, otherwise we can be rescheduled to another thread
2566         // that has profiling enabled.
2567         m->locks++;
2568
2569         // Stop profiler on this thread so that it is safe to lock prof.
2570         // if a profiling signal came in while we had prof locked,
2571         // it would deadlock.
2572         runtime_resetcpuprofiler(0);
2573
2574         runtime_lock(&prof);
2575         prof.fn = fn;
2576         prof.hz = hz;
2577         runtime_unlock(&prof);
2578         runtime_lock(&runtime_sched);
2579         runtime_sched.profilehz = hz;
2580         runtime_unlock(&runtime_sched);
2581
2582         if(hz != 0)
2583                 runtime_resetcpuprofiler(hz);
2584
2585         m->locks--;
2586 }
2587
2588 // Change number of processors.  The world is stopped, sched is locked.
2589 static void
2590 procresize(int32 new)
2591 {
2592         int32 i, old;
2593         bool empty;
2594         G *gp;
2595         P *p;
2596
2597         old = runtime_gomaxprocs;
2598         if(old < 0 || old > MaxGomaxprocs || new <= 0 || new >MaxGomaxprocs)
2599                 runtime_throw("procresize: invalid arg");
2600         // initialize new P's
2601         for(i = 0; i < new; i++) {
2602                 p = runtime_allp[i];
2603                 if(p == nil) {
2604                         p = (P*)runtime_mallocgc(sizeof(*p), 0, FlagNoInvokeGC);
2605                         p->id = i;
2606                         p->status = Pgcstop;
2607                         runtime_atomicstorep(&runtime_allp[i], p);
2608                 }
2609                 if(p->mcache == nil) {
2610                         if(old==0 && i==0)
2611                                 p->mcache = m->mcache;  // bootstrap
2612                         else
2613                                 p->mcache = runtime_allocmcache();
2614                 }
2615         }
2616
2617         // redistribute runnable G's evenly
2618         // collect all runnable goroutines in global queue preserving FIFO order
2619         // FIFO order is required to ensure fairness even during frequent GCs
2620         // see http://golang.org/issue/7126
2621         empty = false;
2622         while(!empty) {
2623                 empty = true;
2624                 for(i = 0; i < old; i++) {
2625                         p = runtime_allp[i];
2626                         if(p->runqhead == p->runqtail)
2627                                 continue;
2628                         empty = false;
2629                         // pop from tail of local queue
2630                         p->runqtail--;
2631                         gp = p->runq[p->runqtail%nelem(p->runq)];
2632                         // push onto head of global queue
2633                         gp->schedlink = runtime_sched.runqhead;
2634                         runtime_sched.runqhead = gp;
2635                         if(runtime_sched.runqtail == nil)
2636                                 runtime_sched.runqtail = gp;
2637                         runtime_sched.runqsize++;
2638                 }
2639         }
2640         // fill local queues with at most nelem(p->runq)/2 goroutines
2641         // start at 1 because current M already executes some G and will acquire allp[0] below,
2642         // so if we have a spare G we want to put it into allp[1].
2643         for(i = 1; (uint32)i < (uint32)new * nelem(p->runq)/2 && runtime_sched.runqsize > 0; i++) {
2644                 gp = runtime_sched.runqhead;
2645                 runtime_sched.runqhead = gp->schedlink;
2646                 if(runtime_sched.runqhead == nil)
2647                         runtime_sched.runqtail = nil;
2648                 runtime_sched.runqsize--;
2649                 runqput(runtime_allp[i%new], gp);
2650         }
2651
2652         // free unused P's
2653         for(i = new; i < old; i++) {
2654                 p = runtime_allp[i];
2655                 runtime_freemcache(p->mcache);
2656                 p->mcache = nil;
2657                 gfpurge(p);
2658                 p->status = Pdead;
2659                 // can't free P itself because it can be referenced by an M in syscall
2660         }
2661
2662         if(m->p)
2663                 m->p->m = nil;
2664         m->p = nil;
2665         m->mcache = nil;
2666         p = runtime_allp[0];
2667         p->m = nil;
2668         p->status = Pidle;
2669         acquirep(p);
2670         for(i = new-1; i > 0; i--) {
2671                 p = runtime_allp[i];
2672                 p->status = Pidle;
2673                 pidleput(p);
2674         }
2675         runtime_atomicstore((uint32*)&runtime_gomaxprocs, new);
2676 }
2677
2678 // Associate p and the current m.
2679 static void
2680 acquirep(P *p)
2681 {
2682         if(m->p || m->mcache)
2683                 runtime_throw("acquirep: already in go");
2684         if(p->m || p->status != Pidle) {
2685                 runtime_printf("acquirep: p->m=%p(%d) p->status=%d\n", p->m, p->m ? p->m->id : 0, p->status);
2686                 runtime_throw("acquirep: invalid p state");
2687         }
2688         m->mcache = p->mcache;
2689         m->p = p;
2690         p->m = m;
2691         p->status = Prunning;
2692 }
2693
2694 // Disassociate p and the current m.
2695 static P*
2696 releasep(void)
2697 {
2698         P *p;
2699
2700         if(m->p == nil || m->mcache == nil)
2701                 runtime_throw("releasep: invalid arg");
2702         p = m->p;
2703         if(p->m != m || p->mcache != m->mcache || p->status != Prunning) {
2704                 runtime_printf("releasep: m=%p m->p=%p p->m=%p m->mcache=%p p->mcache=%p p->status=%d\n",
2705                         m, m->p, p->m, m->mcache, p->mcache, p->status);
2706                 runtime_throw("releasep: invalid p state");
2707         }
2708         m->p = nil;
2709         m->mcache = nil;
2710         p->m = nil;
2711         p->status = Pidle;
2712         return p;
2713 }
2714
2715 static void
2716 incidlelocked(int32 v)
2717 {
2718         runtime_lock(&runtime_sched);
2719         runtime_sched.nmidlelocked += v;
2720         if(v > 0)
2721                 checkdead();
2722         runtime_unlock(&runtime_sched);
2723 }
2724
2725 // Check for deadlock situation.
2726 // The check is based on number of running M's, if 0 -> deadlock.
2727 static void
2728 checkdead(void)
2729 {
2730         G *gp;
2731         int32 run, grunning, s;
2732         uintptr i;
2733
2734         // -1 for sysmon
2735         run = runtime_sched.mcount - runtime_sched.nmidle - runtime_sched.nmidlelocked - 1 - countextra();
2736         if(run > 0)
2737                 return;
2738         // If we are dying because of a signal caught on an already idle thread,
2739         // freezetheworld will cause all running threads to block.
2740         // And runtime will essentially enter into deadlock state,
2741         // except that there is a thread that will call runtime_exit soon.
2742         if(runtime_panicking > 0)
2743                 return;
2744         if(run < 0) {
2745                 runtime_printf("runtime: checkdead: nmidle=%d nmidlelocked=%d mcount=%d\n",
2746                         runtime_sched.nmidle, runtime_sched.nmidlelocked, runtime_sched.mcount);
2747                 runtime_throw("checkdead: inconsistent counts");
2748         }
2749         grunning = 0;
2750         runtime_lock(&allglock);
2751         for(i = 0; i < runtime_allglen; i++) {
2752                 gp = runtime_allg[i];
2753                 if(gp->isbackground)
2754                         continue;
2755                 s = gp->status;
2756                 if(s == Gwaiting)
2757                         grunning++;
2758                 else if(s == Grunnable || s == Grunning || s == Gsyscall) {
2759                         runtime_unlock(&allglock);
2760                         runtime_printf("runtime: checkdead: find g %D in status %d\n", gp->goid, s);
2761                         runtime_throw("checkdead: runnable g");
2762                 }
2763         }
2764         runtime_unlock(&allglock);
2765         if(grunning == 0)  // possible if main goroutine calls runtime_Goexit()
2766                 runtime_throw("no goroutines (main called runtime.Goexit) - deadlock!");
2767         m->throwing = -1;  // do not dump full stacks
2768         runtime_throw("all goroutines are asleep - deadlock!");
2769 }
2770
2771 static void
2772 sysmon(void)
2773 {
2774         uint32 idle, delay;
2775         int64 now, lastpoll, lasttrace;
2776         G *gp;
2777
2778         lasttrace = 0;
2779         idle = 0;  // how many cycles in succession we had not wokeup somebody
2780         delay = 0;
2781         for(;;) {
2782                 if(idle == 0)  // start with 20us sleep...
2783                         delay = 20;
2784                 else if(idle > 50)  // start doubling the sleep after 1ms...
2785                         delay *= 2;
2786                 if(delay > 10*1000)  // up to 10ms
2787                         delay = 10*1000;
2788                 runtime_usleep(delay);
2789                 if(runtime_debug.schedtrace <= 0 &&
2790                         (runtime_sched.gcwaiting || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs)) {  // TODO: fast atomic
2791                         runtime_lock(&runtime_sched);
2792                         if(runtime_atomicload(&runtime_sched.gcwaiting) || runtime_atomicload(&runtime_sched.npidle) == (uint32)runtime_gomaxprocs) {
2793                                 runtime_atomicstore(&runtime_sched.sysmonwait, 1);
2794                                 runtime_unlock(&runtime_sched);
2795                                 runtime_notesleep(&runtime_sched.sysmonnote);
2796                                 runtime_noteclear(&runtime_sched.sysmonnote);
2797                                 idle = 0;
2798                                 delay = 20;
2799                         } else
2800                                 runtime_unlock(&runtime_sched);
2801                 }
2802                 // poll network if not polled for more than 10ms
2803                 lastpoll = runtime_atomicload64(&runtime_sched.lastpoll);
2804                 now = runtime_nanotime();
2805                 if(lastpoll != 0 && lastpoll + 10*1000*1000 < now) {
2806                         runtime_cas64(&runtime_sched.lastpoll, lastpoll, now);
2807                         gp = runtime_netpoll(false);  // non-blocking
2808                         if(gp) {
2809                                 // Need to decrement number of idle locked M's
2810                                 // (pretending that one more is running) before injectglist.
2811                                 // Otherwise it can lead to the following situation:
2812                                 // injectglist grabs all P's but before it starts M's to run the P's,
2813                                 // another M returns from syscall, finishes running its G,
2814                                 // observes that there is no work to do and no other running M's
2815                                 // and reports deadlock.
2816                                 incidlelocked(-1);
2817                                 injectglist(gp);
2818                                 incidlelocked(1);
2819                         }
2820                 }
2821                 // retake P's blocked in syscalls
2822                 // and preempt long running G's
2823                 if(retake(now))
2824                         idle = 0;
2825                 else
2826                         idle++;
2827
2828                 if(runtime_debug.schedtrace > 0 && lasttrace + runtime_debug.schedtrace*1000000ll <= now) {
2829                         lasttrace = now;
2830                         runtime_schedtrace(runtime_debug.scheddetail);
2831                 }
2832         }
2833 }
2834
2835 typedef struct Pdesc Pdesc;
2836 struct Pdesc
2837 {
2838         uint32  schedtick;
2839         int64   schedwhen;
2840         uint32  syscalltick;
2841         int64   syscallwhen;
2842 };
2843 static Pdesc pdesc[MaxGomaxprocs];
2844
2845 static uint32
2846 retake(int64 now)
2847 {
2848         uint32 i, s, n;
2849         int64 t;
2850         P *p;
2851         Pdesc *pd;
2852
2853         n = 0;
2854         for(i = 0; i < (uint32)runtime_gomaxprocs; i++) {
2855                 p = runtime_allp[i];
2856                 if(p==nil)
2857                         continue;
2858                 pd = &pdesc[i];
2859                 s = p->status;
2860                 if(s == Psyscall) {
2861                         // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
2862                         t = p->syscalltick;
2863                         if(pd->syscalltick != t) {
2864                                 pd->syscalltick = t;
2865                                 pd->syscallwhen = now;
2866                                 continue;
2867                         }
2868                         // On the one hand we don't want to retake Ps if there is no other work to do,
2869                         // but on the other hand we want to retake them eventually
2870                         // because they can prevent the sysmon thread from deep sleep.
2871                         if(p->runqhead == p->runqtail &&
2872                                 runtime_atomicload(&runtime_sched.nmspinning) + runtime_atomicload(&runtime_sched.npidle) > 0 &&
2873                                 pd->syscallwhen + 10*1000*1000 > now)
2874                                 continue;
2875                         // Need to decrement number of idle locked M's
2876                         // (pretending that one more is running) before the CAS.
2877                         // Otherwise the M from which we retake can exit the syscall,
2878                         // increment nmidle and report deadlock.
2879                         incidlelocked(-1);
2880                         if(runtime_cas(&p->status, s, Pidle)) {
2881                                 n++;
2882                                 handoffp(p);
2883                         }
2884                         incidlelocked(1);
2885                 } else if(s == Prunning) {
2886                         // Preempt G if it's running for more than 10ms.
2887                         t = p->schedtick;
2888                         if(pd->schedtick != t) {
2889                                 pd->schedtick = t;
2890                                 pd->schedwhen = now;
2891                                 continue;
2892                         }
2893                         if(pd->schedwhen + 10*1000*1000 > now)
2894                                 continue;
2895                         // preemptone(p);
2896                 }
2897         }
2898         return n;
2899 }
2900
2901 // Tell all goroutines that they have been preempted and they should stop.
2902 // This function is purely best-effort.  It can fail to inform a goroutine if a
2903 // processor just started running it.
2904 // No locks need to be held.
2905 // Returns true if preemption request was issued to at least one goroutine.
2906 static bool
2907 preemptall(void)
2908 {
2909         return false;
2910 }
2911
2912 void
2913 runtime_schedtrace(bool detailed)
2914 {
2915         static int64 starttime;
2916         int64 now;
2917         int64 id1, id2, id3;
2918         int32 i, t, h;
2919         uintptr gi;
2920         const char *fmt;
2921         M *mp, *lockedm;
2922         G *gp, *lockedg;
2923         P *p;
2924
2925         now = runtime_nanotime();
2926         if(starttime == 0)
2927                 starttime = now;
2928
2929         runtime_lock(&runtime_sched);
2930         runtime_printf("SCHED %Dms: gomaxprocs=%d idleprocs=%d threads=%d idlethreads=%d runqueue=%d",
2931                 (now-starttime)/1000000, runtime_gomaxprocs, runtime_sched.npidle, runtime_sched.mcount,
2932                 runtime_sched.nmidle, runtime_sched.runqsize);
2933         if(detailed) {
2934                 runtime_printf(" gcwaiting=%d nmidlelocked=%d nmspinning=%d stopwait=%d sysmonwait=%d\n",
2935                         runtime_sched.gcwaiting, runtime_sched.nmidlelocked, runtime_sched.nmspinning,
2936                         runtime_sched.stopwait, runtime_sched.sysmonwait);
2937         }
2938         // We must be careful while reading data from P's, M's and G's.
2939         // Even if we hold schedlock, most data can be changed concurrently.
2940         // E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
2941         for(i = 0; i < runtime_gomaxprocs; i++) {
2942                 p = runtime_allp[i];
2943                 if(p == nil)
2944                         continue;
2945                 mp = p->m;
2946                 h = runtime_atomicload(&p->runqhead);
2947                 t = runtime_atomicload(&p->runqtail);
2948                 if(detailed)
2949                         runtime_printf("  P%d: status=%d schedtick=%d syscalltick=%d m=%d runqsize=%d gfreecnt=%d\n",
2950                                 i, p->status, p->schedtick, p->syscalltick, mp ? mp->id : -1, t-h, p->gfreecnt);
2951                 else {
2952                         // In non-detailed mode format lengths of per-P run queues as:
2953                         // [len1 len2 len3 len4]
2954                         fmt = " %d";
2955                         if(runtime_gomaxprocs == 1)
2956                                 fmt = " [%d]\n";
2957                         else if(i == 0)
2958                                 fmt = " [%d";
2959                         else if(i == runtime_gomaxprocs-1)
2960                                 fmt = " %d]\n";
2961                         runtime_printf(fmt, t-h);
2962                 }
2963         }
2964         if(!detailed) {
2965                 runtime_unlock(&runtime_sched);
2966                 return;
2967         }
2968         for(mp = runtime_allm; mp; mp = mp->alllink) {
2969                 p = mp->p;
2970                 gp = mp->curg;
2971                 lockedg = mp->lockedg;
2972                 id1 = -1;
2973                 if(p)
2974                         id1 = p->id;
2975                 id2 = -1;
2976                 if(gp)
2977                         id2 = gp->goid;
2978                 id3 = -1;
2979                 if(lockedg)
2980                         id3 = lockedg->goid;
2981                 runtime_printf("  M%d: p=%D curg=%D mallocing=%d throwing=%d gcing=%d"
2982                         " locks=%d dying=%d helpgc=%d spinning=%d blocked=%d lockedg=%D\n",
2983                         mp->id, id1, id2,
2984                         mp->mallocing, mp->throwing, mp->gcing, mp->locks, mp->dying, mp->helpgc,
2985                         mp->spinning, m->blocked, id3);
2986         }
2987         runtime_lock(&allglock);
2988         for(gi = 0; gi < runtime_allglen; gi++) {
2989                 gp = runtime_allg[gi];
2990                 mp = gp->m;
2991                 lockedm = gp->lockedm;
2992                 runtime_printf("  G%D: status=%d(%s) m=%d lockedm=%d\n",
2993                         gp->goid, gp->status, gp->waitreason, mp ? mp->id : -1,
2994                         lockedm ? lockedm->id : -1);
2995         }
2996         runtime_unlock(&allglock);
2997         runtime_unlock(&runtime_sched);
2998 }
2999
3000 // Put mp on midle list.
3001 // Sched must be locked.
3002 static void
3003 mput(M *mp)
3004 {
3005         mp->schedlink = runtime_sched.midle;
3006         runtime_sched.midle = mp;
3007         runtime_sched.nmidle++;
3008         checkdead();
3009 }
3010
3011 // Try to get an m from midle list.
3012 // Sched must be locked.
3013 static M*
3014 mget(void)
3015 {
3016         M *mp;
3017
3018         if((mp = runtime_sched.midle) != nil){
3019                 runtime_sched.midle = mp->schedlink;
3020                 runtime_sched.nmidle--;
3021         }
3022         return mp;
3023 }
3024
3025 // Put gp on the global runnable queue.
3026 // Sched must be locked.
3027 static void
3028 globrunqput(G *gp)
3029 {
3030         gp->schedlink = nil;
3031         if(runtime_sched.runqtail)
3032                 runtime_sched.runqtail->schedlink = gp;
3033         else
3034                 runtime_sched.runqhead = gp;
3035         runtime_sched.runqtail = gp;
3036         runtime_sched.runqsize++;
3037 }
3038
3039 // Put a batch of runnable goroutines on the global runnable queue.
3040 // Sched must be locked.
3041 static void
3042 globrunqputbatch(G *ghead, G *gtail, int32 n)
3043 {
3044         gtail->schedlink = nil;
3045         if(runtime_sched.runqtail)
3046                 runtime_sched.runqtail->schedlink = ghead;
3047         else
3048                 runtime_sched.runqhead = ghead;
3049         runtime_sched.runqtail = gtail;
3050         runtime_sched.runqsize += n;
3051 }
3052
3053 // Try get a batch of G's from the global runnable queue.
3054 // Sched must be locked.
3055 static G*
3056 globrunqget(P *p, int32 max)
3057 {
3058         G *gp, *gp1;
3059         int32 n;
3060
3061         if(runtime_sched.runqsize == 0)
3062                 return nil;
3063         n = runtime_sched.runqsize/runtime_gomaxprocs+1;
3064         if(n > runtime_sched.runqsize)
3065                 n = runtime_sched.runqsize;
3066         if(max > 0 && n > max)
3067                 n = max;
3068         if((uint32)n > nelem(p->runq)/2)
3069                 n = nelem(p->runq)/2;
3070         runtime_sched.runqsize -= n;
3071         if(runtime_sched.runqsize == 0)
3072                 runtime_sched.runqtail = nil;
3073         gp = runtime_sched.runqhead;
3074         runtime_sched.runqhead = gp->schedlink;
3075         n--;
3076         while(n--) {
3077                 gp1 = runtime_sched.runqhead;
3078                 runtime_sched.runqhead = gp1->schedlink;
3079                 runqput(p, gp1);
3080         }
3081         return gp;
3082 }
3083
3084 // Put p to on pidle list.
3085 // Sched must be locked.
3086 static void
3087 pidleput(P *p)
3088 {
3089         p->link = runtime_sched.pidle;
3090         runtime_sched.pidle = p;
3091         runtime_xadd(&runtime_sched.npidle, 1);  // TODO: fast atomic
3092 }
3093
3094 // Try get a p from pidle list.
3095 // Sched must be locked.
3096 static P*
3097 pidleget(void)
3098 {
3099         P *p;
3100
3101         p = runtime_sched.pidle;
3102         if(p) {
3103                 runtime_sched.pidle = p->link;
3104                 runtime_xadd(&runtime_sched.npidle, -1);  // TODO: fast atomic
3105         }
3106         return p;
3107 }
3108
3109 // Try to put g on local runnable queue.
3110 // If it's full, put onto global queue.
3111 // Executed only by the owner P.
3112 static void
3113 runqput(P *p, G *gp)
3114 {
3115         uint32 h, t;
3116
3117 retry:
3118         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3119         t = p->runqtail;
3120         if(t - h < nelem(p->runq)) {
3121                 p->runq[t%nelem(p->runq)] = gp;
3122                 runtime_atomicstore(&p->runqtail, t+1);  // store-release, makes the item available for consumption
3123                 return;
3124         }
3125         if(runqputslow(p, gp, h, t))
3126                 return;
3127         // the queue is not full, now the put above must suceed
3128         goto retry;
3129 }
3130
3131 // Put g and a batch of work from local runnable queue on global queue.
3132 // Executed only by the owner P.
3133 static bool
3134 runqputslow(P *p, G *gp, uint32 h, uint32 t)
3135 {
3136         G *batch[nelem(p->runq)/2+1];
3137         uint32 n, i;
3138
3139         // First, grab a batch from local queue.
3140         n = t-h;
3141         n = n/2;
3142         if(n != nelem(p->runq)/2)
3143                 runtime_throw("runqputslow: queue is not full");
3144         for(i=0; i<n; i++)
3145                 batch[i] = p->runq[(h+i)%nelem(p->runq)];
3146         if(!runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3147                 return false;
3148         batch[n] = gp;
3149         // Link the goroutines.
3150         for(i=0; i<n; i++)
3151                 batch[i]->schedlink = batch[i+1];
3152         // Now put the batch on global queue.
3153         runtime_lock(&runtime_sched);
3154         globrunqputbatch(batch[0], batch[n], n+1);
3155         runtime_unlock(&runtime_sched);
3156         return true;
3157 }
3158
3159 // Get g from local runnable queue.
3160 // Executed only by the owner P.
3161 static G*
3162 runqget(P *p)
3163 {
3164         G *gp;
3165         uint32 t, h;
3166
3167         for(;;) {
3168                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3169                 t = p->runqtail;
3170                 if(t == h)
3171                         return nil;
3172                 gp = p->runq[h%nelem(p->runq)];
3173                 if(runtime_cas(&p->runqhead, h, h+1))  // cas-release, commits consume
3174                         return gp;
3175         }
3176 }
3177
3178 // Grabs a batch of goroutines from local runnable queue.
3179 // batch array must be of size nelem(p->runq)/2. Returns number of grabbed goroutines.
3180 // Can be executed by any P.
3181 static uint32
3182 runqgrab(P *p, G **batch)
3183 {
3184         uint32 t, h, n, i;
3185
3186         for(;;) {
3187                 h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with other consumers
3188                 t = runtime_atomicload(&p->runqtail);  // load-acquire, synchronize with the producer
3189                 n = t-h;
3190                 n = n - n/2;
3191                 if(n == 0)
3192                         break;
3193                 if(n > nelem(p->runq)/2)  // read inconsistent h and t
3194                         continue;
3195                 for(i=0; i<n; i++)
3196                         batch[i] = p->runq[(h+i)%nelem(p->runq)];
3197                 if(runtime_cas(&p->runqhead, h, h+n))  // cas-release, commits consume
3198                         break;
3199         }
3200         return n;
3201 }
3202
3203 // Steal half of elements from local runnable queue of p2
3204 // and put onto local runnable queue of p.
3205 // Returns one of the stolen elements (or nil if failed).
3206 static G*
3207 runqsteal(P *p, P *p2)
3208 {
3209         G *gp;
3210         G *batch[nelem(p->runq)/2];
3211         uint32 t, h, n, i;
3212
3213         n = runqgrab(p2, batch);
3214         if(n == 0)
3215                 return nil;
3216         n--;
3217         gp = batch[n];
3218         if(n == 0)
3219                 return gp;
3220         h = runtime_atomicload(&p->runqhead);  // load-acquire, synchronize with consumers
3221         t = p->runqtail;
3222         if(t - h + n >= nelem(p->runq))
3223                 runtime_throw("runqsteal: runq overflow");
3224         for(i=0; i<n; i++, t++)
3225                 p->runq[t%nelem(p->runq)] = batch[i];
3226         runtime_atomicstore(&p->runqtail, t);  // store-release, makes the item available for consumption
3227         return gp;
3228 }
3229
3230 void runtime_testSchedLocalQueue(void)
3231   __asm__("runtime.testSchedLocalQueue");
3232
3233 void
3234 runtime_testSchedLocalQueue(void)
3235 {
3236         P p;
3237         G gs[nelem(p.runq)];
3238         int32 i, j;
3239
3240         runtime_memclr((byte*)&p, sizeof(p));
3241
3242         for(i = 0; i < (int32)nelem(gs); i++) {
3243                 if(runqget(&p) != nil)
3244                         runtime_throw("runq is not empty initially");
3245                 for(j = 0; j < i; j++)
3246                         runqput(&p, &gs[i]);
3247                 for(j = 0; j < i; j++) {
3248                         if(runqget(&p) != &gs[i]) {
3249                                 runtime_printf("bad element at iter %d/%d\n", i, j);
3250                                 runtime_throw("bad element");
3251                         }
3252                 }
3253                 if(runqget(&p) != nil)
3254                         runtime_throw("runq is not empty afterwards");
3255         }
3256 }
3257
3258 void runtime_testSchedLocalQueueSteal(void)
3259   __asm__("runtime.testSchedLocalQueueSteal");
3260
3261 void
3262 runtime_testSchedLocalQueueSteal(void)
3263 {
3264         P p1, p2;
3265         G gs[nelem(p1.runq)], *gp;
3266         int32 i, j, s;
3267
3268         runtime_memclr((byte*)&p1, sizeof(p1));
3269         runtime_memclr((byte*)&p2, sizeof(p2));
3270
3271         for(i = 0; i < (int32)nelem(gs); i++) {
3272                 for(j = 0; j < i; j++) {
3273                         gs[j].sig = 0;
3274                         runqput(&p1, &gs[j]);
3275                 }
3276                 gp = runqsteal(&p2, &p1);
3277                 s = 0;
3278                 if(gp) {
3279                         s++;
3280                         gp->sig++;
3281                 }
3282                 while((gp = runqget(&p2)) != nil) {
3283                         s++;
3284                         gp->sig++;
3285                 }
3286                 while((gp = runqget(&p1)) != nil)
3287                         gp->sig++;
3288                 for(j = 0; j < i; j++) {
3289                         if(gs[j].sig != 1) {
3290                                 runtime_printf("bad element %d(%d) at iter %d\n", j, gs[j].sig, i);
3291                                 runtime_throw("bad element");
3292                         }
3293                 }
3294                 if(s != i/2 && s != i/2+1) {
3295                         runtime_printf("bad steal %d, want %d or %d, iter %d\n",
3296                                 s, i/2, i/2+1, i);
3297                         runtime_throw("bad steal");
3298                 }
3299         }
3300 }
3301
3302 int32
3303 runtime_setmaxthreads(int32 in)
3304 {
3305         int32 out;
3306
3307         runtime_lock(&runtime_sched);
3308         out = runtime_sched.maxmcount;
3309         runtime_sched.maxmcount = in;
3310         checkmcount();
3311         runtime_unlock(&runtime_sched);
3312         return out;
3313 }
3314
3315 void
3316 runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj))
3317 {
3318         enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
3319 }
3320
3321 // When a function calls a closure, it passes the closure value to
3322 // __go_set_closure immediately before the function call.  When a
3323 // function uses a closure, it calls __go_get_closure immediately on
3324 // function entry.  This is a hack, but it will work on any system.
3325 // It would be better to use the static chain register when there is
3326 // one.  It is also worth considering expanding these functions
3327 // directly in the compiler.
3328
3329 void
3330 __go_set_closure(void* v)
3331 {
3332         g->closure = v;
3333 }
3334
3335 void *
3336 __go_get_closure(void)
3337 {
3338         return g->closure;
3339 }
3340
3341 // Return whether we are waiting for a GC.  This gc toolchain uses
3342 // preemption instead.
3343 bool
3344 runtime_gcwaiting(void)
3345 {
3346         return runtime_sched.gcwaiting;
3347 }