libgo/go/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the "mark 1" sub-phase. In this sub-phase, Ps are
  32 // allowed to locally cache parts of the work queue.
  33 //
  34 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  35 //    (from _GCoff), enabling the write barrier, enabling mutator
  36 //    assists, and enqueueing root mark jobs. No objects may be
  37 //    scanned until all Ps have enabled the write barrier, which is
  38 //    accomplished using STW.
  39 //
  40 //    b. Start the world. From this point, GC work is done by mark
  41 //    workers started by the scheduler and by assists performed as
  42 //    part of allocation. The write barrier shades both the
  43 //    overwritten pointer and the new pointer value for any pointer
  44 //    writes (see mbarrier.go for details). Newly allocated objects
  45 //    are immediately marked black.
  46 //
  47 //    c. GC performs root marking jobs. This includes scanning all
  48 //    stacks, shading all globals, and shading any heap pointers in
  49 //    off-heap runtime data structures. Scanning a stack stops a
  50 //    goroutine, shades any pointers found on its stack, and then
  51 //    resumes the goroutine.
  52 //
  53 //    d. GC drains the work queue of grey objects, scanning each grey
  54 //    object to black and shading all pointers found in the object
  55 //    (which in turn may add those pointers to the work queue).
  56 //
  57 // 3. Once the global work queue is empty (but local work queue caches
  58 // may still contain work), GC performs the "mark 2" sub-phase.
  59 //
  60 //    a. GC stops all workers, disables local work queue caches,
  61 //    flushes each P's local work queue cache to the global work queue
  62 //    cache, and reenables workers.
  63 //
  64 //    b. GC again drains the work queue, as in 2d above.
  65 //
  66 // 4. Once the work queue is empty, GC performs mark termination.
  67 //
  68 //    a. Stop the world.
  69 //
  70 //    b. Set gcphase to _GCmarktermination, and disable workers and
  71 //    assists.
  72 //
  73 //    c. Drain any remaining work from the work queue (typically there
  74 //    will be none).
  75 //
  76 //    d. Perform other housekeeping like flushing mcaches.
  77 //
  78 // 5. GC performs the sweep phase.
  79 //
  80 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  81 //    setting up sweep state and disabling the write barrier.
  82 //
  83 //    b. Start the world. From this point on, newly allocated objects
  84 //    are white, and allocating sweeps spans before use if necessary.
  85 //
  86 //    c. GC does concurrent sweeping in the background and in response
  87 //    to allocation. See description below.
  88 //
  89 // 6. When sufficient allocation has taken place, replay the sequence
  90 // starting with 1 above. See discussion of GC rate below.
  91
  92 // Concurrent sweep.
  93 //
  94 // The sweep phase proceeds concurrently with normal program execution.
  95 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  96 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  97 // At the end of STW mark termination all spans are marked as "needs sweeping".
  98 //
  99 // The background sweeper goroutine simply sweeps spans one-by-one.
 100 //
 101 // To avoid requesting more OS memory while there are unswept spans, when a
 102 // goroutine needs another span, it first attempts to reclaim that much memory
 103 // by sweeping. When a goroutine needs to allocate a new small-object span, it
 104 // sweeps small-object spans for the same object size until it frees at least
 105 // one object. When a goroutine needs to allocate large-object span from heap,
 106 // it sweeps spans until it frees at least that many pages into heap. There is
 107 // one case where this may not suffice: if a goroutine sweeps and frees two
 108 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 109 // span, but there can still be other one-page unswept spans which could be
 110 // combined into a two-page span.
 111 //
 112 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 113 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 114 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 115 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 116 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 117 // The finalizer goroutine is kicked off only when all spans are swept.
 118 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 119
 120 // GC rate.
 121 // Next GC is after we've allocated an extra amount of memory proportional to
 122 // the amount already in use. The proportion is controlled by GOGC environment variable
 123 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 124 // (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 125 // proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 126 // (and also the amount of extra memory used).
 127
 128 // Oblets
 129 //
 130 // In order to prevent long pauses while scanning large objects and to
 131 // improve parallelism, the garbage collector breaks up scan jobs for
 132 // objects larger than maxObletBytes into "oblets" of at most
 133 // maxObletBytes. When scanning encounters the beginning of a large
 134 // object, it scans only the first oblet and enqueues the remaining
 135 // oblets as new scan jobs.
 136
 137 package runtime
 138
 139 import (
 140         "runtime/internal/atomic"
 141         "runtime/internal/sys"
 142         "unsafe"
 143 )
 144
 145 const (
 146         _DebugGC         = 0
 147         _ConcurrentSweep = true
 148         _FinBlockSize    = 4 * 1024
 149
 150         // sweepMinHeapDistance is a lower bound on the heap distance
 151         // (in bytes) reserved for concurrent sweeping between GC
 152         // cycles. This will be scaled by gcpercent/100.
 153         sweepMinHeapDistance = 1024 * 1024
 154 )
 155
 156 // heapminimum is the minimum heap size at which to trigger GC.
 157 // For small heaps, this overrides the usual GOGC*live set rule.
 158 //
 159 // When there is a very small live set but a lot of allocation, simply
 160 // collecting when the heap reaches GOGC*live results in many GC
 161 // cycles and high total per-GC overhead. This minimum amortizes this
 162 // per-GC overhead while keeping the heap reasonably small.
 163 //
 164 // During initialization this is set to 4MB*GOGC/100. In the case of
 165 // GOGC==0, this will set heapminimum to 0, resulting in constant
 166 // collection even when the heap size is small, which is useful for
 167 // debugging.
 168 var heapminimum uint64 = defaultHeapMinimum
 169
 170 // defaultHeapMinimum is the value of heapminimum for GOGC==100.
 171 const defaultHeapMinimum = 4 << 20
 172
 173 // Initialized from $GOGC.  GOGC=off means no GC.
 174 var gcpercent int32
 175
 176 func gcinit() {
 177         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 178                 throw("size of Workbuf is suboptimal")
 179         }
 180
 181         // No sweep on the first cycle.
 182         mheap_.sweepdone = 1
 183
 184         // Set a reasonable initial GC trigger.
 185         memstats.triggerRatio = 7 / 8.0
 186
 187         // Fake a heap_marked value so it looks like a trigger at
 188         // heapminimum is the appropriate growth from heap_marked.
 189         // This will go into computing the initial GC goal.
 190         memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
 191
 192         // Set gcpercent from the environment. This will also compute
 193         // and set the GC trigger and goal.
 194         _ = setGCPercent(readgogc())
 195
 196         work.startSema = 1
 197         work.markDoneSema = 1
 198 }
 199
 200 func readgogc() int32 {
 201         p := gogetenv("GOGC")
 202         if p == "off" {
 203                 return -1
 204         }
 205         if n, ok := atoi32(p); ok {
 206                 return n
 207         }
 208         return 100
 209 }
 210
 211 // gcenable is called after the bulk of the runtime initialization,
 212 // just before we're about to start letting user code run.
 213 // It kicks off the background sweeper goroutine and enables GC.
 214 func gcenable() {
 215         c := make(chan int, 1)
 216         expectSystemGoroutine()
 217         go bgsweep(c)
 218         <-c
 219         memstats.enablegc = true // now that runtime is initialized, GC is okay
 220 }
 221
 222 //go:linkname setGCPercent runtime_debug.setGCPercent
 223 func setGCPercent(in int32) (out int32) {
 224         lock(&mheap_.lock)
 225         out = gcpercent
 226         if in < 0 {
 227                 in = -1
 228         }
 229         gcpercent = in
 230         heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
 231         // Update pacing in response to gcpercent change.
 232         gcSetTriggerRatio(memstats.triggerRatio)
 233         unlock(&mheap_.lock)
 234         return out
 235 }
 236
 237 // Garbage collector phase.
 238 // Indicates to write barrier and synchronization task to perform.
 239 var gcphase uint32
 240
 241 // The compiler knows about this variable.
 242 // If you change it, you must change builtin/runtime.go, too.
 243 // If you change the first four bytes, you must also change the write
 244 // barrier insertion code.
 245 var writeBarrier struct {
 246         enabled bool    // compiler emits a check of this before calling write barrier
 247         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 248         needed  bool    // whether we need a write barrier for current GC phase
 249         cgo     bool    // whether we need a write barrier for a cgo check
 250         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 251 }
 252
 253 // gcBlackenEnabled is 1 if mutator assists and background mark
 254 // workers are allowed to blacken objects. This must only be set when
 255 // gcphase == _GCmark.
 256 var gcBlackenEnabled uint32
 257
 258 // gcBlackenPromptly indicates that optimizations that may
 259 // hide work from the global work queue should be disabled.
 260 //
 261 // If gcBlackenPromptly is true, per-P gcWork caches should
 262 // be flushed immediately and new objects should be allocated black.
 263 //
 264 // There is a tension between allocating objects white and
 265 // allocating them black. If white and the objects die before being
 266 // marked they can be collected during this GC cycle. On the other
 267 // hand allocating them black will reduce _GCmarktermination latency
 268 // since more work is done in the mark phase. This tension is resolved
 269 // by allocating white until the mark phase is approaching its end and
 270 // then allocating black for the remainder of the mark phase.
 271 var gcBlackenPromptly bool
 272
 273 const (
 274         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 275         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 276         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 277 )
 278
 279 //go:nosplit
 280 func setGCPhase(x uint32) {
 281         atomic.Store(&gcphase, x)
 282         writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
 283         writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 284 }
 285
 286 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 287 // should operate in.
 288 //
 289 // Concurrent marking happens through four different mechanisms. One
 290 // is mutator assists, which happen in response to allocations and are
 291 // not scheduled. The other three are variations in the per-P mark
 292 // workers and are distinguished by gcMarkWorkerMode.
 293 type gcMarkWorkerMode int
 294
 295 const (
 296         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 297         // worker is dedicated to running that mark worker. The mark
 298         // worker should run without preemption.
 299         gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
 300
 301         // gcMarkWorkerFractionalMode indicates that a P is currently
 302         // running the "fractional" mark worker. The fractional worker
 303         // is necessary when GOMAXPROCS*gcGoalUtilization is not an
 304         // integer. The fractional worker should run until it is
 305         // preempted and will be scheduled to pick up the fractional
 306         // part of GOMAXPROCS*gcGoalUtilization.
 307         gcMarkWorkerFractionalMode
 308
 309         // gcMarkWorkerIdleMode indicates that a P is running the mark
 310         // worker because it has nothing else to do. The idle worker
 311         // should run until it is preempted and account its time
 312         // against gcController.idleMarkTime.
 313         gcMarkWorkerIdleMode
 314 )
 315
 316 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 317 // to use in execution traces.
 318 var gcMarkWorkerModeStrings = [...]string{
 319         "GC (dedicated)",
 320         "GC (fractional)",
 321         "GC (idle)",
 322 }
 323
 324 // gcController implements the GC pacing controller that determines
 325 // when to trigger concurrent garbage collection and how much marking
 326 // work to do in mutator assists and background marking.
 327 //
 328 // It uses a feedback control algorithm to adjust the memstats.gc_trigger
 329 // trigger based on the heap growth and GC CPU utilization each cycle.
 330 // This algorithm optimizes for heap growth to match GOGC and for CPU
 331 // utilization between assist and background marking to be 25% of
 332 // GOMAXPROCS. The high-level design of this algorithm is documented
 333 // at https://golang.org/s/go15gcpacing.
 334 //
 335 // All fields of gcController are used only during a single mark
 336 // cycle.
 337 var gcController gcControllerState
 338
 339 type gcControllerState struct {
 340         // scanWork is the total scan work performed this cycle. This
 341         // is updated atomically during the cycle. Updates occur in
 342         // bounded batches, since it is both written and read
 343         // throughout the cycle. At the end of the cycle, this is how
 344         // much of the retained heap is scannable.
 345         //
 346         // Currently this is the bytes of heap scanned. For most uses,
 347         // this is an opaque unit of work, but for estimation the
 348         // definition is important.
 349         scanWork int64
 350
 351         // bgScanCredit is the scan work credit accumulated by the
 352         // concurrent background scan. This credit is accumulated by
 353         // the background scan and stolen by mutator assists. This is
 354         // updated atomically. Updates occur in bounded batches, since
 355         // it is both written and read throughout the cycle.
 356         bgScanCredit int64
 357
 358         // assistTime is the nanoseconds spent in mutator assists
 359         // during this cycle. This is updated atomically. Updates
 360         // occur in bounded batches, since it is both written and read
 361         // throughout the cycle.
 362         assistTime int64
 363
 364         // dedicatedMarkTime is the nanoseconds spent in dedicated
 365         // mark workers during this cycle. This is updated atomically
 366         // at the end of the concurrent mark phase.
 367         dedicatedMarkTime int64
 368
 369         // fractionalMarkTime is the nanoseconds spent in the
 370         // fractional mark worker during this cycle. This is updated
 371         // atomically throughout the cycle and will be up-to-date if
 372         // the fractional mark worker is not currently running.
 373         fractionalMarkTime int64
 374
 375         // idleMarkTime is the nanoseconds spent in idle marking
 376         // during this cycle. This is updated atomically throughout
 377         // the cycle.
 378         idleMarkTime int64
 379
 380         // markStartTime is the absolute start time in nanoseconds
 381         // that assists and background mark workers started.
 382         markStartTime int64
 383
 384         // dedicatedMarkWorkersNeeded is the number of dedicated mark
 385         // workers that need to be started. This is computed at the
 386         // beginning of each cycle and decremented atomically as
 387         // dedicated mark workers get started.
 388         dedicatedMarkWorkersNeeded int64
 389
 390         // assistWorkPerByte is the ratio of scan work to allocated
 391         // bytes that should be performed by mutator assists. This is
 392         // computed at the beginning of each cycle and updated every
 393         // time heap_scan is updated.
 394         assistWorkPerByte float64
 395
 396         // assistBytesPerWork is 1/assistWorkPerByte.
 397         assistBytesPerWork float64
 398
 399         // fractionalUtilizationGoal is the fraction of wall clock
 400         // time that should be spent in the fractional mark worker.
 401         // For example, if the overall mark utilization goal is 25%
 402         // and GOMAXPROCS is 6, one P will be a dedicated mark worker
 403         // and this will be set to 0.5 so that 50% of the time some P
 404         // is in a fractional mark worker. This is computed at the
 405         // beginning of each cycle.
 406         fractionalUtilizationGoal float64
 407
 408         _ [sys.CacheLineSize]byte
 409
 410         // fractionalMarkWorkersNeeded is the number of fractional
 411         // mark workers that need to be started. This is either 0 or
 412         // 1. This is potentially updated atomically at every
 413         // scheduling point (hence it gets its own cache line).
 414         fractionalMarkWorkersNeeded int64
 415
 416         _ [sys.CacheLineSize]byte
 417 }
 418
 419 // startCycle resets the GC controller's state and computes estimates
 420 // for a new GC cycle. The caller must hold worldsema.
 421 func (c *gcControllerState) startCycle() {
 422         c.scanWork = 0
 423         c.bgScanCredit = 0
 424         c.assistTime = 0
 425         c.dedicatedMarkTime = 0
 426         c.fractionalMarkTime = 0
 427         c.idleMarkTime = 0
 428
 429         // If this is the first GC cycle or we're operating on a very
 430         // small heap, fake heap_marked so it looks like gc_trigger is
 431         // the appropriate growth from heap_marked, even though the
 432         // real heap_marked may not have a meaningful value (on the
 433         // first cycle) or may be much smaller (resulting in a large
 434         // error response).
 435         if memstats.gc_trigger <= heapminimum {
 436                 memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + memstats.triggerRatio))
 437         }
 438
 439         // Re-compute the heap goal for this cycle in case something
 440         // changed. This is the same calculation we use elsewhere.
 441         memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 442         if gcpercent < 0 {
 443                 memstats.next_gc = ^uint64(0)
 444         }
 445
 446         // Ensure that the heap goal is at least a little larger than
 447         // the current live heap size. This may not be the case if GC
 448         // start is delayed or if the allocation that pushed heap_live
 449         // over gc_trigger is large or if the trigger is really close to
 450         // GOGC. Assist is proportional to this distance, so enforce a
 451         // minimum distance, even if it means going over the GOGC goal
 452         // by a tiny bit.
 453         if memstats.next_gc < memstats.heap_live+1024*1024 {
 454                 memstats.next_gc = memstats.heap_live + 1024*1024
 455         }
 456
 457         // Compute the total mark utilization goal and divide it among
 458         // dedicated and fractional workers.
 459         totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
 460         c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal)
 461         c.fractionalUtilizationGoal = totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)
 462         if c.fractionalUtilizationGoal > 0 {
 463                 c.fractionalMarkWorkersNeeded = 1
 464         } else {
 465                 c.fractionalMarkWorkersNeeded = 0
 466         }
 467
 468         // Clear per-P state
 469         for _, p := range &allp {
 470                 if p == nil {
 471                         break
 472                 }
 473                 p.gcAssistTime = 0
 474         }
 475
 476         // Compute initial values for controls that are updated
 477         // throughout the cycle.
 478         c.revise()
 479
 480         if debug.gcpacertrace > 0 {
 481                 print("pacer: assist ratio=", c.assistWorkPerByte,
 482                         " (scan ", memstats.heap_scan>>20, " MB in ",
 483                         work.initialHeapLive>>20, "->",
 484                         memstats.next_gc>>20, " MB)",
 485                         " workers=", c.dedicatedMarkWorkersNeeded,
 486                         "+", c.fractionalMarkWorkersNeeded, "\n")
 487         }
 488 }
 489
 490 // revise updates the assist ratio during the GC cycle to account for
 491 // improved estimates. This should be called either under STW or
 492 // whenever memstats.heap_scan, memstats.heap_live, or
 493 // memstats.next_gc is updated (with mheap_.lock held).
 494 //
 495 // It should only be called when gcBlackenEnabled != 0 (because this
 496 // is when assists are enabled and the necessary statistics are
 497 // available).
 498 func (c *gcControllerState) revise() {
 499         // Compute the expected scan work remaining.
 500         //
 501         // Note that we currently count allocations during GC as both
 502         // scannable heap (heap_scan) and scan work completed
 503         // (scanWork), so this difference won't be changed by
 504         // allocations during GC.
 505         //
 506         // This particular estimate is a strict upper bound on the
 507         // possible remaining scan work for the current heap.
 508         // You might consider dividing this by 2 (or by
 509         // (100+GOGC)/100) to counter this over-estimation, but
 510         // benchmarks show that this has almost no effect on mean
 511         // mutator utilization, heap size, or assist time and it
 512         // introduces the danger of under-estimating and letting the
 513         // mutator outpace the garbage collector.
 514         scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
 515         if scanWorkExpected < 1000 {
 516                 // We set a somewhat arbitrary lower bound on
 517                 // remaining scan work since if we aim a little high,
 518                 // we can miss by a little.
 519                 //
 520                 // We *do* need to enforce that this is at least 1,
 521                 // since marking is racy and double-scanning objects
 522                 // may legitimately make the expected scan work
 523                 // negative.
 524                 scanWorkExpected = 1000
 525         }
 526
 527         // Compute the heap distance remaining.
 528         heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
 529         if heapDistance <= 0 {
 530                 // This shouldn't happen, but if it does, avoid
 531                 // dividing by zero or setting the assist negative.
 532                 heapDistance = 1
 533         }
 534
 535         // Compute the mutator assist ratio so by the time the mutator
 536         // allocates the remaining heap bytes up to next_gc, it will
 537         // have done (or stolen) the remaining amount of scan work.
 538         c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
 539         c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
 540 }
 541
 542 // endCycle computes the trigger ratio for the next cycle.
 543 func (c *gcControllerState) endCycle() float64 {
 544         if work.userForced {
 545                 // Forced GC means this cycle didn't start at the
 546                 // trigger, so where it finished isn't good
 547                 // information about how to adjust the trigger.
 548                 // Just leave it where it is.
 549                 return memstats.triggerRatio
 550         }
 551
 552         // Proportional response gain for the trigger controller. Must
 553         // be in [0, 1]. Lower values smooth out transient effects but
 554         // take longer to respond to phase changes. Higher values
 555         // react to phase changes quickly, but are more affected by
 556         // transient changes. Values near 1 may be unstable.
 557         const triggerGain = 0.5
 558
 559         // Compute next cycle trigger ratio. First, this computes the
 560         // "error" for this cycle; that is, how far off the trigger
 561         // was from what it should have been, accounting for both heap
 562         // growth and GC CPU utilization. We compute the actual heap
 563         // growth during this cycle and scale that by how far off from
 564         // the goal CPU utilization we were (to estimate the heap
 565         // growth if we had the desired CPU utilization). The
 566         // difference between this estimate and the GOGC-based goal
 567         // heap growth is the error.
 568         goalGrowthRatio := float64(gcpercent) / 100
 569         actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
 570         assistDuration := nanotime() - c.markStartTime
 571
 572         // Assume background mark hit its utilization goal.
 573         utilization := gcGoalUtilization
 574         // Add assist utilization; avoid divide by zero.
 575         if assistDuration > 0 {
 576                 utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
 577         }
 578
 579         triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
 580
 581         // Finally, we adjust the trigger for next time by this error,
 582         // damped by the proportional gain.
 583         triggerRatio := memstats.triggerRatio + triggerGain*triggerError
 584
 585         if debug.gcpacertrace > 0 {
 586                 // Print controller state in terms of the design
 587                 // document.
 588                 H_m_prev := memstats.heap_marked
 589                 h_t := memstats.triggerRatio
 590                 H_T := memstats.gc_trigger
 591                 h_a := actualGrowthRatio
 592                 H_a := memstats.heap_live
 593                 h_g := goalGrowthRatio
 594                 H_g := int64(float64(H_m_prev) * (1 + h_g))
 595                 u_a := utilization
 596                 u_g := gcGoalUtilization
 597                 W_a := c.scanWork
 598                 print("pacer: H_m_prev=", H_m_prev,
 599                         " h_t=", h_t, " H_T=", H_T,
 600                         " h_a=", h_a, " H_a=", H_a,
 601                         " h_g=", h_g, " H_g=", H_g,
 602                         " u_a=", u_a, " u_g=", u_g,
 603                         " W_a=", W_a,
 604                         " goalΔ=", goalGrowthRatio-h_t,
 605                         " actualΔ=", h_a-h_t,
 606                         " u_a/u_g=", u_a/u_g,
 607                         "\n")
 608         }
 609
 610         return triggerRatio
 611 }
 612
 613 // enlistWorker encourages another dedicated mark worker to start on
 614 // another P if there are spare worker slots. It is used by putfull
 615 // when more work is made available.
 616 //
 617 //go:nowritebarrier
 618 func (c *gcControllerState) enlistWorker() {
 619         // If there are idle Ps, wake one so it will run an idle worker.
 620         // NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
 621         //
 622         //      if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
 623         //              wakep()
 624         //              return
 625         //      }
 626
 627         // There are no idle Ps. If we need more dedicated workers,
 628         // try to preempt a running P so it will switch to a worker.
 629         if c.dedicatedMarkWorkersNeeded <= 0 {
 630                 return
 631         }
 632         // Pick a random other P to preempt.
 633         if gomaxprocs <= 1 {
 634                 return
 635         }
 636         gp := getg()
 637         if gp == nil || gp.m == nil || gp.m.p == 0 {
 638                 return
 639         }
 640         myID := gp.m.p.ptr().id
 641         for tries := 0; tries < 5; tries++ {
 642                 id := int32(fastrandn(uint32(gomaxprocs - 1)))
 643                 if id >= myID {
 644                         id++
 645                 }
 646                 p := allp[id]
 647                 if p.status != _Prunning {
 648                         continue
 649                 }
 650                 if preemptone(p) {
 651                         return
 652                 }
 653         }
 654 }
 655
 656 // findRunnableGCWorker returns the background mark worker for _p_ if it
 657 // should be run. This must only be called when gcBlackenEnabled != 0.
 658 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 659         if gcBlackenEnabled == 0 {
 660                 throw("gcControllerState.findRunnable: blackening not enabled")
 661         }
 662         if _p_.gcBgMarkWorker == 0 {
 663                 // The mark worker associated with this P is blocked
 664                 // performing a mark transition. We can't run it
 665                 // because it may be on some other run or wait queue.
 666                 return nil
 667         }
 668
 669         if !gcMarkWorkAvailable(_p_) {
 670                 // No work to be done right now. This can happen at
 671                 // the end of the mark phase when there are still
 672                 // assists tapering off. Don't bother running a worker
 673                 // now because it'll just return immediately.
 674                 return nil
 675         }
 676
 677         decIfPositive := func(ptr *int64) bool {
 678                 if *ptr > 0 {
 679                         if atomic.Xaddint64(ptr, -1) >= 0 {
 680                                 return true
 681                         }
 682                         // We lost a race
 683                         atomic.Xaddint64(ptr, +1)
 684                 }
 685                 return false
 686         }
 687
 688         if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
 689                 // This P is now dedicated to marking until the end of
 690                 // the concurrent mark phase.
 691                 _p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 692         } else {
 693                 if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
 694                         // No more workers are need right now.
 695                         return nil
 696                 }
 697
 698                 // This P has picked the token for the fractional worker.
 699                 // Is the GC currently under or at the utilization goal?
 700                 // If so, do more work.
 701                 //
 702                 // We used to check whether doing one time slice of work
 703                 // would remain under the utilization goal, but that has the
 704                 // effect of delaying work until the mutator has run for
 705                 // enough time slices to pay for the work. During those time
 706                 // slices, write barriers are enabled, so the mutator is running slower.
 707                 // Now instead we do the work whenever we're under or at the
 708                 // utilization work and pay for it by letting the mutator run later.
 709                 // This doesn't change the overall utilization averages, but it
 710                 // front loads the GC work so that the GC finishes earlier and
 711                 // write barriers can be turned off sooner, effectively giving
 712                 // the mutator a faster machine.
 713                 //
 714                 // The old, slower behavior can be restored by setting
 715                 //      gcForcePreemptNS = forcePreemptNS.
 716                 const gcForcePreemptNS = 0
 717
 718                 // TODO(austin): We could fast path this and basically
 719                 // eliminate contention on c.fractionalMarkWorkersNeeded by
 720                 // precomputing the minimum time at which it's worth
 721                 // next scheduling the fractional worker. Then Ps
 722                 // don't have to fight in the window where we've
 723                 // passed that deadline and no one has started the
 724                 // worker yet.
 725                 //
 726                 // TODO(austin): Shorter preemption interval for mark
 727                 // worker to improve fairness and give this
 728                 // finer-grained control over schedule?
 729                 now := nanotime() - gcController.markStartTime
 730                 then := now + gcForcePreemptNS
 731                 timeUsed := c.fractionalMarkTime + gcForcePreemptNS
 732                 if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
 733                         // Nope, we'd overshoot the utilization goal
 734                         atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
 735                         return nil
 736                 }
 737                 _p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 738         }
 739
 740         // Run the background mark worker
 741         gp := _p_.gcBgMarkWorker.ptr()
 742         casgstatus(gp, _Gwaiting, _Grunnable)
 743         if trace.enabled {
 744                 traceGoUnpark(gp, 0)
 745         }
 746         return gp
 747 }
 748
 749 // gcSetTriggerRatio sets the trigger ratio and updates everything
 750 // derived from it: the absolute trigger, the heap goal, mark pacing,
 751 // and sweep pacing.
 752 //
 753 // This can be called any time. If GC is the in the middle of a
 754 // concurrent phase, it will adjust the pacing of that phase.
 755 //
 756 // This depends on gcpercent, memstats.heap_marked, and
 757 // memstats.heap_live. These must be up to date.
 758 //
 759 // mheap_.lock must be held or the world must be stopped.
 760 func gcSetTriggerRatio(triggerRatio float64) {
 761         // Set the trigger ratio, capped to reasonable bounds.
 762         if triggerRatio < 0 {
 763                 // This can happen if the mutator is allocating very
 764                 // quickly or the GC is scanning very slowly.
 765                 triggerRatio = 0
 766         } else if gcpercent >= 0 {
 767                 // Ensure there's always a little margin so that the
 768                 // mutator assist ratio isn't infinity.
 769                 maxTriggerRatio := 0.95 * float64(gcpercent) / 100
 770                 if triggerRatio > maxTriggerRatio {
 771                         triggerRatio = maxTriggerRatio
 772                 }
 773         }
 774         memstats.triggerRatio = triggerRatio
 775
 776         // Compute the absolute GC trigger from the trigger ratio.
 777         //
 778         // We trigger the next GC cycle when the allocated heap has
 779         // grown by the trigger ratio over the marked heap size.
 780         trigger := ^uint64(0)
 781         if gcpercent >= 0 {
 782                 trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
 783                 // Don't trigger below the minimum heap size.
 784                 minTrigger := heapminimum
 785                 if !gosweepdone() {
 786                         // Concurrent sweep happens in the heap growth
 787                         // from heap_live to gc_trigger, so ensure
 788                         // that concurrent sweep has some heap growth
 789                         // in which to perform sweeping before we
 790                         // start the next GC cycle.
 791                         sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance*uint64(gcpercent)/100
 792                         if sweepMin > minTrigger {
 793                                 minTrigger = sweepMin
 794                         }
 795                 }
 796                 if trigger < minTrigger {
 797                         trigger = minTrigger
 798                 }
 799                 if int64(trigger) < 0 {
 800                         print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
 801                         throw("gc_trigger underflow")
 802                 }
 803         }
 804         memstats.gc_trigger = trigger
 805
 806         // Compute the next GC goal, which is when the allocated heap
 807         // has grown by GOGC/100 over the heap marked by the last
 808         // cycle.
 809         goal := ^uint64(0)
 810         if gcpercent >= 0 {
 811                 goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 812                 if goal < trigger {
 813                         // The trigger ratio is always less than GOGC/100, but
 814                         // other bounds on the trigger may have raised it.
 815                         // Push up the goal, too.
 816                         goal = trigger
 817                 }
 818         }
 819         memstats.next_gc = goal
 820         if trace.enabled {
 821                 traceNextGC()
 822         }
 823
 824         // Update mark pacing.
 825         if gcphase != _GCoff {
 826                 gcController.revise()
 827         }
 828
 829         // Update sweep pacing.
 830         if gosweepdone() {
 831                 mheap_.sweepPagesPerByte = 0
 832         } else {
 833                 // Concurrent sweep needs to sweep all of the in-use
 834                 // pages by the time the allocated heap reaches the GC
 835                 // trigger. Compute the ratio of in-use pages to sweep
 836                 // per byte allocated, accounting for the fact that
 837                 // some might already be swept.
 838                 heapLiveBasis := atomic.Load64(&memstats.heap_live)
 839                 heapDistance := int64(trigger) - int64(heapLiveBasis)
 840                 // Add a little margin so rounding errors and
 841                 // concurrent sweep are less likely to leave pages
 842                 // unswept when GC starts.
 843                 heapDistance -= 1024 * 1024
 844                 if heapDistance < _PageSize {
 845                         // Avoid setting the sweep ratio extremely high
 846                         heapDistance = _PageSize
 847                 }
 848                 pagesSwept := atomic.Load64(&mheap_.pagesSwept)
 849                 sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
 850                 if sweepDistancePages <= 0 {
 851                         mheap_.sweepPagesPerByte = 0
 852                 } else {
 853                         mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
 854                         mheap_.sweepHeapLiveBasis = heapLiveBasis
 855                         // Write pagesSweptBasis last, since this
 856                         // signals concurrent sweeps to recompute
 857                         // their debt.
 858                         atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
 859                 }
 860         }
 861 }
 862
 863 // gcGoalUtilization is the goal CPU utilization for background
 864 // marking as a fraction of GOMAXPROCS.
 865 const gcGoalUtilization = 0.25
 866
 867 // gcCreditSlack is the amount of scan work credit that can can
 868 // accumulate locally before updating gcController.scanWork and,
 869 // optionally, gcController.bgScanCredit. Lower values give a more
 870 // accurate assist ratio and make it more likely that assists will
 871 // successfully steal background credit. Higher values reduce memory
 872 // contention.
 873 const gcCreditSlack = 2000
 874
 875 // gcAssistTimeSlack is the nanoseconds of mutator assist time that
 876 // can accumulate on a P before updating gcController.assistTime.
 877 const gcAssistTimeSlack = 5000
 878
 879 // gcOverAssistWork determines how many extra units of scan work a GC
 880 // assist does when an assist happens. This amortizes the cost of an
 881 // assist by pre-paying for this many bytes of future allocations.
 882 const gcOverAssistWork = 64 << 10
 883
 884 var work struct {
 885         full  lfstack                  // lock-free list of full blocks workbuf
 886         empty lfstack                  // lock-free list of empty blocks workbuf
 887         pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 888
 889         wbufSpans struct {
 890                 lock mutex
 891                 // free is a list of spans dedicated to workbufs, but
 892                 // that don't currently contain any workbufs.
 893                 free mSpanList
 894                 // busy is a list of all spans containing workbufs on
 895                 // one of the workbuf lists.
 896                 busy mSpanList
 897         }
 898
 899         // Restore 64-bit alignment on 32-bit.
 900         _ uint32
 901
 902         // bytesMarked is the number of bytes marked this cycle. This
 903         // includes bytes blackened in scanned objects, noscan objects
 904         // that go straight to black, and permagrey objects scanned by
 905         // markroot during the concurrent scan phase. This is updated
 906         // atomically during the cycle. Updates may be batched
 907         // arbitrarily, since the value is only read at the end of the
 908         // cycle.
 909         //
 910         // Because of benign races during marking, this number may not
 911         // be the exact number of marked bytes, but it should be very
 912         // close.
 913         //
 914         // Put this field here because it needs 64-bit atomic access
 915         // (and thus 8-byte alignment even on 32-bit architectures).
 916         bytesMarked uint64
 917
 918         markrootNext uint32 // next markroot job
 919         markrootJobs uint32 // number of markroot jobs
 920
 921         nproc   uint32
 922         tstart  int64
 923         nwait   uint32
 924         ndone   uint32
 925         alldone note
 926
 927         // helperDrainBlock indicates that GC mark termination helpers
 928         // should pass gcDrainBlock to gcDrain to block in the
 929         // getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
 930         //
 931         // TODO: This is a temporary fallback to work around races
 932         // that cause early mark termination.
 933         helperDrainBlock bool
 934
 935         // Number of roots of various root types. Set by gcMarkRootPrepare.
 936         nFlushCacheRoots                    int
 937         nDataRoots, nSpanRoots, nStackRoots int
 938
 939         // markrootDone indicates that roots have been marked at least
 940         // once during the current GC cycle. This is checked by root
 941         // marking operations that have to happen only during the
 942         // first root marking pass, whether that's during the
 943         // concurrent mark phase in current GC or mark termination in
 944         // STW GC.
 945         markrootDone bool
 946
 947         // Each type of GC state transition is protected by a lock.
 948         // Since multiple threads can simultaneously detect the state
 949         // transition condition, any thread that detects a transition
 950         // condition must acquire the appropriate transition lock,
 951         // re-check the transition condition and return if it no
 952         // longer holds or perform the transition if it does.
 953         // Likewise, any transition must invalidate the transition
 954         // condition before releasing the lock. This ensures that each
 955         // transition is performed by exactly one thread and threads
 956         // that need the transition to happen block until it has
 957         // happened.
 958         //
 959         // startSema protects the transition from "off" to mark or
 960         // mark termination.
 961         startSema uint32
 962         // markDoneSema protects transitions from mark 1 to mark 2 and
 963         // from mark 2 to mark termination.
 964         markDoneSema uint32
 965
 966         bgMarkReady note   // signal background mark worker has started
 967         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 968         // Background mark completion signaling
 969
 970         // mode is the concurrency mode of the current GC cycle.
 971         mode gcMode
 972
 973         // userForced indicates the current GC cycle was forced by an
 974         // explicit user call.
 975         userForced bool
 976
 977         // totaltime is the CPU nanoseconds spent in GC since the
 978         // program started if debug.gctrace > 0.
 979         totaltime int64
 980
 981         // initialHeapLive is the value of memstats.heap_live at the
 982         // beginning of this GC cycle.
 983         initialHeapLive uint64
 984
 985         // assistQueue is a queue of assists that are blocked because
 986         // there was neither enough credit to steal or enough work to
 987         // do.
 988         assistQueue struct {
 989                 lock       mutex
 990                 head, tail guintptr
 991         }
 992
 993         // sweepWaiters is a list of blocked goroutines to wake when
 994         // we transition from mark termination to sweep.
 995         sweepWaiters struct {
 996                 lock mutex
 997                 head guintptr
 998         }
 999
1000         // cycles is the number of completed GC cycles, where a GC
1001         // cycle is sweep termination, mark, mark termination, and
1002         // sweep. This differs from memstats.numgc, which is
1003         // incremented at mark termination.
1004         cycles uint32
1005
1006         // Timing/utilization stats for this cycle.
1007         stwprocs, maxprocs                 int32
1008         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
1009
1010         pauseNS    int64 // total STW time this cycle
1011         pauseStart int64 // nanotime() of last STW
1012
1013         // debug.gctrace heap sizes for this cycle.
1014         heap0, heap1, heap2, heapGoal uint64
1015 }
1016
1017 // GC runs a garbage collection and blocks the caller until the
1018 // garbage collection is complete. It may also block the entire
1019 // program.
1020 func GC() {
1021         // We consider a cycle to be: sweep termination, mark, mark
1022         // termination, and sweep. This function shouldn't return
1023         // until a full cycle has been completed, from beginning to
1024         // end. Hence, we always want to finish up the current cycle
1025         // and start a new one. That means:
1026         //
1027         // 1. In sweep termination, mark, or mark termination of cycle
1028         // N, wait until mark termination N completes and transitions
1029         // to sweep N.
1030         //
1031         // 2. In sweep N, help with sweep N.
1032         //
1033         // At this point we can begin a full cycle N+1.
1034         //
1035         // 3. Trigger cycle N+1 by starting sweep termination N+1.
1036         //
1037         // 4. Wait for mark termination N+1 to complete.
1038         //
1039         // 5. Help with sweep N+1 until it's done.
1040         //
1041         // This all has to be written to deal with the fact that the
1042         // GC may move ahead on its own. For example, when we block
1043         // until mark termination N, we may wake up in cycle N+2.
1044
1045         gp := getg()
1046
1047         // Prevent the GC phase or cycle count from changing.
1048         lock(&work.sweepWaiters.lock)
1049         n := atomic.Load(&work.cycles)
1050         if gcphase == _GCmark {
1051                 // Wait until sweep termination, mark, and mark
1052                 // termination of cycle N complete.
1053                 gp.schedlink = work.sweepWaiters.head
1054                 work.sweepWaiters.head.set(gp)
1055                 goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
1056         } else {
1057                 // We're in sweep N already.
1058                 unlock(&work.sweepWaiters.lock)
1059         }
1060
1061         // We're now in sweep N or later. Trigger GC cycle N+1, which
1062         // will first finish sweep N if necessary and then enter sweep
1063         // termination N+1.
1064         gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerCycle, n: n + 1})
1065
1066         // Wait for mark termination N+1 to complete.
1067         lock(&work.sweepWaiters.lock)
1068         if gcphase == _GCmark && atomic.Load(&work.cycles) == n+1 {
1069                 gp.schedlink = work.sweepWaiters.head
1070                 work.sweepWaiters.head.set(gp)
1071                 goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
1072         } else {
1073                 unlock(&work.sweepWaiters.lock)
1074         }
1075
1076         // Finish sweep N+1 before returning. We do this both to
1077         // complete the cycle and because runtime.GC() is often used
1078         // as part of tests and benchmarks to get the system into a
1079         // relatively stable and isolated state.
1080         for atomic.Load(&work.cycles) == n+1 && gosweepone() != ^uintptr(0) {
1081                 sweep.nbgsweep++
1082                 Gosched()
1083         }
1084
1085         // Callers may assume that the heap profile reflects the
1086         // just-completed cycle when this returns (historically this
1087         // happened because this was a STW GC), but right now the
1088         // profile still reflects mark termination N, not N+1.
1089         //
1090         // As soon as all of the sweep frees from cycle N+1 are done,
1091         // we can go ahead and publish the heap profile.
1092         //
1093         // First, wait for sweeping to finish. (We know there are no
1094         // more spans on the sweep queue, but we may be concurrently
1095         // sweeping spans, so we have to wait.)
1096         for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
1097                 Gosched()
1098         }
1099
1100         // Now we're really done with sweeping, so we can publish the
1101         // stable heap profile. Only do this if we haven't already hit
1102         // another mark termination.
1103         mp := acquirem()
1104         cycle := atomic.Load(&work.cycles)
1105         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
1106                 mProf_PostSweep()
1107         }
1108         releasem(mp)
1109 }
1110
1111 // gcMode indicates how concurrent a GC cycle should be.
1112 type gcMode int
1113
1114 const (
1115         gcBackgroundMode gcMode = iota // concurrent GC and sweep
1116         gcForceMode                    // stop-the-world GC now, concurrent sweep
1117         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
1118 )
1119
1120 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
1121 // it is an exit condition for the _GCoff phase.
1122 type gcTrigger struct {
1123         kind gcTriggerKind
1124         now  int64  // gcTriggerTime: current time
1125         n    uint32 // gcTriggerCycle: cycle number to start
1126 }
1127
1128 type gcTriggerKind int
1129
1130 const (
1131         // gcTriggerAlways indicates that a cycle should be started
1132         // unconditionally, even if GOGC is off or we're in a cycle
1133         // right now. This cannot be consolidated with other cycles.
1134         gcTriggerAlways gcTriggerKind = iota
1135
1136         // gcTriggerHeap indicates that a cycle should be started when
1137         // the heap size reaches the trigger heap size computed by the
1138         // controller.
1139         gcTriggerHeap
1140
1141         // gcTriggerTime indicates that a cycle should be started when
1142         // it's been more than forcegcperiod nanoseconds since the
1143         // previous GC cycle.
1144         gcTriggerTime
1145
1146         // gcTriggerCycle indicates that a cycle should be started if
1147         // we have not yet started cycle number gcTrigger.n (relative
1148         // to work.cycles).
1149         gcTriggerCycle
1150 )
1151
1152 // test returns true if the trigger condition is satisfied, meaning
1153 // that the exit condition for the _GCoff phase has been met. The exit
1154 // condition should be tested when allocating.
1155 func (t gcTrigger) test() bool {
1156         if !memstats.enablegc || panicking != 0 {
1157                 return false
1158         }
1159         if t.kind == gcTriggerAlways {
1160                 return true
1161         }
1162         if gcphase != _GCoff || gcpercent < 0 {
1163                 return false
1164         }
1165         switch t.kind {
1166         case gcTriggerHeap:
1167                 // Non-atomic access to heap_live for performance. If
1168                 // we are going to trigger on this, this thread just
1169                 // atomically wrote heap_live anyway and we'll see our
1170                 // own write.
1171                 return memstats.heap_live >= memstats.gc_trigger
1172         case gcTriggerTime:
1173                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
1174                 return lastgc != 0 && t.now-lastgc > forcegcperiod
1175         case gcTriggerCycle:
1176                 // t.n > work.cycles, but accounting for wraparound.
1177                 return int32(t.n-work.cycles) > 0
1178         }
1179         return true
1180 }
1181
1182 // gcStart transitions the GC from _GCoff to _GCmark (if
1183 // !mode.stwMark) or _GCmarktermination (if mode.stwMark) by
1184 // performing sweep termination and GC initialization.
1185 //
1186 // This may return without performing this transition in some cases,
1187 // such as when called on a system stack or with locks held.
1188 func gcStart(mode gcMode, trigger gcTrigger) {
1189         // Since this is called from malloc and malloc is called in
1190         // the guts of a number of libraries that might be holding
1191         // locks, don't attempt to start GC in non-preemptible or
1192         // potentially unstable situations.
1193         mp := acquirem()
1194         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
1195                 releasem(mp)
1196                 return
1197         }
1198         releasem(mp)
1199         mp = nil
1200
1201         // Pick up the remaining unswept/not being swept spans concurrently
1202         //
1203         // This shouldn't happen if we're being invoked in background
1204         // mode since proportional sweep should have just finished
1205         // sweeping everything, but rounding errors, etc, may leave a
1206         // few spans unswept. In forced mode, this is necessary since
1207         // GC can be forced at any point in the sweeping cycle.
1208         //
1209         // We check the transition condition continuously here in case
1210         // this G gets delayed in to the next GC cycle.
1211         for trigger.test() && gosweepone() != ^uintptr(0) {
1212                 sweep.nbgsweep++
1213         }
1214
1215         // Perform GC initialization and the sweep termination
1216         // transition.
1217         semacquire(&work.startSema)
1218         // Re-check transition condition under transition lock.
1219         if !trigger.test() {
1220                 semrelease(&work.startSema)
1221                 return
1222         }
1223
1224         // For stats, check if this GC was forced by the user.
1225         work.userForced = trigger.kind == gcTriggerAlways || trigger.kind == gcTriggerCycle
1226
1227         // In gcstoptheworld debug mode, upgrade the mode accordingly.
1228         // We do this after re-checking the transition condition so
1229         // that multiple goroutines that detect the heap trigger don't
1230         // start multiple STW GCs.
1231         if mode == gcBackgroundMode {
1232                 if debug.gcstoptheworld == 1 {
1233                         mode = gcForceMode
1234                 } else if debug.gcstoptheworld == 2 {
1235                         mode = gcForceBlockMode
1236                 }
1237         }
1238
1239         // Ok, we're doing it!  Stop everybody else
1240         semacquire(&worldsema)
1241
1242         if trace.enabled {
1243                 traceGCStart()
1244         }
1245
1246         if mode == gcBackgroundMode {
1247                 gcBgMarkStartWorkers()
1248         }
1249
1250         gcResetMarkState()
1251
1252         work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
1253         work.heap0 = atomic.Load64(&memstats.heap_live)
1254         work.pauseNS = 0
1255         work.mode = mode
1256
1257         now := nanotime()
1258         work.tSweepTerm = now
1259         work.pauseStart = now
1260         systemstack(stopTheWorldWithSema)
1261         // Finish sweep before we start concurrent scan.
1262         systemstack(func() {
1263                 finishsweep_m()
1264         })
1265         // clearpools before we start the GC. If we wait they memory will not be
1266         // reclaimed until the next GC cycle.
1267         clearpools()
1268
1269         work.cycles++
1270         if mode == gcBackgroundMode { // Do as much work concurrently as possible
1271                 gcController.startCycle()
1272                 work.heapGoal = memstats.next_gc
1273
1274                 // Enter concurrent mark phase and enable
1275                 // write barriers.
1276                 //
1277                 // Because the world is stopped, all Ps will
1278                 // observe that write barriers are enabled by
1279                 // the time we start the world and begin
1280                 // scanning.
1281                 //
1282                 // Write barriers must be enabled before assists are
1283                 // enabled because they must be enabled before
1284                 // any non-leaf heap objects are marked. Since
1285                 // allocations are blocked until assists can
1286                 // happen, we want enable assists as early as
1287                 // possible.
1288                 setGCPhase(_GCmark)
1289
1290                 gcBgMarkPrepare() // Must happen before assist enable.
1291                 gcMarkRootPrepare()
1292
1293                 // Mark all active tinyalloc blocks. Since we're
1294                 // allocating from these, they need to be black like
1295                 // other allocations. The alternative is to blacken
1296                 // the tiny block on every allocation from it, which
1297                 // would slow down the tiny allocator.
1298                 gcMarkTinyAllocs()
1299
1300                 // At this point all Ps have enabled the write
1301                 // barrier, thus maintaining the no white to
1302                 // black invariant. Enable mutator assists to
1303                 // put back-pressure on fast allocating
1304                 // mutators.
1305                 atomic.Store(&gcBlackenEnabled, 1)
1306
1307                 // Assists and workers can start the moment we start
1308                 // the world.
1309                 gcController.markStartTime = now
1310
1311                 // Concurrent mark.
1312                 systemstack(startTheWorldWithSema)
1313                 now = nanotime()
1314                 work.pauseNS += now - work.pauseStart
1315                 work.tMark = now
1316         } else {
1317                 t := nanotime()
1318                 work.tMark, work.tMarkTerm = t, t
1319                 work.heapGoal = work.heap0
1320
1321                 // Perform mark termination. This will restart the world.
1322                 gcMarkTermination(memstats.triggerRatio)
1323         }
1324
1325         semrelease(&work.startSema)
1326 }
1327
1328 // gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
1329 // to mark termination.
1330 //
1331 // This should be called when all mark work has been drained. In mark
1332 // 1, this includes all root marking jobs, global work buffers, and
1333 // active work buffers in assists and background workers; however,
1334 // work may still be cached in per-P work buffers. In mark 2, per-P
1335 // caches are disabled.
1336 //
1337 // The calling context must be preemptible.
1338 //
1339 // Note that it is explicitly okay to have write barriers in this
1340 // function because completion of concurrent mark is best-effort
1341 // anyway. Any work created by write barriers here will be cleaned up
1342 // by mark termination.
1343 func gcMarkDone() {
1344 top:
1345         semacquire(&work.markDoneSema)
1346
1347         // Re-check transition condition under transition lock.
1348         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
1349                 semrelease(&work.markDoneSema)
1350                 return
1351         }
1352
1353         // Disallow starting new workers so that any remaining workers
1354         // in the current mark phase will drain out.
1355         //
1356         // TODO(austin): Should dedicated workers keep an eye on this
1357         // and exit gcDrain promptly?
1358         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
1359         atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
1360
1361         if !gcBlackenPromptly {
1362                 // Transition from mark 1 to mark 2.
1363                 //
1364                 // The global work list is empty, but there can still be work
1365                 // sitting in the per-P work caches.
1366                 // Flush and disable work caches.
1367
1368                 // Disallow caching workbufs and indicate that we're in mark 2.
1369                 gcBlackenPromptly = true
1370
1371                 // Prevent completion of mark 2 until we've flushed
1372                 // cached workbufs.
1373                 atomic.Xadd(&work.nwait, -1)
1374
1375                 // GC is set up for mark 2. Let Gs blocked on the
1376                 // transition lock go while we flush caches.
1377                 semrelease(&work.markDoneSema)
1378
1379                 systemstack(func() {
1380                         // Flush all currently cached workbufs and
1381                         // ensure all Ps see gcBlackenPromptly. This
1382                         // also blocks until any remaining mark 1
1383                         // workers have exited their loop so we can
1384                         // start new mark 2 workers.
1385                         forEachP(func(_p_ *p) {
1386                                 _p_.gcw.dispose()
1387                         })
1388                 })
1389
1390                 // Check that roots are marked. We should be able to
1391                 // do this before the forEachP, but based on issue
1392                 // #16083 there may be a (harmless) race where we can
1393                 // enter mark 2 while some workers are still scanning
1394                 // stacks. The forEachP ensures these scans are done.
1395                 //
1396                 // TODO(austin): Figure out the race and fix this
1397                 // properly.
1398                 gcMarkRootCheck()
1399
1400                 // Now we can start up mark 2 workers.
1401                 atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
1402                 atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
1403
1404                 incnwait := atomic.Xadd(&work.nwait, +1)
1405                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1406                         // This loop will make progress because
1407                         // gcBlackenPromptly is now true, so it won't
1408                         // take this same "if" branch.
1409                         goto top
1410                 }
1411         } else {
1412                 // Transition to mark termination.
1413                 now := nanotime()
1414                 work.tMarkTerm = now
1415                 work.pauseStart = now
1416                 getg().m.preemptoff = "gcing"
1417                 systemstack(stopTheWorldWithSema)
1418                 // The gcphase is _GCmark, it will transition to _GCmarktermination
1419                 // below. The important thing is that the wb remains active until
1420                 // all marking is complete. This includes writes made by the GC.
1421
1422                 // Record that one root marking pass has completed.
1423                 work.markrootDone = true
1424
1425                 // Disable assists and background workers. We must do
1426                 // this before waking blocked assists.
1427                 atomic.Store(&gcBlackenEnabled, 0)
1428
1429                 // Wake all blocked assists. These will run when we
1430                 // start the world again.
1431                 gcWakeAllAssists()
1432
1433                 // Likewise, release the transition lock. Blocked
1434                 // workers and assists will run when we start the
1435                 // world again.
1436                 semrelease(&work.markDoneSema)
1437
1438                 // endCycle depends on all gcWork cache stats being
1439                 // flushed. This is ensured by mark 2.
1440                 nextTriggerRatio := gcController.endCycle()
1441
1442                 // Perform mark termination. This will restart the world.
1443                 gcMarkTermination(nextTriggerRatio)
1444         }
1445 }
1446
1447 func gcMarkTermination(nextTriggerRatio float64) {
1448         // World is stopped.
1449         // Start marktermination which includes enabling the write barrier.
1450         atomic.Store(&gcBlackenEnabled, 0)
1451         gcBlackenPromptly = false
1452         setGCPhase(_GCmarktermination)
1453
1454         work.heap1 = memstats.heap_live
1455         startTime := nanotime()
1456
1457         mp := acquirem()
1458         mp.preemptoff = "gcing"
1459         _g_ := getg()
1460         _g_.m.traceback = 2
1461         gp := _g_.m.curg
1462         casgstatus(gp, _Grunning, _Gwaiting)
1463         gp.waitreason = "garbage collection"
1464
1465         // Run gc on the g0 stack. We do this so that the g stack
1466         // we're currently running on will no longer change. Cuts
1467         // the root set down a bit (g0 stacks are not scanned, and
1468         // we don't need to scan gc's internal state).  We also
1469         // need to switch to g0 so we can shrink the stack.
1470         systemstack(func() {
1471                 gcMark(startTime)
1472                 // Must return immediately.
1473                 // The outer function's stack may have moved
1474                 // during gcMark (it shrinks stacks, including the
1475                 // outer function's stack), so we must not refer
1476                 // to any of its variables. Return back to the
1477                 // non-system stack to pick up the new addresses
1478                 // before continuing.
1479         })
1480
1481         systemstack(func() {
1482                 work.heap2 = work.bytesMarked
1483                 if debug.gccheckmark > 0 {
1484                         // Run a full stop-the-world mark using checkmark bits,
1485                         // to check that we didn't forget to mark anything during
1486                         // the concurrent mark process.
1487                         gcResetMarkState()
1488                         initCheckmarks()
1489                         gcMark(startTime)
1490                         clearCheckmarks()
1491                 }
1492
1493                 // marking is complete so we can turn the write barrier off
1494                 setGCPhase(_GCoff)
1495                 gcSweep(work.mode)
1496
1497                 if debug.gctrace > 1 {
1498                         startTime = nanotime()
1499                         // The g stacks have been scanned so
1500                         // they have gcscanvalid==true and gcworkdone==true.
1501                         // Reset these so that all stacks will be rescanned.
1502                         gcResetMarkState()
1503                         finishsweep_m()
1504
1505                         // Still in STW but gcphase is _GCoff, reset to _GCmarktermination
1506                         // At this point all objects will be found during the gcMark which
1507                         // does a complete STW mark and object scan.
1508                         setGCPhase(_GCmarktermination)
1509                         gcMark(startTime)
1510                         setGCPhase(_GCoff) // marking is done, turn off wb.
1511                         gcSweep(work.mode)
1512                 }
1513         })
1514
1515         _g_.m.traceback = 0
1516         casgstatus(gp, _Gwaiting, _Grunning)
1517
1518         if trace.enabled {
1519                 traceGCDone()
1520         }
1521
1522         // all done
1523         mp.preemptoff = ""
1524
1525         if gcphase != _GCoff {
1526                 throw("gc done but gcphase != _GCoff")
1527         }
1528
1529         // Update GC trigger and pacing for the next cycle.
1530         gcSetTriggerRatio(nextTriggerRatio)
1531
1532         // Update timing memstats
1533         now := nanotime()
1534         sec, nsec, _ := time_now()
1535         unixNow := sec*1e9 + int64(nsec)
1536         work.pauseNS += now - work.pauseStart
1537         work.tEnd = now
1538         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1539         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1540         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1541         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1542         memstats.pause_total_ns += uint64(work.pauseNS)
1543
1544         // Update work.totaltime.
1545         sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
1546         // We report idle marking time below, but omit it from the
1547         // overall utilization here since it's "free".
1548         markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
1549         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1550         cycleCpu := sweepTermCpu + markCpu + markTermCpu
1551         work.totaltime += cycleCpu
1552
1553         // Compute overall GC CPU utilization.
1554         totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1555         memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1556
1557         // Reset sweep state.
1558         sweep.nbgsweep = 0
1559         sweep.npausesweep = 0
1560
1561         if work.userForced {
1562                 memstats.numforcedgc++
1563         }
1564
1565         // Bump GC cycle count and wake goroutines waiting on sweep.
1566         lock(&work.sweepWaiters.lock)
1567         memstats.numgc++
1568         injectglist(work.sweepWaiters.head.ptr())
1569         work.sweepWaiters.head = 0
1570         unlock(&work.sweepWaiters.lock)
1571
1572         // Finish the current heap profiling cycle and start a new
1573         // heap profiling cycle. We do this before starting the world
1574         // so events don't leak into the wrong cycle.
1575         mProf_NextCycle()
1576
1577         systemstack(startTheWorldWithSema)
1578
1579         // Flush the heap profile so we can start a new cycle next GC.
1580         // This is relatively expensive, so we don't do it with the
1581         // world stopped.
1582         mProf_Flush()
1583
1584         // Prepare workbufs for freeing by the sweeper. We do this
1585         // asynchronously because it can take non-trivial time.
1586         prepareFreeWorkbufs()
1587
1588         // Print gctrace before dropping worldsema. As soon as we drop
1589         // worldsema another cycle could start and smash the stats
1590         // we're trying to print.
1591         if debug.gctrace > 0 {
1592                 util := int(memstats.gc_cpu_fraction * 100)
1593
1594                 var sbuf [24]byte
1595                 printlock()
1596                 print("gc ", memstats.numgc,
1597                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1598                         util, "%: ")
1599                 prev := work.tSweepTerm
1600                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1601                         if i != 0 {
1602                                 print("+")
1603                         }
1604                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1605                         prev = ns
1606                 }
1607                 print(" ms clock, ")
1608                 for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1609                         if i == 2 || i == 3 {
1610                                 // Separate mark time components with /.
1611                                 print("/")
1612                         } else if i != 0 {
1613                                 print("+")
1614                         }
1615                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1616                 }
1617                 print(" ms cpu, ",
1618                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1619                         work.heapGoal>>20, " MB goal, ",
1620                         work.maxprocs, " P")
1621                 if work.userForced {
1622                         print(" (forced)")
1623                 }
1624                 print("\n")
1625                 printunlock()
1626         }
1627
1628         semrelease(&worldsema)
1629         // Careful: another GC cycle may start now.
1630
1631         releasem(mp)
1632         mp = nil
1633
1634         // now that gc is done, kick off finalizer thread if needed
1635         if !concurrentSweep {
1636                 // give the queued finalizers, if any, a chance to run
1637                 Gosched()
1638         }
1639 }
1640
1641 // gcBgMarkStartWorkers prepares background mark worker goroutines.
1642 // These goroutines will not run until the mark phase, but they must
1643 // be started while the work is not stopped and from a regular G
1644 // stack. The caller must hold worldsema.
1645 func gcBgMarkStartWorkers() {
1646         // Background marking is performed by per-P G's. Ensure that
1647         // each P has a background GC G.
1648         for _, p := range &allp {
1649                 if p == nil || p.status == _Pdead {
1650                         break
1651                 }
1652                 if p.gcBgMarkWorker == 0 {
1653                         expectSystemGoroutine()
1654                         go gcBgMarkWorker(p)
1655                         notetsleepg(&work.bgMarkReady, -1)
1656                         noteclear(&work.bgMarkReady)
1657                 }
1658         }
1659 }
1660
1661 // gcBgMarkPrepare sets up state for background marking.
1662 // Mutator assists must not yet be enabled.
1663 func gcBgMarkPrepare() {
1664         // Background marking will stop when the work queues are empty
1665         // and there are no more workers (note that, since this is
1666         // concurrent, this may be a transient state, but mark
1667         // termination will clean it up). Between background workers
1668         // and assists, we don't really know how many workers there
1669         // will be, so we pretend to have an arbitrarily large number
1670         // of workers, almost all of which are "waiting". While a
1671         // worker is working it decrements nwait. If nproc == nwait,
1672         // there are no workers.
1673         work.nproc = ^uint32(0)
1674         work.nwait = ^uint32(0)
1675 }
1676
1677 func gcBgMarkWorker(_p_ *p) {
1678         setSystemGoroutine()
1679
1680         gp := getg()
1681
1682         type parkInfo struct {
1683                 m      muintptr // Release this m on park.
1684                 attach puintptr // If non-nil, attach to this p on park.
1685         }
1686         // We pass park to a gopark unlock function, so it can't be on
1687         // the stack (see gopark). Prevent deadlock from recursively
1688         // starting GC by disabling preemption.
1689         gp.m.preemptoff = "GC worker init"
1690         park := new(parkInfo)
1691         gp.m.preemptoff = ""
1692
1693         park.m.set(acquirem())
1694         park.attach.set(_p_)
1695         // Inform gcBgMarkStartWorkers that this worker is ready.
1696         // After this point, the background mark worker is scheduled
1697         // cooperatively by gcController.findRunnable. Hence, it must
1698         // never be preempted, as this would put it into _Grunnable
1699         // and put it on a run queue. Instead, when the preempt flag
1700         // is set, this puts itself into _Gwaiting to be woken up by
1701         // gcController.findRunnable at the appropriate time.
1702         notewakeup(&work.bgMarkReady)
1703
1704         for {
1705                 // Go to sleep until woken by gcController.findRunnable.
1706                 // We can't releasem yet since even the call to gopark
1707                 // may be preempted.
1708                 gopark(func(g *g, parkp unsafe.Pointer) bool {
1709                         park := (*parkInfo)(parkp)
1710
1711                         // The worker G is no longer running, so it's
1712                         // now safe to allow preemption.
1713                         releasem(park.m.ptr())
1714
1715                         // If the worker isn't attached to its P,
1716                         // attach now. During initialization and after
1717                         // a phase change, the worker may have been
1718                         // running on a different P. As soon as we
1719                         // attach, the owner P may schedule the
1720                         // worker, so this must be done after the G is
1721                         // stopped.
1722                         if park.attach != 0 {
1723                                 p := park.attach.ptr()
1724                                 park.attach.set(nil)
1725                                 // cas the worker because we may be
1726                                 // racing with a new worker starting
1727                                 // on this P.
1728                                 if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
1729                                         // The P got a new worker.
1730                                         // Exit this worker.
1731                                         return false
1732                                 }
1733                         }
1734                         return true
1735                 }, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
1736
1737                 // Loop until the P dies and disassociates this
1738                 // worker (the P may later be reused, in which case
1739                 // it will get a new worker) or we failed to associate.
1740                 if _p_.gcBgMarkWorker.ptr() != gp {
1741                         break
1742                 }
1743
1744                 // Disable preemption so we can use the gcw. If the
1745                 // scheduler wants to preempt us, we'll stop draining,
1746                 // dispose the gcw, and then preempt.
1747                 park.m.set(acquirem())
1748
1749                 if gcBlackenEnabled == 0 {
1750                         throw("gcBgMarkWorker: blackening not enabled")
1751                 }
1752
1753                 startTime := nanotime()
1754
1755                 decnwait := atomic.Xadd(&work.nwait, -1)
1756                 if decnwait == work.nproc {
1757                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1758                         throw("work.nwait was > work.nproc")
1759                 }
1760
1761                 systemstack(func() {
1762                         // Mark our goroutine preemptible so its stack
1763                         // can be scanned. This lets two mark workers
1764                         // scan each other (otherwise, they would
1765                         // deadlock). We must not modify anything on
1766                         // the G stack. However, stack shrinking is
1767                         // disabled for mark workers, so it is safe to
1768                         // read from the G stack.
1769                         casgstatus(gp, _Grunning, _Gwaiting)
1770                         switch _p_.gcMarkWorkerMode {
1771                         default:
1772                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1773                         case gcMarkWorkerDedicatedMode:
1774                                 gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1775                                 if gp.preempt {
1776                                         // We were preempted. This is
1777                                         // a useful signal to kick
1778                                         // everything out of the run
1779                                         // queue so it can run
1780                                         // somewhere else.
1781                                         lock(&sched.lock)
1782                                         for {
1783                                                 gp, _ := runqget(_p_)
1784                                                 if gp == nil {
1785                                                         break
1786                                                 }
1787                                                 globrunqput(gp)
1788                                         }
1789                                         unlock(&sched.lock)
1790                                 }
1791                                 // Go back to draining, this time
1792                                 // without preemption.
1793                                 gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
1794                         case gcMarkWorkerFractionalMode:
1795                                 gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1796                         case gcMarkWorkerIdleMode:
1797                                 gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1798                         }
1799                         casgstatus(gp, _Gwaiting, _Grunning)
1800                 })
1801
1802                 // If we are nearing the end of mark, dispose
1803                 // of the cache promptly. We must do this
1804                 // before signaling that we're no longer
1805                 // working so that other workers can't observe
1806                 // no workers and no work while we have this
1807                 // cached, and before we compute done.
1808                 if gcBlackenPromptly {
1809                         _p_.gcw.dispose()
1810                 }
1811
1812                 // Account for time.
1813                 duration := nanotime() - startTime
1814                 switch _p_.gcMarkWorkerMode {
1815                 case gcMarkWorkerDedicatedMode:
1816                         atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
1817                         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
1818                 case gcMarkWorkerFractionalMode:
1819                         atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
1820                         atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
1821                 case gcMarkWorkerIdleMode:
1822                         atomic.Xaddint64(&gcController.idleMarkTime, duration)
1823                 }
1824
1825                 // Was this the last worker and did we run out
1826                 // of work?
1827                 incnwait := atomic.Xadd(&work.nwait, +1)
1828                 if incnwait > work.nproc {
1829                         println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
1830                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1831                         throw("work.nwait > work.nproc")
1832                 }
1833
1834                 // If this worker reached a background mark completion
1835                 // point, signal the main GC goroutine.
1836                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1837                         // Make this G preemptible and disassociate it
1838                         // as the worker for this P so
1839                         // findRunnableGCWorker doesn't try to
1840                         // schedule it.
1841                         _p_.gcBgMarkWorker.set(nil)
1842                         releasem(park.m.ptr())
1843
1844                         gcMarkDone()
1845
1846                         // Disable preemption and prepare to reattach
1847                         // to the P.
1848                         //
1849                         // We may be running on a different P at this
1850                         // point, so we can't reattach until this G is
1851                         // parked.
1852                         park.m.set(acquirem())
1853                         park.attach.set(_p_)
1854                 }
1855         }
1856 }
1857
1858 // gcMarkWorkAvailable returns true if executing a mark worker
1859 // on p is potentially useful. p may be nil, in which case it only
1860 // checks the global sources of work.
1861 func gcMarkWorkAvailable(p *p) bool {
1862         if p != nil && !p.gcw.empty() {
1863                 return true
1864         }
1865         if !work.full.empty() {
1866                 return true // global work available
1867         }
1868         if work.markrootNext < work.markrootJobs {
1869                 return true // root scan work available
1870         }
1871         return false
1872 }
1873
1874 // gcMark runs the mark (or, for concurrent GC, mark termination)
1875 // All gcWork caches must be empty.
1876 // STW is in effect at this point.
1877 //TODO go:nowritebarrier
1878 func gcMark(start_time int64) {
1879         if debug.allocfreetrace > 0 {
1880                 tracegc()
1881         }
1882
1883         if gcphase != _GCmarktermination {
1884                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
1885         }
1886         work.tstart = start_time
1887
1888         // Queue root marking jobs.
1889         gcMarkRootPrepare()
1890
1891         work.nwait = 0
1892         work.ndone = 0
1893         work.nproc = uint32(gcprocs())
1894
1895         if work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots == 0 {
1896                 // There's no work on the work queue and no root jobs
1897                 // that can produce work, so don't bother entering the
1898                 // getfull() barrier.
1899                 //
1900                 // This will be the situation the vast majority of the
1901                 // time after concurrent mark. However, we still need
1902                 // a fallback for STW GC and because there are some
1903                 // known races that occasionally leave work around for
1904                 // mark termination.
1905                 //
1906                 // We're still hedging our bets here: if we do
1907                 // accidentally produce some work, we'll still process
1908                 // it, just not necessarily in parallel.
1909                 //
1910                 // TODO(austin): Fix the races and and remove
1911                 // work draining from mark termination so we don't
1912                 // need the fallback path.
1913                 work.helperDrainBlock = false
1914         } else {
1915                 work.helperDrainBlock = true
1916         }
1917
1918         if trace.enabled {
1919                 traceGCScanStart()
1920         }
1921
1922         if work.nproc > 1 {
1923                 noteclear(&work.alldone)
1924                 helpgc(int32(work.nproc))
1925         }
1926
1927         gchelperstart()
1928
1929         gcw := &getg().m.p.ptr().gcw
1930         if work.helperDrainBlock {
1931                 gcDrain(gcw, gcDrainBlock)
1932         } else {
1933                 gcDrain(gcw, gcDrainNoBlock)
1934         }
1935         gcw.dispose()
1936
1937         if debug.gccheckmark > 0 {
1938                 // This is expensive when there's a large number of
1939                 // Gs, so only do it if checkmark is also enabled.
1940                 gcMarkRootCheck()
1941         }
1942         if work.full != 0 {
1943                 throw("work.full != 0")
1944         }
1945
1946         if work.nproc > 1 {
1947                 notesleep(&work.alldone)
1948         }
1949
1950         // Record that at least one root marking pass has completed.
1951         work.markrootDone = true
1952
1953         // Double-check that all gcWork caches are empty. This should
1954         // be ensured by mark 2 before we enter mark termination.
1955         for i := 0; i < int(gomaxprocs); i++ {
1956                 gcw := &allp[i].gcw
1957                 if !gcw.empty() {
1958                         throw("P has cached GC work at end of mark termination")
1959                 }
1960                 if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
1961                         throw("P has unflushed stats at end of mark termination")
1962                 }
1963         }
1964
1965         if trace.enabled {
1966                 traceGCScanDone()
1967         }
1968
1969         cachestats()
1970
1971         // Update the marked heap stat.
1972         memstats.heap_marked = work.bytesMarked
1973
1974         // Update other GC heap size stats. This must happen after
1975         // cachestats (which flushes local statistics to these) and
1976         // flushallmcaches (which modifies heap_live).
1977         memstats.heap_live = work.bytesMarked
1978         memstats.heap_scan = uint64(gcController.scanWork)
1979
1980         if trace.enabled {
1981                 traceHeapAlloc()
1982         }
1983 }
1984
1985 func gcSweep(mode gcMode) {
1986         if gcphase != _GCoff {
1987                 throw("gcSweep being done but phase is not GCoff")
1988         }
1989
1990         lock(&mheap_.lock)
1991         mheap_.sweepgen += 2
1992         mheap_.sweepdone = 0
1993         if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
1994                 // We should have drained this list during the last
1995                 // sweep phase. We certainly need to start this phase
1996                 // with an empty swept list.
1997                 throw("non-empty swept list")
1998         }
1999         mheap_.pagesSwept = 0
2000         unlock(&mheap_.lock)
2001
2002         if !_ConcurrentSweep || mode == gcForceBlockMode {
2003                 // Special case synchronous sweep.
2004                 // Record that no proportional sweeping has to happen.
2005                 lock(&mheap_.lock)
2006                 mheap_.sweepPagesPerByte = 0
2007                 unlock(&mheap_.lock)
2008                 // Sweep all spans eagerly.
2009                 for sweepone() != ^uintptr(0) {
2010                         sweep.npausesweep++
2011                 }
2012                 // Free workbufs eagerly.
2013                 prepareFreeWorkbufs()
2014                 for freeSomeWbufs(false) {
2015                 }
2016                 // All "free" events for this mark/sweep cycle have
2017                 // now happened, so we can make this profile cycle
2018                 // available immediately.
2019                 mProf_NextCycle()
2020                 mProf_Flush()
2021                 return
2022         }
2023
2024         // Background sweep.
2025         lock(&sweep.lock)
2026         if sweep.parked {
2027                 sweep.parked = false
2028                 ready(sweep.g, 0, true)
2029         }
2030         unlock(&sweep.lock)
2031 }
2032
2033 // gcResetMarkState resets global state prior to marking (concurrent
2034 // or STW) and resets the stack scan state of all Gs.
2035 //
2036 // This is safe to do without the world stopped because any Gs created
2037 // during or after this will start out in the reset state.
2038 func gcResetMarkState() {
2039         // This may be called during a concurrent phase, so make sure
2040         // allgs doesn't change.
2041         lock(&allglock)
2042         for _, gp := range allgs {
2043                 gp.gcscandone = false  // set to true in gcphasework
2044                 gp.gcscanvalid = false // stack has not been scanned
2045                 gp.gcAssistBytes = 0
2046         }
2047         unlock(&allglock)
2048
2049         work.bytesMarked = 0
2050         work.initialHeapLive = atomic.Load64(&memstats.heap_live)
2051         work.markrootDone = false
2052 }
2053
2054 // Hooks for other packages
2055
2056 var poolcleanup func()
2057
2058 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
2059 func sync_runtime_registerPoolCleanup(f func()) {
2060         poolcleanup = f
2061 }
2062
2063 func clearpools() {
2064         // clear sync.Pools
2065         if poolcleanup != nil {
2066                 poolcleanup()
2067         }
2068
2069         // Clear central sudog cache.
2070         // Leave per-P caches alone, they have strictly bounded size.
2071         // Disconnect cached list before dropping it on the floor,
2072         // so that a dangling ref to one entry does not pin all of them.
2073         lock(&sched.sudoglock)
2074         var sg, sgnext *sudog
2075         for sg = sched.sudogcache; sg != nil; sg = sgnext {
2076                 sgnext = sg.next
2077                 sg.next = nil
2078         }
2079         sched.sudogcache = nil
2080         unlock(&sched.sudoglock)
2081
2082         // Clear central defer pools.
2083         // Leave per-P pools alone, they have strictly bounded size.
2084         lock(&sched.deferlock)
2085         // disconnect cached list before dropping it on the floor,
2086         // so that a dangling ref to one entry does not pin all of them.
2087         var d, dlink *_defer
2088         for d = sched.deferpool; d != nil; d = dlink {
2089                 dlink = d.link
2090                 d.link = nil
2091         }
2092         sched.deferpool = nil
2093         unlock(&sched.deferlock)
2094 }
2095
2096 // Timing
2097
2098 //go:nowritebarrier
2099 func gchelper() {
2100         _g_ := getg()
2101         _g_.m.traceback = 2
2102         gchelperstart()
2103
2104         if trace.enabled {
2105                 traceGCScanStart()
2106         }
2107
2108         // Parallel mark over GC roots and heap
2109         if gcphase == _GCmarktermination {
2110                 gcw := &_g_.m.p.ptr().gcw
2111                 if work.helperDrainBlock {
2112                         gcDrain(gcw, gcDrainBlock) // blocks in getfull
2113                 } else {
2114                         gcDrain(gcw, gcDrainNoBlock)
2115                 }
2116                 gcw.dispose()
2117         }
2118
2119         if trace.enabled {
2120                 traceGCScanDone()
2121         }
2122
2123         nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
2124         if atomic.Xadd(&work.ndone, +1) == nproc-1 {
2125                 notewakeup(&work.alldone)
2126         }
2127         _g_.m.traceback = 0
2128 }
2129
2130 func gchelperstart() {
2131         _g_ := getg()
2132
2133         if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
2134                 throw("gchelperstart: bad m->helpgc")
2135         }
2136         if _g_ != _g_.m.g0 {
2137                 throw("gchelper not running on g0 stack")
2138         }
2139 }
2140
2141 // itoaDiv formats val/(10**dec) into buf.
2142 func itoaDiv(buf []byte, val uint64, dec int) []byte {
2143         i := len(buf) - 1
2144         idec := i - dec
2145         for val >= 10 || i >= idec {
2146                 buf[i] = byte(val%10 + '0')
2147                 i--
2148                 if i == idec {
2149                         buf[i] = '.'
2150                         i--
2151                 }
2152                 val /= 10
2153         }
2154         buf[i] = byte(val + '0')
2155         return buf[i:]
2156 }
2157
2158 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
2159 func fmtNSAsMS(buf []byte, ns uint64) []byte {
2160         if ns >= 10e6 {
2161                 // Format as whole milliseconds.
2162                 return itoaDiv(buf, ns/1e6, 0)
2163         }
2164         // Format two digits of precision, with at most three decimal places.
2165         x := ns / 1e3
2166         if x == 0 {
2167                 buf[0] = '0'
2168                 return buf[:1]
2169         }
2170         dec := 3
2171         for x >= 100 {
2172                 x /= 10
2173                 dec--
2174         }
2175         return itoaDiv(buf, x, dec)
2176 }