libgo/go/runtime/mgc.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector (GC).
   6 //
   7 // The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple
   8 // GC thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
   9 // non-generational and non-compacting. Allocation is done using size segregated per P allocation
  10 // areas to minimize fragmentation while eliminating locks in the common case.
  11 //
  12 // The algorithm decomposes into several steps.
  13 // This is a high level description of the algorithm being used. For an overview of GC a good
  14 // place to start is Richard Jones' gchandbook.org.
  15 //
  16 // The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
  17 // Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
  18 // On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978),
  19 // 966-975.
  20 // For journal quality proofs that these steps are complete, correct, and terminate see
  21 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
  22 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
  23 //
  24 // 1. GC performs sweep termination.
  25 //
  26 //    a. Stop the world. This causes all Ps to reach a GC safe-point.
  27 //
  28 //    b. Sweep any unswept spans. There will only be unswept spans if
  29 //    this GC cycle was forced before the expected time.
  30 //
  31 // 2. GC performs the "mark 1" sub-phase. In this sub-phase, Ps are
  32 // allowed to locally cache parts of the work queue.
  33 //
  34 //    a. Prepare for the mark phase by setting gcphase to _GCmark
  35 //    (from _GCoff), enabling the write barrier, enabling mutator
  36 //    assists, and enqueueing root mark jobs. No objects may be
  37 //    scanned until all Ps have enabled the write barrier, which is
  38 //    accomplished using STW.
  39 //
  40 //    b. Start the world. From this point, GC work is done by mark
  41 //    workers started by the scheduler and by assists performed as
  42 //    part of allocation. The write barrier shades both the
  43 //    overwritten pointer and the new pointer value for any pointer
  44 //    writes (see mbarrier.go for details). Newly allocated objects
  45 //    are immediately marked black.
  46 //
  47 //    c. GC performs root marking jobs. This includes scanning all
  48 //    stacks, shading all globals, and shading any heap pointers in
  49 //    off-heap runtime data structures. Scanning a stack stops a
  50 //    goroutine, shades any pointers found on its stack, and then
  51 //    resumes the goroutine.
  52 //
  53 //    d. GC drains the work queue of grey objects, scanning each grey
  54 //    object to black and shading all pointers found in the object
  55 //    (which in turn may add those pointers to the work queue).
  56 //
  57 // 3. Once the global work queue is empty (but local work queue caches
  58 // may still contain work), GC performs the "mark 2" sub-phase.
  59 //
  60 //    a. GC stops all workers, disables local work queue caches,
  61 //    flushes each P's local work queue cache to the global work queue
  62 //    cache, and reenables workers.
  63 //
  64 //    b. GC again drains the work queue, as in 2d above.
  65 //
  66 // 4. Once the work queue is empty, GC performs mark termination.
  67 //
  68 //    a. Stop the world.
  69 //
  70 //    b. Set gcphase to _GCmarktermination, and disable workers and
  71 //    assists.
  72 //
  73 //    c. Drain any remaining work from the work queue (typically there
  74 //    will be none).
  75 //
  76 //    d. Perform other housekeeping like flushing mcaches.
  77 //
  78 // 5. GC performs the sweep phase.
  79 //
  80 //    a. Prepare for the sweep phase by setting gcphase to _GCoff,
  81 //    setting up sweep state and disabling the write barrier.
  82 //
  83 //    b. Start the world. From this point on, newly allocated objects
  84 //    are white, and allocating sweeps spans before use if necessary.
  85 //
  86 //    c. GC does concurrent sweeping in the background and in response
  87 //    to allocation. See description below.
  88 //
  89 // 6. When sufficient allocation has taken place, replay the sequence
  90 // starting with 1 above. See discussion of GC rate below.
  91
  92 // Concurrent sweep.
  93 //
  94 // The sweep phase proceeds concurrently with normal program execution.
  95 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
  96 // and concurrently in a background goroutine (this helps programs that are not CPU bound).
  97 // At the end of STW mark termination all spans are marked as "needs sweeping".
  98 //
  99 // The background sweeper goroutine simply sweeps spans one-by-one.
 100 //
 101 // To avoid requesting more OS memory while there are unswept spans, when a
 102 // goroutine needs another span, it first attempts to reclaim that much memory
 103 // by sweeping. When a goroutine needs to allocate a new small-object span, it
 104 // sweeps small-object spans for the same object size until it frees at least
 105 // one object. When a goroutine needs to allocate large-object span from heap,
 106 // it sweeps spans until it frees at least that many pages into heap. There is
 107 // one case where this may not suffice: if a goroutine sweeps and frees two
 108 // nonadjacent one-page spans to the heap, it will allocate a new two-page
 109 // span, but there can still be other one-page unswept spans which could be
 110 // combined into a two-page span.
 111 //
 112 // It's critical to ensure that no operations proceed on unswept spans (that would corrupt
 113 // mark bits in GC bitmap). During GC all mcaches are flushed into the central cache,
 114 // so they are empty. When a goroutine grabs a new span into mcache, it sweeps it.
 115 // When a goroutine explicitly frees an object or sets a finalizer, it ensures that
 116 // the span is swept (either by sweeping it, or by waiting for the concurrent sweep to finish).
 117 // The finalizer goroutine is kicked off only when all spans are swept.
 118 // When the next GC starts, it sweeps all not-yet-swept spans (if any).
 119
 120 // GC rate.
 121 // Next GC is after we've allocated an extra amount of memory proportional to
 122 // the amount already in use. The proportion is controlled by GOGC environment variable
 123 // (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
 124 // (this mark is tracked in next_gc variable). This keeps the GC cost in linear
 125 // proportion to the allocation cost. Adjusting GOGC just changes the linear constant
 126 // (and also the amount of extra memory used).
 127
 128 // Oblets
 129 //
 130 // In order to prevent long pauses while scanning large objects and to
 131 // improve parallelism, the garbage collector breaks up scan jobs for
 132 // objects larger than maxObletBytes into "oblets" of at most
 133 // maxObletBytes. When scanning encounters the beginning of a large
 134 // object, it scans only the first oblet and enqueues the remaining
 135 // oblets as new scan jobs.
 136
 137 package runtime
 138
 139 import (
 140         "runtime/internal/atomic"
 141         "runtime/internal/sys"
 142         "unsafe"
 143 )
 144
 145 const (
 146         _DebugGC         = 0
 147         _ConcurrentSweep = true
 148         _FinBlockSize    = 4 * 1024
 149
 150         // sweepMinHeapDistance is a lower bound on the heap distance
 151         // (in bytes) reserved for concurrent sweeping between GC
 152         // cycles. This will be scaled by gcpercent/100.
 153         sweepMinHeapDistance = 1024 * 1024
 154 )
 155
 156 // heapminimum is the minimum heap size at which to trigger GC.
 157 // For small heaps, this overrides the usual GOGC*live set rule.
 158 //
 159 // When there is a very small live set but a lot of allocation, simply
 160 // collecting when the heap reaches GOGC*live results in many GC
 161 // cycles and high total per-GC overhead. This minimum amortizes this
 162 // per-GC overhead while keeping the heap reasonably small.
 163 //
 164 // During initialization this is set to 4MB*GOGC/100. In the case of
 165 // GOGC==0, this will set heapminimum to 0, resulting in constant
 166 // collection even when the heap size is small, which is useful for
 167 // debugging.
 168 var heapminimum uint64 = defaultHeapMinimum
 169
 170 // defaultHeapMinimum is the value of heapminimum for GOGC==100.
 171 const defaultHeapMinimum = 4 << 20
 172
 173 // Initialized from $GOGC.  GOGC=off means no GC.
 174 var gcpercent int32
 175
 176 func gcinit() {
 177         if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
 178                 throw("size of Workbuf is suboptimal")
 179         }
 180
 181         // No sweep on the first cycle.
 182         mheap_.sweepdone = 1
 183
 184         // Set a reasonable initial GC trigger.
 185         memstats.triggerRatio = 7 / 8.0
 186
 187         // Fake a heap_marked value so it looks like a trigger at
 188         // heapminimum is the appropriate growth from heap_marked.
 189         // This will go into computing the initial GC goal.
 190         memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
 191
 192         // Set gcpercent from the environment. This will also compute
 193         // and set the GC trigger and goal.
 194         _ = setGCPercent(readgogc())
 195
 196         work.startSema = 1
 197         work.markDoneSema = 1
 198 }
 199
 200 func readgogc() int32 {
 201         p := gogetenv("GOGC")
 202         if p == "off" {
 203                 return -1
 204         }
 205         if n, ok := atoi32(p); ok {
 206                 return n
 207         }
 208         return 100
 209 }
 210
 211 // gcenable is called after the bulk of the runtime initialization,
 212 // just before we're about to start letting user code run.
 213 // It kicks off the background sweeper goroutine and enables GC.
 214 func gcenable() {
 215         c := make(chan int, 1)
 216         expectSystemGoroutine()
 217         go bgsweep(c)
 218         <-c
 219         memstats.enablegc = true // now that runtime is initialized, GC is okay
 220 }
 221
 222 //go:linkname setGCPercent runtime_debug.setGCPercent
 223 func setGCPercent(in int32) (out int32) {
 224         lock(&mheap_.lock)
 225         out = gcpercent
 226         if in < 0 {
 227                 in = -1
 228         }
 229         gcpercent = in
 230         heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
 231         // Update pacing in response to gcpercent change.
 232         gcSetTriggerRatio(memstats.triggerRatio)
 233         unlock(&mheap_.lock)
 234
 235         // If we just disabled GC, wait for any concurrent GC to
 236         // finish so we always return with no GC running.
 237         if in < 0 {
 238                 // Disable phase transitions.
 239                 lock(&work.sweepWaiters.lock)
 240                 if gcphase == _GCmark {
 241                         // GC is active. Wait until we reach sweeping.
 242                         gp := getg()
 243                         gp.schedlink = work.sweepWaiters.head
 244                         work.sweepWaiters.head.set(gp)
 245                         goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
 246                 } else {
 247                         // GC isn't active.
 248                         unlock(&work.sweepWaiters.lock)
 249                 }
 250         }
 251
 252         return out
 253 }
 254
 255 // Garbage collector phase.
 256 // Indicates to write barrier and synchronization task to perform.
 257 var gcphase uint32
 258
 259 // The compiler knows about this variable.
 260 // If you change it, you must change builtin/runtime.go, too.
 261 // If you change the first four bytes, you must also change the write
 262 // barrier insertion code.
 263 var writeBarrier struct {
 264         enabled bool    // compiler emits a check of this before calling write barrier
 265         pad     [3]byte // compiler uses 32-bit load for "enabled" field
 266         needed  bool    // whether we need a write barrier for current GC phase
 267         cgo     bool    // whether we need a write barrier for a cgo check
 268         alignme uint64  // guarantee alignment so that compiler can use a 32 or 64-bit load
 269 }
 270
 271 // gcBlackenEnabled is 1 if mutator assists and background mark
 272 // workers are allowed to blacken objects. This must only be set when
 273 // gcphase == _GCmark.
 274 var gcBlackenEnabled uint32
 275
 276 // gcBlackenPromptly indicates that optimizations that may
 277 // hide work from the global work queue should be disabled.
 278 //
 279 // If gcBlackenPromptly is true, per-P gcWork caches should
 280 // be flushed immediately and new objects should be allocated black.
 281 //
 282 // There is a tension between allocating objects white and
 283 // allocating them black. If white and the objects die before being
 284 // marked they can be collected during this GC cycle. On the other
 285 // hand allocating them black will reduce _GCmarktermination latency
 286 // since more work is done in the mark phase. This tension is resolved
 287 // by allocating white until the mark phase is approaching its end and
 288 // then allocating black for the remainder of the mark phase.
 289 var gcBlackenPromptly bool
 290
 291 const (
 292         _GCoff             = iota // GC not running; sweeping in background, write barrier disabled
 293         _GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 294         _GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 295 )
 296
 297 //go:nosplit
 298 func setGCPhase(x uint32) {
 299         atomic.Store(&gcphase, x)
 300         writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
 301         writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 302 }
 303
 304 // gcMarkWorkerMode represents the mode that a concurrent mark worker
 305 // should operate in.
 306 //
 307 // Concurrent marking happens through four different mechanisms. One
 308 // is mutator assists, which happen in response to allocations and are
 309 // not scheduled. The other three are variations in the per-P mark
 310 // workers and are distinguished by gcMarkWorkerMode.
 311 type gcMarkWorkerMode int
 312
 313 const (
 314         // gcMarkWorkerDedicatedMode indicates that the P of a mark
 315         // worker is dedicated to running that mark worker. The mark
 316         // worker should run without preemption.
 317         gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
 318
 319         // gcMarkWorkerFractionalMode indicates that a P is currently
 320         // running the "fractional" mark worker. The fractional worker
 321         // is necessary when GOMAXPROCS*gcBackgroundUtilization is not
 322         // an integer. The fractional worker should run until it is
 323         // preempted and will be scheduled to pick up the fractional
 324         // part of GOMAXPROCS*gcBackgroundUtilization.
 325         gcMarkWorkerFractionalMode
 326
 327         // gcMarkWorkerIdleMode indicates that a P is running the mark
 328         // worker because it has nothing else to do. The idle worker
 329         // should run until it is preempted and account its time
 330         // against gcController.idleMarkTime.
 331         gcMarkWorkerIdleMode
 332 )
 333
 334 // gcMarkWorkerModeStrings are the strings labels of gcMarkWorkerModes
 335 // to use in execution traces.
 336 var gcMarkWorkerModeStrings = [...]string{
 337         "GC (dedicated)",
 338         "GC (fractional)",
 339         "GC (idle)",
 340 }
 341
 342 // gcController implements the GC pacing controller that determines
 343 // when to trigger concurrent garbage collection and how much marking
 344 // work to do in mutator assists and background marking.
 345 //
 346 // It uses a feedback control algorithm to adjust the memstats.gc_trigger
 347 // trigger based on the heap growth and GC CPU utilization each cycle.
 348 // This algorithm optimizes for heap growth to match GOGC and for CPU
 349 // utilization between assist and background marking to be 25% of
 350 // GOMAXPROCS. The high-level design of this algorithm is documented
 351 // at https://golang.org/s/go15gcpacing.
 352 //
 353 // All fields of gcController are used only during a single mark
 354 // cycle.
 355 var gcController gcControllerState
 356
 357 type gcControllerState struct {
 358         // scanWork is the total scan work performed this cycle. This
 359         // is updated atomically during the cycle. Updates occur in
 360         // bounded batches, since it is both written and read
 361         // throughout the cycle. At the end of the cycle, this is how
 362         // much of the retained heap is scannable.
 363         //
 364         // Currently this is the bytes of heap scanned. For most uses,
 365         // this is an opaque unit of work, but for estimation the
 366         // definition is important.
 367         scanWork int64
 368
 369         // bgScanCredit is the scan work credit accumulated by the
 370         // concurrent background scan. This credit is accumulated by
 371         // the background scan and stolen by mutator assists. This is
 372         // updated atomically. Updates occur in bounded batches, since
 373         // it is both written and read throughout the cycle.
 374         bgScanCredit int64
 375
 376         // assistTime is the nanoseconds spent in mutator assists
 377         // during this cycle. This is updated atomically. Updates
 378         // occur in bounded batches, since it is both written and read
 379         // throughout the cycle.
 380         assistTime int64
 381
 382         // dedicatedMarkTime is the nanoseconds spent in dedicated
 383         // mark workers during this cycle. This is updated atomically
 384         // at the end of the concurrent mark phase.
 385         dedicatedMarkTime int64
 386
 387         // fractionalMarkTime is the nanoseconds spent in the
 388         // fractional mark worker during this cycle. This is updated
 389         // atomically throughout the cycle and will be up-to-date if
 390         // the fractional mark worker is not currently running.
 391         fractionalMarkTime int64
 392
 393         // idleMarkTime is the nanoseconds spent in idle marking
 394         // during this cycle. This is updated atomically throughout
 395         // the cycle.
 396         idleMarkTime int64
 397
 398         // markStartTime is the absolute start time in nanoseconds
 399         // that assists and background mark workers started.
 400         markStartTime int64
 401
 402         // dedicatedMarkWorkersNeeded is the number of dedicated mark
 403         // workers that need to be started. This is computed at the
 404         // beginning of each cycle and decremented atomically as
 405         // dedicated mark workers get started.
 406         dedicatedMarkWorkersNeeded int64
 407
 408         // assistWorkPerByte is the ratio of scan work to allocated
 409         // bytes that should be performed by mutator assists. This is
 410         // computed at the beginning of each cycle and updated every
 411         // time heap_scan is updated.
 412         assistWorkPerByte float64
 413
 414         // assistBytesPerWork is 1/assistWorkPerByte.
 415         assistBytesPerWork float64
 416
 417         // fractionalUtilizationGoal is the fraction of wall clock
 418         // time that should be spent in the fractional mark worker on
 419         // each P that isn't running a dedicated worker.
 420         //
 421         // For example, if the utilization goal is 25% and there are
 422         // no dedicated workers, this will be 0.25. If there goal is
 423         // 25%, there is one dedicated worker, and GOMAXPROCS is 5,
 424         // this will be 0.05 to make up the missing 5%.
 425         //
 426         // If this is zero, no fractional workers are needed.
 427         fractionalUtilizationGoal float64
 428
 429         _ [sys.CacheLineSize]byte
 430 }
 431
 432 // startCycle resets the GC controller's state and computes estimates
 433 // for a new GC cycle. The caller must hold worldsema.
 434 func (c *gcControllerState) startCycle() {
 435         c.scanWork = 0
 436         c.bgScanCredit = 0
 437         c.assistTime = 0
 438         c.dedicatedMarkTime = 0
 439         c.fractionalMarkTime = 0
 440         c.idleMarkTime = 0
 441
 442         // If this is the first GC cycle or we're operating on a very
 443         // small heap, fake heap_marked so it looks like gc_trigger is
 444         // the appropriate growth from heap_marked, even though the
 445         // real heap_marked may not have a meaningful value (on the
 446         // first cycle) or may be much smaller (resulting in a large
 447         // error response).
 448         if memstats.gc_trigger <= heapminimum {
 449                 memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + memstats.triggerRatio))
 450         }
 451
 452         // Re-compute the heap goal for this cycle in case something
 453         // changed. This is the same calculation we use elsewhere.
 454         memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 455         if gcpercent < 0 {
 456                 memstats.next_gc = ^uint64(0)
 457         }
 458
 459         // Ensure that the heap goal is at least a little larger than
 460         // the current live heap size. This may not be the case if GC
 461         // start is delayed or if the allocation that pushed heap_live
 462         // over gc_trigger is large or if the trigger is really close to
 463         // GOGC. Assist is proportional to this distance, so enforce a
 464         // minimum distance, even if it means going over the GOGC goal
 465         // by a tiny bit.
 466         if memstats.next_gc < memstats.heap_live+1024*1024 {
 467                 memstats.next_gc = memstats.heap_live + 1024*1024
 468         }
 469
 470         // Compute the background mark utilization goal. In general,
 471         // this may not come out exactly. We round the number of
 472         // dedicated workers so that the utilization is closest to
 473         // 25%. For small GOMAXPROCS, this would introduce too much
 474         // error, so we add fractional workers in that case.
 475         totalUtilizationGoal := float64(gomaxprocs) * gcBackgroundUtilization
 476         c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
 477         utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
 478         const maxUtilError = 0.3
 479         if utilError < -maxUtilError || utilError > maxUtilError {
 480                 // Rounding put us more than 30% off our goal. With
 481                 // gcBackgroundUtilization of 25%, this happens for
 482                 // GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
 483                 // workers to compensate.
 484                 if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
 485                         // Too many dedicated workers.
 486                         c.dedicatedMarkWorkersNeeded--
 487                 }
 488                 c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(gomaxprocs)
 489         } else {
 490                 c.fractionalUtilizationGoal = 0
 491         }
 492
 493         // Clear per-P state
 494         for _, p := range allp {
 495                 p.gcAssistTime = 0
 496                 p.gcFractionalMarkTime = 0
 497         }
 498
 499         // Compute initial values for controls that are updated
 500         // throughout the cycle.
 501         c.revise()
 502
 503         if debug.gcpacertrace > 0 {
 504                 print("pacer: assist ratio=", c.assistWorkPerByte,
 505                         " (scan ", memstats.heap_scan>>20, " MB in ",
 506                         work.initialHeapLive>>20, "->",
 507                         memstats.next_gc>>20, " MB)",
 508                         " workers=", c.dedicatedMarkWorkersNeeded,
 509                         "+", c.fractionalUtilizationGoal, "\n")
 510         }
 511 }
 512
 513 // revise updates the assist ratio during the GC cycle to account for
 514 // improved estimates. This should be called either under STW or
 515 // whenever memstats.heap_scan, memstats.heap_live, or
 516 // memstats.next_gc is updated (with mheap_.lock held).
 517 //
 518 // It should only be called when gcBlackenEnabled != 0 (because this
 519 // is when assists are enabled and the necessary statistics are
 520 // available).
 521 func (c *gcControllerState) revise() {
 522         gcpercent := gcpercent
 523         if gcpercent < 0 {
 524                 // If GC is disabled but we're running a forced GC,
 525                 // act like GOGC is huge for the below calculations.
 526                 gcpercent = 100000
 527         }
 528         live := atomic.Load64(&memstats.heap_live)
 529
 530         var heapGoal, scanWorkExpected int64
 531         if live <= memstats.next_gc {
 532                 // We're under the soft goal. Pace GC to complete at
 533                 // next_gc assuming the heap is in steady-state.
 534                 heapGoal = int64(memstats.next_gc)
 535
 536                 // Compute the expected scan work remaining.
 537                 //
 538                 // This is estimated based on the expected
 539                 // steady-state scannable heap. For example, with
 540                 // GOGC=100, only half of the scannable heap is
 541                 // expected to be live, so that's what we target.
 542                 //
 543                 // (This is a float calculation to avoid overflowing on
 544                 // 100*heap_scan.)
 545                 scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
 546         } else {
 547                 // We're past the soft goal. Pace GC so that in the
 548                 // worst case it will complete by the hard goal.
 549                 const maxOvershoot = 1.1
 550                 heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
 551
 552                 // Compute the upper bound on the scan work remaining.
 553                 scanWorkExpected = int64(memstats.heap_scan)
 554         }
 555
 556         // Compute the remaining scan work estimate.
 557         //
 558         // Note that we currently count allocations during GC as both
 559         // scannable heap (heap_scan) and scan work completed
 560         // (scanWork), so allocation will change this difference will
 561         // slowly in the soft regime and not at all in the hard
 562         // regime.
 563         scanWorkRemaining := scanWorkExpected - c.scanWork
 564         if scanWorkRemaining < 1000 {
 565                 // We set a somewhat arbitrary lower bound on
 566                 // remaining scan work since if we aim a little high,
 567                 // we can miss by a little.
 568                 //
 569                 // We *do* need to enforce that this is at least 1,
 570                 // since marking is racy and double-scanning objects
 571                 // may legitimately make the remaining scan work
 572                 // negative, even in the hard goal regime.
 573                 scanWorkRemaining = 1000
 574         }
 575
 576         // Compute the heap distance remaining.
 577         heapRemaining := heapGoal - int64(live)
 578         if heapRemaining <= 0 {
 579                 // This shouldn't happen, but if it does, avoid
 580                 // dividing by zero or setting the assist negative.
 581                 heapRemaining = 1
 582         }
 583
 584         // Compute the mutator assist ratio so by the time the mutator
 585         // allocates the remaining heap bytes up to next_gc, it will
 586         // have done (or stolen) the remaining amount of scan work.
 587         c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
 588         c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
 589 }
 590
 591 // endCycle computes the trigger ratio for the next cycle.
 592 func (c *gcControllerState) endCycle() float64 {
 593         if work.userForced {
 594                 // Forced GC means this cycle didn't start at the
 595                 // trigger, so where it finished isn't good
 596                 // information about how to adjust the trigger.
 597                 // Just leave it where it is.
 598                 return memstats.triggerRatio
 599         }
 600
 601         // Proportional response gain for the trigger controller. Must
 602         // be in [0, 1]. Lower values smooth out transient effects but
 603         // take longer to respond to phase changes. Higher values
 604         // react to phase changes quickly, but are more affected by
 605         // transient changes. Values near 1 may be unstable.
 606         const triggerGain = 0.5
 607
 608         // Compute next cycle trigger ratio. First, this computes the
 609         // "error" for this cycle; that is, how far off the trigger
 610         // was from what it should have been, accounting for both heap
 611         // growth and GC CPU utilization. We compute the actual heap
 612         // growth during this cycle and scale that by how far off from
 613         // the goal CPU utilization we were (to estimate the heap
 614         // growth if we had the desired CPU utilization). The
 615         // difference between this estimate and the GOGC-based goal
 616         // heap growth is the error.
 617         goalGrowthRatio := float64(gcpercent) / 100
 618         actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
 619         assistDuration := nanotime() - c.markStartTime
 620
 621         // Assume background mark hit its utilization goal.
 622         utilization := gcBackgroundUtilization
 623         // Add assist utilization; avoid divide by zero.
 624         if assistDuration > 0 {
 625                 utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
 626         }
 627
 628         triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
 629
 630         // Finally, we adjust the trigger for next time by this error,
 631         // damped by the proportional gain.
 632         triggerRatio := memstats.triggerRatio + triggerGain*triggerError
 633
 634         if debug.gcpacertrace > 0 {
 635                 // Print controller state in terms of the design
 636                 // document.
 637                 H_m_prev := memstats.heap_marked
 638                 h_t := memstats.triggerRatio
 639                 H_T := memstats.gc_trigger
 640                 h_a := actualGrowthRatio
 641                 H_a := memstats.heap_live
 642                 h_g := goalGrowthRatio
 643                 H_g := int64(float64(H_m_prev) * (1 + h_g))
 644                 u_a := utilization
 645                 u_g := gcGoalUtilization
 646                 W_a := c.scanWork
 647                 print("pacer: H_m_prev=", H_m_prev,
 648                         " h_t=", h_t, " H_T=", H_T,
 649                         " h_a=", h_a, " H_a=", H_a,
 650                         " h_g=", h_g, " H_g=", H_g,
 651                         " u_a=", u_a, " u_g=", u_g,
 652                         " W_a=", W_a,
 653                         " goalΔ=", goalGrowthRatio-h_t,
 654                         " actualΔ=", h_a-h_t,
 655                         " u_a/u_g=", u_a/u_g,
 656                         "\n")
 657         }
 658
 659         return triggerRatio
 660 }
 661
 662 // enlistWorker encourages another dedicated mark worker to start on
 663 // another P if there are spare worker slots. It is used by putfull
 664 // when more work is made available.
 665 //
 666 //go:nowritebarrier
 667 func (c *gcControllerState) enlistWorker() {
 668         // If there are idle Ps, wake one so it will run an idle worker.
 669         // NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
 670         //
 671         //      if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
 672         //              wakep()
 673         //              return
 674         //      }
 675
 676         // There are no idle Ps. If we need more dedicated workers,
 677         // try to preempt a running P so it will switch to a worker.
 678         if c.dedicatedMarkWorkersNeeded <= 0 {
 679                 return
 680         }
 681         // Pick a random other P to preempt.
 682         if gomaxprocs <= 1 {
 683                 return
 684         }
 685         gp := getg()
 686         if gp == nil || gp.m == nil || gp.m.p == 0 {
 687                 return
 688         }
 689         myID := gp.m.p.ptr().id
 690         for tries := 0; tries < 5; tries++ {
 691                 id := int32(fastrandn(uint32(gomaxprocs - 1)))
 692                 if id >= myID {
 693                         id++
 694                 }
 695                 p := allp[id]
 696                 if p.status != _Prunning {
 697                         continue
 698                 }
 699                 if preemptone(p) {
 700                         return
 701                 }
 702         }
 703 }
 704
 705 // findRunnableGCWorker returns the background mark worker for _p_ if it
 706 // should be run. This must only be called when gcBlackenEnabled != 0.
 707 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 708         if gcBlackenEnabled == 0 {
 709                 throw("gcControllerState.findRunnable: blackening not enabled")
 710         }
 711         if _p_.gcBgMarkWorker == 0 {
 712                 // The mark worker associated with this P is blocked
 713                 // performing a mark transition. We can't run it
 714                 // because it may be on some other run or wait queue.
 715                 return nil
 716         }
 717
 718         if !gcMarkWorkAvailable(_p_) {
 719                 // No work to be done right now. This can happen at
 720                 // the end of the mark phase when there are still
 721                 // assists tapering off. Don't bother running a worker
 722                 // now because it'll just return immediately.
 723                 return nil
 724         }
 725
 726         decIfPositive := func(ptr *int64) bool {
 727                 if *ptr > 0 {
 728                         if atomic.Xaddint64(ptr, -1) >= 0 {
 729                                 return true
 730                         }
 731                         // We lost a race
 732                         atomic.Xaddint64(ptr, +1)
 733                 }
 734                 return false
 735         }
 736
 737         if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
 738                 // This P is now dedicated to marking until the end of
 739                 // the concurrent mark phase.
 740                 _p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 741         } else if c.fractionalUtilizationGoal == 0 {
 742                 // No need for fractional workers.
 743                 return nil
 744         } else {
 745                 // Is this P behind on the fractional utilization
 746                 // goal?
 747                 //
 748                 // This should be kept in sync with pollFractionalWorkerExit.
 749                 delta := nanotime() - gcController.markStartTime
 750                 if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
 751                         // Nope. No need to run a fractional worker.
 752                         return nil
 753                 }
 754                 // Run a fractional worker.
 755                 _p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 756         }
 757
 758         // Run the background mark worker
 759         gp := _p_.gcBgMarkWorker.ptr()
 760         casgstatus(gp, _Gwaiting, _Grunnable)
 761         if trace.enabled {
 762                 traceGoUnpark(gp, 0)
 763         }
 764         return gp
 765 }
 766
 767 // pollFractionalWorkerExit returns true if a fractional mark worker
 768 // should self-preempt. It assumes it is called from the fractional
 769 // worker.
 770 func pollFractionalWorkerExit() bool {
 771         // This should be kept in sync with the fractional worker
 772         // scheduler logic in findRunnableGCWorker.
 773         now := nanotime()
 774         delta := now - gcController.markStartTime
 775         if delta <= 0 {
 776                 return true
 777         }
 778         p := getg().m.p.ptr()
 779         selfTime := p.gcFractionalMarkTime + (now - p.gcMarkWorkerStartTime)
 780         // Add some slack to the utilization goal so that the
 781         // fractional worker isn't behind again the instant it exits.
 782         return float64(selfTime)/float64(delta) > 1.2*gcController.fractionalUtilizationGoal
 783 }
 784
 785 // gcSetTriggerRatio sets the trigger ratio and updates everything
 786 // derived from it: the absolute trigger, the heap goal, mark pacing,
 787 // and sweep pacing.
 788 //
 789 // This can be called any time. If GC is the in the middle of a
 790 // concurrent phase, it will adjust the pacing of that phase.
 791 //
 792 // This depends on gcpercent, memstats.heap_marked, and
 793 // memstats.heap_live. These must be up to date.
 794 //
 795 // mheap_.lock must be held or the world must be stopped.
 796 func gcSetTriggerRatio(triggerRatio float64) {
 797         // Set the trigger ratio, capped to reasonable bounds.
 798         if triggerRatio < 0 {
 799                 // This can happen if the mutator is allocating very
 800                 // quickly or the GC is scanning very slowly.
 801                 triggerRatio = 0
 802         } else if gcpercent >= 0 {
 803                 // Ensure there's always a little margin so that the
 804                 // mutator assist ratio isn't infinity.
 805                 maxTriggerRatio := 0.95 * float64(gcpercent) / 100
 806                 if triggerRatio > maxTriggerRatio {
 807                         triggerRatio = maxTriggerRatio
 808                 }
 809         }
 810         memstats.triggerRatio = triggerRatio
 811
 812         // Compute the absolute GC trigger from the trigger ratio.
 813         //
 814         // We trigger the next GC cycle when the allocated heap has
 815         // grown by the trigger ratio over the marked heap size.
 816         trigger := ^uint64(0)
 817         if gcpercent >= 0 {
 818                 trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
 819                 // Don't trigger below the minimum heap size.
 820                 minTrigger := heapminimum
 821                 if !gosweepdone() {
 822                         // Concurrent sweep happens in the heap growth
 823                         // from heap_live to gc_trigger, so ensure
 824                         // that concurrent sweep has some heap growth
 825                         // in which to perform sweeping before we
 826                         // start the next GC cycle.
 827                         sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance*uint64(gcpercent)/100
 828                         if sweepMin > minTrigger {
 829                                 minTrigger = sweepMin
 830                         }
 831                 }
 832                 if trigger < minTrigger {
 833                         trigger = minTrigger
 834                 }
 835                 if int64(trigger) < 0 {
 836                         print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
 837                         throw("gc_trigger underflow")
 838                 }
 839         }
 840         memstats.gc_trigger = trigger
 841
 842         // Compute the next GC goal, which is when the allocated heap
 843         // has grown by GOGC/100 over the heap marked by the last
 844         // cycle.
 845         goal := ^uint64(0)
 846         if gcpercent >= 0 {
 847                 goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
 848                 if goal < trigger {
 849                         // The trigger ratio is always less than GOGC/100, but
 850                         // other bounds on the trigger may have raised it.
 851                         // Push up the goal, too.
 852                         goal = trigger
 853                 }
 854         }
 855         memstats.next_gc = goal
 856         if trace.enabled {
 857                 traceNextGC()
 858         }
 859
 860         // Update mark pacing.
 861         if gcphase != _GCoff {
 862                 gcController.revise()
 863         }
 864
 865         // Update sweep pacing.
 866         if gosweepdone() {
 867                 mheap_.sweepPagesPerByte = 0
 868         } else {
 869                 // Concurrent sweep needs to sweep all of the in-use
 870                 // pages by the time the allocated heap reaches the GC
 871                 // trigger. Compute the ratio of in-use pages to sweep
 872                 // per byte allocated, accounting for the fact that
 873                 // some might already be swept.
 874                 heapLiveBasis := atomic.Load64(&memstats.heap_live)
 875                 heapDistance := int64(trigger) - int64(heapLiveBasis)
 876                 // Add a little margin so rounding errors and
 877                 // concurrent sweep are less likely to leave pages
 878                 // unswept when GC starts.
 879                 heapDistance -= 1024 * 1024
 880                 if heapDistance < _PageSize {
 881                         // Avoid setting the sweep ratio extremely high
 882                         heapDistance = _PageSize
 883                 }
 884                 pagesSwept := atomic.Load64(&mheap_.pagesSwept)
 885                 sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
 886                 if sweepDistancePages <= 0 {
 887                         mheap_.sweepPagesPerByte = 0
 888                 } else {
 889                         mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
 890                         mheap_.sweepHeapLiveBasis = heapLiveBasis
 891                         // Write pagesSweptBasis last, since this
 892                         // signals concurrent sweeps to recompute
 893                         // their debt.
 894                         atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
 895                 }
 896         }
 897 }
 898
 899 // gcGoalUtilization is the goal CPU utilization for
 900 // marking as a fraction of GOMAXPROCS.
 901 const gcGoalUtilization = 0.30
 902
 903 // gcBackgroundUtilization is the fixed CPU utilization for background
 904 // marking. It must be <= gcGoalUtilization. The difference between
 905 // gcGoalUtilization and gcBackgroundUtilization will be made up by
 906 // mark assists. The scheduler will aim to use within 50% of this
 907 // goal.
 908 //
 909 // Setting this to < gcGoalUtilization avoids saturating the trigger
 910 // feedback controller when there are no assists, which allows it to
 911 // better control CPU and heap growth. However, the larger the gap,
 912 // the more mutator assists are expected to happen, which impact
 913 // mutator latency.
 914 const gcBackgroundUtilization = 0.25
 915
 916 // gcCreditSlack is the amount of scan work credit that can can
 917 // accumulate locally before updating gcController.scanWork and,
 918 // optionally, gcController.bgScanCredit. Lower values give a more
 919 // accurate assist ratio and make it more likely that assists will
 920 // successfully steal background credit. Higher values reduce memory
 921 // contention.
 922 const gcCreditSlack = 2000
 923
 924 // gcAssistTimeSlack is the nanoseconds of mutator assist time that
 925 // can accumulate on a P before updating gcController.assistTime.
 926 const gcAssistTimeSlack = 5000
 927
 928 // gcOverAssistWork determines how many extra units of scan work a GC
 929 // assist does when an assist happens. This amortizes the cost of an
 930 // assist by pre-paying for this many bytes of future allocations.
 931 const gcOverAssistWork = 64 << 10
 932
 933 var work struct {
 934         full  lfstack                  // lock-free list of full blocks workbuf
 935         empty lfstack                  // lock-free list of empty blocks workbuf
 936         pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 937
 938         wbufSpans struct {
 939                 lock mutex
 940                 // free is a list of spans dedicated to workbufs, but
 941                 // that don't currently contain any workbufs.
 942                 free mSpanList
 943                 // busy is a list of all spans containing workbufs on
 944                 // one of the workbuf lists.
 945                 busy mSpanList
 946         }
 947
 948         // Restore 64-bit alignment on 32-bit.
 949         _ uint32
 950
 951         // bytesMarked is the number of bytes marked this cycle. This
 952         // includes bytes blackened in scanned objects, noscan objects
 953         // that go straight to black, and permagrey objects scanned by
 954         // markroot during the concurrent scan phase. This is updated
 955         // atomically during the cycle. Updates may be batched
 956         // arbitrarily, since the value is only read at the end of the
 957         // cycle.
 958         //
 959         // Because of benign races during marking, this number may not
 960         // be the exact number of marked bytes, but it should be very
 961         // close.
 962         //
 963         // Put this field here because it needs 64-bit atomic access
 964         // (and thus 8-byte alignment even on 32-bit architectures).
 965         bytesMarked uint64
 966
 967         markrootNext uint32 // next markroot job
 968         markrootJobs uint32 // number of markroot jobs
 969
 970         nproc   uint32
 971         tstart  int64
 972         nwait   uint32
 973         ndone   uint32
 974         alldone note
 975
 976         // helperDrainBlock indicates that GC mark termination helpers
 977         // should pass gcDrainBlock to gcDrain to block in the
 978         // getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
 979         //
 980         // TODO: This is a temporary fallback to work around races
 981         // that cause early mark termination.
 982         helperDrainBlock bool
 983
 984         // Number of roots of various root types. Set by gcMarkRootPrepare.
 985         nFlushCacheRoots                    int
 986         nDataRoots, nSpanRoots, nStackRoots int
 987
 988         // markrootDone indicates that roots have been marked at least
 989         // once during the current GC cycle. This is checked by root
 990         // marking operations that have to happen only during the
 991         // first root marking pass, whether that's during the
 992         // concurrent mark phase in current GC or mark termination in
 993         // STW GC.
 994         markrootDone bool
 995
 996         // Each type of GC state transition is protected by a lock.
 997         // Since multiple threads can simultaneously detect the state
 998         // transition condition, any thread that detects a transition
 999         // condition must acquire the appropriate transition lock,
1000         // re-check the transition condition and return if it no
1001         // longer holds or perform the transition if it does.
1002         // Likewise, any transition must invalidate the transition
1003         // condition before releasing the lock. This ensures that each
1004         // transition is performed by exactly one thread and threads
1005         // that need the transition to happen block until it has
1006         // happened.
1007         //
1008         // startSema protects the transition from "off" to mark or
1009         // mark termination.
1010         startSema uint32
1011         // markDoneSema protects transitions from mark 1 to mark 2 and
1012         // from mark 2 to mark termination.
1013         markDoneSema uint32
1014
1015         bgMarkReady note   // signal background mark worker has started
1016         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
1017         // Background mark completion signaling
1018
1019         // mode is the concurrency mode of the current GC cycle.
1020         mode gcMode
1021
1022         // userForced indicates the current GC cycle was forced by an
1023         // explicit user call.
1024         userForced bool
1025
1026         // totaltime is the CPU nanoseconds spent in GC since the
1027         // program started if debug.gctrace > 0.
1028         totaltime int64
1029
1030         // initialHeapLive is the value of memstats.heap_live at the
1031         // beginning of this GC cycle.
1032         initialHeapLive uint64
1033
1034         // assistQueue is a queue of assists that are blocked because
1035         // there was neither enough credit to steal or enough work to
1036         // do.
1037         assistQueue struct {
1038                 lock       mutex
1039                 head, tail guintptr
1040         }
1041
1042         // sweepWaiters is a list of blocked goroutines to wake when
1043         // we transition from mark termination to sweep.
1044         sweepWaiters struct {
1045                 lock mutex
1046                 head guintptr
1047         }
1048
1049         // cycles is the number of completed GC cycles, where a GC
1050         // cycle is sweep termination, mark, mark termination, and
1051         // sweep. This differs from memstats.numgc, which is
1052         // incremented at mark termination.
1053         cycles uint32
1054
1055         // Timing/utilization stats for this cycle.
1056         stwprocs, maxprocs                 int32
1057         tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
1058
1059         pauseNS    int64 // total STW time this cycle
1060         pauseStart int64 // nanotime() of last STW
1061
1062         // debug.gctrace heap sizes for this cycle.
1063         heap0, heap1, heap2, heapGoal uint64
1064 }
1065
1066 // GC runs a garbage collection and blocks the caller until the
1067 // garbage collection is complete. It may also block the entire
1068 // program.
1069 func GC() {
1070         // We consider a cycle to be: sweep termination, mark, mark
1071         // termination, and sweep. This function shouldn't return
1072         // until a full cycle has been completed, from beginning to
1073         // end. Hence, we always want to finish up the current cycle
1074         // and start a new one. That means:
1075         //
1076         // 1. In sweep termination, mark, or mark termination of cycle
1077         // N, wait until mark termination N completes and transitions
1078         // to sweep N.
1079         //
1080         // 2. In sweep N, help with sweep N.
1081         //
1082         // At this point we can begin a full cycle N+1.
1083         //
1084         // 3. Trigger cycle N+1 by starting sweep termination N+1.
1085         //
1086         // 4. Wait for mark termination N+1 to complete.
1087         //
1088         // 5. Help with sweep N+1 until it's done.
1089         //
1090         // This all has to be written to deal with the fact that the
1091         // GC may move ahead on its own. For example, when we block
1092         // until mark termination N, we may wake up in cycle N+2.
1093
1094         gp := getg()
1095
1096         // Prevent the GC phase or cycle count from changing.
1097         lock(&work.sweepWaiters.lock)
1098         n := atomic.Load(&work.cycles)
1099         if gcphase == _GCmark {
1100                 // Wait until sweep termination, mark, and mark
1101                 // termination of cycle N complete.
1102                 gp.schedlink = work.sweepWaiters.head
1103                 work.sweepWaiters.head.set(gp)
1104                 goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
1105         } else {
1106                 // We're in sweep N already.
1107                 unlock(&work.sweepWaiters.lock)
1108         }
1109
1110         // We're now in sweep N or later. Trigger GC cycle N+1, which
1111         // will first finish sweep N if necessary and then enter sweep
1112         // termination N+1.
1113         gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerCycle, n: n + 1})
1114
1115         // Wait for mark termination N+1 to complete.
1116         lock(&work.sweepWaiters.lock)
1117         if gcphase == _GCmark && atomic.Load(&work.cycles) == n+1 {
1118                 gp.schedlink = work.sweepWaiters.head
1119                 work.sweepWaiters.head.set(gp)
1120                 goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
1121         } else {
1122                 unlock(&work.sweepWaiters.lock)
1123         }
1124
1125         // Finish sweep N+1 before returning. We do this both to
1126         // complete the cycle and because runtime.GC() is often used
1127         // as part of tests and benchmarks to get the system into a
1128         // relatively stable and isolated state.
1129         for atomic.Load(&work.cycles) == n+1 && gosweepone() != ^uintptr(0) {
1130                 sweep.nbgsweep++
1131                 Gosched()
1132         }
1133
1134         // Callers may assume that the heap profile reflects the
1135         // just-completed cycle when this returns (historically this
1136         // happened because this was a STW GC), but right now the
1137         // profile still reflects mark termination N, not N+1.
1138         //
1139         // As soon as all of the sweep frees from cycle N+1 are done,
1140         // we can go ahead and publish the heap profile.
1141         //
1142         // First, wait for sweeping to finish. (We know there are no
1143         // more spans on the sweep queue, but we may be concurrently
1144         // sweeping spans, so we have to wait.)
1145         for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
1146                 Gosched()
1147         }
1148
1149         // Now we're really done with sweeping, so we can publish the
1150         // stable heap profile. Only do this if we haven't already hit
1151         // another mark termination.
1152         mp := acquirem()
1153         cycle := atomic.Load(&work.cycles)
1154         if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
1155                 mProf_PostSweep()
1156         }
1157         releasem(mp)
1158 }
1159
1160 // gcMode indicates how concurrent a GC cycle should be.
1161 type gcMode int
1162
1163 const (
1164         gcBackgroundMode gcMode = iota // concurrent GC and sweep
1165         gcForceMode                    // stop-the-world GC now, concurrent sweep
1166         gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
1167 )
1168
1169 // A gcTrigger is a predicate for starting a GC cycle. Specifically,
1170 // it is an exit condition for the _GCoff phase.
1171 type gcTrigger struct {
1172         kind gcTriggerKind
1173         now  int64  // gcTriggerTime: current time
1174         n    uint32 // gcTriggerCycle: cycle number to start
1175 }
1176
1177 type gcTriggerKind int
1178
1179 const (
1180         // gcTriggerAlways indicates that a cycle should be started
1181         // unconditionally, even if GOGC is off or we're in a cycle
1182         // right now. This cannot be consolidated with other cycles.
1183         gcTriggerAlways gcTriggerKind = iota
1184
1185         // gcTriggerHeap indicates that a cycle should be started when
1186         // the heap size reaches the trigger heap size computed by the
1187         // controller.
1188         gcTriggerHeap
1189
1190         // gcTriggerTime indicates that a cycle should be started when
1191         // it's been more than forcegcperiod nanoseconds since the
1192         // previous GC cycle.
1193         gcTriggerTime
1194
1195         // gcTriggerCycle indicates that a cycle should be started if
1196         // we have not yet started cycle number gcTrigger.n (relative
1197         // to work.cycles).
1198         gcTriggerCycle
1199 )
1200
1201 // test returns true if the trigger condition is satisfied, meaning
1202 // that the exit condition for the _GCoff phase has been met. The exit
1203 // condition should be tested when allocating.
1204 func (t gcTrigger) test() bool {
1205         if !memstats.enablegc || panicking != 0 {
1206                 return false
1207         }
1208         if t.kind == gcTriggerAlways {
1209                 return true
1210         }
1211         if gcphase != _GCoff {
1212                 return false
1213         }
1214         switch t.kind {
1215         case gcTriggerHeap:
1216                 // Non-atomic access to heap_live for performance. If
1217                 // we are going to trigger on this, this thread just
1218                 // atomically wrote heap_live anyway and we'll see our
1219                 // own write.
1220                 return memstats.heap_live >= memstats.gc_trigger
1221         case gcTriggerTime:
1222                 if gcpercent < 0 {
1223                         return false
1224                 }
1225                 lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
1226                 return lastgc != 0 && t.now-lastgc > forcegcperiod
1227         case gcTriggerCycle:
1228                 // t.n > work.cycles, but accounting for wraparound.
1229                 return int32(t.n-work.cycles) > 0
1230         }
1231         return true
1232 }
1233
1234 // gcStart transitions the GC from _GCoff to _GCmark (if
1235 // !mode.stwMark) or _GCmarktermination (if mode.stwMark) by
1236 // performing sweep termination and GC initialization.
1237 //
1238 // This may return without performing this transition in some cases,
1239 // such as when called on a system stack or with locks held.
1240 func gcStart(mode gcMode, trigger gcTrigger) {
1241         // Since this is called from malloc and malloc is called in
1242         // the guts of a number of libraries that might be holding
1243         // locks, don't attempt to start GC in non-preemptible or
1244         // potentially unstable situations.
1245         mp := acquirem()
1246         if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
1247                 releasem(mp)
1248                 return
1249         }
1250         releasem(mp)
1251         mp = nil
1252
1253         // Pick up the remaining unswept/not being swept spans concurrently
1254         //
1255         // This shouldn't happen if we're being invoked in background
1256         // mode since proportional sweep should have just finished
1257         // sweeping everything, but rounding errors, etc, may leave a
1258         // few spans unswept. In forced mode, this is necessary since
1259         // GC can be forced at any point in the sweeping cycle.
1260         //
1261         // We check the transition condition continuously here in case
1262         // this G gets delayed in to the next GC cycle.
1263         for trigger.test() && gosweepone() != ^uintptr(0) {
1264                 sweep.nbgsweep++
1265         }
1266
1267         // Perform GC initialization and the sweep termination
1268         // transition.
1269         semacquire(&work.startSema)
1270         // Re-check transition condition under transition lock.
1271         if !trigger.test() {
1272                 semrelease(&work.startSema)
1273                 return
1274         }
1275
1276         // For stats, check if this GC was forced by the user.
1277         work.userForced = trigger.kind == gcTriggerAlways || trigger.kind == gcTriggerCycle
1278
1279         // In gcstoptheworld debug mode, upgrade the mode accordingly.
1280         // We do this after re-checking the transition condition so
1281         // that multiple goroutines that detect the heap trigger don't
1282         // start multiple STW GCs.
1283         if mode == gcBackgroundMode {
1284                 if debug.gcstoptheworld == 1 {
1285                         mode = gcForceMode
1286                 } else if debug.gcstoptheworld == 2 {
1287                         mode = gcForceBlockMode
1288                 }
1289         }
1290
1291         // Ok, we're doing it! Stop everybody else
1292         semacquire(&worldsema)
1293
1294         if trace.enabled {
1295                 traceGCStart()
1296         }
1297
1298         if mode == gcBackgroundMode {
1299                 gcBgMarkStartWorkers()
1300         }
1301
1302         gcResetMarkState()
1303
1304         work.stwprocs, work.maxprocs = gomaxprocs, gomaxprocs
1305         if work.stwprocs > ncpu {
1306                 // This is used to compute CPU time of the STW phases,
1307                 // so it can't be more than ncpu, even if GOMAXPROCS is.
1308                 work.stwprocs = ncpu
1309         }
1310         work.heap0 = atomic.Load64(&memstats.heap_live)
1311         work.pauseNS = 0
1312         work.mode = mode
1313
1314         now := nanotime()
1315         work.tSweepTerm = now
1316         work.pauseStart = now
1317         if trace.enabled {
1318                 traceGCSTWStart(1)
1319         }
1320         systemstack(stopTheWorldWithSema)
1321         // Finish sweep before we start concurrent scan.
1322         systemstack(func() {
1323                 finishsweep_m()
1324         })
1325         // clearpools before we start the GC. If we wait they memory will not be
1326         // reclaimed until the next GC cycle.
1327         clearpools()
1328
1329         work.cycles++
1330         if mode == gcBackgroundMode { // Do as much work concurrently as possible
1331                 gcController.startCycle()
1332                 work.heapGoal = memstats.next_gc
1333
1334                 // Enter concurrent mark phase and enable
1335                 // write barriers.
1336                 //
1337                 // Because the world is stopped, all Ps will
1338                 // observe that write barriers are enabled by
1339                 // the time we start the world and begin
1340                 // scanning.
1341                 //
1342                 // Write barriers must be enabled before assists are
1343                 // enabled because they must be enabled before
1344                 // any non-leaf heap objects are marked. Since
1345                 // allocations are blocked until assists can
1346                 // happen, we want enable assists as early as
1347                 // possible.
1348                 setGCPhase(_GCmark)
1349
1350                 gcBgMarkPrepare() // Must happen before assist enable.
1351                 gcMarkRootPrepare()
1352
1353                 // Mark all active tinyalloc blocks. Since we're
1354                 // allocating from these, they need to be black like
1355                 // other allocations. The alternative is to blacken
1356                 // the tiny block on every allocation from it, which
1357                 // would slow down the tiny allocator.
1358                 gcMarkTinyAllocs()
1359
1360                 // At this point all Ps have enabled the write
1361                 // barrier, thus maintaining the no white to
1362                 // black invariant. Enable mutator assists to
1363                 // put back-pressure on fast allocating
1364                 // mutators.
1365                 atomic.Store(&gcBlackenEnabled, 1)
1366
1367                 // Assists and workers can start the moment we start
1368                 // the world.
1369                 gcController.markStartTime = now
1370
1371                 // Concurrent mark.
1372                 systemstack(func() {
1373                         now = startTheWorldWithSema(trace.enabled)
1374                 })
1375                 work.pauseNS += now - work.pauseStart
1376                 work.tMark = now
1377         } else {
1378                 if trace.enabled {
1379                         // Switch to mark termination STW.
1380                         traceGCSTWDone()
1381                         traceGCSTWStart(0)
1382                 }
1383                 t := nanotime()
1384                 work.tMark, work.tMarkTerm = t, t
1385                 work.heapGoal = work.heap0
1386
1387                 // Perform mark termination. This will restart the world.
1388                 gcMarkTermination(memstats.triggerRatio)
1389         }
1390
1391         semrelease(&work.startSema)
1392 }
1393
1394 // gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
1395 // to mark termination.
1396 //
1397 // This should be called when all mark work has been drained. In mark
1398 // 1, this includes all root marking jobs, global work buffers, and
1399 // active work buffers in assists and background workers; however,
1400 // work may still be cached in per-P work buffers. In mark 2, per-P
1401 // caches are disabled.
1402 //
1403 // The calling context must be preemptible.
1404 //
1405 // Note that it is explicitly okay to have write barriers in this
1406 // function because completion of concurrent mark is best-effort
1407 // anyway. Any work created by write barriers here will be cleaned up
1408 // by mark termination.
1409 func gcMarkDone() {
1410 top:
1411         semacquire(&work.markDoneSema)
1412
1413         // Re-check transition condition under transition lock.
1414         if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
1415                 semrelease(&work.markDoneSema)
1416                 return
1417         }
1418
1419         // Disallow starting new workers so that any remaining workers
1420         // in the current mark phase will drain out.
1421         //
1422         // TODO(austin): Should dedicated workers keep an eye on this
1423         // and exit gcDrain promptly?
1424         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
1425         prevFractionalGoal := gcController.fractionalUtilizationGoal
1426         gcController.fractionalUtilizationGoal = 0
1427
1428         if !gcBlackenPromptly {
1429                 // Transition from mark 1 to mark 2.
1430                 //
1431                 // The global work list is empty, but there can still be work
1432                 // sitting in the per-P work caches.
1433                 // Flush and disable work caches.
1434
1435                 // Disallow caching workbufs and indicate that we're in mark 2.
1436                 gcBlackenPromptly = true
1437
1438                 // Prevent completion of mark 2 until we've flushed
1439                 // cached workbufs.
1440                 atomic.Xadd(&work.nwait, -1)
1441
1442                 // GC is set up for mark 2. Let Gs blocked on the
1443                 // transition lock go while we flush caches.
1444                 semrelease(&work.markDoneSema)
1445
1446                 systemstack(func() {
1447                         // Flush all currently cached workbufs and
1448                         // ensure all Ps see gcBlackenPromptly. This
1449                         // also blocks until any remaining mark 1
1450                         // workers have exited their loop so we can
1451                         // start new mark 2 workers.
1452                         forEachP(func(_p_ *p) {
1453                                 wbBufFlush1(_p_)
1454                                 _p_.gcw.dispose()
1455                         })
1456                 })
1457
1458                 // Check that roots are marked. We should be able to
1459                 // do this before the forEachP, but based on issue
1460                 // #16083 there may be a (harmless) race where we can
1461                 // enter mark 2 while some workers are still scanning
1462                 // stacks. The forEachP ensures these scans are done.
1463                 //
1464                 // TODO(austin): Figure out the race and fix this
1465                 // properly.
1466                 gcMarkRootCheck()
1467
1468                 // Now we can start up mark 2 workers.
1469                 atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
1470                 gcController.fractionalUtilizationGoal = prevFractionalGoal
1471
1472                 incnwait := atomic.Xadd(&work.nwait, +1)
1473                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1474                         // This loop will make progress because
1475                         // gcBlackenPromptly is now true, so it won't
1476                         // take this same "if" branch.
1477                         goto top
1478                 }
1479         } else {
1480                 // Transition to mark termination.
1481                 now := nanotime()
1482                 work.tMarkTerm = now
1483                 work.pauseStart = now
1484                 getg().m.preemptoff = "gcing"
1485                 if trace.enabled {
1486                         traceGCSTWStart(0)
1487                 }
1488                 systemstack(stopTheWorldWithSema)
1489                 // The gcphase is _GCmark, it will transition to _GCmarktermination
1490                 // below. The important thing is that the wb remains active until
1491                 // all marking is complete. This includes writes made by the GC.
1492
1493                 // Record that one root marking pass has completed.
1494                 work.markrootDone = true
1495
1496                 // Disable assists and background workers. We must do
1497                 // this before waking blocked assists.
1498                 atomic.Store(&gcBlackenEnabled, 0)
1499
1500                 // Wake all blocked assists. These will run when we
1501                 // start the world again.
1502                 gcWakeAllAssists()
1503
1504                 // Likewise, release the transition lock. Blocked
1505                 // workers and assists will run when we start the
1506                 // world again.
1507                 semrelease(&work.markDoneSema)
1508
1509                 // endCycle depends on all gcWork cache stats being
1510                 // flushed. This is ensured by mark 2.
1511                 nextTriggerRatio := gcController.endCycle()
1512
1513                 // Perform mark termination. This will restart the world.
1514                 gcMarkTermination(nextTriggerRatio)
1515         }
1516 }
1517
1518 func gcMarkTermination(nextTriggerRatio float64) {
1519         // World is stopped.
1520         // Start marktermination which includes enabling the write barrier.
1521         atomic.Store(&gcBlackenEnabled, 0)
1522         gcBlackenPromptly = false
1523         setGCPhase(_GCmarktermination)
1524
1525         work.heap1 = memstats.heap_live
1526         startTime := nanotime()
1527
1528         mp := acquirem()
1529         mp.preemptoff = "gcing"
1530         _g_ := getg()
1531         _g_.m.traceback = 2
1532         gp := _g_.m.curg
1533         casgstatus(gp, _Grunning, _Gwaiting)
1534         gp.waitreason = "garbage collection"
1535
1536         // Run gc on the g0 stack. We do this so that the g stack
1537         // we're currently running on will no longer change. Cuts
1538         // the root set down a bit (g0 stacks are not scanned, and
1539         // we don't need to scan gc's internal state).  We also
1540         // need to switch to g0 so we can shrink the stack.
1541         systemstack(func() {
1542                 gcMark(startTime)
1543                 // Must return immediately.
1544                 // The outer function's stack may have moved
1545                 // during gcMark (it shrinks stacks, including the
1546                 // outer function's stack), so we must not refer
1547                 // to any of its variables. Return back to the
1548                 // non-system stack to pick up the new addresses
1549                 // before continuing.
1550         })
1551
1552         systemstack(func() {
1553                 work.heap2 = work.bytesMarked
1554                 if debug.gccheckmark > 0 {
1555                         // Run a full stop-the-world mark using checkmark bits,
1556                         // to check that we didn't forget to mark anything during
1557                         // the concurrent mark process.
1558                         gcResetMarkState()
1559                         initCheckmarks()
1560                         gcMark(startTime)
1561                         clearCheckmarks()
1562                 }
1563
1564                 // marking is complete so we can turn the write barrier off
1565                 setGCPhase(_GCoff)
1566                 gcSweep(work.mode)
1567
1568                 if debug.gctrace > 1 {
1569                         startTime = nanotime()
1570                         // The g stacks have been scanned so
1571                         // they have gcscanvalid==true and gcworkdone==true.
1572                         // Reset these so that all stacks will be rescanned.
1573                         gcResetMarkState()
1574                         finishsweep_m()
1575
1576                         // Still in STW but gcphase is _GCoff, reset to _GCmarktermination
1577                         // At this point all objects will be found during the gcMark which
1578                         // does a complete STW mark and object scan.
1579                         setGCPhase(_GCmarktermination)
1580                         gcMark(startTime)
1581                         setGCPhase(_GCoff) // marking is done, turn off wb.
1582                         gcSweep(work.mode)
1583                 }
1584         })
1585
1586         _g_.m.traceback = 0
1587         casgstatus(gp, _Gwaiting, _Grunning)
1588
1589         if trace.enabled {
1590                 traceGCDone()
1591         }
1592
1593         // all done
1594         mp.preemptoff = ""
1595
1596         if gcphase != _GCoff {
1597                 throw("gc done but gcphase != _GCoff")
1598         }
1599
1600         // Update GC trigger and pacing for the next cycle.
1601         gcSetTriggerRatio(nextTriggerRatio)
1602
1603         // Update timing memstats
1604         now := nanotime()
1605         sec, nsec, _ := time_now()
1606         unixNow := sec*1e9 + int64(nsec)
1607         work.pauseNS += now - work.pauseStart
1608         work.tEnd = now
1609         atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
1610         atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
1611         memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
1612         memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
1613         memstats.pause_total_ns += uint64(work.pauseNS)
1614
1615         // Update work.totaltime.
1616         sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
1617         // We report idle marking time below, but omit it from the
1618         // overall utilization here since it's "free".
1619         markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
1620         markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
1621         cycleCpu := sweepTermCpu + markCpu + markTermCpu
1622         work.totaltime += cycleCpu
1623
1624         // Compute overall GC CPU utilization.
1625         totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
1626         memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
1627
1628         // Reset sweep state.
1629         sweep.nbgsweep = 0
1630         sweep.npausesweep = 0
1631
1632         if work.userForced {
1633                 memstats.numforcedgc++
1634         }
1635
1636         // Bump GC cycle count and wake goroutines waiting on sweep.
1637         lock(&work.sweepWaiters.lock)
1638         memstats.numgc++
1639         injectglist(work.sweepWaiters.head.ptr())
1640         work.sweepWaiters.head = 0
1641         unlock(&work.sweepWaiters.lock)
1642
1643         // Finish the current heap profiling cycle and start a new
1644         // heap profiling cycle. We do this before starting the world
1645         // so events don't leak into the wrong cycle.
1646         mProf_NextCycle()
1647
1648         systemstack(func() { startTheWorldWithSema(true) })
1649
1650         // Flush the heap profile so we can start a new cycle next GC.
1651         // This is relatively expensive, so we don't do it with the
1652         // world stopped.
1653         mProf_Flush()
1654
1655         // Prepare workbufs for freeing by the sweeper. We do this
1656         // asynchronously because it can take non-trivial time.
1657         prepareFreeWorkbufs()
1658
1659         // Print gctrace before dropping worldsema. As soon as we drop
1660         // worldsema another cycle could start and smash the stats
1661         // we're trying to print.
1662         if debug.gctrace > 0 {
1663                 util := int(memstats.gc_cpu_fraction * 100)
1664
1665                 var sbuf [24]byte
1666                 printlock()
1667                 print("gc ", memstats.numgc,
1668                         " @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
1669                         util, "%: ")
1670                 prev := work.tSweepTerm
1671                 for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
1672                         if i != 0 {
1673                                 print("+")
1674                         }
1675                         print(string(fmtNSAsMS(sbuf[:], uint64(ns-prev))))
1676                         prev = ns
1677                 }
1678                 print(" ms clock, ")
1679                 for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
1680                         if i == 2 || i == 3 {
1681                                 // Separate mark time components with /.
1682                                 print("/")
1683                         } else if i != 0 {
1684                                 print("+")
1685                         }
1686                         print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
1687                 }
1688                 print(" ms cpu, ",
1689                         work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
1690                         work.heapGoal>>20, " MB goal, ",
1691                         work.maxprocs, " P")
1692                 if work.userForced {
1693                         print(" (forced)")
1694                 }
1695                 print("\n")
1696                 printunlock()
1697         }
1698
1699         semrelease(&worldsema)
1700         // Careful: another GC cycle may start now.
1701
1702         releasem(mp)
1703         mp = nil
1704
1705         // now that gc is done, kick off finalizer thread if needed
1706         if !concurrentSweep {
1707                 // give the queued finalizers, if any, a chance to run
1708                 Gosched()
1709         }
1710 }
1711
1712 // gcBgMarkStartWorkers prepares background mark worker goroutines.
1713 // These goroutines will not run until the mark phase, but they must
1714 // be started while the work is not stopped and from a regular G
1715 // stack. The caller must hold worldsema.
1716 func gcBgMarkStartWorkers() {
1717         // Background marking is performed by per-P G's. Ensure that
1718         // each P has a background GC G.
1719         for _, p := range allp {
1720                 if p.gcBgMarkWorker == 0 {
1721                         expectSystemGoroutine()
1722                         go gcBgMarkWorker(p)
1723                         notetsleepg(&work.bgMarkReady, -1)
1724                         noteclear(&work.bgMarkReady)
1725                 }
1726         }
1727 }
1728
1729 // gcBgMarkPrepare sets up state for background marking.
1730 // Mutator assists must not yet be enabled.
1731 func gcBgMarkPrepare() {
1732         // Background marking will stop when the work queues are empty
1733         // and there are no more workers (note that, since this is
1734         // concurrent, this may be a transient state, but mark
1735         // termination will clean it up). Between background workers
1736         // and assists, we don't really know how many workers there
1737         // will be, so we pretend to have an arbitrarily large number
1738         // of workers, almost all of which are "waiting". While a
1739         // worker is working it decrements nwait. If nproc == nwait,
1740         // there are no workers.
1741         work.nproc = ^uint32(0)
1742         work.nwait = ^uint32(0)
1743 }
1744
1745 func gcBgMarkWorker(_p_ *p) {
1746         setSystemGoroutine()
1747
1748         gp := getg()
1749
1750         type parkInfo struct {
1751                 m      muintptr // Release this m on park.
1752                 attach puintptr // If non-nil, attach to this p on park.
1753         }
1754         // We pass park to a gopark unlock function, so it can't be on
1755         // the stack (see gopark). Prevent deadlock from recursively
1756         // starting GC by disabling preemption.
1757         gp.m.preemptoff = "GC worker init"
1758         park := new(parkInfo)
1759         gp.m.preemptoff = ""
1760
1761         park.m.set(acquirem())
1762         park.attach.set(_p_)
1763         // Inform gcBgMarkStartWorkers that this worker is ready.
1764         // After this point, the background mark worker is scheduled
1765         // cooperatively by gcController.findRunnable. Hence, it must
1766         // never be preempted, as this would put it into _Grunnable
1767         // and put it on a run queue. Instead, when the preempt flag
1768         // is set, this puts itself into _Gwaiting to be woken up by
1769         // gcController.findRunnable at the appropriate time.
1770         notewakeup(&work.bgMarkReady)
1771
1772         for {
1773                 // Go to sleep until woken by gcController.findRunnable.
1774                 // We can't releasem yet since even the call to gopark
1775                 // may be preempted.
1776                 gopark(func(g *g, parkp unsafe.Pointer) bool {
1777                         park := (*parkInfo)(parkp)
1778
1779                         // The worker G is no longer running, so it's
1780                         // now safe to allow preemption.
1781                         releasem(park.m.ptr())
1782
1783                         // If the worker isn't attached to its P,
1784                         // attach now. During initialization and after
1785                         // a phase change, the worker may have been
1786                         // running on a different P. As soon as we
1787                         // attach, the owner P may schedule the
1788                         // worker, so this must be done after the G is
1789                         // stopped.
1790                         if park.attach != 0 {
1791                                 p := park.attach.ptr()
1792                                 park.attach.set(nil)
1793                                 // cas the worker because we may be
1794                                 // racing with a new worker starting
1795                                 // on this P.
1796                                 if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
1797                                         // The P got a new worker.
1798                                         // Exit this worker.
1799                                         return false
1800                                 }
1801                         }
1802                         return true
1803                 }, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
1804
1805                 // Loop until the P dies and disassociates this
1806                 // worker (the P may later be reused, in which case
1807                 // it will get a new worker) or we failed to associate.
1808                 if _p_.gcBgMarkWorker.ptr() != gp {
1809                         break
1810                 }
1811
1812                 // Disable preemption so we can use the gcw. If the
1813                 // scheduler wants to preempt us, we'll stop draining,
1814                 // dispose the gcw, and then preempt.
1815                 park.m.set(acquirem())
1816
1817                 if gcBlackenEnabled == 0 {
1818                         throw("gcBgMarkWorker: blackening not enabled")
1819                 }
1820
1821                 startTime := nanotime()
1822                 _p_.gcMarkWorkerStartTime = startTime
1823
1824                 decnwait := atomic.Xadd(&work.nwait, -1)
1825                 if decnwait == work.nproc {
1826                         println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
1827                         throw("work.nwait was > work.nproc")
1828                 }
1829
1830                 systemstack(func() {
1831                         // Mark our goroutine preemptible so its stack
1832                         // can be scanned. This lets two mark workers
1833                         // scan each other (otherwise, they would
1834                         // deadlock). We must not modify anything on
1835                         // the G stack. However, stack shrinking is
1836                         // disabled for mark workers, so it is safe to
1837                         // read from the G stack.
1838                         casgstatus(gp, _Grunning, _Gwaiting)
1839                         switch _p_.gcMarkWorkerMode {
1840                         default:
1841                                 throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
1842                         case gcMarkWorkerDedicatedMode:
1843                                 gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
1844                                 if gp.preempt {
1845                                         // We were preempted. This is
1846                                         // a useful signal to kick
1847                                         // everything out of the run
1848                                         // queue so it can run
1849                                         // somewhere else.
1850                                         lock(&sched.lock)
1851                                         for {
1852                                                 gp, _ := runqget(_p_)
1853                                                 if gp == nil {
1854                                                         break
1855                                                 }
1856                                                 globrunqput(gp)
1857                                         }
1858                                         unlock(&sched.lock)
1859                                 }
1860                                 // Go back to draining, this time
1861                                 // without preemption.
1862                                 gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
1863                         case gcMarkWorkerFractionalMode:
1864                                 gcDrain(&_p_.gcw, gcDrainFractional|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1865                         case gcMarkWorkerIdleMode:
1866                                 gcDrain(&_p_.gcw, gcDrainIdle|gcDrainUntilPreempt|gcDrainFlushBgCredit)
1867                         }
1868                         casgstatus(gp, _Gwaiting, _Grunning)
1869                 })
1870
1871                 // If we are nearing the end of mark, dispose
1872                 // of the cache promptly. We must do this
1873                 // before signaling that we're no longer
1874                 // working so that other workers can't observe
1875                 // no workers and no work while we have this
1876                 // cached, and before we compute done.
1877                 if gcBlackenPromptly {
1878                         _p_.gcw.dispose()
1879                 }
1880
1881                 // Account for time.
1882                 duration := nanotime() - startTime
1883                 switch _p_.gcMarkWorkerMode {
1884                 case gcMarkWorkerDedicatedMode:
1885                         atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
1886                         atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
1887                 case gcMarkWorkerFractionalMode:
1888                         atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
1889                         atomic.Xaddint64(&_p_.gcFractionalMarkTime, duration)
1890                 case gcMarkWorkerIdleMode:
1891                         atomic.Xaddint64(&gcController.idleMarkTime, duration)
1892                 }
1893
1894                 // Was this the last worker and did we run out
1895                 // of work?
1896                 incnwait := atomic.Xadd(&work.nwait, +1)
1897                 if incnwait > work.nproc {
1898                         println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
1899                                 "work.nwait=", incnwait, "work.nproc=", work.nproc)
1900                         throw("work.nwait > work.nproc")
1901                 }
1902
1903                 // If this worker reached a background mark completion
1904                 // point, signal the main GC goroutine.
1905                 if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
1906                         // Make this G preemptible and disassociate it
1907                         // as the worker for this P so
1908                         // findRunnableGCWorker doesn't try to
1909                         // schedule it.
1910                         _p_.gcBgMarkWorker.set(nil)
1911                         releasem(park.m.ptr())
1912
1913                         gcMarkDone()
1914
1915                         // Disable preemption and prepare to reattach
1916                         // to the P.
1917                         //
1918                         // We may be running on a different P at this
1919                         // point, so we can't reattach until this G is
1920                         // parked.
1921                         park.m.set(acquirem())
1922                         park.attach.set(_p_)
1923                 }
1924         }
1925 }
1926
1927 // gcMarkWorkAvailable returns true if executing a mark worker
1928 // on p is potentially useful. p may be nil, in which case it only
1929 // checks the global sources of work.
1930 func gcMarkWorkAvailable(p *p) bool {
1931         if p != nil && !p.gcw.empty() {
1932                 return true
1933         }
1934         if !work.full.empty() {
1935                 return true // global work available
1936         }
1937         if work.markrootNext < work.markrootJobs {
1938                 return true // root scan work available
1939         }
1940         return false
1941 }
1942
1943 // gcMark runs the mark (or, for concurrent GC, mark termination)
1944 // All gcWork caches must be empty.
1945 // STW is in effect at this point.
1946 //TODO go:nowritebarrier
1947 func gcMark(start_time int64) {
1948         if debug.allocfreetrace > 0 {
1949                 tracegc()
1950         }
1951
1952         if gcphase != _GCmarktermination {
1953                 throw("in gcMark expecting to see gcphase as _GCmarktermination")
1954         }
1955         work.tstart = start_time
1956
1957         // Queue root marking jobs.
1958         gcMarkRootPrepare()
1959
1960         work.nwait = 0
1961         work.ndone = 0
1962         work.nproc = uint32(gcprocs())
1963
1964         if work.full == 0 && work.nDataRoots+work.nSpanRoots+work.nStackRoots == 0 {
1965                 // There's no work on the work queue and no root jobs
1966                 // that can produce work, so don't bother entering the
1967                 // getfull() barrier.
1968                 //
1969                 // This will be the situation the vast majority of the
1970                 // time after concurrent mark. However, we still need
1971                 // a fallback for STW GC and because there are some
1972                 // known races that occasionally leave work around for
1973                 // mark termination.
1974                 //
1975                 // We're still hedging our bets here: if we do
1976                 // accidentally produce some work, we'll still process
1977                 // it, just not necessarily in parallel.
1978                 //
1979                 // TODO(austin): Fix the races and and remove
1980                 // work draining from mark termination so we don't
1981                 // need the fallback path.
1982                 work.helperDrainBlock = false
1983         } else {
1984                 work.helperDrainBlock = true
1985         }
1986
1987         if work.nproc > 1 {
1988                 noteclear(&work.alldone)
1989                 helpgc(int32(work.nproc))
1990         }
1991
1992         gchelperstart()
1993
1994         gcw := &getg().m.p.ptr().gcw
1995         if work.helperDrainBlock {
1996                 gcDrain(gcw, gcDrainBlock)
1997         } else {
1998                 gcDrain(gcw, gcDrainNoBlock)
1999         }
2000         gcw.dispose()
2001
2002         if debug.gccheckmark > 0 {
2003                 // This is expensive when there's a large number of
2004                 // Gs, so only do it if checkmark is also enabled.
2005                 gcMarkRootCheck()
2006         }
2007         if work.full != 0 {
2008                 throw("work.full != 0")
2009         }
2010
2011         if work.nproc > 1 {
2012                 notesleep(&work.alldone)
2013         }
2014
2015         // Record that at least one root marking pass has completed.
2016         work.markrootDone = true
2017
2018         // Double-check that all gcWork caches are empty. This should
2019         // be ensured by mark 2 before we enter mark termination.
2020         for _, p := range allp {
2021                 gcw := &p.gcw
2022                 if !gcw.empty() {
2023                         throw("P has cached GC work at end of mark termination")
2024                 }
2025                 if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
2026                         throw("P has unflushed stats at end of mark termination")
2027                 }
2028         }
2029
2030         cachestats()
2031
2032         // Update the marked heap stat.
2033         memstats.heap_marked = work.bytesMarked
2034
2035         // Update other GC heap size stats. This must happen after
2036         // cachestats (which flushes local statistics to these) and
2037         // flushallmcaches (which modifies heap_live).
2038         memstats.heap_live = work.bytesMarked
2039         memstats.heap_scan = uint64(gcController.scanWork)
2040
2041         if trace.enabled {
2042                 traceHeapAlloc()
2043         }
2044 }
2045
2046 func gcSweep(mode gcMode) {
2047         if gcphase != _GCoff {
2048                 throw("gcSweep being done but phase is not GCoff")
2049         }
2050
2051         lock(&mheap_.lock)
2052         mheap_.sweepgen += 2
2053         mheap_.sweepdone = 0
2054         if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
2055                 // We should have drained this list during the last
2056                 // sweep phase. We certainly need to start this phase
2057                 // with an empty swept list.
2058                 throw("non-empty swept list")
2059         }
2060         mheap_.pagesSwept = 0
2061         unlock(&mheap_.lock)
2062
2063         if !_ConcurrentSweep || mode == gcForceBlockMode {
2064                 // Special case synchronous sweep.
2065                 // Record that no proportional sweeping has to happen.
2066                 lock(&mheap_.lock)
2067                 mheap_.sweepPagesPerByte = 0
2068                 unlock(&mheap_.lock)
2069                 // Sweep all spans eagerly.
2070                 for sweepone() != ^uintptr(0) {
2071                         sweep.npausesweep++
2072                 }
2073                 // Free workbufs eagerly.
2074                 prepareFreeWorkbufs()
2075                 for freeSomeWbufs(false) {
2076                 }
2077                 // All "free" events for this mark/sweep cycle have
2078                 // now happened, so we can make this profile cycle
2079                 // available immediately.
2080                 mProf_NextCycle()
2081                 mProf_Flush()
2082                 return
2083         }
2084
2085         // Background sweep.
2086         lock(&sweep.lock)
2087         if sweep.parked {
2088                 sweep.parked = false
2089                 ready(sweep.g, 0, true)
2090         }
2091         unlock(&sweep.lock)
2092 }
2093
2094 // gcResetMarkState resets global state prior to marking (concurrent
2095 // or STW) and resets the stack scan state of all Gs.
2096 //
2097 // This is safe to do without the world stopped because any Gs created
2098 // during or after this will start out in the reset state.
2099 func gcResetMarkState() {
2100         // This may be called during a concurrent phase, so make sure
2101         // allgs doesn't change.
2102         lock(&allglock)
2103         for _, gp := range allgs {
2104                 gp.gcscandone = false  // set to true in gcphasework
2105                 gp.gcscanvalid = false // stack has not been scanned
2106                 gp.gcAssistBytes = 0
2107         }
2108         unlock(&allglock)
2109
2110         work.bytesMarked = 0
2111         work.initialHeapLive = atomic.Load64(&memstats.heap_live)
2112         work.markrootDone = false
2113 }
2114
2115 // Hooks for other packages
2116
2117 var poolcleanup func()
2118
2119 //go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
2120 func sync_runtime_registerPoolCleanup(f func()) {
2121         poolcleanup = f
2122 }
2123
2124 func clearpools() {
2125         // clear sync.Pools
2126         if poolcleanup != nil {
2127                 poolcleanup()
2128         }
2129
2130         // Clear central sudog cache.
2131         // Leave per-P caches alone, they have strictly bounded size.
2132         // Disconnect cached list before dropping it on the floor,
2133         // so that a dangling ref to one entry does not pin all of them.
2134         lock(&sched.sudoglock)
2135         var sg, sgnext *sudog
2136         for sg = sched.sudogcache; sg != nil; sg = sgnext {
2137                 sgnext = sg.next
2138                 sg.next = nil
2139         }
2140         sched.sudogcache = nil
2141         unlock(&sched.sudoglock)
2142
2143         // Clear central defer pools.
2144         // Leave per-P pools alone, they have strictly bounded size.
2145         lock(&sched.deferlock)
2146         // disconnect cached list before dropping it on the floor,
2147         // so that a dangling ref to one entry does not pin all of them.
2148         var d, dlink *_defer
2149         for d = sched.deferpool; d != nil; d = dlink {
2150                 dlink = d.link
2151                 d.link = nil
2152         }
2153         sched.deferpool = nil
2154         unlock(&sched.deferlock)
2155 }
2156
2157 // gchelper runs mark termination tasks on Ps other than the P
2158 // coordinating mark termination.
2159 //
2160 // The caller is responsible for ensuring that this has a P to run on,
2161 // even though it's running during STW. Because of this, it's allowed
2162 // to have write barriers.
2163 //
2164 //go:yeswritebarrierrec
2165 func gchelper() {
2166         _g_ := getg()
2167         _g_.m.traceback = 2
2168         gchelperstart()
2169
2170         // Parallel mark over GC roots and heap
2171         if gcphase == _GCmarktermination {
2172                 gcw := &_g_.m.p.ptr().gcw
2173                 if work.helperDrainBlock {
2174                         gcDrain(gcw, gcDrainBlock) // blocks in getfull
2175                 } else {
2176                         gcDrain(gcw, gcDrainNoBlock)
2177                 }
2178                 gcw.dispose()
2179         }
2180
2181         nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
2182         if atomic.Xadd(&work.ndone, +1) == nproc-1 {
2183                 notewakeup(&work.alldone)
2184         }
2185         _g_.m.traceback = 0
2186 }
2187
2188 func gchelperstart() {
2189         _g_ := getg()
2190
2191         if _g_.m.helpgc < 0 || _g_.m.helpgc >= _MaxGcproc {
2192                 throw("gchelperstart: bad m->helpgc")
2193         }
2194         if _g_ != _g_.m.g0 {
2195                 throw("gchelper not running on g0 stack")
2196         }
2197 }
2198
2199 // Timing
2200
2201 // itoaDiv formats val/(10**dec) into buf.
2202 func itoaDiv(buf []byte, val uint64, dec int) []byte {
2203         i := len(buf) - 1
2204         idec := i - dec
2205         for val >= 10 || i >= idec {
2206                 buf[i] = byte(val%10 + '0')
2207                 i--
2208                 if i == idec {
2209                         buf[i] = '.'
2210                         i--
2211                 }
2212                 val /= 10
2213         }
2214         buf[i] = byte(val + '0')
2215         return buf[i:]
2216 }
2217
2218 // fmtNSAsMS nicely formats ns nanoseconds as milliseconds.
2219 func fmtNSAsMS(buf []byte, ns uint64) []byte {
2220         if ns >= 10e6 {
2221                 // Format as whole milliseconds.
2222                 return itoaDiv(buf, ns/1e6, 0)
2223         }
2224         // Format two digits of precision, with at most three decimal places.
2225         x := ns / 1e3
2226         if x == 0 {
2227                 buf[0] = '0'
2228                 return buf[:1]
2229         }
2230         dec := 3
2231         for x >= 100 {
2232                 x /= 10
2233                 dec--
2234         }
2235         return itoaDiv(buf, x, dec)
2236 }