libgo/go/runtime/mbitmap.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Garbage collector: type and heap bitmaps.
   6 //
   7 // Stack, data, and bss bitmaps
   8 //
   9 // Stack frames and global variables in the data and bss sections are described
  10 // by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
  11 // to be visited during GC. The bits in each byte are consumed starting with
  12 // the low bit: 1<<0, 1<<1, and so on.
  13 //
  14 // Heap bitmap
  15 //
  16 // The allocated heap comes from a subset of the memory in the range [start, used),
  17 // where start == mheap_.arena_start and used == mheap_.arena_used.
  18 // The heap bitmap comprises 2 bits for each pointer-sized word in that range,
  19 // stored in bytes indexed backward in memory from start.
  20 // That is, the byte at address start-1 holds the 2-bit entries for the four words
  21 // start through start+3*ptrSize, the byte at start-2 holds the entries for
  22 // start+4*ptrSize through start+7*ptrSize, and so on.
  23 //
  24 // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
  25 // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
  26 // The meaning of the high bit depends on the position of the word being described
  27 // in its allocated object. In all words *except* the second word, the
  28 // high bit indicates that the object is still being described. In
  29 // these words, if a bit pair with a high bit 0 is encountered, the
  30 // low bit can also be assumed to be 0, and the object description is
  31 // over. This 00 is called the ``dead'' encoding: it signals that the
  32 // rest of the words in the object are uninteresting to the garbage
  33 // collector.
  34 //
  35 // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
  36 //
  37 // The 2-bit entries are split when written into the byte, so that the top half
  38 // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
  39 // bits.
  40 // This form allows a copy from the 1-bit to the 4-bit form to keep the
  41 // pointer bits contiguous, instead of having to space them out.
  42 //
  43 // The code makes use of the fact that the zero value for a heap bitmap
  44 // has no live pointer bit set and is (depending on position), not used,
  45 // not checkmarked, and is the dead encoding.
  46 // These properties must be preserved when modifying the encoding.
  47 //
  48 // The bitmap for noscan spans is not maintained. Code must ensure
  49 // that an object is scannable before consulting its bitmap by
  50 // checking either the noscan bit in the span or by consulting its
  51 // type's information.
  52 //
  53 // Checkmarks
  54 //
  55 // In a concurrent garbage collector, one worries about failing to mark
  56 // a live object due to mutations without write barriers or bugs in the
  57 // collector implementation. As a sanity check, the GC has a 'checkmark'
  58 // mode that retraverses the object graph with the world stopped, to make
  59 // sure that everything that should be marked is marked.
  60 // In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
  61 // for the second word of the object holds the checkmark bit.
  62 // When not in checkmark mode, this bit is set to 1.
  63 //
  64 // The smallest possible allocation is 8 bytes. On a 32-bit machine, that
  65 // means every allocated object has two words, so there is room for the
  66 // checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
  67 // just one word, so the second bit pair is not available for encoding the
  68 // checkmark. However, because non-pointer allocations are combined
  69 // into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
  70 // must be a pointer, so the type bit in the first word is not actually needed.
  71 // It is still used in general, except in checkmark the type bit is repurposed
  72 // as the checkmark bit and then reinitialized (to 1) as the type bit when
  73 // finished.
  74 //
  75
  76 package runtime
  77
  78 import (
  79         "runtime/internal/atomic"
  80         "runtime/internal/sys"
  81         "unsafe"
  82 )
  83
  84 const (
  85         bitPointer = 1 << 0
  86         bitScan    = 1 << 4
  87
  88         heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
  89         heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
  90
  91         // all scan/pointer bits in a byte
  92         bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
  93         bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
  94 )
  95
  96 // addb returns the byte pointer p+n.
  97 //go:nowritebarrier
  98 //go:nosplit
  99 func addb(p *byte, n uintptr) *byte {
 100         // Note: wrote out full expression instead of calling add(p, n)
 101         // to reduce the number of temporaries generated by the
 102         // compiler for this trivial expression during inlining.
 103         return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
 104 }
 105
 106 // subtractb returns the byte pointer p-n.
 107 // subtractb is typically used when traversing the pointer tables referred to by hbits
 108 // which are arranged in reverse order.
 109 //go:nowritebarrier
 110 //go:nosplit
 111 func subtractb(p *byte, n uintptr) *byte {
 112         // Note: wrote out full expression instead of calling add(p, -n)
 113         // to reduce the number of temporaries generated by the
 114         // compiler for this trivial expression during inlining.
 115         return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
 116 }
 117
 118 // add1 returns the byte pointer p+1.
 119 //go:nowritebarrier
 120 //go:nosplit
 121 func add1(p *byte) *byte {
 122         // Note: wrote out full expression instead of calling addb(p, 1)
 123         // to reduce the number of temporaries generated by the
 124         // compiler for this trivial expression during inlining.
 125         return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
 126 }
 127
 128 // subtract1 returns the byte pointer p-1.
 129 // subtract1 is typically used when traversing the pointer tables referred to by hbits
 130 // which are arranged in reverse order.
 131 //go:nowritebarrier
 132 //
 133 // nosplit because it is used during write barriers and must not be preempted.
 134 //go:nosplit
 135 func subtract1(p *byte) *byte {
 136         // Note: wrote out full expression instead of calling subtractb(p, 1)
 137         // to reduce the number of temporaries generated by the
 138         // compiler for this trivial expression during inlining.
 139         return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 140 }
 141
 142 // mapBits maps any additional bitmap memory needed for the new arena memory.
 143 //
 144 // Don't call this directly. Call mheap.setArenaUsed.
 145 //
 146 //go:nowritebarrier
 147 func (h *mheap) mapBits(arena_used uintptr) {
 148         // Caller has added extra mappings to the arena.
 149         // Add extra mappings of bitmap words as needed.
 150         // We allocate extra bitmap pieces in chunks of bitmapChunk.
 151         const bitmapChunk = 8192
 152
 153         n := (arena_used - mheap_.arena_start) / heapBitmapScale
 154         n = round(n, bitmapChunk)
 155         n = round(n, physPageSize)
 156         if h.bitmap_mapped >= n {
 157                 return
 158         }
 159
 160         sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
 161         h.bitmap_mapped = n
 162 }
 163
 164 // heapBits provides access to the bitmap bits for a single heap word.
 165 // The methods on heapBits take value receivers so that the compiler
 166 // can more easily inline calls to those methods and registerize the
 167 // struct fields independently.
 168 type heapBits struct {
 169         bitp  *uint8
 170         shift uint32
 171 }
 172
 173 // markBits provides access to the mark bit for an object in the heap.
 174 // bytep points to the byte holding the mark bit.
 175 // mask is a byte with a single bit set that can be &ed with *bytep
 176 // to see if the bit has been set.
 177 // *m.byte&m.mask != 0 indicates the mark bit is set.
 178 // index can be used along with span information to generate
 179 // the address of the object in the heap.
 180 // We maintain one set of mark bits for allocation and one for
 181 // marking purposes.
 182 type markBits struct {
 183         bytep *uint8
 184         mask  uint8
 185         index uintptr
 186 }
 187
 188 //go:nosplit
 189 func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
 190         bytep, mask := s.allocBits.bitp(allocBitIndex)
 191         return markBits{bytep, mask, allocBitIndex}
 192 }
 193
 194 // refillaCache takes 8 bytes s.allocBits starting at whichByte
 195 // and negates them so that ctz (count trailing zeros) instructions
 196 // can be used. It then places these 8 bytes into the cached 64 bit
 197 // s.allocCache.
 198 func (s *mspan) refillAllocCache(whichByte uintptr) {
 199         bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
 200         aCache := uint64(0)
 201         aCache |= uint64(bytes[0])
 202         aCache |= uint64(bytes[1]) << (1 * 8)
 203         aCache |= uint64(bytes[2]) << (2 * 8)
 204         aCache |= uint64(bytes[3]) << (3 * 8)
 205         aCache |= uint64(bytes[4]) << (4 * 8)
 206         aCache |= uint64(bytes[5]) << (5 * 8)
 207         aCache |= uint64(bytes[6]) << (6 * 8)
 208         aCache |= uint64(bytes[7]) << (7 * 8)
 209         s.allocCache = ^aCache
 210 }
 211
 212 // nextFreeIndex returns the index of the next free object in s at
 213 // or after s.freeindex.
 214 // There are hardware instructions that can be used to make this
 215 // faster if profiling warrants it.
 216 func (s *mspan) nextFreeIndex() uintptr {
 217         sfreeindex := s.freeindex
 218         snelems := s.nelems
 219         if sfreeindex == snelems {
 220                 return sfreeindex
 221         }
 222         if sfreeindex > snelems {
 223                 throw("s.freeindex > s.nelems")
 224         }
 225
 226         aCache := s.allocCache
 227
 228         bitIndex := sys.Ctz64(aCache)
 229         for bitIndex == 64 {
 230                 // Move index to start of next cached bits.
 231                 sfreeindex = (sfreeindex + 64) &^ (64 - 1)
 232                 if sfreeindex >= snelems {
 233                         s.freeindex = snelems
 234                         return snelems
 235                 }
 236                 whichByte := sfreeindex / 8
 237                 // Refill s.allocCache with the next 64 alloc bits.
 238                 s.refillAllocCache(whichByte)
 239                 aCache = s.allocCache
 240                 bitIndex = sys.Ctz64(aCache)
 241                 // nothing available in cached bits
 242                 // grab the next 8 bytes and try again.
 243         }
 244         result := sfreeindex + uintptr(bitIndex)
 245         if result >= snelems {
 246                 s.freeindex = snelems
 247                 return snelems
 248         }
 249
 250         s.allocCache >>= uint(bitIndex + 1)
 251         sfreeindex = result + 1
 252
 253         if sfreeindex%64 == 0 && sfreeindex != snelems {
 254                 // We just incremented s.freeindex so it isn't 0.
 255                 // As each 1 in s.allocCache was encountered and used for allocation
 256                 // it was shifted away. At this point s.allocCache contains all 0s.
 257                 // Refill s.allocCache so that it corresponds
 258                 // to the bits at s.allocBits starting at s.freeindex.
 259                 whichByte := sfreeindex / 8
 260                 s.refillAllocCache(whichByte)
 261         }
 262         s.freeindex = sfreeindex
 263         return result
 264 }
 265
 266 // isFree returns whether the index'th object in s is unallocated.
 267 func (s *mspan) isFree(index uintptr) bool {
 268         if index < s.freeindex {
 269                 return false
 270         }
 271         bytep, mask := s.allocBits.bitp(index)
 272         return *bytep&mask == 0
 273 }
 274
 275 func (s *mspan) objIndex(p uintptr) uintptr {
 276         byteOffset := p - s.base()
 277         if byteOffset == 0 {
 278                 return 0
 279         }
 280         if s.baseMask != 0 {
 281                 // s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
 282                 return byteOffset >> s.divShift
 283         }
 284         return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
 285 }
 286
 287 func markBitsForAddr(p uintptr) markBits {
 288         s := spanOf(p)
 289         objIndex := s.objIndex(p)
 290         return s.markBitsForIndex(objIndex)
 291 }
 292
 293 func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
 294         bytep, mask := s.gcmarkBits.bitp(objIndex)
 295         return markBits{bytep, mask, objIndex}
 296 }
 297
 298 func (s *mspan) markBitsForBase() markBits {
 299         return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
 300 }
 301
 302 // isMarked reports whether mark bit m is set.
 303 func (m markBits) isMarked() bool {
 304         return *m.bytep&m.mask != 0
 305 }
 306
 307 // setMarked sets the marked bit in the markbits, atomically. Some compilers
 308 // are not able to inline atomic.Or8 function so if it appears as a hot spot consider
 309 // inlining it manually.
 310 func (m markBits) setMarked() {
 311         // Might be racing with other updates, so use atomic update always.
 312         // We used to be clever here and use a non-atomic update in certain
 313         // cases, but it's not worth the risk.
 314         atomic.Or8(m.bytep, m.mask)
 315 }
 316
 317 // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
 318 func (m markBits) setMarkedNonAtomic() {
 319         *m.bytep |= m.mask
 320 }
 321
 322 // clearMarked clears the marked bit in the markbits, atomically.
 323 func (m markBits) clearMarked() {
 324         // Might be racing with other updates, so use atomic update always.
 325         // We used to be clever here and use a non-atomic update in certain
 326         // cases, but it's not worth the risk.
 327         atomic.And8(m.bytep, ^m.mask)
 328 }
 329
 330 // markBitsForSpan returns the markBits for the span base address base.
 331 func markBitsForSpan(base uintptr) (mbits markBits) {
 332         if base < mheap_.arena_start || base >= mheap_.arena_used {
 333                 throw("markBitsForSpan: base out of range")
 334         }
 335         mbits = markBitsForAddr(base)
 336         if mbits.mask != 1 {
 337                 throw("markBitsForSpan: unaligned start")
 338         }
 339         return mbits
 340 }
 341
 342 // advance advances the markBits to the next object in the span.
 343 func (m *markBits) advance() {
 344         if m.mask == 1<<7 {
 345                 m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
 346                 m.mask = 1
 347         } else {
 348                 m.mask = m.mask << 1
 349         }
 350         m.index++
 351 }
 352
 353 // heapBitsForAddr returns the heapBits for the address addr.
 354 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 355 //
 356 // nosplit because it is used during write barriers and must not be preempted.
 357 //go:nosplit
 358 func heapBitsForAddr(addr uintptr) heapBits {
 359         // 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 360         off := (addr - mheap_.arena_start) / sys.PtrSize
 361         return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
 362 }
 363
 364 // heapBitsForSpan returns the heapBits for the span base address base.
 365 func heapBitsForSpan(base uintptr) (hbits heapBits) {
 366         if base < mheap_.arena_start || base >= mheap_.arena_used {
 367                 print("runtime: base ", hex(base), " not in range [", hex(mheap_.arena_start), ",", hex(mheap_.arena_used), ")\n")
 368                 throw("heapBitsForSpan: base out of range")
 369         }
 370         return heapBitsForAddr(base)
 371 }
 372
 373 // heapBitsForObject returns the base address for the heap object
 374 // containing the address p, the heapBits for base,
 375 // the object's span, and of the index of the object in s.
 376 // If p does not point into a heap object,
 377 // return base == 0
 378 // otherwise return the base of the object.
 379 //
 380 // For gccgo, the forStack parameter is true if the value came from the stack.
 381 // The stack is collected conservatively and may contain invalid pointers.
 382 //
 383 // refBase and refOff optionally give the base address of the object
 384 // in which the pointer p was found and the byte offset at which it
 385 // was found. These are used for error reporting.
 386 func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
 387         arenaStart := mheap_.arena_start
 388         if p < arenaStart || p >= mheap_.arena_used {
 389                 return
 390         }
 391         off := p - arenaStart
 392         idx := off >> _PageShift
 393         // p points into the heap, but possibly to the middle of an object.
 394         // Consult the span table to find the block beginning.
 395         s = mheap_.spans[idx]
 396         if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
 397                 if s == nil || s.state == _MSpanManual || forStack {
 398                         // If s is nil, the virtual address has never been part of the heap.
 399                         // This pointer may be to some mmap'd region, so we allow it.
 400                         // Pointers into stacks are also ok, the runtime manages these explicitly.
 401                         return
 402                 }
 403
 404                 // The following ensures that we are rigorous about what data
 405                 // structures hold valid pointers.
 406                 if debug.invalidptr != 0 {
 407                         // Typically this indicates an incorrect use
 408                         // of unsafe or cgo to store a bad pointer in
 409                         // the Go heap. It may also indicate a runtime
 410                         // bug.
 411                         //
 412                         // TODO(austin): We could be more aggressive
 413                         // and detect pointers to unallocated objects
 414                         // in allocated spans.
 415                         printlock()
 416                         print("runtime: pointer ", hex(p))
 417                         if s.state != mSpanInUse {
 418                                 print(" to unallocated span")
 419                         } else {
 420                                 print(" to unused region of span")
 421                         }
 422                         print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
 423                         if refBase != 0 {
 424                                 print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 425                                 gcDumpObject("object", refBase, refOff)
 426                         }
 427                         getg().m.traceback = 2
 428                         throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
 429                 }
 430                 return
 431         }
 432
 433         if forStack {
 434                 // A span can be entered in mheap_.spans, and be set
 435                 // to mSpanInUse, before it is fully initialized.
 436                 // All we need in practice is allocBits and gcmarkBits,
 437                 // so make sure they are set.
 438                 if s.allocBits == nil || s.gcmarkBits == nil {
 439                         return
 440                 }
 441         }
 442
 443         // If this span holds object of a power of 2 size, just mask off the bits to
 444         // the interior of the object. Otherwise use the size to get the base.
 445         if s.baseMask != 0 {
 446                 // optimize for power of 2 sized objects.
 447                 base = s.base()
 448                 base = base + (p-base)&uintptr(s.baseMask)
 449                 objIndex = (base - s.base()) >> s.divShift
 450                 // base = p & s.baseMask is faster for small spans,
 451                 // but doesn't work for large spans.
 452                 // Overall, it's faster to use the more general computation above.
 453         } else {
 454                 base = s.base()
 455                 if p-base >= s.elemsize {
 456                         // n := (p - base) / s.elemsize, using division by multiplication
 457                         objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
 458                         base += objIndex * s.elemsize
 459                 }
 460         }
 461         // Now that we know the actual base, compute heapBits to return to caller.
 462         hbits = heapBitsForAddr(base)
 463         return
 464 }
 465
 466 // next returns the heapBits describing the next pointer-sized word in memory.
 467 // That is, if h describes address p, h.next() describes p+ptrSize.
 468 // Note that next does not modify h. The caller must record the result.
 469 //
 470 // nosplit because it is used during write barriers and must not be preempted.
 471 //go:nosplit
 472 func (h heapBits) next() heapBits {
 473         if h.shift < 3*heapBitsShift {
 474                 return heapBits{h.bitp, h.shift + heapBitsShift}
 475         }
 476         return heapBits{subtract1(h.bitp), 0}
 477 }
 478
 479 // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
 480 // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
 481 // h.forward(1) is equivalent to h.next(), just slower.
 482 // Note that forward does not modify h. The caller must record the result.
 483 // bits returns the heap bits for the current word.
 484 func (h heapBits) forward(n uintptr) heapBits {
 485         n += uintptr(h.shift) / heapBitsShift
 486         return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
 487 }
 488
 489 // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
 490 // The result includes in its higher bits the bits for subsequent words
 491 // described by the same bitmap byte.
 492 //
 493 // nosplit because it is used during write barriers and must not be preempted.
 494 //go:nosplit
 495 func (h heapBits) bits() uint32 {
 496         // The (shift & 31) eliminates a test and conditional branch
 497         // from the generated code.
 498         return uint32(*h.bitp) >> (h.shift & 31)
 499 }
 500
 501 // morePointers returns true if this word and all remaining words in this object
 502 // are scalars.
 503 // h must not describe the second word of the object.
 504 func (h heapBits) morePointers() bool {
 505         return h.bits()&bitScan != 0
 506 }
 507
 508 // isPointer reports whether the heap bits describe a pointer word.
 509 //
 510 // nosplit because it is used during write barriers and must not be preempted.
 511 //go:nosplit
 512 func (h heapBits) isPointer() bool {
 513         return h.bits()&bitPointer != 0
 514 }
 515
 516 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
 517 // It must be told how large the object at h is, because the encoding of the
 518 // checkmark bit varies by size.
 519 // h must describe the initial word of the object.
 520 func (h heapBits) isCheckmarked(size uintptr) bool {
 521         if size == sys.PtrSize {
 522                 return (*h.bitp>>h.shift)&bitPointer != 0
 523         }
 524         // All multiword objects are 2-word aligned,
 525         // so we know that the initial word's 2-bit pair
 526         // and the second word's 2-bit pair are in the
 527         // same heap bitmap byte, *h.bitp.
 528         return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
 529 }
 530
 531 // setCheckmarked sets the checkmarked bit.
 532 // It must be told how large the object at h is, because the encoding of the
 533 // checkmark bit varies by size.
 534 // h must describe the initial word of the object.
 535 func (h heapBits) setCheckmarked(size uintptr) {
 536         if size == sys.PtrSize {
 537                 atomic.Or8(h.bitp, bitPointer<<h.shift)
 538                 return
 539         }
 540         atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 541 }
 542
 543 // bulkBarrierPreWrite executes a write barrier
 544 // for every pointer slot in the memory range [src, src+size),
 545 // using pointer/scalar information from [dst, dst+size).
 546 // This executes the write barriers necessary before a memmove.
 547 // src, dst, and size must be pointer-aligned.
 548 // The range [dst, dst+size) must lie within a single object.
 549 // It does not perform the actual writes.
 550 //
 551 // As a special case, src == 0 indicates that this is being used for a
 552 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
 553 // barrier.
 554 //
 555 // Callers should call bulkBarrierPreWrite immediately before
 556 // calling memmove(dst, src, size). This function is marked nosplit
 557 // to avoid being preempted; the GC must not stop the goroutine
 558 // between the memmove and the execution of the barriers.
 559 // The caller is also responsible for cgo pointer checks if this
 560 // may be writing Go pointers into non-Go memory.
 561 //
 562 // The pointer bitmap is not maintained for allocations containing
 563 // no pointers at all; any caller of bulkBarrierPreWrite must first
 564 // make sure the underlying allocation contains pointers, usually
 565 // by checking typ.kind&kindNoPointers.
 566 //
 567 //go:nosplit
 568 func bulkBarrierPreWrite(dst, src, size uintptr) {
 569         if (dst|src|size)&(sys.PtrSize-1) != 0 {
 570                 throw("bulkBarrierPreWrite: unaligned arguments")
 571         }
 572         if !writeBarrier.needed {
 573                 return
 574         }
 575         if !inheap(dst) {
 576                 // If dst is a global, use the data or BSS bitmaps to
 577                 // execute write barriers.
 578                 roots := gcRoots
 579                 for roots != nil {
 580                         for i := 0; i < roots.count; i++ {
 581                                 pr := roots.roots[i]
 582                                 addr := uintptr(pr.decl)
 583                                 if addr <= dst && dst < addr+pr.size {
 584                                         if dst < addr+pr.ptrdata {
 585                                                 bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
 586                                         }
 587                                         return
 588                                 }
 589                         }
 590                         roots = roots.next
 591                 }
 592                 return
 593         }
 594
 595         buf := &getg().m.p.ptr().wbBuf
 596         h := heapBitsForAddr(dst)
 597         if src == 0 {
 598                 for i := uintptr(0); i < size; i += sys.PtrSize {
 599                         if h.isPointer() {
 600                                 dstx := (*uintptr)(unsafe.Pointer(dst + i))
 601                                 if !buf.putFast(*dstx, 0) {
 602                                         wbBufFlush(nil, 0)
 603                                 }
 604                         }
 605                         h = h.next()
 606                 }
 607         } else {
 608                 for i := uintptr(0); i < size; i += sys.PtrSize {
 609                         if h.isPointer() {
 610                                 dstx := (*uintptr)(unsafe.Pointer(dst + i))
 611                                 srcx := (*uintptr)(unsafe.Pointer(src + i))
 612                                 if !buf.putFast(*dstx, *srcx) {
 613                                         wbBufFlush(nil, 0)
 614                                 }
 615                         }
 616                         h = h.next()
 617                 }
 618         }
 619 }
 620
 621 // bulkBarrierBitmap executes write barriers for copying from [src,
 622 // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
 623 // assumed to start maskOffset bytes into the data covered by the
 624 // bitmap in bits (which may not be a multiple of 8).
 625 //
 626 // This is used by bulkBarrierPreWrite for writes to data and BSS.
 627 //
 628 //go:nosplit
 629 func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 630         word := maskOffset / sys.PtrSize
 631         bits = addb(bits, word/8)
 632         mask := uint8(1) << (word % 8)
 633
 634         buf := &getg().m.p.ptr().wbBuf
 635         for i := uintptr(0); i < size; i += sys.PtrSize {
 636                 if mask == 0 {
 637                         bits = addb(bits, 1)
 638                         if *bits == 0 {
 639                                 // Skip 8 words.
 640                                 i += 7 * sys.PtrSize
 641                                 continue
 642                         }
 643                         mask = 1
 644                 }
 645                 if *bits&mask != 0 {
 646                         dstx := (*uintptr)(unsafe.Pointer(dst + i))
 647                         if src == 0 {
 648                                 if !buf.putFast(*dstx, 0) {
 649                                         wbBufFlush(nil, 0)
 650                                 }
 651                         } else {
 652                                 srcx := (*uintptr)(unsafe.Pointer(src + i))
 653                                 if !buf.putFast(*dstx, *srcx) {
 654                                         wbBufFlush(nil, 0)
 655                                 }
 656                         }
 657                 }
 658                 mask <<= 1
 659         }
 660 }
 661
 662 // typeBitsBulkBarrier executes writebarrierptr_prewrite for every
 663 // pointer that would be copied from [src, src+size) to [dst,
 664 // dst+size) by a memmove using the type bitmap to locate those
 665 // pointer slots.
 666 //
 667 // The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
 668 // dst, src, and size must be pointer-aligned.
 669 // The type typ must have a plain bitmap, not a GC program.
 670 // The only use of this function is in channel sends, and the
 671 // 64 kB channel element limit takes care of this for us.
 672 //
 673 // Must not be preempted because it typically runs right before memmove,
 674 // and the GC must observe them as an atomic action.
 675 //
 676 //go:nosplit
 677 func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
 678         if typ == nil {
 679                 throw("runtime: typeBitsBulkBarrier without type")
 680         }
 681         if typ.size != size {
 682                 println("runtime: typeBitsBulkBarrier with type ", *typ.string, " of size ", typ.size, " but memory size", size)
 683                 throw("runtime: invalid typeBitsBulkBarrier")
 684         }
 685         if typ.kind&kindGCProg != 0 {
 686                 println("runtime: typeBitsBulkBarrier with type ", *typ.string, " with GC prog")
 687                 throw("runtime: invalid typeBitsBulkBarrier")
 688         }
 689         if !writeBarrier.needed {
 690                 return
 691         }
 692         ptrmask := typ.gcdata
 693         var bits uint32
 694         for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
 695                 if i&(sys.PtrSize*8-1) == 0 {
 696                         bits = uint32(*ptrmask)
 697                         ptrmask = addb(ptrmask, 1)
 698                 } else {
 699                         bits = bits >> 1
 700                 }
 701                 if bits&1 != 0 {
 702                         dstx := (*uintptr)(unsafe.Pointer(dst + i))
 703                         srcx := (*uintptr)(unsafe.Pointer(src + i))
 704                         writebarrierptr_prewrite(dstx, *srcx)
 705                 }
 706         }
 707 }
 708
 709 // The methods operating on spans all require that h has been returned
 710 // by heapBitsForSpan and that size, n, total are the span layout description
 711 // returned by the mspan's layout method.
 712 // If total > size*n, it means that there is extra leftover memory in the span,
 713 // usually due to rounding.
 714 //
 715 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 716
 717 // initSpan initializes the heap bitmap for a span.
 718 // It clears all checkmark bits.
 719 // If this is a span of pointer-sized objects, it initializes all
 720 // words to pointer/scan.
 721 // Otherwise, it initializes all words to scalar/dead.
 722 func (h heapBits) initSpan(s *mspan) {
 723         size, n, total := s.layout()
 724
 725         // Init the markbit structures
 726         s.freeindex = 0
 727         s.allocCache = ^uint64(0) // all 1s indicating all free.
 728         s.nelems = n
 729         s.allocBits = nil
 730         s.gcmarkBits = nil
 731         s.gcmarkBits = newMarkBits(s.nelems)
 732         s.allocBits = newAllocBits(s.nelems)
 733
 734         // Clear bits corresponding to objects.
 735         if total%heapBitmapScale != 0 {
 736                 throw("initSpan: unaligned length")
 737         }
 738         nbyte := total / heapBitmapScale
 739         if sys.PtrSize == 8 && size == sys.PtrSize {
 740                 end := h.bitp
 741                 bitp := subtractb(end, nbyte-1)
 742                 for {
 743                         *bitp = bitPointerAll | bitScanAll
 744                         if bitp == end {
 745                                 break
 746                         }
 747                         bitp = add1(bitp)
 748                 }
 749                 return
 750         }
 751         memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
 752 }
 753
 754 // initCheckmarkSpan initializes a span for being checkmarked.
 755 // It clears the checkmark bits, which are set to 1 in normal operation.
 756 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
 757         // The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
 758         if sys.PtrSize == 8 && size == sys.PtrSize {
 759                 // Checkmark bit is type bit, bottom bit of every 2-bit entry.
 760                 // Only possible on 64-bit system, since minimum size is 8.
 761                 // Must clear type bit (checkmark bit) of every word.
 762                 // The type bit is the lower of every two-bit pair.
 763                 bitp := h.bitp
 764                 for i := uintptr(0); i < n; i += 4 {
 765                         *bitp &^= bitPointerAll
 766                         bitp = subtract1(bitp)
 767                 }
 768                 return
 769         }
 770         for i := uintptr(0); i < n; i++ {
 771                 *h.bitp &^= bitScan << (heapBitsShift + h.shift)
 772                 h = h.forward(size / sys.PtrSize)
 773         }
 774 }
 775
 776 // clearCheckmarkSpan undoes all the checkmarking in a span.
 777 // The actual checkmark bits are ignored, so the only work to do
 778 // is to fix the pointer bits. (Pointer bits are ignored by scanobject
 779 // but consulted by typedmemmove.)
 780 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 781         // The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
 782         if sys.PtrSize == 8 && size == sys.PtrSize {
 783                 // Checkmark bit is type bit, bottom bit of every 2-bit entry.
 784                 // Only possible on 64-bit system, since minimum size is 8.
 785                 // Must clear type bit (checkmark bit) of every word.
 786                 // The type bit is the lower of every two-bit pair.
 787                 bitp := h.bitp
 788                 for i := uintptr(0); i < n; i += 4 {
 789                         *bitp |= bitPointerAll
 790                         bitp = subtract1(bitp)
 791                 }
 792         }
 793 }
 794
 795 // oneBitCount is indexed by byte and produces the
 796 // number of 1 bits in that byte. For example 128 has 1 bit set
 797 // and oneBitCount[128] will holds 1.
 798 var oneBitCount = [256]uint8{
 799         0, 1, 1, 2, 1, 2, 2, 3,
 800         1, 2, 2, 3, 2, 3, 3, 4,
 801         1, 2, 2, 3, 2, 3, 3, 4,
 802         2, 3, 3, 4, 3, 4, 4, 5,
 803         1, 2, 2, 3, 2, 3, 3, 4,
 804         2, 3, 3, 4, 3, 4, 4, 5,
 805         2, 3, 3, 4, 3, 4, 4, 5,
 806         3, 4, 4, 5, 4, 5, 5, 6,
 807         1, 2, 2, 3, 2, 3, 3, 4,
 808         2, 3, 3, 4, 3, 4, 4, 5,
 809         2, 3, 3, 4, 3, 4, 4, 5,
 810         3, 4, 4, 5, 4, 5, 5, 6,
 811         2, 3, 3, 4, 3, 4, 4, 5,
 812         3, 4, 4, 5, 4, 5, 5, 6,
 813         3, 4, 4, 5, 4, 5, 5, 6,
 814         4, 5, 5, 6, 5, 6, 6, 7,
 815         1, 2, 2, 3, 2, 3, 3, 4,
 816         2, 3, 3, 4, 3, 4, 4, 5,
 817         2, 3, 3, 4, 3, 4, 4, 5,
 818         3, 4, 4, 5, 4, 5, 5, 6,
 819         2, 3, 3, 4, 3, 4, 4, 5,
 820         3, 4, 4, 5, 4, 5, 5, 6,
 821         3, 4, 4, 5, 4, 5, 5, 6,
 822         4, 5, 5, 6, 5, 6, 6, 7,
 823         2, 3, 3, 4, 3, 4, 4, 5,
 824         3, 4, 4, 5, 4, 5, 5, 6,
 825         3, 4, 4, 5, 4, 5, 5, 6,
 826         4, 5, 5, 6, 5, 6, 6, 7,
 827         3, 4, 4, 5, 4, 5, 5, 6,
 828         4, 5, 5, 6, 5, 6, 6, 7,
 829         4, 5, 5, 6, 5, 6, 6, 7,
 830         5, 6, 6, 7, 6, 7, 7, 8}
 831
 832 // countAlloc returns the number of objects allocated in span s by
 833 // scanning the allocation bitmap.
 834 // TODO:(rlh) Use popcount intrinsic.
 835 func (s *mspan) countAlloc() int {
 836         count := 0
 837         maxIndex := s.nelems / 8
 838         for i := uintptr(0); i < maxIndex; i++ {
 839                 mrkBits := *s.gcmarkBits.bytep(i)
 840                 count += int(oneBitCount[mrkBits])
 841         }
 842         if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
 843                 mrkBits := *s.gcmarkBits.bytep(maxIndex)
 844                 mask := uint8((1 << bitsInLastByte) - 1)
 845                 bits := mrkBits & mask
 846                 count += int(oneBitCount[bits])
 847         }
 848         return count
 849 }
 850
 851 // heapBitsSetType records that the new allocation [x, x+size)
 852 // holds in [x, x+dataSize) one or more values of type typ.
 853 // (The number of values is given by dataSize / typ.size.)
 854 // If dataSize < size, the fragment [x+dataSize, x+size) is
 855 // recorded as non-pointer data.
 856 // It is known that the type has pointers somewhere;
 857 // malloc does not call heapBitsSetType when there are no pointers,
 858 // because all free objects are marked as noscan during
 859 // heapBitsSweepSpan.
 860 //
 861 // There can only be one allocation from a given span active at a time,
 862 // and the bitmap for a span always falls on byte boundaries,
 863 // so there are no write-write races for access to the heap bitmap.
 864 // Hence, heapBitsSetType can access the bitmap without atomics.
 865 //
 866 // There can be read-write races between heapBitsSetType and things
 867 // that read the heap bitmap like scanobject. However, since
 868 // heapBitsSetType is only used for objects that have not yet been
 869 // made reachable, readers will ignore bits being modified by this
 870 // function. This does mean this function cannot transiently modify
 871 // bits that belong to neighboring objects. Also, on weakly-ordered
 872 // machines, callers must execute a store/store (publication) barrier
 873 // between calling this function and making the object reachable.
 874 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 875         const doubleCheck = false // slow but helpful; enable to test modifications to this code
 876
 877         // dataSize is always size rounded up to the next malloc size class,
 878         // except in the case of allocating a defer block, in which case
 879         // size is sizeof(_defer{}) (at least 6 words) and dataSize may be
 880         // arbitrarily larger.
 881         //
 882         // The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
 883         // assume that dataSize == size without checking it explicitly.
 884
 885         if sys.PtrSize == 8 && size == sys.PtrSize {
 886                 // It's one word and it has pointers, it must be a pointer.
 887                 // Since all allocated one-word objects are pointers
 888                 // (non-pointers are aggregated into tinySize allocations),
 889                 // initSpan sets the pointer bits for us. Nothing to do here.
 890                 if doubleCheck {
 891                         h := heapBitsForAddr(x)
 892                         if !h.isPointer() {
 893                                 throw("heapBitsSetType: pointer bit missing")
 894                         }
 895                         if !h.morePointers() {
 896                                 throw("heapBitsSetType: scan bit missing")
 897                         }
 898                 }
 899                 return
 900         }
 901
 902         h := heapBitsForAddr(x)
 903         ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
 904
 905         // Heap bitmap bits for 2-word object are only 4 bits,
 906         // so also shared with objects next to it.
 907         // This is called out as a special case primarily for 32-bit systems,
 908         // so that on 32-bit systems the code below can assume all objects
 909         // are 4-word aligned (because they're all 16-byte aligned).
 910         if size == 2*sys.PtrSize {
 911                 if typ.size == sys.PtrSize {
 912                         // We're allocating a block big enough to hold two pointers.
 913                         // On 64-bit, that means the actual object must be two pointers,
 914                         // or else we'd have used the one-pointer-sized block.
 915                         // On 32-bit, however, this is the 8-byte block, the smallest one.
 916                         // So it could be that we're allocating one pointer and this was
 917                         // just the smallest block available. Distinguish by checking dataSize.
 918                         // (In general the number of instances of typ being allocated is
 919                         // dataSize/typ.size.)
 920                         if sys.PtrSize == 4 && dataSize == sys.PtrSize {
 921                                 // 1 pointer object. On 32-bit machines clear the bit for the
 922                                 // unused second word.
 923                                 *h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 924                                 *h.bitp |= (bitPointer | bitScan) << h.shift
 925                         } else {
 926                                 // 2-element slice of pointer.
 927                                 *h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
 928                         }
 929                         return
 930                 }
 931                 // Otherwise typ.size must be 2*sys.PtrSize,
 932                 // and typ.kind&kindGCProg == 0.
 933                 if doubleCheck {
 934                         if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
 935                                 print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
 936                                 throw("heapBitsSetType")
 937                         }
 938                 }
 939                 b := uint32(*ptrmask)
 940                 hb := (b & 3) | bitScan
 941                 // bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
 942                 // 110011 is shifted h.shift and complemented.
 943                 // This clears out the bits that are about to be
 944                 // ored into *h.hbitp in the next instructions.
 945                 *h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 946                 *h.bitp |= uint8(hb << h.shift)
 947                 return
 948         }
 949
 950         // Copy from 1-bit ptrmask into 2-bit bitmap.
 951         // The basic approach is to use a single uintptr as a bit buffer,
 952         // alternating between reloading the buffer and writing bitmap bytes.
 953         // In general, one load can supply two bitmap byte writes.
 954         // This is a lot of lines of code, but it compiles into relatively few
 955         // machine instructions.
 956
 957         var (
 958                 // Ptrmask input.
 959                 p     *byte   // last ptrmask byte read
 960                 b     uintptr // ptrmask bits already loaded
 961                 nb    uintptr // number of bits in b at next read
 962                 endp  *byte   // final ptrmask byte to read (then repeat)
 963                 endnb uintptr // number of valid bits in *endp
 964                 pbits uintptr // alternate source of bits
 965
 966                 // Heap bitmap output.
 967                 w     uintptr // words processed
 968                 nw    uintptr // number of words to process
 969                 hbitp *byte   // next heap bitmap byte to write
 970                 hb    uintptr // bits being prepared for *hbitp
 971         )
 972
 973         hbitp = h.bitp
 974
 975         // Handle GC program. Delayed until this part of the code
 976         // so that we can use the same double-checking mechanism
 977         // as the 1-bit case. Nothing above could have encountered
 978         // GC programs: the cases were all too small.
 979         if typ.kind&kindGCProg != 0 {
 980                 heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
 981                 if doubleCheck {
 982                         // Double-check the heap bits written by GC program
 983                         // by running the GC program to create a 1-bit pointer mask
 984                         // and then jumping to the double-check code below.
 985                         // This doesn't catch bugs shared between the 1-bit and 4-bit
 986                         // GC program execution, but it does catch mistakes specific
 987                         // to just one of those and bugs in heapBitsSetTypeGCProg's
 988                         // implementation of arrays.
 989                         lock(&debugPtrmask.lock)
 990                         if debugPtrmask.data == nil {
 991                                 debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
 992                         }
 993                         ptrmask = debugPtrmask.data
 994                         runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
 995                         goto Phase4
 996                 }
 997                 return
 998         }
 999
1000         // Note about sizes:
1001         //
1002         // typ.size is the number of words in the object,
1003         // and typ.ptrdata is the number of words in the prefix
1004         // of the object that contains pointers. That is, the final
1005         // typ.size - typ.ptrdata words contain no pointers.
1006         // This allows optimization of a common pattern where
1007         // an object has a small header followed by a large scalar
1008         // buffer. If we know the pointers are over, we don't have
1009         // to scan the buffer's heap bitmap at all.
1010         // The 1-bit ptrmasks are sized to contain only bits for
1011         // the typ.ptrdata prefix, zero padded out to a full byte
1012         // of bitmap. This code sets nw (below) so that heap bitmap
1013         // bits are only written for the typ.ptrdata prefix; if there is
1014         // more room in the allocated object, the next heap bitmap
1015         // entry is a 00, indicating that there are no more pointers
1016         // to scan. So only the ptrmask for the ptrdata bytes is needed.
1017         //
1018         // Replicated copies are not as nice: if there is an array of
1019         // objects with scalar tails, all but the last tail does have to
1020         // be initialized, because there is no way to say "skip forward".
1021         // However, because of the possibility of a repeated type with
1022         // size not a multiple of 4 pointers (one heap bitmap byte),
1023         // the code already must handle the last ptrmask byte specially
1024         // by treating it as containing only the bits for endnb pointers,
1025         // where endnb <= 4. We represent large scalar tails that must
1026         // be expanded in the replication by setting endnb larger than 4.
1027         // This will have the effect of reading many bits out of b,
1028         // but once the real bits are shifted out, b will supply as many
1029         // zero bits as we try to read, which is exactly what we need.
1030
1031         p = ptrmask
1032         if typ.size < dataSize {
1033                 // Filling in bits for an array of typ.
1034                 // Set up for repetition of ptrmask during main loop.
1035                 // Note that ptrmask describes only a prefix of
1036                 const maxBits = sys.PtrSize*8 - 7
1037                 if typ.ptrdata/sys.PtrSize <= maxBits {
1038                         // Entire ptrmask fits in uintptr with room for a byte fragment.
1039                         // Load into pbits and never read from ptrmask again.
1040                         // This is especially important when the ptrmask has
1041                         // fewer than 8 bits in it; otherwise the reload in the middle
1042                         // of the Phase 2 loop would itself need to loop to gather
1043                         // at least 8 bits.
1044
1045                         // Accumulate ptrmask into b.
1046                         // ptrmask is sized to describe only typ.ptrdata, but we record
1047                         // it as describing typ.size bytes, since all the high bits are zero.
1048                         nb = typ.ptrdata / sys.PtrSize
1049                         for i := uintptr(0); i < nb; i += 8 {
1050                                 b |= uintptr(*p) << i
1051                                 p = add1(p)
1052                         }
1053                         nb = typ.size / sys.PtrSize
1054
1055                         // Replicate ptrmask to fill entire pbits uintptr.
1056                         // Doubling and truncating is fewer steps than
1057                         // iterating by nb each time. (nb could be 1.)
1058                         // Since we loaded typ.ptrdata/sys.PtrSize bits
1059                         // but are pretending to have typ.size/sys.PtrSize,
1060                         // there might be no replication necessary/possible.
1061                         pbits = b
1062                         endnb = nb
1063                         if nb+nb <= maxBits {
1064                                 for endnb <= sys.PtrSize*8 {
1065                                         pbits |= pbits << endnb
1066                                         endnb += endnb
1067                                 }
1068                                 // Truncate to a multiple of original ptrmask.
1069                                 // Because nb+nb <= maxBits, nb fits in a byte.
1070                                 // Byte division is cheaper than uintptr division.
1071                                 endnb = uintptr(maxBits/byte(nb)) * nb
1072                                 pbits &= 1<<endnb - 1
1073                                 b = pbits
1074                                 nb = endnb
1075                         }
1076
1077                         // Clear p and endp as sentinel for using pbits.
1078                         // Checked during Phase 2 loop.
1079                         p = nil
1080                         endp = nil
1081                 } else {
1082                         // Ptrmask is larger. Read it multiple times.
1083                         n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
1084                         endp = addb(ptrmask, n)
1085                         endnb = typ.size/sys.PtrSize - n*8
1086                 }
1087         }
1088         if p != nil {
1089                 b = uintptr(*p)
1090                 p = add1(p)
1091                 nb = 8
1092         }
1093
1094         if typ.size == dataSize {
1095                 // Single entry: can stop once we reach the non-pointer data.
1096                 nw = typ.ptrdata / sys.PtrSize
1097         } else {
1098                 // Repeated instances of typ in an array.
1099                 // Have to process first N-1 entries in full, but can stop
1100                 // once we reach the non-pointer data in the final entry.
1101                 nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
1102         }
1103         if nw == 0 {
1104                 // No pointers! Caller was supposed to check.
1105                 println("runtime: invalid type ", *typ.string)
1106                 throw("heapBitsSetType: called with non-pointer type")
1107                 return
1108         }
1109         if nw < 2 {
1110                 // Must write at least 2 words, because the "no scan"
1111                 // encoding doesn't take effect until the third word.
1112                 nw = 2
1113         }
1114
1115         // Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
1116         // The leading byte is special because it contains the bits for word 1,
1117         // which does not have the scan bit set.
1118         // The leading half-byte is special because it's a half a byte,
1119         // so we have to be careful with the bits already there.
1120         switch {
1121         default:
1122                 throw("heapBitsSetType: unexpected shift")
1123
1124         case h.shift == 0:
1125                 // Ptrmask and heap bitmap are aligned.
1126                 // Handle first byte of bitmap specially.
1127                 //
1128                 // The first byte we write out covers the first four
1129                 // words of the object. The scan/dead bit on the first
1130                 // word must be set to scan since there are pointers
1131                 // somewhere in the object. The scan/dead bit on the
1132                 // second word is the checkmark, so we don't set it.
1133                 // In all following words, we set the scan/dead
1134                 // appropriately to indicate that the object contains
1135                 // to the next 2-bit entry in the bitmap.
1136                 //
1137                 // TODO: It doesn't matter if we set the checkmark, so
1138                 // maybe this case isn't needed any more.
1139                 hb = b & bitPointerAll
1140                 hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
1141                 if w += 4; w >= nw {
1142                         goto Phase3
1143                 }
1144                 *hbitp = uint8(hb)
1145                 hbitp = subtract1(hbitp)
1146                 b >>= 4
1147                 nb -= 4
1148
1149         case sys.PtrSize == 8 && h.shift == 2:
1150                 // Ptrmask and heap bitmap are misaligned.
1151                 // The bits for the first two words are in a byte shared
1152                 // with another object, so we must be careful with the bits
1153                 // already there.
1154                 // We took care of 1-word and 2-word objects above,
1155                 // so this is at least a 6-word object.
1156                 hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
1157                 // This is not noscan, so set the scan bit in the
1158                 // first word.
1159                 hb |= bitScan << (2 * heapBitsShift)
1160                 b >>= 2
1161                 nb -= 2
1162                 // Note: no bitScan for second word because that's
1163                 // the checkmark.
1164                 *hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
1165                 *hbitp |= uint8(hb)
1166                 hbitp = subtract1(hbitp)
1167                 if w += 2; w >= nw {
1168                         // We know that there is more data, because we handled 2-word objects above.
1169                         // This must be at least a 6-word object. If we're out of pointer words,
1170                         // mark no scan in next bitmap byte and finish.
1171                         hb = 0
1172                         w += 4
1173                         goto Phase3
1174                 }
1175         }
1176
1177         // Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
1178         // The loop computes the bits for that last write but does not execute the write;
1179         // it leaves the bits in hb for processing by phase 3.
1180         // To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
1181         // use in the first half of the loop right now, and then we only adjust nb explicitly
1182         // if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
1183         nb -= 4
1184         for {
1185                 // Emit bitmap byte.
1186                 // b has at least nb+4 bits, with one exception:
1187                 // if w+4 >= nw, then b has only nw-w bits,
1188                 // but we'll stop at the break and then truncate
1189                 // appropriately in Phase 3.
1190                 hb = b & bitPointerAll
1191                 hb |= bitScanAll
1192                 if w += 4; w >= nw {
1193                         break
1194                 }
1195                 *hbitp = uint8(hb)
1196                 hbitp = subtract1(hbitp)
1197                 b >>= 4
1198
1199                 // Load more bits. b has nb right now.
1200                 if p != endp {
1201                         // Fast path: keep reading from ptrmask.
1202                         // nb unmodified: we just loaded 8 bits,
1203                         // and the next iteration will consume 8 bits,
1204                         // leaving us with the same nb the next time we're here.
1205                         if nb < 8 {
1206                                 b |= uintptr(*p) << nb
1207                                 p = add1(p)
1208                         } else {
1209                                 // Reduce the number of bits in b.
1210                                 // This is important if we skipped
1211                                 // over a scalar tail, since nb could
1212                                 // be larger than the bit width of b.
1213                                 nb -= 8
1214                         }
1215                 } else if p == nil {
1216                         // Almost as fast path: track bit count and refill from pbits.
1217                         // For short repetitions.
1218                         if nb < 8 {
1219                                 b |= pbits << nb
1220                                 nb += endnb
1221                         }
1222                         nb -= 8 // for next iteration
1223                 } else {
1224                         // Slow path: reached end of ptrmask.
1225                         // Process final partial byte and rewind to start.
1226                         b |= uintptr(*p) << nb
1227                         nb += endnb
1228                         if nb < 8 {
1229                                 b |= uintptr(*ptrmask) << nb
1230                                 p = add1(ptrmask)
1231                         } else {
1232                                 nb -= 8
1233                                 p = ptrmask
1234                         }
1235                 }
1236
1237                 // Emit bitmap byte.
1238                 hb = b & bitPointerAll
1239                 hb |= bitScanAll
1240                 if w += 4; w >= nw {
1241                         break
1242                 }
1243                 *hbitp = uint8(hb)
1244                 hbitp = subtract1(hbitp)
1245                 b >>= 4
1246         }
1247
1248 Phase3:
1249         // Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
1250         if w > nw {
1251                 // Counting the 4 entries in hb not yet written to memory,
1252                 // there are more entries than possible pointer slots.
1253                 // Discard the excess entries (can't be more than 3).
1254                 mask := uintptr(1)<<(4-(w-nw)) - 1
1255                 hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
1256         }
1257
1258         // Change nw from counting possibly-pointer words to total words in allocation.
1259         nw = size / sys.PtrSize
1260
1261         // Write whole bitmap bytes.
1262         // The first is hb, the rest are zero.
1263         if w <= nw {
1264                 *hbitp = uint8(hb)
1265                 hbitp = subtract1(hbitp)
1266                 hb = 0 // for possible final half-byte below
1267                 for w += 4; w <= nw; w += 4 {
1268                         *hbitp = 0
1269                         hbitp = subtract1(hbitp)
1270                 }
1271         }
1272
1273         // Write final partial bitmap byte if any.
1274         // We know w > nw, or else we'd still be in the loop above.
1275         // It can be bigger only due to the 4 entries in hb that it counts.
1276         // If w == nw+4 then there's nothing left to do: we wrote all nw entries
1277         // and can discard the 4 sitting in hb.
1278         // But if w == nw+2, we need to write first two in hb.
1279         // The byte is shared with the next object, so be careful with
1280         // existing bits.
1281         if w == nw+2 {
1282                 *hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
1283         }
1284
1285 Phase4:
1286         // Phase 4: all done, but perhaps double check.
1287         if doubleCheck {
1288                 end := heapBitsForAddr(x + size)
1289                 if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
1290                         println("ended at wrong bitmap byte for", *typ.string, "x", dataSize/typ.size)
1291                         print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
1292                         print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
1293                         h0 := heapBitsForAddr(x)
1294                         print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
1295                         print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
1296                         throw("bad heapBitsSetType")
1297                 }
1298
1299                 // Double-check that bits to be written were written correctly.
1300                 // Does not check that other bits were not written, unfortunately.
1301                 h := heapBitsForAddr(x)
1302                 nptr := typ.ptrdata / sys.PtrSize
1303                 ndata := typ.size / sys.PtrSize
1304                 count := dataSize / typ.size
1305                 totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
1306                 for i := uintptr(0); i < size/sys.PtrSize; i++ {
1307                         j := i % ndata
1308                         var have, want uint8
1309                         have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
1310                         if i >= totalptr {
1311                                 want = 0 // deadmarker
1312                                 if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
1313                                         want = bitScan
1314                                 }
1315                         } else {
1316                                 if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
1317                                         want |= bitPointer
1318                                 }
1319                                 if i != 1 {
1320                                         want |= bitScan
1321                                 } else {
1322                                         have &^= bitScan
1323                                 }
1324                         }
1325                         if have != want {
1326                                 println("mismatch writing bits for", *typ.string, "x", dataSize/typ.size)
1327                                 print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
1328                                 print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
1329                                 print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
1330                                 h0 := heapBitsForAddr(x)
1331                                 print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
1332                                 print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
1333                                 print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
1334                                 println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
1335                                 if typ.kind&kindGCProg != 0 {
1336                                         println("GC program:")
1337                                         dumpGCProg(addb(typ.gcdata, 4))
1338                                 }
1339                                 throw("bad heapBitsSetType")
1340                         }
1341                         h = h.next()
1342                 }
1343                 if ptrmask == debugPtrmask.data {
1344                         unlock(&debugPtrmask.lock)
1345                 }
1346         }
1347 }
1348
1349 var debugPtrmask struct {
1350         lock mutex
1351         data *byte
1352 }
1353
1354 // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
1355 // progSize is the size of the memory described by the program.
1356 // elemSize is the size of the element that the GC program describes (a prefix of).
1357 // dataSize is the total size of the intended data, a multiple of elemSize.
1358 // allocSize is the total size of the allocated memory.
1359 //
1360 // GC programs are only used for large allocations.
1361 // heapBitsSetType requires that allocSize is a multiple of 4 words,
1362 // so that the relevant bitmap bytes are not shared with surrounding
1363 // objects.
1364 func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
1365         if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
1366                 // Alignment will be wrong.
1367                 throw("heapBitsSetTypeGCProg: small allocation")
1368         }
1369         var totalBits uintptr
1370         if elemSize == dataSize {
1371                 totalBits = runGCProg(prog, nil, h.bitp, 2)
1372                 if totalBits*sys.PtrSize != progSize {
1373                         println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
1374                         throw("heapBitsSetTypeGCProg: unexpected bit count")
1375                 }
1376         } else {
1377                 count := dataSize / elemSize
1378
1379                 // Piece together program trailer to run after prog that does:
1380                 //      literal(0)
1381                 //      repeat(1, elemSize-progSize-1) // zeros to fill element size
1382                 //      repeat(elemSize, count-1) // repeat that element for count
1383                 // This zero-pads the data remaining in the first element and then
1384                 // repeats that first element to fill the array.
1385                 var trailer [40]byte // 3 varints (max 10 each) + some bytes
1386                 i := 0
1387                 if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
1388                         // literal(0)
1389                         trailer[i] = 0x01
1390                         i++
1391                         trailer[i] = 0
1392                         i++
1393                         if n > 1 {
1394                                 // repeat(1, n-1)
1395                                 trailer[i] = 0x81
1396                                 i++
1397                                 n--
1398                                 for ; n >= 0x80; n >>= 7 {
1399                                         trailer[i] = byte(n | 0x80)
1400                                         i++
1401                                 }
1402                                 trailer[i] = byte(n)
1403                                 i++
1404                         }
1405                 }
1406                 // repeat(elemSize/ptrSize, count-1)
1407                 trailer[i] = 0x80
1408                 i++
1409                 n := elemSize / sys.PtrSize
1410                 for ; n >= 0x80; n >>= 7 {
1411                         trailer[i] = byte(n | 0x80)
1412                         i++
1413                 }
1414                 trailer[i] = byte(n)
1415                 i++
1416                 n = count - 1
1417                 for ; n >= 0x80; n >>= 7 {
1418                         trailer[i] = byte(n | 0x80)
1419                         i++
1420                 }
1421                 trailer[i] = byte(n)
1422                 i++
1423                 trailer[i] = 0
1424                 i++
1425
1426                 runGCProg(prog, &trailer[0], h.bitp, 2)
1427
1428                 // Even though we filled in the full array just now,
1429                 // record that we only filled in up to the ptrdata of the
1430                 // last element. This will cause the code below to
1431                 // memclr the dead section of the final array element,
1432                 // so that scanobject can stop early in the final element.
1433                 totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
1434         }
1435         endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
1436         endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
1437         memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
1438 }
1439
1440 // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
1441 // size the size of the region described by prog, in bytes.
1442 // The resulting bitvector will have no more than size/sys.PtrSize bits.
1443 func progToPointerMask(prog *byte, size uintptr) bitvector {
1444         n := (size/sys.PtrSize + 7) / 8
1445         x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
1446         x[len(x)-1] = 0xa1 // overflow check sentinel
1447         n = runGCProg(prog, nil, &x[0], 1)
1448         if x[len(x)-1] != 0xa1 {
1449                 throw("progToPointerMask: overflow")
1450         }
1451         return bitvector{int32(n), &x[0]}
1452 }
1453
1454 // Packed GC pointer bitmaps, aka GC programs.
1455 //
1456 // For large types containing arrays, the type information has a
1457 // natural repetition that can be encoded to save space in the
1458 // binary and in the memory representation of the type information.
1459 //
1460 // The encoding is a simple Lempel-Ziv style bytecode machine
1461 // with the following instructions:
1462 //
1463 //      00000000: stop
1464 //      0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
1465 //      10000000 n c: repeat the previous n bits c times; n, c are varints
1466 //      1nnnnnnn c: repeat the previous n bits c times; c is a varint
1467
1468 // runGCProg executes the GC program prog, and then trailer if non-nil,
1469 // writing to dst with entries of the given size.
1470 // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
1471 // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
1472 // starting at dst (because the heap bitmap does). In this case, the caller guarantees
1473 // that only whole bytes in dst need to be written.
1474 //
1475 // runGCProg returns the number of 1- or 2-bit entries written to memory.
1476 func runGCProg(prog, trailer, dst *byte, size int) uintptr {
1477         dstStart := dst
1478
1479         // Bits waiting to be written to memory.
1480         var bits uintptr
1481         var nbits uintptr
1482
1483         p := prog
1484 Run:
1485         for {
1486                 // Flush accumulated full bytes.
1487                 // The rest of the loop assumes that nbits <= 7.
1488                 for ; nbits >= 8; nbits -= 8 {
1489                         if size == 1 {
1490                                 *dst = uint8(bits)
1491                                 dst = add1(dst)
1492                                 bits >>= 8
1493                         } else {
1494                                 v := bits&bitPointerAll | bitScanAll
1495                                 *dst = uint8(v)
1496                                 dst = subtract1(dst)
1497                                 bits >>= 4
1498                                 v = bits&bitPointerAll | bitScanAll
1499                                 *dst = uint8(v)
1500                                 dst = subtract1(dst)
1501                                 bits >>= 4
1502                         }
1503                 }
1504
1505                 // Process one instruction.
1506                 inst := uintptr(*p)
1507                 p = add1(p)
1508                 n := inst & 0x7F
1509                 if inst&0x80 == 0 {
1510                         // Literal bits; n == 0 means end of program.
1511                         if n == 0 {
1512                                 // Program is over; continue in trailer if present.
1513                                 if trailer != nil {
1514                                         //println("trailer")
1515                                         p = trailer
1516                                         trailer = nil
1517                                         continue
1518                                 }
1519                                 //println("done")
1520                                 break Run
1521                         }
1522                         //println("lit", n, dst)
1523                         nbyte := n / 8
1524                         for i := uintptr(0); i < nbyte; i++ {
1525                                 bits |= uintptr(*p) << nbits
1526                                 p = add1(p)
1527                                 if size == 1 {
1528                                         *dst = uint8(bits)
1529                                         dst = add1(dst)
1530                                         bits >>= 8
1531                                 } else {
1532                                         v := bits&0xf | bitScanAll
1533                                         *dst = uint8(v)
1534                                         dst = subtract1(dst)
1535                                         bits >>= 4
1536                                         v = bits&0xf | bitScanAll
1537                                         *dst = uint8(v)
1538                                         dst = subtract1(dst)
1539                                         bits >>= 4
1540                                 }
1541                         }
1542                         if n %= 8; n > 0 {
1543                                 bits |= uintptr(*p) << nbits
1544                                 p = add1(p)
1545                                 nbits += n
1546                         }
1547                         continue Run
1548                 }
1549
1550                 // Repeat. If n == 0, it is encoded in a varint in the next bytes.
1551                 if n == 0 {
1552                         for off := uint(0); ; off += 7 {
1553                                 x := uintptr(*p)
1554                                 p = add1(p)
1555                                 n |= (x & 0x7F) << off
1556                                 if x&0x80 == 0 {
1557                                         break
1558                                 }
1559                         }
1560                 }
1561
1562                 // Count is encoded in a varint in the next bytes.
1563                 c := uintptr(0)
1564                 for off := uint(0); ; off += 7 {
1565                         x := uintptr(*p)
1566                         p = add1(p)
1567                         c |= (x & 0x7F) << off
1568                         if x&0x80 == 0 {
1569                                 break
1570                         }
1571                 }
1572                 c *= n // now total number of bits to copy
1573
1574                 // If the number of bits being repeated is small, load them
1575                 // into a register and use that register for the entire loop
1576                 // instead of repeatedly reading from memory.
1577                 // Handling fewer than 8 bits here makes the general loop simpler.
1578                 // The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
1579                 // the pattern to a bit buffer holding at most 7 bits (a partial byte)
1580                 // it will not overflow.
1581                 src := dst
1582                 const maxBits = sys.PtrSize*8 - 7
1583                 if n <= maxBits {
1584                         // Start with bits in output buffer.
1585                         pattern := bits
1586                         npattern := nbits
1587
1588                         // If we need more bits, fetch them from memory.
1589                         if size == 1 {
1590                                 src = subtract1(src)
1591                                 for npattern < n {
1592                                         pattern <<= 8
1593                                         pattern |= uintptr(*src)
1594                                         src = subtract1(src)
1595                                         npattern += 8
1596                                 }
1597                         } else {
1598                                 src = add1(src)
1599                                 for npattern < n {
1600                                         pattern <<= 4
1601                                         pattern |= uintptr(*src) & 0xf
1602                                         src = add1(src)
1603                                         npattern += 4
1604                                 }
1605                         }
1606
1607                         // We started with the whole bit output buffer,
1608                         // and then we loaded bits from whole bytes.
1609                         // Either way, we might now have too many instead of too few.
1610                         // Discard the extra.
1611                         if npattern > n {
1612                                 pattern >>= npattern - n
1613                                 npattern = n
1614                         }
1615
1616                         // Replicate pattern to at most maxBits.
1617                         if npattern == 1 {
1618                                 // One bit being repeated.
1619                                 // If the bit is 1, make the pattern all 1s.
1620                                 // If the bit is 0, the pattern is already all 0s,
1621                                 // but we can claim that the number of bits
1622                                 // in the word is equal to the number we need (c),
1623                                 // because right shift of bits will zero fill.
1624                                 if pattern == 1 {
1625                                         pattern = 1<<maxBits - 1
1626                                         npattern = maxBits
1627                                 } else {
1628                                         npattern = c
1629                                 }
1630                         } else {
1631                                 b := pattern
1632                                 nb := npattern
1633                                 if nb+nb <= maxBits {
1634                                         // Double pattern until the whole uintptr is filled.
1635                                         for nb <= sys.PtrSize*8 {
1636                                                 b |= b << nb
1637                                                 nb += nb
1638                                         }
1639                                         // Trim away incomplete copy of original pattern in high bits.
1640                                         // TODO(rsc): Replace with table lookup or loop on systems without divide?
1641                                         nb = maxBits / npattern * npattern
1642                                         b &= 1<<nb - 1
1643                                         pattern = b
1644                                         npattern = nb
1645                                 }
1646                         }
1647
1648                         // Add pattern to bit buffer and flush bit buffer, c/npattern times.
1649                         // Since pattern contains >8 bits, there will be full bytes to flush
1650                         // on each iteration.
1651                         for ; c >= npattern; c -= npattern {
1652                                 bits |= pattern << nbits
1653                                 nbits += npattern
1654                                 if size == 1 {
1655                                         for nbits >= 8 {
1656                                                 *dst = uint8(bits)
1657                                                 dst = add1(dst)
1658                                                 bits >>= 8
1659                                                 nbits -= 8
1660                                         }
1661                                 } else {
1662                                         for nbits >= 4 {
1663                                                 *dst = uint8(bits&0xf | bitScanAll)
1664                                                 dst = subtract1(dst)
1665                                                 bits >>= 4
1666                                                 nbits -= 4
1667                                         }
1668                                 }
1669                         }
1670
1671                         // Add final fragment to bit buffer.
1672                         if c > 0 {
1673                                 pattern &= 1<<c - 1
1674                                 bits |= pattern << nbits
1675                                 nbits += c
1676                         }
1677                         continue Run
1678                 }
1679
1680                 // Repeat; n too large to fit in a register.
1681                 // Since nbits <= 7, we know the first few bytes of repeated data
1682                 // are already written to memory.
1683                 off := n - nbits // n > nbits because n > maxBits and nbits <= 7
1684                 if size == 1 {
1685                         // Leading src fragment.
1686                         src = subtractb(src, (off+7)/8)
1687                         if frag := off & 7; frag != 0 {
1688                                 bits |= uintptr(*src) >> (8 - frag) << nbits
1689                                 src = add1(src)
1690                                 nbits += frag
1691                                 c -= frag
1692                         }
1693                         // Main loop: load one byte, write another.
1694                         // The bits are rotating through the bit buffer.
1695                         for i := c / 8; i > 0; i-- {
1696                                 bits |= uintptr(*src) << nbits
1697                                 src = add1(src)
1698                                 *dst = uint8(bits)
1699                                 dst = add1(dst)
1700                                 bits >>= 8
1701                         }
1702                         // Final src fragment.
1703                         if c %= 8; c > 0 {
1704                                 bits |= (uintptr(*src) & (1<<c - 1)) << nbits
1705                                 nbits += c
1706                         }
1707                 } else {
1708                         // Leading src fragment.
1709                         src = addb(src, (off+3)/4)
1710                         if frag := off & 3; frag != 0 {
1711                                 bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
1712                                 src = subtract1(src)
1713                                 nbits += frag
1714                                 c -= frag
1715                         }
1716                         // Main loop: load one byte, write another.
1717                         // The bits are rotating through the bit buffer.
1718                         for i := c / 4; i > 0; i-- {
1719                                 bits |= (uintptr(*src) & 0xf) << nbits
1720                                 src = subtract1(src)
1721                                 *dst = uint8(bits&0xf | bitScanAll)
1722                                 dst = subtract1(dst)
1723                                 bits >>= 4
1724                         }
1725                         // Final src fragment.
1726                         if c %= 4; c > 0 {
1727                                 bits |= (uintptr(*src) & (1<<c - 1)) << nbits
1728                                 nbits += c
1729                         }
1730                 }
1731         }
1732
1733         // Write any final bits out, using full-byte writes, even for the final byte.
1734         var totalBits uintptr
1735         if size == 1 {
1736                 totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
1737                 nbits += -nbits & 7
1738                 for ; nbits > 0; nbits -= 8 {
1739                         *dst = uint8(bits)
1740                         dst = add1(dst)
1741                         bits >>= 8
1742                 }
1743         } else {
1744                 totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
1745                 nbits += -nbits & 3
1746                 for ; nbits > 0; nbits -= 4 {
1747                         v := bits&0xf | bitScanAll
1748                         *dst = uint8(v)
1749                         dst = subtract1(dst)
1750                         bits >>= 4
1751                 }
1752         }
1753         return totalBits
1754 }
1755
1756 func dumpGCProg(p *byte) {
1757         nptr := 0
1758         for {
1759                 x := *p
1760                 p = add1(p)
1761                 if x == 0 {
1762                         print("\t", nptr, " end\n")
1763                         break
1764                 }
1765                 if x&0x80 == 0 {
1766                         print("\t", nptr, " lit ", x, ":")
1767                         n := int(x+7) / 8
1768                         for i := 0; i < n; i++ {
1769                                 print(" ", hex(*p))
1770                                 p = add1(p)
1771                         }
1772                         print("\n")
1773                         nptr += int(x)
1774                 } else {
1775                         nbit := int(x &^ 0x80)
1776                         if nbit == 0 {
1777                                 for nb := uint(0); ; nb += 7 {
1778                                         x := *p
1779                                         p = add1(p)
1780                                         nbit |= int(x&0x7f) << nb
1781                                         if x&0x80 == 0 {
1782                                                 break
1783                                         }
1784                                 }
1785                         }
1786                         count := 0
1787                         for nb := uint(0); ; nb += 7 {
1788                                 x := *p
1789                                 p = add1(p)
1790                                 count |= int(x&0x7f) << nb
1791                                 if x&0x80 == 0 {
1792                                         break
1793                                 }
1794                         }
1795                         print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
1796                         nptr += nbit * count
1797                 }
1798         }
1799 }
1800
1801 // Testing.
1802
1803 // gcbits returns the GC type info for x, for testing.
1804 // The result is the bitmap entries (0 or 1), one entry per byte.
1805 //go:linkname reflect_gcbits reflect.gcbits
1806 func reflect_gcbits(x interface{}) []byte {
1807         ret := getgcmask(x)
1808         typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
1809         nptr := typ.ptrdata / sys.PtrSize
1810         for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
1811                 ret = ret[:len(ret)-1]
1812         }
1813         return ret
1814 }
1815
1816 // Returns GC type info for object p for testing.
1817 func getgcmask(ep interface{}) (mask []byte) {
1818         e := *efaceOf(&ep)
1819         p := e.data
1820         t := e._type
1821         // data or bss
1822         roots := gcRoots
1823         for roots != nil {
1824                 for i := 0; i < roots.count; i++ {
1825                         pr := roots.roots[i]
1826                         addr := uintptr(pr.decl)
1827                         if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
1828                                 n := (*ptrtype)(unsafe.Pointer(t)).elem.size
1829                                 mask = make([]byte, n/sys.PtrSize)
1830                                 copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
1831                         }
1832                         return
1833                 }
1834                 roots = roots.next
1835         }
1836
1837         // heap
1838         var n uintptr
1839         var base uintptr
1840         if mlookup(uintptr(p), &base, &n, nil) != 0 {
1841                 mask = make([]byte, n/sys.PtrSize)
1842                 for i := uintptr(0); i < n; i += sys.PtrSize {
1843                         hbits := heapBitsForAddr(base + i)
1844                         if hbits.isPointer() {
1845                                 mask[i/sys.PtrSize] = 1
1846                         }
1847                         if i != 1*sys.PtrSize && !hbits.morePointers() {
1848                                 mask = mask[:i/sys.PtrSize]
1849                                 break
1850                         }
1851                 }
1852                 return
1853         }
1854
1855         // otherwise, not something the GC knows about.
1856         // possibly read-only data, like malloc(0).
1857         // must not have pointers
1858         // For gccgo, may live on the stack, which is collected conservatively.
1859         return
1860 }