Consistently use "rG" constraint for copy instruction in move patterns
[official-gcc.git] / libsanitizer / tsan / tsan_clock.cpp
blobd122b67c0aaa5fc9016d5d208120d38e75a543dc
1 //===-- tsan_clock.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is a part of ThreadSanitizer (TSan), a race detector.
11 //===----------------------------------------------------------------------===//
12 #include "tsan_clock.h"
13 #include "tsan_rtl.h"
14 #include "sanitizer_common/sanitizer_placement_new.h"
16 // SyncClock and ThreadClock implement vector clocks for sync variables
17 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
18 // ThreadClock contains fixed-size vector clock for maximum number of threads.
19 // SyncClock contains growable vector clock for currently necessary number of
20 // threads.
21 // Together they implement very simple model of operations, namely:
23 // void ThreadClock::acquire(const SyncClock *src) {
24 // for (int i = 0; i < kMaxThreads; i++)
25 // clock[i] = max(clock[i], src->clock[i]);
26 // }
28 // void ThreadClock::release(SyncClock *dst) const {
29 // for (int i = 0; i < kMaxThreads; i++)
30 // dst->clock[i] = max(dst->clock[i], clock[i]);
31 // }
33 // void ThreadClock::releaseStoreAcquire(SyncClock *sc) const {
34 // for (int i = 0; i < kMaxThreads; i++) {
35 // tmp = clock[i];
36 // clock[i] = max(clock[i], sc->clock[i]);
37 // sc->clock[i] = tmp;
38 // }
39 // }
41 // void ThreadClock::ReleaseStore(SyncClock *dst) const {
42 // for (int i = 0; i < kMaxThreads; i++)
43 // dst->clock[i] = clock[i];
44 // }
46 // void ThreadClock::acq_rel(SyncClock *dst) {
47 // acquire(dst);
48 // release(dst);
49 // }
51 // Conformance to this model is extensively verified in tsan_clock_test.cpp.
52 // However, the implementation is significantly more complex. The complexity
53 // allows to implement important classes of use cases in O(1) instead of O(N).
55 // The use cases are:
56 // 1. Singleton/once atomic that has a single release-store operation followed
57 // by zillions of acquire-loads (the acquire-load is O(1)).
58 // 2. Thread-local mutex (both lock and unlock can be O(1)).
59 // 3. Leaf mutex (unlock is O(1)).
60 // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
61 // 5. An atomic with a single writer (writes can be O(1)).
62 // The implementation dynamically adopts to workload. So if an atomic is in
63 // read-only phase, these reads will be O(1); if it later switches to read/write
64 // phase, the implementation will correctly handle that by switching to O(N).
66 // Thread-safety note: all const operations on SyncClock's are conducted under
67 // a shared lock; all non-const operations on SyncClock's are conducted under
68 // an exclusive lock; ThreadClock's are private to respective threads and so
69 // do not need any protection.
71 // Description of SyncClock state:
72 // clk_ - variable size vector clock, low kClkBits hold timestamp,
73 // the remaining bits hold "acquired" flag (the actual value is thread's
74 // reused counter);
75 // if acquired == thr->reused_, then the respective thread has already
76 // acquired this clock (except possibly for dirty elements).
77 // dirty_ - holds up to two indices in the vector clock that other threads
78 // need to acquire regardless of "acquired" flag value;
79 // release_store_tid_ - denotes that the clock state is a result of
80 // release-store operation by the thread with release_store_tid_ index.
81 // release_store_reused_ - reuse count of release_store_tid_.
83 namespace __tsan {
85 static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
86 return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
89 // Drop reference to the first level block idx.
90 static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
91 ClockBlock *cb = ctx->clock_alloc.Map(idx);
92 atomic_uint32_t *ref = ref_ptr(cb);
93 u32 v = atomic_load(ref, memory_order_acquire);
94 for (;;) {
95 CHECK_GT(v, 0);
96 if (v == 1)
97 break;
98 if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
99 return;
101 // First level block owns second level blocks, so them as well.
102 for (uptr i = 0; i < blocks; i++)
103 ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
104 ctx->clock_alloc.Free(c, idx);
107 ThreadClock::ThreadClock(unsigned tid, unsigned reused)
108 : tid_(tid)
109 , reused_(reused + 1) // 0 has special meaning
110 , last_acquire_()
111 , global_acquire_()
112 , cached_idx_()
113 , cached_size_()
114 , cached_blocks_() {
115 CHECK_LT(tid, kMaxTidInClock);
116 CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
117 nclk_ = tid_ + 1;
118 internal_memset(clk_, 0, sizeof(clk_));
121 void ThreadClock::ResetCached(ClockCache *c) {
122 if (cached_idx_) {
123 UnrefClockBlock(c, cached_idx_, cached_blocks_);
124 cached_idx_ = 0;
125 cached_size_ = 0;
126 cached_blocks_ = 0;
130 void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
131 DCHECK_LE(nclk_, kMaxTid);
132 DCHECK_LE(src->size_, kMaxTid);
134 // Check if it's empty -> no need to do anything.
135 const uptr nclk = src->size_;
136 if (nclk == 0)
137 return;
139 bool acquired = false;
140 for (unsigned i = 0; i < kDirtyTids; i++) {
141 SyncClock::Dirty dirty = src->dirty_[i];
142 unsigned tid = dirty.tid();
143 if (tid != kInvalidTid) {
144 if (clk_[tid] < dirty.epoch) {
145 clk_[tid] = dirty.epoch;
146 acquired = true;
151 // Check if we've already acquired src after the last release operation on src
152 if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
153 // O(N) acquire.
154 nclk_ = max(nclk_, nclk);
155 u64 *dst_pos = &clk_[0];
156 for (ClockElem &src_elem : *src) {
157 u64 epoch = src_elem.epoch;
158 if (*dst_pos < epoch) {
159 *dst_pos = epoch;
160 acquired = true;
162 dst_pos++;
165 // Remember that this thread has acquired this clock.
166 if (nclk > tid_)
167 src->elem(tid_).reused = reused_;
170 if (acquired) {
171 last_acquire_ = clk_[tid_];
172 ResetCached(c);
176 void ThreadClock::releaseStoreAcquire(ClockCache *c, SyncClock *sc) {
177 DCHECK_LE(nclk_, kMaxTid);
178 DCHECK_LE(sc->size_, kMaxTid);
180 if (sc->size_ == 0) {
181 // ReleaseStore will correctly set release_store_tid_,
182 // which can be important for future operations.
183 ReleaseStore(c, sc);
184 return;
187 nclk_ = max(nclk_, (uptr) sc->size_);
189 // Check if we need to resize sc.
190 if (sc->size_ < nclk_)
191 sc->Resize(c, nclk_);
193 bool acquired = false;
195 sc->Unshare(c);
196 // Update sc->clk_.
197 sc->FlushDirty();
198 uptr i = 0;
199 for (ClockElem &ce : *sc) {
200 u64 tmp = clk_[i];
201 if (clk_[i] < ce.epoch) {
202 clk_[i] = ce.epoch;
203 acquired = true;
205 ce.epoch = tmp;
206 ce.reused = 0;
207 i++;
209 sc->release_store_tid_ = kInvalidTid;
210 sc->release_store_reused_ = 0;
212 if (acquired) {
213 last_acquire_ = clk_[tid_];
214 ResetCached(c);
218 void ThreadClock::release(ClockCache *c, SyncClock *dst) {
219 DCHECK_LE(nclk_, kMaxTid);
220 DCHECK_LE(dst->size_, kMaxTid);
222 if (dst->size_ == 0) {
223 // ReleaseStore will correctly set release_store_tid_,
224 // which can be important for future operations.
225 ReleaseStore(c, dst);
226 return;
229 // Check if we need to resize dst.
230 if (dst->size_ < nclk_)
231 dst->Resize(c, nclk_);
233 // Check if we had not acquired anything from other threads
234 // since the last release on dst. If so, we need to update
235 // only dst->elem(tid_).
236 if (!HasAcquiredAfterRelease(dst)) {
237 UpdateCurrentThread(c, dst);
238 if (dst->release_store_tid_ != tid_ ||
239 dst->release_store_reused_ != reused_)
240 dst->release_store_tid_ = kInvalidTid;
241 return;
244 // O(N) release.
245 dst->Unshare(c);
246 // First, remember whether we've acquired dst.
247 bool acquired = IsAlreadyAcquired(dst);
248 // Update dst->clk_.
249 dst->FlushDirty();
250 uptr i = 0;
251 for (ClockElem &ce : *dst) {
252 ce.epoch = max(ce.epoch, clk_[i]);
253 ce.reused = 0;
254 i++;
256 // Clear 'acquired' flag in the remaining elements.
257 dst->release_store_tid_ = kInvalidTid;
258 dst->release_store_reused_ = 0;
259 // If we've acquired dst, remember this fact,
260 // so that we don't need to acquire it on next acquire.
261 if (acquired)
262 dst->elem(tid_).reused = reused_;
265 void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
266 DCHECK_LE(nclk_, kMaxTid);
267 DCHECK_LE(dst->size_, kMaxTid);
269 if (dst->size_ == 0 && cached_idx_ != 0) {
270 // Reuse the cached clock.
271 // Note: we could reuse/cache the cached clock in more cases:
272 // we could update the existing clock and cache it, or replace it with the
273 // currently cached clock and release the old one. And for a shared
274 // existing clock, we could replace it with the currently cached;
275 // or unshare, update and cache. But, for simplicity, we currently reuse
276 // cached clock only when the target clock is empty.
277 dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
278 dst->tab_idx_ = cached_idx_;
279 dst->size_ = cached_size_;
280 dst->blocks_ = cached_blocks_;
281 CHECK_EQ(dst->dirty_[0].tid(), kInvalidTid);
282 // The cached clock is shared (immutable),
283 // so this is where we store the current clock.
284 dst->dirty_[0].set_tid(tid_);
285 dst->dirty_[0].epoch = clk_[tid_];
286 dst->release_store_tid_ = tid_;
287 dst->release_store_reused_ = reused_;
288 // Remember that we don't need to acquire it in future.
289 dst->elem(tid_).reused = reused_;
290 // Grab a reference.
291 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
292 return;
295 // Check if we need to resize dst.
296 if (dst->size_ < nclk_)
297 dst->Resize(c, nclk_);
299 if (dst->release_store_tid_ == tid_ &&
300 dst->release_store_reused_ == reused_ &&
301 !HasAcquiredAfterRelease(dst)) {
302 UpdateCurrentThread(c, dst);
303 return;
306 // O(N) release-store.
307 dst->Unshare(c);
308 // Note: dst can be larger than this ThreadClock.
309 // This is fine since clk_ beyond size is all zeros.
310 uptr i = 0;
311 for (ClockElem &ce : *dst) {
312 ce.epoch = clk_[i];
313 ce.reused = 0;
314 i++;
316 for (uptr i = 0; i < kDirtyTids; i++) dst->dirty_[i].set_tid(kInvalidTid);
317 dst->release_store_tid_ = tid_;
318 dst->release_store_reused_ = reused_;
319 // Remember that we don't need to acquire it in future.
320 dst->elem(tid_).reused = reused_;
322 // If the resulting clock is cachable, cache it for future release operations.
323 // The clock is always cachable if we released to an empty sync object.
324 if (cached_idx_ == 0 && dst->Cachable()) {
325 // Grab a reference to the ClockBlock.
326 atomic_uint32_t *ref = ref_ptr(dst->tab_);
327 if (atomic_load(ref, memory_order_acquire) == 1)
328 atomic_store_relaxed(ref, 2);
329 else
330 atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
331 cached_idx_ = dst->tab_idx_;
332 cached_size_ = dst->size_;
333 cached_blocks_ = dst->blocks_;
337 void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
338 acquire(c, dst);
339 ReleaseStore(c, dst);
342 // Updates only single element related to the current thread in dst->clk_.
343 void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
344 // Update the threads time, but preserve 'acquired' flag.
345 for (unsigned i = 0; i < kDirtyTids; i++) {
346 SyncClock::Dirty *dirty = &dst->dirty_[i];
347 const unsigned tid = dirty->tid();
348 if (tid == tid_ || tid == kInvalidTid) {
349 dirty->set_tid(tid_);
350 dirty->epoch = clk_[tid_];
351 return;
354 // Reset all 'acquired' flags, O(N).
355 // We are going to touch dst elements, so we need to unshare it.
356 dst->Unshare(c);
357 dst->elem(tid_).epoch = clk_[tid_];
358 for (uptr i = 0; i < dst->size_; i++)
359 dst->elem(i).reused = 0;
360 dst->FlushDirty();
363 // Checks whether the current thread has already acquired src.
364 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
365 if (src->elem(tid_).reused != reused_)
366 return false;
367 for (unsigned i = 0; i < kDirtyTids; i++) {
368 SyncClock::Dirty dirty = src->dirty_[i];
369 if (dirty.tid() != kInvalidTid) {
370 if (clk_[dirty.tid()] < dirty.epoch)
371 return false;
374 return true;
377 // Checks whether the current thread has acquired anything
378 // from other clocks after releasing to dst (directly or indirectly).
379 bool ThreadClock::HasAcquiredAfterRelease(const SyncClock *dst) const {
380 const u64 my_epoch = dst->elem(tid_).epoch;
381 return my_epoch <= last_acquire_ ||
382 my_epoch <= atomic_load_relaxed(&global_acquire_);
385 // Sets a single element in the vector clock.
386 // This function is called only from weird places like AcquireGlobal.
387 void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
388 DCHECK_LT(tid, kMaxTid);
389 DCHECK_GE(v, clk_[tid]);
390 clk_[tid] = v;
391 if (nclk_ <= tid)
392 nclk_ = tid + 1;
393 last_acquire_ = clk_[tid_];
394 ResetCached(c);
397 void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
398 printf("clock=[");
399 for (uptr i = 0; i < nclk_; i++)
400 printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
401 printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
404 SyncClock::SyncClock() {
405 ResetImpl();
408 SyncClock::~SyncClock() {
409 // Reset must be called before dtor.
410 CHECK_EQ(size_, 0);
411 CHECK_EQ(blocks_, 0);
412 CHECK_EQ(tab_, 0);
413 CHECK_EQ(tab_idx_, 0);
416 void SyncClock::Reset(ClockCache *c) {
417 if (size_)
418 UnrefClockBlock(c, tab_idx_, blocks_);
419 ResetImpl();
422 void SyncClock::ResetImpl() {
423 tab_ = 0;
424 tab_idx_ = 0;
425 size_ = 0;
426 blocks_ = 0;
427 release_store_tid_ = kInvalidTid;
428 release_store_reused_ = 0;
429 for (uptr i = 0; i < kDirtyTids; i++) dirty_[i].set_tid(kInvalidTid);
432 void SyncClock::Resize(ClockCache *c, uptr nclk) {
433 Unshare(c);
434 if (nclk <= capacity()) {
435 // Memory is already allocated, just increase the size.
436 size_ = nclk;
437 return;
439 if (size_ == 0) {
440 // Grow from 0 to one-level table.
441 CHECK_EQ(size_, 0);
442 CHECK_EQ(blocks_, 0);
443 CHECK_EQ(tab_, 0);
444 CHECK_EQ(tab_idx_, 0);
445 tab_idx_ = ctx->clock_alloc.Alloc(c);
446 tab_ = ctx->clock_alloc.Map(tab_idx_);
447 internal_memset(tab_, 0, sizeof(*tab_));
448 atomic_store_relaxed(ref_ptr(tab_), 1);
449 size_ = 1;
450 } else if (size_ > blocks_ * ClockBlock::kClockCount) {
451 u32 idx = ctx->clock_alloc.Alloc(c);
452 ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
453 uptr top = size_ - blocks_ * ClockBlock::kClockCount;
454 CHECK_LT(top, ClockBlock::kClockCount);
455 const uptr move = top * sizeof(tab_->clock[0]);
456 internal_memcpy(&new_cb->clock[0], tab_->clock, move);
457 internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
458 internal_memset(tab_->clock, 0, move);
459 append_block(idx);
461 // At this point we have first level table allocated and all clock elements
462 // are evacuated from it to a second level block.
463 // Add second level tables as necessary.
464 while (nclk > capacity()) {
465 u32 idx = ctx->clock_alloc.Alloc(c);
466 ClockBlock *cb = ctx->clock_alloc.Map(idx);
467 internal_memset(cb, 0, sizeof(*cb));
468 append_block(idx);
470 size_ = nclk;
473 // Flushes all dirty elements into the main clock array.
474 void SyncClock::FlushDirty() {
475 for (unsigned i = 0; i < kDirtyTids; i++) {
476 Dirty *dirty = &dirty_[i];
477 if (dirty->tid() != kInvalidTid) {
478 CHECK_LT(dirty->tid(), size_);
479 elem(dirty->tid()).epoch = dirty->epoch;
480 dirty->set_tid(kInvalidTid);
485 bool SyncClock::IsShared() const {
486 if (size_ == 0)
487 return false;
488 atomic_uint32_t *ref = ref_ptr(tab_);
489 u32 v = atomic_load(ref, memory_order_acquire);
490 CHECK_GT(v, 0);
491 return v > 1;
494 // Unshares the current clock if it's shared.
495 // Shared clocks are immutable, so they need to be unshared before any updates.
496 // Note: this does not apply to dirty entries as they are not shared.
497 void SyncClock::Unshare(ClockCache *c) {
498 if (!IsShared())
499 return;
500 // First, copy current state into old.
501 SyncClock old;
502 old.tab_ = tab_;
503 old.tab_idx_ = tab_idx_;
504 old.size_ = size_;
505 old.blocks_ = blocks_;
506 old.release_store_tid_ = release_store_tid_;
507 old.release_store_reused_ = release_store_reused_;
508 for (unsigned i = 0; i < kDirtyTids; i++)
509 old.dirty_[i] = dirty_[i];
510 // Then, clear current object.
511 ResetImpl();
512 // Allocate brand new clock in the current object.
513 Resize(c, old.size_);
514 // Now copy state back into this object.
515 Iter old_iter(&old);
516 for (ClockElem &ce : *this) {
517 ce = *old_iter;
518 ++old_iter;
520 release_store_tid_ = old.release_store_tid_;
521 release_store_reused_ = old.release_store_reused_;
522 for (unsigned i = 0; i < kDirtyTids; i++)
523 dirty_[i] = old.dirty_[i];
524 // Drop reference to old and delete if necessary.
525 old.Reset(c);
528 // Can we cache this clock for future release operations?
529 ALWAYS_INLINE bool SyncClock::Cachable() const {
530 if (size_ == 0)
531 return false;
532 for (unsigned i = 0; i < kDirtyTids; i++) {
533 if (dirty_[i].tid() != kInvalidTid)
534 return false;
536 return atomic_load_relaxed(ref_ptr(tab_)) == 1;
539 // elem linearizes the two-level structure into linear array.
540 // Note: this is used only for one time accesses, vector operations use
541 // the iterator as it is much faster.
542 ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
543 DCHECK_LT(tid, size_);
544 const uptr block = tid / ClockBlock::kClockCount;
545 DCHECK_LE(block, blocks_);
546 tid %= ClockBlock::kClockCount;
547 if (block == blocks_)
548 return tab_->clock[tid];
549 u32 idx = get_block(block);
550 ClockBlock *cb = ctx->clock_alloc.Map(idx);
551 return cb->clock[tid];
554 ALWAYS_INLINE uptr SyncClock::capacity() const {
555 if (size_ == 0)
556 return 0;
557 uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
558 // How many clock elements we can fit into the first level block.
559 // +1 for ref counter.
560 uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
561 return blocks_ * ClockBlock::kClockCount + top;
564 ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
565 DCHECK(size_);
566 DCHECK_LT(bi, blocks_);
567 return tab_->table[ClockBlock::kBlockIdx - bi];
570 ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
571 uptr bi = blocks_++;
572 CHECK_EQ(get_block(bi), 0);
573 tab_->table[ClockBlock::kBlockIdx - bi] = idx;
576 // Used only by tests.
577 u64 SyncClock::get(unsigned tid) const {
578 for (unsigned i = 0; i < kDirtyTids; i++) {
579 Dirty dirty = dirty_[i];
580 if (dirty.tid() == tid)
581 return dirty.epoch;
583 return elem(tid).epoch;
586 // Used only by Iter test.
587 u64 SyncClock::get_clean(unsigned tid) const {
588 return elem(tid).epoch;
591 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
592 printf("clock=[");
593 for (uptr i = 0; i < size_; i++)
594 printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
595 printf("] reused=[");
596 for (uptr i = 0; i < size_; i++)
597 printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
598 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
599 release_store_tid_, release_store_reused_, dirty_[0].tid(),
600 dirty_[0].epoch, dirty_[1].tid(), dirty_[1].epoch);
603 void SyncClock::Iter::Next() {
604 // Finished with the current block, move on to the next one.
605 block_++;
606 if (block_ < parent_->blocks_) {
607 // Iterate over the next second level block.
608 u32 idx = parent_->get_block(block_);
609 ClockBlock *cb = ctx->clock_alloc.Map(idx);
610 pos_ = &cb->clock[0];
611 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
612 ClockBlock::kClockCount);
613 return;
615 if (block_ == parent_->blocks_ &&
616 parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
617 // Iterate over elements in the first level block.
618 pos_ = &parent_->tab_->clock[0];
619 end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
620 ClockBlock::kClockCount);
621 return;
623 parent_ = nullptr; // denotes end
625 } // namespace __tsan