1 //===-- tsan_clock.cpp ----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file is a part of ThreadSanitizer (TSan), a race detector.
11 //===----------------------------------------------------------------------===//
12 #include "tsan_clock.h"
14 #include "sanitizer_common/sanitizer_placement_new.h"
16 // SyncClock and ThreadClock implement vector clocks for sync variables
17 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
18 // ThreadClock contains fixed-size vector clock for maximum number of threads.
19 // SyncClock contains growable vector clock for currently necessary number of
21 // Together they implement very simple model of operations, namely:
23 // void ThreadClock::acquire(const SyncClock *src) {
24 // for (int i = 0; i < kMaxThreads; i++)
25 // clock[i] = max(clock[i], src->clock[i]);
28 // void ThreadClock::release(SyncClock *dst) const {
29 // for (int i = 0; i < kMaxThreads; i++)
30 // dst->clock[i] = max(dst->clock[i], clock[i]);
33 // void ThreadClock::releaseStoreAcquire(SyncClock *sc) const {
34 // for (int i = 0; i < kMaxThreads; i++) {
36 // clock[i] = max(clock[i], sc->clock[i]);
37 // sc->clock[i] = tmp;
41 // void ThreadClock::ReleaseStore(SyncClock *dst) const {
42 // for (int i = 0; i < kMaxThreads; i++)
43 // dst->clock[i] = clock[i];
46 // void ThreadClock::acq_rel(SyncClock *dst) {
51 // Conformance to this model is extensively verified in tsan_clock_test.cpp.
52 // However, the implementation is significantly more complex. The complexity
53 // allows to implement important classes of use cases in O(1) instead of O(N).
56 // 1. Singleton/once atomic that has a single release-store operation followed
57 // by zillions of acquire-loads (the acquire-load is O(1)).
58 // 2. Thread-local mutex (both lock and unlock can be O(1)).
59 // 3. Leaf mutex (unlock is O(1)).
60 // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
61 // 5. An atomic with a single writer (writes can be O(1)).
62 // The implementation dynamically adopts to workload. So if an atomic is in
63 // read-only phase, these reads will be O(1); if it later switches to read/write
64 // phase, the implementation will correctly handle that by switching to O(N).
66 // Thread-safety note: all const operations on SyncClock's are conducted under
67 // a shared lock; all non-const operations on SyncClock's are conducted under
68 // an exclusive lock; ThreadClock's are private to respective threads and so
69 // do not need any protection.
71 // Description of SyncClock state:
72 // clk_ - variable size vector clock, low kClkBits hold timestamp,
73 // the remaining bits hold "acquired" flag (the actual value is thread's
75 // if acquried == thr->reused_, then the respective thread has already
76 // acquired this clock (except possibly for dirty elements).
77 // dirty_ - holds up to two indeces in the vector clock that other threads
78 // need to acquire regardless of "acquired" flag value;
79 // release_store_tid_ - denotes that the clock state is a result of
80 // release-store operation by the thread with release_store_tid_ index.
81 // release_store_reused_ - reuse count of release_store_tid_.
83 // We don't have ThreadState in these methods, so this is an ugly hack that
86 # define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
88 # define CPP_STAT_INC(typ) (void)0
93 static atomic_uint32_t
*ref_ptr(ClockBlock
*cb
) {
94 return reinterpret_cast<atomic_uint32_t
*>(&cb
->table
[ClockBlock::kRefIdx
]);
97 // Drop reference to the first level block idx.
98 static void UnrefClockBlock(ClockCache
*c
, u32 idx
, uptr blocks
) {
99 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
100 atomic_uint32_t
*ref
= ref_ptr(cb
);
101 u32 v
= atomic_load(ref
, memory_order_acquire
);
106 if (atomic_compare_exchange_strong(ref
, &v
, v
- 1, memory_order_acq_rel
))
109 // First level block owns second level blocks, so them as well.
110 for (uptr i
= 0; i
< blocks
; i
++)
111 ctx
->clock_alloc
.Free(c
, cb
->table
[ClockBlock::kBlockIdx
- i
]);
112 ctx
->clock_alloc
.Free(c
, idx
);
115 ThreadClock::ThreadClock(unsigned tid
, unsigned reused
)
117 , reused_(reused
+ 1) // 0 has special meaning
123 CHECK_LT(tid
, kMaxTidInClock
);
124 CHECK_EQ(reused_
, ((u64
)reused_
<< kClkBits
) >> kClkBits
);
126 internal_memset(clk_
, 0, sizeof(clk_
));
129 void ThreadClock::ResetCached(ClockCache
*c
) {
131 UnrefClockBlock(c
, cached_idx_
, cached_blocks_
);
138 void ThreadClock::acquire(ClockCache
*c
, SyncClock
*src
) {
139 DCHECK_LE(nclk_
, kMaxTid
);
140 DCHECK_LE(src
->size_
, kMaxTid
);
141 CPP_STAT_INC(StatClockAcquire
);
143 // Check if it's empty -> no need to do anything.
144 const uptr nclk
= src
->size_
;
146 CPP_STAT_INC(StatClockAcquireEmpty
);
150 bool acquired
= false;
151 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
152 SyncClock::Dirty dirty
= src
->dirty_
[i
];
153 unsigned tid
= dirty
.tid
;
154 if (tid
!= kInvalidTid
) {
155 if (clk_
[tid
] < dirty
.epoch
) {
156 clk_
[tid
] = dirty
.epoch
;
162 // Check if we've already acquired src after the last release operation on src
163 if (tid_
>= nclk
|| src
->elem(tid_
).reused
!= reused_
) {
165 CPP_STAT_INC(StatClockAcquireFull
);
166 nclk_
= max(nclk_
, nclk
);
167 u64
*dst_pos
= &clk_
[0];
168 for (ClockElem
&src_elem
: *src
) {
169 u64 epoch
= src_elem
.epoch
;
170 if (*dst_pos
< epoch
) {
177 // Remember that this thread has acquired this clock.
179 src
->elem(tid_
).reused
= reused_
;
183 CPP_STAT_INC(StatClockAcquiredSomething
);
184 last_acquire_
= clk_
[tid_
];
189 void ThreadClock::releaseStoreAcquire(ClockCache
*c
, SyncClock
*sc
) {
190 DCHECK_LE(nclk_
, kMaxTid
);
191 DCHECK_LE(sc
->size_
, kMaxTid
);
193 if (sc
->size_
== 0) {
194 // ReleaseStore will correctly set release_store_tid_,
195 // which can be important for future operations.
200 nclk_
= max(nclk_
, (uptr
) sc
->size_
);
202 // Check if we need to resize sc.
203 if (sc
->size_
< nclk_
)
204 sc
->Resize(c
, nclk_
);
206 bool acquired
= false;
212 for (ClockElem
&ce
: *sc
) {
214 if (clk_
[i
] < ce
.epoch
) {
222 sc
->release_store_tid_
= kInvalidTid
;
223 sc
->release_store_reused_
= 0;
226 CPP_STAT_INC(StatClockAcquiredSomething
);
227 last_acquire_
= clk_
[tid_
];
232 void ThreadClock::release(ClockCache
*c
, SyncClock
*dst
) {
233 DCHECK_LE(nclk_
, kMaxTid
);
234 DCHECK_LE(dst
->size_
, kMaxTid
);
236 if (dst
->size_
== 0) {
237 // ReleaseStore will correctly set release_store_tid_,
238 // which can be important for future operations.
239 ReleaseStore(c
, dst
);
243 CPP_STAT_INC(StatClockRelease
);
244 // Check if we need to resize dst.
245 if (dst
->size_
< nclk_
)
246 dst
->Resize(c
, nclk_
);
248 // Check if we had not acquired anything from other threads
249 // since the last release on dst. If so, we need to update
250 // only dst->elem(tid_).
251 if (!HasAcquiredAfterRelease(dst
)) {
252 UpdateCurrentThread(c
, dst
);
253 if (dst
->release_store_tid_
!= tid_
||
254 dst
->release_store_reused_
!= reused_
)
255 dst
->release_store_tid_
= kInvalidTid
;
260 CPP_STAT_INC(StatClockReleaseFull
);
262 // First, remember whether we've acquired dst.
263 bool acquired
= IsAlreadyAcquired(dst
);
265 CPP_STAT_INC(StatClockReleaseAcquired
);
269 for (ClockElem
&ce
: *dst
) {
270 ce
.epoch
= max(ce
.epoch
, clk_
[i
]);
274 // Clear 'acquired' flag in the remaining elements.
275 if (nclk_
< dst
->size_
)
276 CPP_STAT_INC(StatClockReleaseClearTail
);
277 dst
->release_store_tid_
= kInvalidTid
;
278 dst
->release_store_reused_
= 0;
279 // If we've acquired dst, remember this fact,
280 // so that we don't need to acquire it on next acquire.
282 dst
->elem(tid_
).reused
= reused_
;
285 void ThreadClock::ReleaseStore(ClockCache
*c
, SyncClock
*dst
) {
286 DCHECK_LE(nclk_
, kMaxTid
);
287 DCHECK_LE(dst
->size_
, kMaxTid
);
288 CPP_STAT_INC(StatClockStore
);
290 if (dst
->size_
== 0 && cached_idx_
!= 0) {
291 // Reuse the cached clock.
292 // Note: we could reuse/cache the cached clock in more cases:
293 // we could update the existing clock and cache it, or replace it with the
294 // currently cached clock and release the old one. And for a shared
295 // existing clock, we could replace it with the currently cached;
296 // or unshare, update and cache. But, for simplicity, we currnetly reuse
297 // cached clock only when the target clock is empty.
298 dst
->tab_
= ctx
->clock_alloc
.Map(cached_idx_
);
299 dst
->tab_idx_
= cached_idx_
;
300 dst
->size_
= cached_size_
;
301 dst
->blocks_
= cached_blocks_
;
302 CHECK_EQ(dst
->dirty_
[0].tid
, kInvalidTid
);
303 // The cached clock is shared (immutable),
304 // so this is where we store the current clock.
305 dst
->dirty_
[0].tid
= tid_
;
306 dst
->dirty_
[0].epoch
= clk_
[tid_
];
307 dst
->release_store_tid_
= tid_
;
308 dst
->release_store_reused_
= reused_
;
309 // Rememeber that we don't need to acquire it in future.
310 dst
->elem(tid_
).reused
= reused_
;
312 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
316 // Check if we need to resize dst.
317 if (dst
->size_
< nclk_
)
318 dst
->Resize(c
, nclk_
);
320 if (dst
->release_store_tid_
== tid_
&&
321 dst
->release_store_reused_
== reused_
&&
322 !HasAcquiredAfterRelease(dst
)) {
323 CPP_STAT_INC(StatClockStoreFast
);
324 UpdateCurrentThread(c
, dst
);
328 // O(N) release-store.
329 CPP_STAT_INC(StatClockStoreFull
);
331 // Note: dst can be larger than this ThreadClock.
332 // This is fine since clk_ beyond size is all zeros.
334 for (ClockElem
&ce
: *dst
) {
339 for (uptr i
= 0; i
< kDirtyTids
; i
++)
340 dst
->dirty_
[i
].tid
= kInvalidTid
;
341 dst
->release_store_tid_
= tid_
;
342 dst
->release_store_reused_
= reused_
;
343 // Rememeber that we don't need to acquire it in future.
344 dst
->elem(tid_
).reused
= reused_
;
346 // If the resulting clock is cachable, cache it for future release operations.
347 // The clock is always cachable if we released to an empty sync object.
348 if (cached_idx_
== 0 && dst
->Cachable()) {
349 // Grab a reference to the ClockBlock.
350 atomic_uint32_t
*ref
= ref_ptr(dst
->tab_
);
351 if (atomic_load(ref
, memory_order_acquire
) == 1)
352 atomic_store_relaxed(ref
, 2);
354 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
355 cached_idx_
= dst
->tab_idx_
;
356 cached_size_
= dst
->size_
;
357 cached_blocks_
= dst
->blocks_
;
361 void ThreadClock::acq_rel(ClockCache
*c
, SyncClock
*dst
) {
362 CPP_STAT_INC(StatClockAcquireRelease
);
364 ReleaseStore(c
, dst
);
367 // Updates only single element related to the current thread in dst->clk_.
368 void ThreadClock::UpdateCurrentThread(ClockCache
*c
, SyncClock
*dst
) const {
369 // Update the threads time, but preserve 'acquired' flag.
370 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
371 SyncClock::Dirty
*dirty
= &dst
->dirty_
[i
];
372 const unsigned tid
= dirty
->tid
;
373 if (tid
== tid_
|| tid
== kInvalidTid
) {
374 CPP_STAT_INC(StatClockReleaseFast
);
376 dirty
->epoch
= clk_
[tid_
];
380 // Reset all 'acquired' flags, O(N).
381 // We are going to touch dst elements, so we need to unshare it.
383 CPP_STAT_INC(StatClockReleaseSlow
);
384 dst
->elem(tid_
).epoch
= clk_
[tid_
];
385 for (uptr i
= 0; i
< dst
->size_
; i
++)
386 dst
->elem(i
).reused
= 0;
390 // Checks whether the current thread has already acquired src.
391 bool ThreadClock::IsAlreadyAcquired(const SyncClock
*src
) const {
392 if (src
->elem(tid_
).reused
!= reused_
)
394 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
395 SyncClock::Dirty dirty
= src
->dirty_
[i
];
396 if (dirty
.tid
!= kInvalidTid
) {
397 if (clk_
[dirty
.tid
] < dirty
.epoch
)
404 // Checks whether the current thread has acquired anything
405 // from other clocks after releasing to dst (directly or indirectly).
406 bool ThreadClock::HasAcquiredAfterRelease(const SyncClock
*dst
) const {
407 const u64 my_epoch
= dst
->elem(tid_
).epoch
;
408 return my_epoch
<= last_acquire_
||
409 my_epoch
<= atomic_load_relaxed(&global_acquire_
);
412 // Sets a single element in the vector clock.
413 // This function is called only from weird places like AcquireGlobal.
414 void ThreadClock::set(ClockCache
*c
, unsigned tid
, u64 v
) {
415 DCHECK_LT(tid
, kMaxTid
);
416 DCHECK_GE(v
, clk_
[tid
]);
420 last_acquire_
= clk_
[tid_
];
424 void ThreadClock::DebugDump(int(*printf
)(const char *s
, ...)) {
426 for (uptr i
= 0; i
< nclk_
; i
++)
427 printf("%s%llu", i
== 0 ? "" : ",", clk_
[i
]);
428 printf("] tid=%u/%u last_acq=%llu", tid_
, reused_
, last_acquire_
);
431 SyncClock::SyncClock() {
435 SyncClock::~SyncClock() {
436 // Reset must be called before dtor.
438 CHECK_EQ(blocks_
, 0);
440 CHECK_EQ(tab_idx_
, 0);
443 void SyncClock::Reset(ClockCache
*c
) {
445 UnrefClockBlock(c
, tab_idx_
, blocks_
);
449 void SyncClock::ResetImpl() {
454 release_store_tid_
= kInvalidTid
;
455 release_store_reused_
= 0;
456 for (uptr i
= 0; i
< kDirtyTids
; i
++)
457 dirty_
[i
].tid
= kInvalidTid
;
460 void SyncClock::Resize(ClockCache
*c
, uptr nclk
) {
461 CPP_STAT_INC(StatClockReleaseResize
);
463 if (nclk
<= capacity()) {
464 // Memory is already allocated, just increase the size.
469 // Grow from 0 to one-level table.
471 CHECK_EQ(blocks_
, 0);
473 CHECK_EQ(tab_idx_
, 0);
474 tab_idx_
= ctx
->clock_alloc
.Alloc(c
);
475 tab_
= ctx
->clock_alloc
.Map(tab_idx_
);
476 internal_memset(tab_
, 0, sizeof(*tab_
));
477 atomic_store_relaxed(ref_ptr(tab_
), 1);
479 } else if (size_
> blocks_
* ClockBlock::kClockCount
) {
480 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
481 ClockBlock
*new_cb
= ctx
->clock_alloc
.Map(idx
);
482 uptr top
= size_
- blocks_
* ClockBlock::kClockCount
;
483 CHECK_LT(top
, ClockBlock::kClockCount
);
484 const uptr move
= top
* sizeof(tab_
->clock
[0]);
485 internal_memcpy(&new_cb
->clock
[0], tab_
->clock
, move
);
486 internal_memset(&new_cb
->clock
[top
], 0, sizeof(*new_cb
) - move
);
487 internal_memset(tab_
->clock
, 0, move
);
490 // At this point we have first level table allocated and all clock elements
491 // are evacuated from it to a second level block.
492 // Add second level tables as necessary.
493 while (nclk
> capacity()) {
494 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
495 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
496 internal_memset(cb
, 0, sizeof(*cb
));
502 // Flushes all dirty elements into the main clock array.
503 void SyncClock::FlushDirty() {
504 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
505 Dirty
*dirty
= &dirty_
[i
];
506 if (dirty
->tid
!= kInvalidTid
) {
507 CHECK_LT(dirty
->tid
, size_
);
508 elem(dirty
->tid
).epoch
= dirty
->epoch
;
509 dirty
->tid
= kInvalidTid
;
514 bool SyncClock::IsShared() const {
517 atomic_uint32_t
*ref
= ref_ptr(tab_
);
518 u32 v
= atomic_load(ref
, memory_order_acquire
);
523 // Unshares the current clock if it's shared.
524 // Shared clocks are immutable, so they need to be unshared before any updates.
525 // Note: this does not apply to dirty entries as they are not shared.
526 void SyncClock::Unshare(ClockCache
*c
) {
529 // First, copy current state into old.
532 old
.tab_idx_
= tab_idx_
;
534 old
.blocks_
= blocks_
;
535 old
.release_store_tid_
= release_store_tid_
;
536 old
.release_store_reused_
= release_store_reused_
;
537 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
538 old
.dirty_
[i
] = dirty_
[i
];
539 // Then, clear current object.
541 // Allocate brand new clock in the current object.
542 Resize(c
, old
.size_
);
543 // Now copy state back into this object.
545 for (ClockElem
&ce
: *this) {
549 release_store_tid_
= old
.release_store_tid_
;
550 release_store_reused_
= old
.release_store_reused_
;
551 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
552 dirty_
[i
] = old
.dirty_
[i
];
553 // Drop reference to old and delete if necessary.
557 // Can we cache this clock for future release operations?
558 ALWAYS_INLINE
bool SyncClock::Cachable() const {
561 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
562 if (dirty_
[i
].tid
!= kInvalidTid
)
565 return atomic_load_relaxed(ref_ptr(tab_
)) == 1;
568 // elem linearizes the two-level structure into linear array.
569 // Note: this is used only for one time accesses, vector operations use
570 // the iterator as it is much faster.
571 ALWAYS_INLINE ClockElem
&SyncClock::elem(unsigned tid
) const {
572 DCHECK_LT(tid
, size_
);
573 const uptr block
= tid
/ ClockBlock::kClockCount
;
574 DCHECK_LE(block
, blocks_
);
575 tid
%= ClockBlock::kClockCount
;
576 if (block
== blocks_
)
577 return tab_
->clock
[tid
];
578 u32 idx
= get_block(block
);
579 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
580 return cb
->clock
[tid
];
583 ALWAYS_INLINE uptr
SyncClock::capacity() const {
586 uptr ratio
= sizeof(ClockBlock::clock
[0]) / sizeof(ClockBlock::table
[0]);
587 // How many clock elements we can fit into the first level block.
588 // +1 for ref counter.
589 uptr top
= ClockBlock::kClockCount
- RoundUpTo(blocks_
+ 1, ratio
) / ratio
;
590 return blocks_
* ClockBlock::kClockCount
+ top
;
593 ALWAYS_INLINE u32
SyncClock::get_block(uptr bi
) const {
595 DCHECK_LT(bi
, blocks_
);
596 return tab_
->table
[ClockBlock::kBlockIdx
- bi
];
599 ALWAYS_INLINE
void SyncClock::append_block(u32 idx
) {
601 CHECK_EQ(get_block(bi
), 0);
602 tab_
->table
[ClockBlock::kBlockIdx
- bi
] = idx
;
605 // Used only by tests.
606 u64
SyncClock::get(unsigned tid
) const {
607 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
608 Dirty dirty
= dirty_
[i
];
609 if (dirty
.tid
== tid
)
612 return elem(tid
).epoch
;
615 // Used only by Iter test.
616 u64
SyncClock::get_clean(unsigned tid
) const {
617 return elem(tid
).epoch
;
620 void SyncClock::DebugDump(int(*printf
)(const char *s
, ...)) {
622 for (uptr i
= 0; i
< size_
; i
++)
623 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).epoch
);
624 printf("] reused=[");
625 for (uptr i
= 0; i
< size_
; i
++)
626 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).reused
);
627 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
628 release_store_tid_
, release_store_reused_
,
629 dirty_
[0].tid
, dirty_
[0].epoch
,
630 dirty_
[1].tid
, dirty_
[1].epoch
);
633 void SyncClock::Iter::Next() {
634 // Finished with the current block, move on to the next one.
636 if (block_
< parent_
->blocks_
) {
637 // Iterate over the next second level block.
638 u32 idx
= parent_
->get_block(block_
);
639 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
640 pos_
= &cb
->clock
[0];
641 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
642 ClockBlock::kClockCount
);
645 if (block_
== parent_
->blocks_
&&
646 parent_
->size_
> parent_
->blocks_
* ClockBlock::kClockCount
) {
647 // Iterate over elements in the first level block.
648 pos_
= &parent_
->tab_
->clock
[0];
649 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
650 ClockBlock::kClockCount
);
653 parent_
= nullptr; // denotes end
655 } // namespace __tsan