1 //===-- tsan_clock.cpp ----------------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file is a part of ThreadSanitizer (TSan), a race detector.
11 //===----------------------------------------------------------------------===//
12 #include "tsan_clock.h"
14 #include "sanitizer_common/sanitizer_placement_new.h"
16 // SyncClock and ThreadClock implement vector clocks for sync variables
17 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
18 // ThreadClock contains fixed-size vector clock for maximum number of threads.
19 // SyncClock contains growable vector clock for currently necessary number of
21 // Together they implement very simple model of operations, namely:
23 // void ThreadClock::acquire(const SyncClock *src) {
24 // for (int i = 0; i < kMaxThreads; i++)
25 // clock[i] = max(clock[i], src->clock[i]);
28 // void ThreadClock::release(SyncClock *dst) const {
29 // for (int i = 0; i < kMaxThreads; i++)
30 // dst->clock[i] = max(dst->clock[i], clock[i]);
33 // void ThreadClock::releaseStoreAcquire(SyncClock *sc) const {
34 // for (int i = 0; i < kMaxThreads; i++) {
36 // clock[i] = max(clock[i], sc->clock[i]);
37 // sc->clock[i] = tmp;
41 // void ThreadClock::ReleaseStore(SyncClock *dst) const {
42 // for (int i = 0; i < kMaxThreads; i++)
43 // dst->clock[i] = clock[i];
46 // void ThreadClock::acq_rel(SyncClock *dst) {
51 // Conformance to this model is extensively verified in tsan_clock_test.cpp.
52 // However, the implementation is significantly more complex. The complexity
53 // allows to implement important classes of use cases in O(1) instead of O(N).
56 // 1. Singleton/once atomic that has a single release-store operation followed
57 // by zillions of acquire-loads (the acquire-load is O(1)).
58 // 2. Thread-local mutex (both lock and unlock can be O(1)).
59 // 3. Leaf mutex (unlock is O(1)).
60 // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
61 // 5. An atomic with a single writer (writes can be O(1)).
62 // The implementation dynamically adopts to workload. So if an atomic is in
63 // read-only phase, these reads will be O(1); if it later switches to read/write
64 // phase, the implementation will correctly handle that by switching to O(N).
66 // Thread-safety note: all const operations on SyncClock's are conducted under
67 // a shared lock; all non-const operations on SyncClock's are conducted under
68 // an exclusive lock; ThreadClock's are private to respective threads and so
69 // do not need any protection.
71 // Description of SyncClock state:
72 // clk_ - variable size vector clock, low kClkBits hold timestamp,
73 // the remaining bits hold "acquired" flag (the actual value is thread's
75 // if acquired == thr->reused_, then the respective thread has already
76 // acquired this clock (except possibly for dirty elements).
77 // dirty_ - holds up to two indices in the vector clock that other threads
78 // need to acquire regardless of "acquired" flag value;
79 // release_store_tid_ - denotes that the clock state is a result of
80 // release-store operation by the thread with release_store_tid_ index.
81 // release_store_reused_ - reuse count of release_store_tid_.
85 static atomic_uint32_t
*ref_ptr(ClockBlock
*cb
) {
86 return reinterpret_cast<atomic_uint32_t
*>(&cb
->table
[ClockBlock::kRefIdx
]);
89 // Drop reference to the first level block idx.
90 static void UnrefClockBlock(ClockCache
*c
, u32 idx
, uptr blocks
) {
91 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
92 atomic_uint32_t
*ref
= ref_ptr(cb
);
93 u32 v
= atomic_load(ref
, memory_order_acquire
);
98 if (atomic_compare_exchange_strong(ref
, &v
, v
- 1, memory_order_acq_rel
))
101 // First level block owns second level blocks, so them as well.
102 for (uptr i
= 0; i
< blocks
; i
++)
103 ctx
->clock_alloc
.Free(c
, cb
->table
[ClockBlock::kBlockIdx
- i
]);
104 ctx
->clock_alloc
.Free(c
, idx
);
107 ThreadClock::ThreadClock(unsigned tid
, unsigned reused
)
109 , reused_(reused
+ 1) // 0 has special meaning
115 CHECK_LT(tid
, kMaxTidInClock
);
116 CHECK_EQ(reused_
, ((u64
)reused_
<< kClkBits
) >> kClkBits
);
118 internal_memset(clk_
, 0, sizeof(clk_
));
121 void ThreadClock::ResetCached(ClockCache
*c
) {
123 UnrefClockBlock(c
, cached_idx_
, cached_blocks_
);
130 void ThreadClock::acquire(ClockCache
*c
, SyncClock
*src
) {
131 DCHECK_LE(nclk_
, kMaxTid
);
132 DCHECK_LE(src
->size_
, kMaxTid
);
134 // Check if it's empty -> no need to do anything.
135 const uptr nclk
= src
->size_
;
139 bool acquired
= false;
140 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
141 SyncClock::Dirty dirty
= src
->dirty_
[i
];
142 unsigned tid
= dirty
.tid();
143 if (tid
!= kInvalidTid
) {
144 if (clk_
[tid
] < dirty
.epoch
) {
145 clk_
[tid
] = dirty
.epoch
;
151 // Check if we've already acquired src after the last release operation on src
152 if (tid_
>= nclk
|| src
->elem(tid_
).reused
!= reused_
) {
154 nclk_
= max(nclk_
, nclk
);
155 u64
*dst_pos
= &clk_
[0];
156 for (ClockElem
&src_elem
: *src
) {
157 u64 epoch
= src_elem
.epoch
;
158 if (*dst_pos
< epoch
) {
165 // Remember that this thread has acquired this clock.
167 src
->elem(tid_
).reused
= reused_
;
171 last_acquire_
= clk_
[tid_
];
176 void ThreadClock::releaseStoreAcquire(ClockCache
*c
, SyncClock
*sc
) {
177 DCHECK_LE(nclk_
, kMaxTid
);
178 DCHECK_LE(sc
->size_
, kMaxTid
);
180 if (sc
->size_
== 0) {
181 // ReleaseStore will correctly set release_store_tid_,
182 // which can be important for future operations.
187 nclk_
= max(nclk_
, (uptr
) sc
->size_
);
189 // Check if we need to resize sc.
190 if (sc
->size_
< nclk_
)
191 sc
->Resize(c
, nclk_
);
193 bool acquired
= false;
199 for (ClockElem
&ce
: *sc
) {
201 if (clk_
[i
] < ce
.epoch
) {
209 sc
->release_store_tid_
= kInvalidTid
;
210 sc
->release_store_reused_
= 0;
213 last_acquire_
= clk_
[tid_
];
218 void ThreadClock::release(ClockCache
*c
, SyncClock
*dst
) {
219 DCHECK_LE(nclk_
, kMaxTid
);
220 DCHECK_LE(dst
->size_
, kMaxTid
);
222 if (dst
->size_
== 0) {
223 // ReleaseStore will correctly set release_store_tid_,
224 // which can be important for future operations.
225 ReleaseStore(c
, dst
);
229 // Check if we need to resize dst.
230 if (dst
->size_
< nclk_
)
231 dst
->Resize(c
, nclk_
);
233 // Check if we had not acquired anything from other threads
234 // since the last release on dst. If so, we need to update
235 // only dst->elem(tid_).
236 if (!HasAcquiredAfterRelease(dst
)) {
237 UpdateCurrentThread(c
, dst
);
238 if (dst
->release_store_tid_
!= tid_
||
239 dst
->release_store_reused_
!= reused_
)
240 dst
->release_store_tid_
= kInvalidTid
;
246 // First, remember whether we've acquired dst.
247 bool acquired
= IsAlreadyAcquired(dst
);
251 for (ClockElem
&ce
: *dst
) {
252 ce
.epoch
= max(ce
.epoch
, clk_
[i
]);
256 // Clear 'acquired' flag in the remaining elements.
257 dst
->release_store_tid_
= kInvalidTid
;
258 dst
->release_store_reused_
= 0;
259 // If we've acquired dst, remember this fact,
260 // so that we don't need to acquire it on next acquire.
262 dst
->elem(tid_
).reused
= reused_
;
265 void ThreadClock::ReleaseStore(ClockCache
*c
, SyncClock
*dst
) {
266 DCHECK_LE(nclk_
, kMaxTid
);
267 DCHECK_LE(dst
->size_
, kMaxTid
);
269 if (dst
->size_
== 0 && cached_idx_
!= 0) {
270 // Reuse the cached clock.
271 // Note: we could reuse/cache the cached clock in more cases:
272 // we could update the existing clock and cache it, or replace it with the
273 // currently cached clock and release the old one. And for a shared
274 // existing clock, we could replace it with the currently cached;
275 // or unshare, update and cache. But, for simplicity, we currently reuse
276 // cached clock only when the target clock is empty.
277 dst
->tab_
= ctx
->clock_alloc
.Map(cached_idx_
);
278 dst
->tab_idx_
= cached_idx_
;
279 dst
->size_
= cached_size_
;
280 dst
->blocks_
= cached_blocks_
;
281 CHECK_EQ(dst
->dirty_
[0].tid(), kInvalidTid
);
282 // The cached clock is shared (immutable),
283 // so this is where we store the current clock.
284 dst
->dirty_
[0].set_tid(tid_
);
285 dst
->dirty_
[0].epoch
= clk_
[tid_
];
286 dst
->release_store_tid_
= tid_
;
287 dst
->release_store_reused_
= reused_
;
288 // Remember that we don't need to acquire it in future.
289 dst
->elem(tid_
).reused
= reused_
;
291 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
295 // Check if we need to resize dst.
296 if (dst
->size_
< nclk_
)
297 dst
->Resize(c
, nclk_
);
299 if (dst
->release_store_tid_
== tid_
&&
300 dst
->release_store_reused_
== reused_
&&
301 !HasAcquiredAfterRelease(dst
)) {
302 UpdateCurrentThread(c
, dst
);
306 // O(N) release-store.
308 // Note: dst can be larger than this ThreadClock.
309 // This is fine since clk_ beyond size is all zeros.
311 for (ClockElem
&ce
: *dst
) {
316 for (uptr i
= 0; i
< kDirtyTids
; i
++) dst
->dirty_
[i
].set_tid(kInvalidTid
);
317 dst
->release_store_tid_
= tid_
;
318 dst
->release_store_reused_
= reused_
;
319 // Remember that we don't need to acquire it in future.
320 dst
->elem(tid_
).reused
= reused_
;
322 // If the resulting clock is cachable, cache it for future release operations.
323 // The clock is always cachable if we released to an empty sync object.
324 if (cached_idx_
== 0 && dst
->Cachable()) {
325 // Grab a reference to the ClockBlock.
326 atomic_uint32_t
*ref
= ref_ptr(dst
->tab_
);
327 if (atomic_load(ref
, memory_order_acquire
) == 1)
328 atomic_store_relaxed(ref
, 2);
330 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
331 cached_idx_
= dst
->tab_idx_
;
332 cached_size_
= dst
->size_
;
333 cached_blocks_
= dst
->blocks_
;
337 void ThreadClock::acq_rel(ClockCache
*c
, SyncClock
*dst
) {
339 ReleaseStore(c
, dst
);
342 // Updates only single element related to the current thread in dst->clk_.
343 void ThreadClock::UpdateCurrentThread(ClockCache
*c
, SyncClock
*dst
) const {
344 // Update the threads time, but preserve 'acquired' flag.
345 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
346 SyncClock::Dirty
*dirty
= &dst
->dirty_
[i
];
347 const unsigned tid
= dirty
->tid();
348 if (tid
== tid_
|| tid
== kInvalidTid
) {
349 dirty
->set_tid(tid_
);
350 dirty
->epoch
= clk_
[tid_
];
354 // Reset all 'acquired' flags, O(N).
355 // We are going to touch dst elements, so we need to unshare it.
357 dst
->elem(tid_
).epoch
= clk_
[tid_
];
358 for (uptr i
= 0; i
< dst
->size_
; i
++)
359 dst
->elem(i
).reused
= 0;
363 // Checks whether the current thread has already acquired src.
364 bool ThreadClock::IsAlreadyAcquired(const SyncClock
*src
) const {
365 if (src
->elem(tid_
).reused
!= reused_
)
367 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
368 SyncClock::Dirty dirty
= src
->dirty_
[i
];
369 if (dirty
.tid() != kInvalidTid
) {
370 if (clk_
[dirty
.tid()] < dirty
.epoch
)
377 // Checks whether the current thread has acquired anything
378 // from other clocks after releasing to dst (directly or indirectly).
379 bool ThreadClock::HasAcquiredAfterRelease(const SyncClock
*dst
) const {
380 const u64 my_epoch
= dst
->elem(tid_
).epoch
;
381 return my_epoch
<= last_acquire_
||
382 my_epoch
<= atomic_load_relaxed(&global_acquire_
);
385 // Sets a single element in the vector clock.
386 // This function is called only from weird places like AcquireGlobal.
387 void ThreadClock::set(ClockCache
*c
, unsigned tid
, u64 v
) {
388 DCHECK_LT(tid
, kMaxTid
);
389 DCHECK_GE(v
, clk_
[tid
]);
393 last_acquire_
= clk_
[tid_
];
397 void ThreadClock::DebugDump(int(*printf
)(const char *s
, ...)) {
399 for (uptr i
= 0; i
< nclk_
; i
++)
400 printf("%s%llu", i
== 0 ? "" : ",", clk_
[i
]);
401 printf("] tid=%u/%u last_acq=%llu", tid_
, reused_
, last_acquire_
);
404 SyncClock::SyncClock() {
408 SyncClock::~SyncClock() {
409 // Reset must be called before dtor.
411 CHECK_EQ(blocks_
, 0);
413 CHECK_EQ(tab_idx_
, 0);
416 void SyncClock::Reset(ClockCache
*c
) {
418 UnrefClockBlock(c
, tab_idx_
, blocks_
);
422 void SyncClock::ResetImpl() {
427 release_store_tid_
= kInvalidTid
;
428 release_store_reused_
= 0;
429 for (uptr i
= 0; i
< kDirtyTids
; i
++) dirty_
[i
].set_tid(kInvalidTid
);
432 void SyncClock::Resize(ClockCache
*c
, uptr nclk
) {
434 if (nclk
<= capacity()) {
435 // Memory is already allocated, just increase the size.
440 // Grow from 0 to one-level table.
442 CHECK_EQ(blocks_
, 0);
444 CHECK_EQ(tab_idx_
, 0);
445 tab_idx_
= ctx
->clock_alloc
.Alloc(c
);
446 tab_
= ctx
->clock_alloc
.Map(tab_idx_
);
447 internal_memset(tab_
, 0, sizeof(*tab_
));
448 atomic_store_relaxed(ref_ptr(tab_
), 1);
450 } else if (size_
> blocks_
* ClockBlock::kClockCount
) {
451 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
452 ClockBlock
*new_cb
= ctx
->clock_alloc
.Map(idx
);
453 uptr top
= size_
- blocks_
* ClockBlock::kClockCount
;
454 CHECK_LT(top
, ClockBlock::kClockCount
);
455 const uptr move
= top
* sizeof(tab_
->clock
[0]);
456 internal_memcpy(&new_cb
->clock
[0], tab_
->clock
, move
);
457 internal_memset(&new_cb
->clock
[top
], 0, sizeof(*new_cb
) - move
);
458 internal_memset(tab_
->clock
, 0, move
);
461 // At this point we have first level table allocated and all clock elements
462 // are evacuated from it to a second level block.
463 // Add second level tables as necessary.
464 while (nclk
> capacity()) {
465 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
466 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
467 internal_memset(cb
, 0, sizeof(*cb
));
473 // Flushes all dirty elements into the main clock array.
474 void SyncClock::FlushDirty() {
475 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
476 Dirty
*dirty
= &dirty_
[i
];
477 if (dirty
->tid() != kInvalidTid
) {
478 CHECK_LT(dirty
->tid(), size_
);
479 elem(dirty
->tid()).epoch
= dirty
->epoch
;
480 dirty
->set_tid(kInvalidTid
);
485 bool SyncClock::IsShared() const {
488 atomic_uint32_t
*ref
= ref_ptr(tab_
);
489 u32 v
= atomic_load(ref
, memory_order_acquire
);
494 // Unshares the current clock if it's shared.
495 // Shared clocks are immutable, so they need to be unshared before any updates.
496 // Note: this does not apply to dirty entries as they are not shared.
497 void SyncClock::Unshare(ClockCache
*c
) {
500 // First, copy current state into old.
503 old
.tab_idx_
= tab_idx_
;
505 old
.blocks_
= blocks_
;
506 old
.release_store_tid_
= release_store_tid_
;
507 old
.release_store_reused_
= release_store_reused_
;
508 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
509 old
.dirty_
[i
] = dirty_
[i
];
510 // Then, clear current object.
512 // Allocate brand new clock in the current object.
513 Resize(c
, old
.size_
);
514 // Now copy state back into this object.
516 for (ClockElem
&ce
: *this) {
520 release_store_tid_
= old
.release_store_tid_
;
521 release_store_reused_
= old
.release_store_reused_
;
522 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
523 dirty_
[i
] = old
.dirty_
[i
];
524 // Drop reference to old and delete if necessary.
528 // Can we cache this clock for future release operations?
529 ALWAYS_INLINE
bool SyncClock::Cachable() const {
532 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
533 if (dirty_
[i
].tid() != kInvalidTid
)
536 return atomic_load_relaxed(ref_ptr(tab_
)) == 1;
539 // elem linearizes the two-level structure into linear array.
540 // Note: this is used only for one time accesses, vector operations use
541 // the iterator as it is much faster.
542 ALWAYS_INLINE ClockElem
&SyncClock::elem(unsigned tid
) const {
543 DCHECK_LT(tid
, size_
);
544 const uptr block
= tid
/ ClockBlock::kClockCount
;
545 DCHECK_LE(block
, blocks_
);
546 tid
%= ClockBlock::kClockCount
;
547 if (block
== blocks_
)
548 return tab_
->clock
[tid
];
549 u32 idx
= get_block(block
);
550 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
551 return cb
->clock
[tid
];
554 ALWAYS_INLINE uptr
SyncClock::capacity() const {
557 uptr ratio
= sizeof(ClockBlock::clock
[0]) / sizeof(ClockBlock::table
[0]);
558 // How many clock elements we can fit into the first level block.
559 // +1 for ref counter.
560 uptr top
= ClockBlock::kClockCount
- RoundUpTo(blocks_
+ 1, ratio
) / ratio
;
561 return blocks_
* ClockBlock::kClockCount
+ top
;
564 ALWAYS_INLINE u32
SyncClock::get_block(uptr bi
) const {
566 DCHECK_LT(bi
, blocks_
);
567 return tab_
->table
[ClockBlock::kBlockIdx
- bi
];
570 ALWAYS_INLINE
void SyncClock::append_block(u32 idx
) {
572 CHECK_EQ(get_block(bi
), 0);
573 tab_
->table
[ClockBlock::kBlockIdx
- bi
] = idx
;
576 // Used only by tests.
577 u64
SyncClock::get(unsigned tid
) const {
578 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
579 Dirty dirty
= dirty_
[i
];
580 if (dirty
.tid() == tid
)
583 return elem(tid
).epoch
;
586 // Used only by Iter test.
587 u64
SyncClock::get_clean(unsigned tid
) const {
588 return elem(tid
).epoch
;
591 void SyncClock::DebugDump(int(*printf
)(const char *s
, ...)) {
593 for (uptr i
= 0; i
< size_
; i
++)
594 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).epoch
);
595 printf("] reused=[");
596 for (uptr i
= 0; i
< size_
; i
++)
597 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).reused
);
598 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
599 release_store_tid_
, release_store_reused_
, dirty_
[0].tid(),
600 dirty_
[0].epoch
, dirty_
[1].tid(), dirty_
[1].epoch
);
603 void SyncClock::Iter::Next() {
604 // Finished with the current block, move on to the next one.
606 if (block_
< parent_
->blocks_
) {
607 // Iterate over the next second level block.
608 u32 idx
= parent_
->get_block(block_
);
609 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
610 pos_
= &cb
->clock
[0];
611 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
612 ClockBlock::kClockCount
);
615 if (block_
== parent_
->blocks_
&&
616 parent_
->size_
> parent_
->blocks_
* ClockBlock::kClockCount
) {
617 // Iterate over elements in the first level block.
618 pos_
= &parent_
->tab_
->clock
[0];
619 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
620 ClockBlock::kClockCount
);
623 parent_
= nullptr; // denotes end
625 } // namespace __tsan