1 //===-- tsan_clock.cc -----------------------------------------------------===//
3 // This file is distributed under the University of Illinois Open Source
4 // License. See LICENSE.TXT for details.
6 //===----------------------------------------------------------------------===//
8 // This file is a part of ThreadSanitizer (TSan), a race detector.
10 //===----------------------------------------------------------------------===//
11 #include "tsan_clock.h"
13 #include "sanitizer_common/sanitizer_placement_new.h"
15 // SyncClock and ThreadClock implement vector clocks for sync variables
16 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
17 // ThreadClock contains fixed-size vector clock for maximum number of threads.
18 // SyncClock contains growable vector clock for currently necessary number of
20 // Together they implement very simple model of operations, namely:
22 // void ThreadClock::acquire(const SyncClock *src) {
23 // for (int i = 0; i < kMaxThreads; i++)
24 // clock[i] = max(clock[i], src->clock[i]);
27 // void ThreadClock::release(SyncClock *dst) const {
28 // for (int i = 0; i < kMaxThreads; i++)
29 // dst->clock[i] = max(dst->clock[i], clock[i]);
32 // void ThreadClock::ReleaseStore(SyncClock *dst) const {
33 // for (int i = 0; i < kMaxThreads; i++)
34 // dst->clock[i] = clock[i];
37 // void ThreadClock::acq_rel(SyncClock *dst) {
42 // Conformance to this model is extensively verified in tsan_clock_test.cc.
43 // However, the implementation is significantly more complex. The complexity
44 // allows to implement important classes of use cases in O(1) instead of O(N).
47 // 1. Singleton/once atomic that has a single release-store operation followed
48 // by zillions of acquire-loads (the acquire-load is O(1)).
49 // 2. Thread-local mutex (both lock and unlock can be O(1)).
50 // 3. Leaf mutex (unlock is O(1)).
51 // 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
52 // 5. An atomic with a single writer (writes can be O(1)).
53 // The implementation dynamically adopts to workload. So if an atomic is in
54 // read-only phase, these reads will be O(1); if it later switches to read/write
55 // phase, the implementation will correctly handle that by switching to O(N).
57 // Thread-safety note: all const operations on SyncClock's are conducted under
58 // a shared lock; all non-const operations on SyncClock's are conducted under
59 // an exclusive lock; ThreadClock's are private to respective threads and so
60 // do not need any protection.
62 // Description of SyncClock state:
63 // clk_ - variable size vector clock, low kClkBits hold timestamp,
64 // the remaining bits hold "acquired" flag (the actual value is thread's
66 // if acquried == thr->reused_, then the respective thread has already
67 // acquired this clock (except possibly for dirty elements).
68 // dirty_ - holds up to two indeces in the vector clock that other threads
69 // need to acquire regardless of "acquired" flag value;
70 // release_store_tid_ - denotes that the clock state is a result of
71 // release-store operation by the thread with release_store_tid_ index.
72 // release_store_reused_ - reuse count of release_store_tid_.
74 // We don't have ThreadState in these methods, so this is an ugly hack that
77 # define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
79 # define CPP_STAT_INC(typ) (void)0
84 static atomic_uint32_t
*ref_ptr(ClockBlock
*cb
) {
85 return reinterpret_cast<atomic_uint32_t
*>(&cb
->table
[ClockBlock::kRefIdx
]);
88 // Drop reference to the first level block idx.
89 static void UnrefClockBlock(ClockCache
*c
, u32 idx
, uptr blocks
) {
90 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
91 atomic_uint32_t
*ref
= ref_ptr(cb
);
92 u32 v
= atomic_load(ref
, memory_order_acquire
);
97 if (atomic_compare_exchange_strong(ref
, &v
, v
- 1, memory_order_acq_rel
))
100 // First level block owns second level blocks, so them as well.
101 for (uptr i
= 0; i
< blocks
; i
++)
102 ctx
->clock_alloc
.Free(c
, cb
->table
[ClockBlock::kBlockIdx
- i
]);
103 ctx
->clock_alloc
.Free(c
, idx
);
106 ThreadClock::ThreadClock(unsigned tid
, unsigned reused
)
108 , reused_(reused
+ 1) // 0 has special meaning
112 CHECK_LT(tid
, kMaxTidInClock
);
113 CHECK_EQ(reused_
, ((u64
)reused_
<< kClkBits
) >> kClkBits
);
116 internal_memset(clk_
, 0, sizeof(clk_
));
119 void ThreadClock::ResetCached(ClockCache
*c
) {
121 UnrefClockBlock(c
, cached_idx_
, cached_blocks_
);
128 void ThreadClock::acquire(ClockCache
*c
, SyncClock
*src
) {
129 DCHECK_LE(nclk_
, kMaxTid
);
130 DCHECK_LE(src
->size_
, kMaxTid
);
131 CPP_STAT_INC(StatClockAcquire
);
133 // Check if it's empty -> no need to do anything.
134 const uptr nclk
= src
->size_
;
136 CPP_STAT_INC(StatClockAcquireEmpty
);
140 bool acquired
= false;
141 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
142 SyncClock::Dirty dirty
= src
->dirty_
[i
];
143 unsigned tid
= dirty
.tid
;
144 if (tid
!= kInvalidTid
) {
145 if (clk_
[tid
] < dirty
.epoch
) {
146 clk_
[tid
] = dirty
.epoch
;
152 // Check if we've already acquired src after the last release operation on src
153 if (tid_
>= nclk
|| src
->elem(tid_
).reused
!= reused_
) {
155 CPP_STAT_INC(StatClockAcquireFull
);
156 nclk_
= max(nclk_
, nclk
);
157 u64
*dst_pos
= &clk_
[0];
158 for (ClockElem
&src_elem
: *src
) {
159 u64 epoch
= src_elem
.epoch
;
160 if (*dst_pos
< epoch
) {
167 // Remember that this thread has acquired this clock.
169 src
->elem(tid_
).reused
= reused_
;
173 CPP_STAT_INC(StatClockAcquiredSomething
);
174 last_acquire_
= clk_
[tid_
];
179 void ThreadClock::release(ClockCache
*c
, SyncClock
*dst
) {
180 DCHECK_LE(nclk_
, kMaxTid
);
181 DCHECK_LE(dst
->size_
, kMaxTid
);
183 if (dst
->size_
== 0) {
184 // ReleaseStore will correctly set release_store_tid_,
185 // which can be important for future operations.
186 ReleaseStore(c
, dst
);
190 CPP_STAT_INC(StatClockRelease
);
191 // Check if we need to resize dst.
192 if (dst
->size_
< nclk_
)
193 dst
->Resize(c
, nclk_
);
195 // Check if we had not acquired anything from other threads
196 // since the last release on dst. If so, we need to update
197 // only dst->elem(tid_).
198 if (dst
->elem(tid_
).epoch
> last_acquire_
) {
199 UpdateCurrentThread(c
, dst
);
200 if (dst
->release_store_tid_
!= tid_
||
201 dst
->release_store_reused_
!= reused_
)
202 dst
->release_store_tid_
= kInvalidTid
;
207 CPP_STAT_INC(StatClockReleaseFull
);
209 // First, remember whether we've acquired dst.
210 bool acquired
= IsAlreadyAcquired(dst
);
212 CPP_STAT_INC(StatClockReleaseAcquired
);
216 for (ClockElem
&ce
: *dst
) {
217 ce
.epoch
= max(ce
.epoch
, clk_
[i
]);
221 // Clear 'acquired' flag in the remaining elements.
222 if (nclk_
< dst
->size_
)
223 CPP_STAT_INC(StatClockReleaseClearTail
);
224 for (uptr i
= nclk_
; i
< dst
->size_
; i
++)
225 dst
->elem(i
).reused
= 0;
226 dst
->release_store_tid_
= kInvalidTid
;
227 dst
->release_store_reused_
= 0;
228 // If we've acquired dst, remember this fact,
229 // so that we don't need to acquire it on next acquire.
231 dst
->elem(tid_
).reused
= reused_
;
234 void ThreadClock::ReleaseStore(ClockCache
*c
, SyncClock
*dst
) {
235 DCHECK_LE(nclk_
, kMaxTid
);
236 DCHECK_LE(dst
->size_
, kMaxTid
);
237 CPP_STAT_INC(StatClockStore
);
239 if (dst
->size_
== 0 && cached_idx_
!= 0) {
240 // Reuse the cached clock.
241 // Note: we could reuse/cache the cached clock in more cases:
242 // we could update the existing clock and cache it, or replace it with the
243 // currently cached clock and release the old one. And for a shared
244 // existing clock, we could replace it with the currently cached;
245 // or unshare, update and cache. But, for simplicity, we currnetly reuse
246 // cached clock only when the target clock is empty.
247 dst
->tab_
= ctx
->clock_alloc
.Map(cached_idx_
);
248 dst
->tab_idx_
= cached_idx_
;
249 dst
->size_
= cached_size_
;
250 dst
->blocks_
= cached_blocks_
;
251 CHECK_EQ(dst
->dirty_
[0].tid
, kInvalidTid
);
252 // The cached clock is shared (immutable),
253 // so this is where we store the current clock.
254 dst
->dirty_
[0].tid
= tid_
;
255 dst
->dirty_
[0].epoch
= clk_
[tid_
];
256 dst
->release_store_tid_
= tid_
;
257 dst
->release_store_reused_
= reused_
;
258 // Rememeber that we don't need to acquire it in future.
259 dst
->elem(tid_
).reused
= reused_
;
261 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
265 // Check if we need to resize dst.
266 if (dst
->size_
< nclk_
)
267 dst
->Resize(c
, nclk_
);
269 if (dst
->release_store_tid_
== tid_
&&
270 dst
->release_store_reused_
== reused_
&&
271 dst
->elem(tid_
).epoch
> last_acquire_
) {
272 CPP_STAT_INC(StatClockStoreFast
);
273 UpdateCurrentThread(c
, dst
);
277 // O(N) release-store.
278 CPP_STAT_INC(StatClockStoreFull
);
280 // Note: dst can be larger than this ThreadClock.
281 // This is fine since clk_ beyond size is all zeros.
283 for (ClockElem
&ce
: *dst
) {
288 for (uptr i
= 0; i
< kDirtyTids
; i
++)
289 dst
->dirty_
[i
].tid
= kInvalidTid
;
290 dst
->release_store_tid_
= tid_
;
291 dst
->release_store_reused_
= reused_
;
292 // Rememeber that we don't need to acquire it in future.
293 dst
->elem(tid_
).reused
= reused_
;
295 // If the resulting clock is cachable, cache it for future release operations.
296 // The clock is always cachable if we released to an empty sync object.
297 if (cached_idx_
== 0 && dst
->Cachable()) {
298 // Grab a reference to the ClockBlock.
299 atomic_uint32_t
*ref
= ref_ptr(dst
->tab_
);
300 if (atomic_load(ref
, memory_order_acquire
) == 1)
301 atomic_store_relaxed(ref
, 2);
303 atomic_fetch_add(ref_ptr(dst
->tab_
), 1, memory_order_relaxed
);
304 cached_idx_
= dst
->tab_idx_
;
305 cached_size_
= dst
->size_
;
306 cached_blocks_
= dst
->blocks_
;
310 void ThreadClock::acq_rel(ClockCache
*c
, SyncClock
*dst
) {
311 CPP_STAT_INC(StatClockAcquireRelease
);
313 ReleaseStore(c
, dst
);
316 // Updates only single element related to the current thread in dst->clk_.
317 void ThreadClock::UpdateCurrentThread(ClockCache
*c
, SyncClock
*dst
) const {
318 // Update the threads time, but preserve 'acquired' flag.
319 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
320 SyncClock::Dirty
*dirty
= &dst
->dirty_
[i
];
321 const unsigned tid
= dirty
->tid
;
322 if (tid
== tid_
|| tid
== kInvalidTid
) {
323 CPP_STAT_INC(StatClockReleaseFast
);
325 dirty
->epoch
= clk_
[tid_
];
329 // Reset all 'acquired' flags, O(N).
330 // We are going to touch dst elements, so we need to unshare it.
332 CPP_STAT_INC(StatClockReleaseSlow
);
333 dst
->elem(tid_
).epoch
= clk_
[tid_
];
334 for (uptr i
= 0; i
< dst
->size_
; i
++)
335 dst
->elem(i
).reused
= 0;
339 // Checks whether the current thread has already acquired src.
340 bool ThreadClock::IsAlreadyAcquired(const SyncClock
*src
) const {
341 if (src
->elem(tid_
).reused
!= reused_
)
343 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
344 SyncClock::Dirty dirty
= src
->dirty_
[i
];
345 if (dirty
.tid
!= kInvalidTid
) {
346 if (clk_
[dirty
.tid
] < dirty
.epoch
)
353 // Sets a single element in the vector clock.
354 // This function is called only from weird places like AcquireGlobal.
355 void ThreadClock::set(ClockCache
*c
, unsigned tid
, u64 v
) {
356 DCHECK_LT(tid
, kMaxTid
);
357 DCHECK_GE(v
, clk_
[tid
]);
361 last_acquire_
= clk_
[tid_
];
365 void ThreadClock::DebugDump(int(*printf
)(const char *s
, ...)) {
367 for (uptr i
= 0; i
< nclk_
; i
++)
368 printf("%s%llu", i
== 0 ? "" : ",", clk_
[i
]);
369 printf("] tid=%u/%u last_acq=%llu", tid_
, reused_
, last_acquire_
);
372 SyncClock::SyncClock() {
376 SyncClock::~SyncClock() {
377 // Reset must be called before dtor.
379 CHECK_EQ(blocks_
, 0);
381 CHECK_EQ(tab_idx_
, 0);
384 void SyncClock::Reset(ClockCache
*c
) {
386 UnrefClockBlock(c
, tab_idx_
, blocks_
);
390 void SyncClock::ResetImpl() {
395 release_store_tid_
= kInvalidTid
;
396 release_store_reused_
= 0;
397 for (uptr i
= 0; i
< kDirtyTids
; i
++)
398 dirty_
[i
].tid
= kInvalidTid
;
401 void SyncClock::Resize(ClockCache
*c
, uptr nclk
) {
402 CPP_STAT_INC(StatClockReleaseResize
);
404 if (nclk
<= capacity()) {
405 // Memory is already allocated, just increase the size.
410 // Grow from 0 to one-level table.
412 CHECK_EQ(blocks_
, 0);
414 CHECK_EQ(tab_idx_
, 0);
415 tab_idx_
= ctx
->clock_alloc
.Alloc(c
);
416 tab_
= ctx
->clock_alloc
.Map(tab_idx_
);
417 internal_memset(tab_
, 0, sizeof(*tab_
));
418 atomic_store_relaxed(ref_ptr(tab_
), 1);
420 } else if (size_
> blocks_
* ClockBlock::kClockCount
) {
421 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
422 ClockBlock
*new_cb
= ctx
->clock_alloc
.Map(idx
);
423 uptr top
= size_
- blocks_
* ClockBlock::kClockCount
;
424 CHECK_LT(top
, ClockBlock::kClockCount
);
425 const uptr move
= top
* sizeof(tab_
->clock
[0]);
426 internal_memcpy(&new_cb
->clock
[0], tab_
->clock
, move
);
427 internal_memset(&new_cb
->clock
[top
], 0, sizeof(*new_cb
) - move
);
428 internal_memset(tab_
->clock
, 0, move
);
431 // At this point we have first level table allocated and all clock elements
432 // are evacuated from it to a second level block.
433 // Add second level tables as necessary.
434 while (nclk
> capacity()) {
435 u32 idx
= ctx
->clock_alloc
.Alloc(c
);
436 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
437 internal_memset(cb
, 0, sizeof(*cb
));
443 // Flushes all dirty elements into the main clock array.
444 void SyncClock::FlushDirty() {
445 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
446 Dirty
*dirty
= &dirty_
[i
];
447 if (dirty
->tid
!= kInvalidTid
) {
448 CHECK_LT(dirty
->tid
, size_
);
449 elem(dirty
->tid
).epoch
= dirty
->epoch
;
450 dirty
->tid
= kInvalidTid
;
455 bool SyncClock::IsShared() const {
458 atomic_uint32_t
*ref
= ref_ptr(tab_
);
459 u32 v
= atomic_load(ref
, memory_order_acquire
);
464 // Unshares the current clock if it's shared.
465 // Shared clocks are immutable, so they need to be unshared before any updates.
466 // Note: this does not apply to dirty entries as they are not shared.
467 void SyncClock::Unshare(ClockCache
*c
) {
470 // First, copy current state into old.
473 old
.tab_idx_
= tab_idx_
;
475 old
.blocks_
= blocks_
;
476 old
.release_store_tid_
= release_store_tid_
;
477 old
.release_store_reused_
= release_store_reused_
;
478 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
479 old
.dirty_
[i
] = dirty_
[i
];
480 // Then, clear current object.
482 // Allocate brand new clock in the current object.
483 Resize(c
, old
.size_
);
484 // Now copy state back into this object.
486 for (ClockElem
&ce
: *this) {
490 release_store_tid_
= old
.release_store_tid_
;
491 release_store_reused_
= old
.release_store_reused_
;
492 for (unsigned i
= 0; i
< kDirtyTids
; i
++)
493 dirty_
[i
] = old
.dirty_
[i
];
494 // Drop reference to old and delete if necessary.
498 // Can we cache this clock for future release operations?
499 ALWAYS_INLINE
bool SyncClock::Cachable() const {
502 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
503 if (dirty_
[i
].tid
!= kInvalidTid
)
506 return atomic_load_relaxed(ref_ptr(tab_
)) == 1;
509 // elem linearizes the two-level structure into linear array.
510 // Note: this is used only for one time accesses, vector operations use
511 // the iterator as it is much faster.
512 ALWAYS_INLINE ClockElem
&SyncClock::elem(unsigned tid
) const {
513 DCHECK_LT(tid
, size_
);
514 const uptr block
= tid
/ ClockBlock::kClockCount
;
515 DCHECK_LE(block
, blocks_
);
516 tid
%= ClockBlock::kClockCount
;
517 if (block
== blocks_
)
518 return tab_
->clock
[tid
];
519 u32 idx
= get_block(block
);
520 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
521 return cb
->clock
[tid
];
524 ALWAYS_INLINE uptr
SyncClock::capacity() const {
527 uptr ratio
= sizeof(ClockBlock::clock
[0]) / sizeof(ClockBlock::table
[0]);
528 // How many clock elements we can fit into the first level block.
529 // +1 for ref counter.
530 uptr top
= ClockBlock::kClockCount
- RoundUpTo(blocks_
+ 1, ratio
) / ratio
;
531 return blocks_
* ClockBlock::kClockCount
+ top
;
534 ALWAYS_INLINE u32
SyncClock::get_block(uptr bi
) const {
536 DCHECK_LT(bi
, blocks_
);
537 return tab_
->table
[ClockBlock::kBlockIdx
- bi
];
540 ALWAYS_INLINE
void SyncClock::append_block(u32 idx
) {
542 CHECK_EQ(get_block(bi
), 0);
543 tab_
->table
[ClockBlock::kBlockIdx
- bi
] = idx
;
546 // Used only by tests.
547 u64
SyncClock::get(unsigned tid
) const {
548 for (unsigned i
= 0; i
< kDirtyTids
; i
++) {
549 Dirty dirty
= dirty_
[i
];
550 if (dirty
.tid
== tid
)
553 return elem(tid
).epoch
;
556 // Used only by Iter test.
557 u64
SyncClock::get_clean(unsigned tid
) const {
558 return elem(tid
).epoch
;
561 void SyncClock::DebugDump(int(*printf
)(const char *s
, ...)) {
563 for (uptr i
= 0; i
< size_
; i
++)
564 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).epoch
);
565 printf("] reused=[");
566 for (uptr i
= 0; i
< size_
; i
++)
567 printf("%s%llu", i
== 0 ? "" : ",", elem(i
).reused
);
568 printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
569 release_store_tid_
, release_store_reused_
,
570 dirty_
[0].tid
, dirty_
[0].epoch
,
571 dirty_
[1].tid
, dirty_
[1].epoch
);
574 void SyncClock::Iter::Next() {
575 // Finished with the current block, move on to the next one.
577 if (block_
< parent_
->blocks_
) {
578 // Iterate over the next second level block.
579 u32 idx
= parent_
->get_block(block_
);
580 ClockBlock
*cb
= ctx
->clock_alloc
.Map(idx
);
581 pos_
= &cb
->clock
[0];
582 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
583 ClockBlock::kClockCount
);
586 if (block_
== parent_
->blocks_
&&
587 parent_
->size_
> parent_
->blocks_
* ClockBlock::kClockCount
) {
588 // Iterate over elements in the first level block.
589 pos_
= &parent_
->tab_
->clock
[0];
590 end_
= pos_
+ min(parent_
->size_
- block_
* ClockBlock::kClockCount
,
591 ClockBlock::kClockCount
);
594 parent_
= nullptr; // denotes end
596 } // namespace __tsan