1 /* record-replay.cpp -*-C++-*-
3 *************************************************************************
6 * Copyright (C) 2012-2013, Intel Corporation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
14 * * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * * Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in
18 * the documentation and/or other materials provided with the
20 * * Neither the name of Intel Corporation nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
32 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
33 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
35 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
38 **************************************************************************/
41 * Implementation of the record/replay functionality for Cilk Plus
48 // clang is really strict about printf formats, so use the annoying integer
49 // printf macros. Unfortunately they're not avaiable on Windows
53 #define __STDC_FORMAT_MACROS 1
57 #include "record-replay.h"
59 #include "internal/abi.h"
60 #include "local_state.h"
61 #include "full_frame.h"
62 #include "global_state.h"
63 #include "cilk_malloc.h"
64 #include "os.h" // for cilkos_error()
67 #pragma message ("*** Record on Replay is enabled!")
70 // Defined to write sequence number to the logs. Note that you cannot
71 // diff logs with sequence numbers because the numbers may increment in
73 //#define INCLUDE_SEQUENCE_NUMBER 1
75 const int PED_VERSION
= 1; // Log recording version
84 ped_type_last
// Flags end of the list
88 #define PED_TYPE_STR_STEAL "Steal"
89 #define PED_TYPE_STR_SYNC "Sync"
90 #define PED_TYPE_STR_WORKERS "Workers"
91 #define PED_TYPE_STR_ORPHANED "Orphaned"
93 #define PED_TYPE_SIZE 16 // Buffer size for the type of pedigree. Must
94 // hold largest pedigree record type string.
95 #define PEDIGREE_BUFF_SIZE 512 // Buffer size for the string representation
99 * Data we store for a replay log entry
101 typedef struct replay_entry_t
103 uint64_t *m_reverse_pedigree
; /**< Reverse pedigree for replay log entry */
104 ped_type_t m_type
; /**< Type of replay log entry */
105 int16_t m_pedigree_len
; /**< Number of terms in reverse pedigree */
106 int16_t m_value
; /**< Victim for STEALs, 0 if matching steal found for ORPHANs */
109 * Load data read from the log into the entry
111 bool load(const char *type
, const char *pedigee_str
, int32_t value1
, int32_t value2
)
113 // Convert the type into an enum
114 if (0 == strcmp(type
, PED_TYPE_STR_STEAL
))
116 m_type
= ped_type_steal
;
117 m_value
= (int16_t)value1
; // Victim
121 m_value
= -1; // Victim not valid
122 if (0 == strcmp(type
, PED_TYPE_STR_SYNC
))
123 m_type
= ped_type_sync
;
124 else if (0 == strcmp(type
, PED_TYPE_STR_ORPHANED
))
125 m_type
= ped_type_orphaned
;
128 m_type
= ped_type_unknown
;
133 // Parse the pedigree
136 const char *p
= pedigee_str
;
139 uint64_t temp_pedigree
[PEDIGREE_BUFF_SIZE
/2];
143 temp_pedigree
[m_pedigree_len
++] = (uint64_t)strtol(p
, &end
, 10);
149 // Allocate memory to hold the pedigree.
150 // Copy the pedigree in reverse order since that's the order we'll
153 (uint64_t *)__cilkrts_malloc(sizeof(int64_t) * m_pedigree_len
);
154 for (int n
= 0; n
< m_pedigree_len
; n
++)
155 m_reverse_pedigree
[n
] = temp_pedigree
[(m_pedigree_len
- 1) - n
];
161 * Match this entry against the data supplied. This includes walking the
162 * pedigree from the specified node.
164 bool match (ped_type_t type
, const __cilkrts_pedigree
*node
, int victim
= -1)
168 // If the type isn't what they're seeking, we don't have a match
172 // If we're looking for a STEAL, then the victim must match
173 if ((type
== ped_type_steal
) && (victim
!= m_value
))
176 // Compare the current pedigree against what was recorded
177 while ((NULL
!= node
) && (i
< m_pedigree_len
))
179 // If we've got a pedigree rank difference, then we don't have
181 if (node
->rank
!= m_reverse_pedigree
[i
])
187 // Make sure we exhausted both the pedigree chain and the recorded
189 return ((NULL
== node
) && (i
== m_pedigree_len
));
193 * Advance to the next entry, skipping any ORPHANED records we didn't see
194 * a matching STEAL for
196 replay_entry_t
*next_entry()
198 replay_entry_t
*entry
= this;
200 // You can't go beyond the end
201 if (ped_type_last
== entry
->m_type
)
204 // Advance to the next entry
207 // Skip any ORPHANED records that don't have a matching steal. We
208 // initialized the value field to -1 for ORPHANED. After loading all
209 // the log data, we iterated through all the STEAL records setting the
210 // matching ORPHANED record's value field to 0. So if an ORPHANED
211 // record's value field is still -1, it doesn't have a matching STEAL
212 // record, and I don't know why we chose not to return from the
214 while ((ped_type_orphaned
== entry
->m_type
) && (-1 == entry
->m_value
))
223 * Release any allocated resources
227 __cilkrts_free(m_reverse_pedigree
);
228 m_reverse_pedigree
= NULL
;
233 __CILKRTS_BEGIN_EXTERN_C
236 * Walk the pedigree and generate a string representation with underscores
237 * between terms. Currently does a recursive walk to generate a forward
240 * @param p The buffer that is to be filled. Assumed to be PEDIGREE_BUFF_SIZE
242 * @param pnode The initial pedigree term to be written.
244 * @return A pointer into the pedigree string buffer after a term has been
248 char * walk_pedigree_nodes(char *p
, const __cilkrts_pedigree
*pnode
)
253 p
= walk_pedigree_nodes(p
, pnode
->parent
);
254 p
+= sprintf(p
, "_");
257 return p
+ sprintf(p
, "%" PRIu64
, pnode
->rank
);
261 * Write a record to a replay log file.
263 * @param w The worker we're writing the pedigree for.
264 * @param type The type of the pedigree record, as a string
265 * @param initial_node The initial pedigree node to be written, or NULL if
266 * there is no pedigree for this record type.
267 * @param i1 First integer value to be written to the record.
268 * @param i2 Second integer value to be written to the record. Only applies
269 * to STEAL records. Defaults to -1 (unused). The second value is always
270 * written to make parsing easier.
273 void write_to_replay_log (__cilkrts_worker
*w
, const char *type
,
274 const __cilkrts_pedigree
*initial_node
,
275 int i1
= -1, int i2
= -1)
277 char pedigree
[PEDIGREE_BUFF_SIZE
];
279 // If we don't have an initial pedigree node, just use "0" to fill the slot
280 if (NULL
== initial_node
)
281 strcpy(pedigree
, "0");
283 walk_pedigree_nodes(pedigree
, initial_node
);
285 #ifndef INCLUDE_SEQUENCE_NUMBER
286 // Simply write the record
287 fprintf(w
->l
->record_replay_fptr
, "%s %s %d %d\n",
288 type
, pedigree
, i1
, i2
);
290 // Write the record with a sequence number. The sequence number should
291 // always be the last term, and ignored on read
293 static long volatile seq_num
= 0;
296 // Atomic increment functions are compiler/OS-specific
298 write_num
= _InterlockedIncrement(&seq_num
);
300 write_num
= __sync_add_and_fetch(&seq_num
, 1);
303 fprintf(w
->l
->record_replay_fptr
, "%s %s %d %d %ld\n",
304 type
, pedigree
, i1
, i2
, write_num
);
305 #endif // INCLUDE_SEQUENCE_NUMBER
307 fflush(w
->l
->record_replay_fptr
);
311 * Record data for a successful steal.
313 * The pedigree for a STEAL record is the pedigree of the stolen frame.
315 * @note It's assumed that replay_record_steal() has already checked that we're
316 * recording a log and that the record/replay functionality has not been
319 * @param w The worker stealing a frame.
320 * @param victim_id The ID of the worker which had it's frame stolen.
322 void replay_record_steal_internal(__cilkrts_worker
*w
, int32_t victim_id
)
324 // Follow the pedigree chain using worker's stack frame
325 CILK_ASSERT(w
->l
->next_frame_ff
);
326 CILK_ASSERT(w
->l
->next_frame_ff
->call_stack
);
328 // Record steal: STEAL pedigree victim_id thief_id
329 write_to_replay_log (w
, PED_TYPE_STR_STEAL
,
330 &(w
->l
->next_frame_ff
->call_stack
->parent_pedigree
),
335 * Record data for the worker that continues from a sync
337 * The pedigree for a SYNC record is the pedigree at the sync.
339 * @note It's assumed that replay_record_sync() has already checked that we're
340 * recording a log and that the record/replay functionality has not been
343 * @param w The worker continuing from a sync.
345 void replay_record_sync_internal(__cilkrts_worker
*w
)
347 // Record sync: SYNC pedigree last_worker_id
348 write_to_replay_log (w
, PED_TYPE_STR_SYNC
, &w
->pedigree
);
352 * Record the pedigree of an attempt to return to a stolen parent
354 * The pedigree for an ORPHANED record is the pedigree of our parent
356 * @note It's assumed that replay_record_orphaned() has already checked that
357 * we're recording a log and that the record/replay functionality has not
360 * @param w The worker continuing noting that it has been orphaned.
362 void replay_record_orphaned_internal(__cilkrts_worker
*w
)
364 // Record steal: ORPHANED pedigree self
365 write_to_replay_log (w
, PED_TYPE_STR_ORPHANED
, w
->pedigree
.parent
);
369 * Attempt to match a SYNC record. We have a match when this worker was
370 * recorded returning from the current call to __cilkrts_sync() with the
371 * same pedigree and this was the worker that continued from the sync, since
372 * it was the last to sync.
374 * If we find a match, the caller is expected to stall it is the last worker
375 * to reach a sync so it will be the worker to continue from the sync.
377 * @note It's assumed that replay_match_sync_pedigree() has already returned
378 * if we're not replaying a log, or if record/replay functionality has
381 * @param w The worker we're checking to see if we've got a match
383 int replay_match_sync_pedigree_internal(__cilkrts_worker
*w
)
385 // Return true if we have a match
386 if (w
->l
->replay_list_entry
->match(ped_type_sync
, &w
->pedigree
))
393 * Advance to the next log entry from a SYNC record. Consume the current
394 * SYNC record on this worker and advance to the next one.
396 * @note It's assumed that replay_advance_from_sync() has already returned if
397 * we're not replaying a log, or if record/replay functionality has been
400 * @param w The worker whose replay log we're advancing.
402 void replay_advance_from_sync_internal (__cilkrts_worker
*w
)
404 // The current replay entry must be a SYNC
405 CILK_ASSERT(ped_type_sync
== w
->l
->replay_list_entry
->m_type
);
407 // Advance to the next entry
408 w
->l
->replay_list_entry
= w
->l
->replay_list_entry
->next_entry();
412 * Called from random_steal() to override the ID of the randomly chosen victim
413 * worker which this worker will attempt to steal from. Returns the worker id
414 * of the next victim this worker was recorded stealing from, or -1 if the
415 * next record in the log is not a STEAL.
417 * @note This call does NOT attempt to match the pedigree. That will be done
418 * by replay_match_victim_pedigree() after random_steal() has locked the victim
421 * @param w The __cilkrts_worker we're executing on. The worker's replay log
422 * is checked for a STEAL record. If we've got one, the stolen worker ID is
425 * @return -1 if the next record is not a STEAL
426 * @return recorded stolen worker ID if we've got a matching STEAL record
428 int replay_get_next_recorded_victim_internal(__cilkrts_worker
*w
)
430 // If the next record isn't a STEAL, abort the attempt to steal work
431 if (ped_type_steal
!= w
->l
->replay_list_entry
->m_type
)
434 // Return the victim's worker ID from the STEAL record. We'll check
435 // the pedigree after random_steal has locked the victim worker.
436 return w
->l
->replay_list_entry
->m_value
;
440 * Called from random_steal() to determine if we have a STEAL record that
441 * matches the pedigree at the head of the victim worker. If we do have a
442 * match, the STEAL record is consumed.
444 * @note It's assumed that replay_match_victim_pedigree() has already returned if
445 * we're not replaying a log, or if record/replay functionality has been
448 * @return 1 if we have a match
449 * @return 0 if the current replay record isn't a STEAL record, or the victim
450 * isn't correct, or the pedigree doesn't match.
452 int replay_match_victim_pedigree_internal(__cilkrts_worker
*w
, __cilkrts_worker
*victim
)
454 // If we don't have a match, return 0
455 if (! w
->l
->replay_list_entry
->match(ped_type_steal
,
456 &((*victim
->head
)->parent_pedigree
),
460 // Consume this entry
461 w
->l
->replay_list_entry
= w
->l
->replay_list_entry
->next_entry();
468 * If the frame we're about to return to was recorded as being stolen,
471 * @note It's assumed that replay_wait_for_steal_if_parent_was_stolen() has
472 * already returned if we're not replaying a log, or if record/replay
473 * functionality has been compiled out.
475 * @param w The worker we're executing on.
477 void replay_wait_for_steal_if_parent_was_stolen_internal(__cilkrts_worker
*w
)
479 // If our parent wasn't recorded orphanen, return now
480 if (! w
->l
->replay_list_entry
->match (ped_type_orphaned
,
484 // Stall until our parent is stolen. Note that we're comparing head
485 // and tail, not head and exc. The steal is not completed until tail
487 while (!((w
->tail
- 1) < w
->head
))
491 w
->l
->replay_list_entry
= w
->l
->replay_list_entry
->next_entry();
495 * Allocate memory for the list of logged events.
497 * This function will read through the file and count the number of records
498 * so it can estimate how big a buffer to allocate for the array or replay
499 * entries. It will then rewind the file to the beginning so it can be
500 * loaded into memory.
502 * @param w The worker we're loading the file for.
503 * @param f The file of replay data we're scanning.
506 void allocate_replay_list(__cilkrts_worker
*w
, FILE *f
)
508 // Count the number of entries - yeah, it's a hack, but it lets me
509 // allocate the space all at once instead of in chunks
511 int entries
= 1; // Include "LAST" node
515 if (fgets(buf
, 1024, f
))
517 // Skip the Workers record - should only be in file for Worker 0
518 if (0 != strncmp(PED_TYPE_STR_WORKERS
, buf
, sizeof(PED_TYPE_STR_WORKERS
)-1))
523 w
->l
->replay_list_root
=
524 (replay_entry_t
*)__cilkrts_malloc(entries
* sizeof(replay_entry_t
));
525 w
->l
->replay_list_root
[entries
- 1].m_type
= ped_type_last
;
527 // Reset the file to the beginning
532 * Load the replay log for a worker into memory.
534 * @param w The worker we're loading the replay for.
537 void load_recorded_log(__cilkrts_worker
*w
)
539 char ped_type
[PED_TYPE_SIZE
];
540 char ped_str
[PEDIGREE_BUFF_SIZE
];
541 int32_t i1
= -1, i2
= -1;
543 char local_replay_file_name
[512];
546 // Open the log for reading
547 sprintf(local_replay_file_name
, "%s%d.cilklog", w
->g
->record_replay_file_name
, w
->self
);
548 f
= fopen(local_replay_file_name
, "r");
550 // Make sure we found a log!
551 CILK_ASSERT (NULL
!= f
);
553 // Initialize the replay_list
554 allocate_replay_list(w
, f
);
555 replay_entry_t
*entry
= w
->l
->replay_list_root
;
557 // Read the data out and add it to our tables
560 #ifndef INCLUDE_SEQUENCE_NUMBER
561 fret
= fscanf(f
, "%s %s %d %d\n", ped_type
, ped_str
, &i1
, &i2
);
565 // We must have read 4 fields
566 CILK_ASSERT(4 == fret
);
569 fret
= fscanf(f
, "%s %s %d %d %d\n", ped_type
, ped_str
,
570 &i1
, &i2
, &write_num
);
574 // We must have read 5 fields
575 CILK_ASSERT(5 == fret
);
576 #endif // INCLUDE_SEQUENCE_NUMBER
578 // Load the data into the entry
579 if (0 == strcmp(ped_type
, PED_TYPE_STR_WORKERS
))
581 // Verify we're replaying with the same number of workers we recorded with
584 // Fatal error - does not return
585 cilkos_error("Cannot continue replay: number of workers(%d) doesn't match "
586 "that from the recording(%d).\n", w
->g
->P
, i1
);
589 // Verify that we understand this version of the pedigree file
590 if (PED_VERSION
!= i2
)
592 // Fatal error - does not return
593 cilkos_error("Pedigree file version %d doesn't match current "
594 "version %d - cannot continue.\n",
600 entry
->load(ped_type
, ped_str
, i1
, i2
);
605 // Make sure we've filled the allocated memory. We initialized the last
607 CILK_ASSERT(ped_type_last
== entry
->m_type
);
608 w
->l
->replay_list_entry
= w
->l
->replay_list_root
;
610 // Close the log and return
615 * Scan a recorded log to match STEALs againsted ORPHANED records.
617 * @param g Cilk Runtime global state. Passed to access the worker array so
618 * we can scan a worker's ORPHANED entries for one that matches a STEAL entry.
619 * @param entry The root of a replay_list for a worker.
622 void scan_for_matching_steals(global_state_t
*g
, replay_entry_t
*entry
)
624 // Iterate over all of the entries
625 while (ped_type_last
!= entry
->m_type
)
627 // Look for STEALs. That will tell us which worker the frame was
629 if (ped_type_steal
== entry
->m_type
)
633 // Validate the worker ID and make sure we've got a list
634 CILK_ASSERT((entry
->m_value
>= 0) && (entry
->m_value
< g
->total_workers
));
635 replay_entry_t
*victim_entry
= g
->workers
[entry
->m_value
]->l
->replay_list_root
;
636 CILK_ASSERT(NULL
!= victim_entry
);
638 // Scan the victim's list for the matching ORPHANED record
639 while ((ped_type_last
!= victim_entry
->m_type
) && ! found
)
641 if (ped_type_orphaned
== victim_entry
->m_type
)
643 if (entry
->m_pedigree_len
== victim_entry
->m_pedigree_len
)
645 if (0 == memcmp(entry
->m_reverse_pedigree
,
646 victim_entry
->m_reverse_pedigree
,
647 entry
->m_pedigree_len
* sizeof(int64_t)))
649 // Note that this ORPHANED record has a matching steal
650 victim_entry
->m_value
= 0;
664 * Initialize per-worker data for record or replay - See record-replay.h
665 * for full routine header.
667 void replay_init_workers(global_state_t
*g
)
670 char worker_file_name
[512];
672 // If we're not recording or replaying a log, we're done. All of the
673 // fields in the global_state_t or local_state_t are already initialized
674 // to default values.
675 if (RECORD_REPLAY_NONE
== g
->record_or_replay
)
678 // If we're replaying a log, read each worker's log and construct the
680 if (REPLAY_LOG
== g
->record_or_replay
)
682 // Read all of the data
683 for (i
= 0; i
< g
->total_workers
; ++i
)
685 // This function will also initialize and fill the worker's
687 load_recorded_log(g
->workers
[i
]);
690 // Scan for orphans with no matching steal. Mark them so they'll be
691 // skipped as we advance through the log.
692 for (i
= 0; i
< g
->total_workers
; ++i
)
694 scan_for_matching_steals(g
, g
->workers
[i
]->l
->replay_list_root
);
697 // If we're recording the logs while replaying, create the log files.
698 // This will only be used for debugging. Create the logs in the
699 // current directory. It should be as good a place as any...
701 for(i
= 0; i
< g
->total_workers
; ++i
)
703 __cilkrts_worker
*w
= g
->workers
[i
];
704 sprintf(worker_file_name
, "replay_log_%d.cilklog", w
->self
);
705 w
->l
->record_replay_fptr
= fopen(worker_file_name
, "w+");
706 CILK_ASSERT(NULL
!= w
->l
->record_replay_fptr
);
709 // Record the number of workers, file version in Worker 0's file
710 write_to_replay_log (g
->workers
[0], PED_TYPE_STR_WORKERS
, NULL
, g
->P
, PED_VERSION
);
711 #endif // RECORD_ON_REPLAY
714 // If we're recording, create the log files
715 if (RECORD_LOG
== g
->record_or_replay
)
717 for(i
= 0; i
< g
->total_workers
; ++i
)
719 __cilkrts_worker
*w
= g
->workers
[i
];
720 sprintf(worker_file_name
, "%s%d.cilklog",
721 g
->record_replay_file_name
,
723 w
->l
->record_replay_fptr
= fopen(worker_file_name
, "w+");
724 CILK_ASSERT(NULL
!= w
->l
->record_replay_fptr
);
727 // Record the number of workers, file version in Worker 0's file
728 write_to_replay_log (g
->workers
[0], PED_TYPE_STR_WORKERS
, NULL
, g
->P
, PED_VERSION
);
733 * Do any necessary cleanup for the logs - See record-replay.h for full
736 void replay_term(global_state_t
*g
)
738 // Free memory for the record/replay log file name, if we've got one
739 if (g
->record_replay_file_name
)
740 __cilkrts_free(g
->record_replay_file_name
);
742 // Per-worker cleanup
743 for(int i
= 0; i
< g
->total_workers
; ++i
)
745 __cilkrts_worker
*w
= g
->workers
[i
];
747 // Close the log files, if we've opened them
748 if(w
->l
->record_replay_fptr
)
749 fclose(w
->l
->record_replay_fptr
);
751 if (w
->l
->replay_list_root
)
753 // We should have consumed the entire list
754 CILK_ASSERT(ped_type_last
== w
->l
->replay_list_entry
->m_type
);
756 replay_entry_t
*entry
= w
->l
->replay_list_root
;
757 while (ped_type_last
!= entry
->m_type
)
759 // Free the pedigree memory for each entry
763 __cilkrts_free(w
->l
->replay_list_root
);
764 w
->l
->replay_list_root
= NULL
;
765 w
->l
->replay_list_entry
= NULL
;
770 __CILKRTS_END_EXTERN_C