2 Charm++ File: Checkpoint Library
3 added 01/03/2003 by Chao Huang, chuang10@uiuc.edu
5 More documentation goes here...
6 --- Updated 12/14/2003 by Gengbin, gzheng@uiuc.edu
7 see ckcheckpoint.h for change log
14 using std::ostringstream;
18 #include "ckcheckpoint.h"
19 #include "CkCheckpoint.decl.h"
21 void noopit(const char*, ...)
24 //#define DEBCHK CkPrintf
30 CkGroupID _sysChkptMgr;
32 typedef struct _GroupInfo{
39 PUPmarshall(GroupInfo)
44 int _chareRestored = 0;
45 double chkptStartTimer = 0;
47 int originalnumGroups = -1;
48 extern int Cmi_isOldProcess;
49 extern int Cmi_myoldpe;
50 extern char *_shrinkexpand_basedir;
53 void CkCreateLocalChare(int epIdx, envelope *env);
55 // helper class to get number of array elements
56 class ElementCounter : public CkLocIterator {
60 ElementCounter():count(0){};
61 void addLocation(CkLocation &loc) { count++; }
62 int getCount() { return count; }
65 // helper class to pup all elements that belong to same ckLocMgr
66 class ElementCheckpointer : public CkLocIterator {
71 ElementCheckpointer(CkLocMgr* mgr_, PUP::er &p_):locMgr(mgr_),p(p_){};
72 void addLocation(CkLocation &loc) {
73 CkArrayIndex idx=loc.getIndex();
74 CkGroupID gID = locMgr->ckGetGroupID();
75 CmiUInt8 id = loc.getID();
76 p|gID; // store loc mgr's GID as well for easier restore
80 //CkPrintf("[%d] addLocation: ", CkMyPe()), idx.print();
85 extern void _initDone();
87 static void bdcastRO(void){
89 // Determine the size of the RODataMessage
91 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(ps);
93 // Allocate and fill out the RODataMessage
94 envelope *env = _allocEnv(RODataMsg, ps.size());
95 PUP::toMem pp((char *)EnvToUsr(env));
96 for(i=0;i<_readonlyTable.size();i++) _readonlyTable[i]->pupData(pp);
98 env->setCount(++_numInitMsgs);
99 env->setSrcPe(CkMyPe());
100 CmiSetHandler(env, _roRestartHandlerIdx);
101 CmiSyncBroadcastAndFree(env->getTotalsize(), (char *)env);
104 #if CMK_SHRINK_EXPAND
105 static void bdcastROGroupData(void){
107 //Determine the size of the RODataMessage
110 int ROSize = ps.size();
113 int GroupSize = ps1.size();
115 char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + 2*sizeof(int) + ps.size() + ps1.size());
116 char *payloadOffset = msg + CmiMsgHeaderSizeBytes;
118 // how much data to send
119 *(int*)payloadOffset = ps.size();
120 payloadOffset += sizeof(int);
121 *(int*)payloadOffset = ps1.size();
122 payloadOffset += sizeof(int);
124 //Allocate and fill out the RODataMessage
125 PUP::toMem pp((char *)payloadOffset);
130 CmiSetHandler(msg, _ROGroupRestartHandlerIdx);
131 CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes + 2*sizeof(int) + pp.size(), msg);
135 // Print out an array index to this string as decimal fields
136 // separated by underscores.
137 void printIndex(const CkArrayIndex &idx,char *dest) {
138 const int *idxData=idx.data();
139 for (int i=0;i<idx.nInts;i++) {
140 sprintf(dest,"%s%d",i==0?"":"_", idxData[i]);
145 static bool checkpointOne(const char* dirname, CkCallback& cb, bool requestStatus);
147 static void addPartitionDirectory(ostringstream &path) {
148 if (CmiNumPartitions() > 1) {
149 path << "/part-" << CmiMyPartition() << '/';
153 static FILE* openCheckpointFile(const char *dirname, const char *basename,
154 const char *mode, int id = -1) {
156 out << dirname << '/';
157 addPartitionDirectory(out);
163 FILE *fp = CmiFopen(out.str().c_str(), mode);
166 error << "PE " << CkMyPe() << " failed to open checkpoint file: " << out.str()
167 << ", mode: " << mode << " status: " << strerror(errno);
168 CkAbort(error.str().c_str());
174 * There is only one Checkpoint Manager in the whole system
176 class CkCheckpointMgr : public CBase_CkCheckpointMgr {
178 CkCallback restartCB;
179 double chkptStartTimer;
183 CkCheckpointMgr() { }
184 CkCheckpointMgr(CkMigrateMessage *m):CBase_CkCheckpointMgr(m) { }
185 void Checkpoint(const char *dirname,CkCallback& cb, bool requestStatus = false);
186 void SendRestartCB(void);
187 void pup(PUP::er& p){ p|restartCB; }
191 void CkCheckpointMgr::Checkpoint(const char *dirname, CkCallback& cb, bool _requestStatus){
192 chkptStartTimer = CmiWallTimer();
193 requestStatus = _requestStatus;
194 // make dir on all PEs in case it is a local directory
197 if (CmiNumPartitions() > 1) {
198 ostringstream partDir;
200 addPartitionDirectory(partDir);
201 CmiMkdir(partDir.str().c_str());
205 #if CMK_SHRINK_EXPAND
206 if (pending_realloc_state == REALLOC_IN_PROGRESS) {
207 // After restarting from this AtSync checkpoint, resume execution along the
208 // normal path (i.e. whatever the user defined as ResumeFromSync.)
209 CkCallback resumeFromSyncCB(CkIndex_LBDatabase::ResumeClients(), _lbdb);
210 success &= checkpointOne(dirname, resumeFromSyncCB, requestStatus);
214 success &= checkpointOne(dirname, cb, requestStatus);
218 #ifndef CMK_CHARE_USE_PTR
219 // save plain singleton chares into Chares.dat
220 FILE* fChares = openCheckpointFile(dirname, "Chares", "wb", CkMyPe());
221 PUP::toDisk pChares(fChares);
222 CkPupChareData(pChares);
223 if(pChares.checkError())
225 if(CmiFclose(fChares)!=0)
229 // save groups into Groups.dat
230 // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
231 FILE* fGroups = openCheckpointFile(dirname, "Groups", "wb", CkMyPe());
232 PUP::toDisk pGroups(fGroups);
233 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
234 CkPupGroupData(pGroups, true);
236 CkPupGroupData(pGroups);
238 if(pGroups.checkError())
240 if(CmiFclose(fGroups)!=0)
243 // save nodegroups into NodeGroups.dat
244 // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
245 if (CkMyRank() == 0) {
246 FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "wb", CkMyNode());
247 PUP::toDisk pNodeGroups(fNodeGroups);
248 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
249 CkPupNodeGroupData(pNodeGroups, true);
251 CkPupNodeGroupData(pNodeGroups);
253 if(pNodeGroups.checkError())
255 if(CmiFclose(fNodeGroups)!=0)
259 //DEBCHK("[%d]CkCheckpointMgr::Checkpoint called dirname={%s}\n",CkMyPe(),dirname);
260 FILE *datFile = openCheckpointFile(dirname, "arr", "wb", CkMyPe());
261 PUP::toDisk p(datFile);
262 CkPupArrayElementsData(p);
265 if(CmiFclose(datFile)!=0)
268 #if ! CMK_DISABLE_SYNC
269 #if CMK_HAS_SYNC_FUNC
275 chkpStatus = success?CK_CHECKPOINT_SUCCESS:CK_CHECKPOINT_FAILURE;
277 DEBCHK("[%d]restartCB installed\n",CkMyPe());
279 // Use barrier instead of contribute here:
280 // barrier is stateless and multiple calls to it do not overlap.
281 barrier(CkCallback(CkReductionTarget(CkCheckpointMgr, SendRestartCB), 0, thisgroup));
284 void CkCheckpointMgr::SendRestartCB(void){
285 DEBCHK("[%d]Sending out the cb\n",CkMyPe());
286 CkPrintf("Checkpoint to disk finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
289 CkCheckpointStatusMsg * m = new CkCheckpointStatusMsg(chkpStatus);
296 void CkPupROData(PUP::er &p)
298 int _numReadonlies = 0;
299 int _numReadonlyMsgs = 0;
300 if (!p.isUnpacking()) _numReadonlies=_readonlyTable.size();
304 if (p.isUnpacking()) {
305 if (_numReadonlies != _readonlyTable.size())
306 CkAbort("You cannot add readonlies and restore from checkpoint...");
308 for(int i=0;i<_numReadonlies;i++) _readonlyTable[i]->pupData(p);
309 if (!p.isUnpacking()) _numReadonlyMsgs=_readonlyMsgs.size();
311 for(int i=0;i<_numReadonlyMsgs; i++){
312 ReadonlyMsgInfo *c = _readonlyMsgs[i];
313 CkPupMessage(p,c->pMsg);
318 void CkPupMainChareData(PUP::er &p, CkArgMsg *args)
320 int nMains=_mainTable.size();
321 DEBCHK("[%d] CkPupMainChareData %s: nMains = %d\n", CkMyPe(),p.typeString(),nMains);
322 for(int i=0;i<nMains;i++){ /* Create all mainchares */
323 ChareInfo *entry = _chareTable[_mainTable[i]->chareIdx];
324 int entryMigCtor = entry->getMigCtor();
325 if(entryMigCtor!=-1) {
327 if (p.isUnpacking()) {
328 int size = entry->size;
329 DEBCHK("MainChare PUP'ed: name = %s, idx = %d, size = %d\n", entry->name, i, size);
330 obj = (Chare*)malloc(size);
332 _mainTable[i]->setObj(obj);
333 //void *m = CkAllocSysMsg();
334 _entryTable[entryMigCtor]->call(args, obj);
337 obj = (Chare *)_mainTable[i]->getObj();
341 // to update mainchare proxy
342 // only readonly variables of Chare Proxy are taken care of here;
343 // in general, if chare proxy is contained in some data structure,
344 // such as CkCallback, it is user's responsibility to
345 // update them after restarting
346 #if !CMK_SHRINK_EXPAND
347 if (p.isUnpacking() && CkMyPe()==0)
353 #ifndef CMK_CHARE_USE_PTR
355 CkpvExtern(CkVec<void *>, chare_objs);
356 CkpvExtern(CkVec<int>, chare_types);
357 CkpvExtern(CkVec<VidBlock *>, vidblocks);
359 // handle plain non-migratable chare
360 void CkPupChareData(PUP::er &p)
363 if (!p.isUnpacking()) n = CkpvAccess(chare_objs).size();
365 for (i=0; i<n; i++) {
367 if (!p.isUnpacking()) {
368 chare_type = CkpvAccess(chare_types)[i];
371 bool pup_flag = true;
372 if (!p.isUnpacking()) {
373 if(CkpvAccess(chare_objs)[i] == NULL){
380 if (p.isUnpacking()) {
381 int migCtor = _chareTable[chare_type]->migCtor;
384 sprintf(buf,"Chare %s needs a migration constructor and PUP'er routine for restart.\n", _chareTable[chare_type]->name);
387 void *m = CkAllocSysMsg();
388 envelope* env = UsrToEnv((CkMessage *)m);
389 CkCreateLocalChare(migCtor, env);
392 Chare *obj = (Chare*)CkpvAccess(chare_objs)[i];
397 CkpvAccess(chare_objs)[i] = NULL;
401 if (!p.isUnpacking()) n = CkpvAccess(vidblocks).size();
403 for (i=0; i<n; i++) {
405 bool pup_flag = true;
406 if (!p.isUnpacking()) {
407 if(CkpvAccess(vidblocks)[i]==NULL)
415 if (p.isUnpacking()) {
417 CkpvAccess(vidblocks).push_back(v);
420 v = CkpvAccess(vidblocks)[i];
427 void CkPupChareData(PUP::er &p)
433 typedef void GroupCreationFn(CkGroupID groupID, int constructorIdx, envelope *env);
435 static void CkPupPerPlaceData(PUP::er &p, GroupIDTable *idTable, GroupTable *objectTable,
436 unsigned int &numObjects, int constructionMsgType,
437 GroupCreationFn creationFn
438 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
443 int numGroups = 0, i;
445 if (!p.isUnpacking()) {
446 numGroups = idTable->size();
449 if (p.isUnpacking()) {
451 numObjects = numGroups+1;
455 DEBCHK("[%d] CkPupPerPlaceData %s: numGroups = %d\n", CkMyPe(),p.typeString(),numGroups);
457 GroupInfo *tmpInfo = new GroupInfo [numGroups];
458 if (!p.isUnpacking()) {
459 for (i = 0; i < numGroups; i++) {
460 tmpInfo[i].gID = (*idTable)[i];
461 TableEntry ent = objectTable->find(tmpInfo[i].gID);
462 tmpInfo[i].present = ent.getObj() != NULL;
463 tmpInfo[i].MigCtor = _chareTable[ent.getcIdx()]->migCtor;
464 strncpy(tmpInfo[i].name,_chareTable[ent.getcIdx()]->name,255);
465 //CkPrintf("[%d] CkPupPerPlaceData: %s group %s \n", CkMyPe(), p.typeString(), tmpInfo[i].name);
467 if(tmpInfo[i].MigCtor==-1) {
469 sprintf(buf,"(Node)Group %s needs a migration constructor and PUP'er routine for restart.\n", tmpInfo[i].name);
474 for (i = 0; i < numGroups; i++) p|tmpInfo[i];
476 for (i = 0; i < numGroups; i++)
478 if (!tmpInfo[i].present)
481 CkGroupID gID = tmpInfo[i].gID;
482 if (p.isUnpacking()) {
483 int eIdx = tmpInfo[i].MigCtor;
485 CkPrintf("[%d] ERROR> (Node)Group %s's migration constructor is not defined!\n", CkMyPe(), tmpInfo[i].name);
488 void *m = CkAllocSysMsg();
489 envelope* env = UsrToEnv((CkMessage *)m);
490 env->setMsgtype(constructionMsgType);
492 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
496 creationFn(gID, eIdx, env);
498 } // end of unPacking
499 IrrGroup *gobj = objectTable->find(gID).getObj();
501 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
502 if(creationFn == CkCreateLocalGroup && !create)
504 gobj->mlogData->teamRecoveryFlag = 1;
508 // if using migration constructor, you'd better have a pup
509 gobj->virtual_pup(p);
515 void CkPupGroupData(PUP::er &p
516 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
521 CkPupPerPlaceData(p, CkpvAccess(_groupIDTable), CkpvAccess(_groupTable),
522 CkpvAccess(_numGroups), BocInitMsg, &CkCreateLocalGroup
523 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
529 void CkPupNodeGroupData(PUP::er &p
530 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
535 CkPupPerPlaceData(p, &CksvAccess(_nodeGroupIDTable),
536 CksvAccess(_nodeGroupTable), CksvAccess(_numNodeGroups),
537 NodeBocInitMsg, &CkCreateLocalNodeGroup
538 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
544 // handle chare array elements for this processor
545 void CkPupArrayElementsData(PUP::er &p, int notifyListeners)
548 // safe in both packing/unpacking at this stage
549 int numGroups = CkpvAccess(_groupIDTable)->size();
551 // number of array elements on this processor
553 if (!p.isUnpacking()) {
554 ElementCounter counter;
555 CKLOCMGR_LOOP(mgr->iterate(counter););
556 numElements = counter.getCount();
560 DEBCHK("[%d] CkPupArrayElementsData %s numGroups:%d numElements:%d \n",CkMyPe(),p.typeString(), numGroups, numElements);
562 if (!p.isUnpacking())
564 // let CkLocMgr iterate over and store every array element
565 CKLOCMGR_LOOP(ElementCheckpointer chk(mgr, p); mgr->iterate(chk););
568 // loop and create all array elements ourselves
569 //CkPrintf("total chare array cnts: %d\n", numElements);
570 for (int i=0; i<numElements; i++) {
577 CkLocMgr *mgr = (CkLocMgr*)CkpvAccess(_groupTable)->find(gID).getObj();
578 if (notifyListeners){
579 mgr->resume(idx, id, p, true);
582 mgr->restore(idx, id, p);
588 for(i=0;i<numGroups;i++) {
589 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
591 obj->ckJustMigrated();
596 int CkCountArrayElements(){
597 int numGroups = CkpvAccess(_groupIDTable)->size();
599 ElementCounter counter;
600 CKLOCMGR_LOOP(mgr->iterate(counter););
601 int numElements = counter.getCount();
606 void CkPupProcessorData(PUP::er &p)
608 // save readonlys, and callback BTW
613 // save mainchares into MainChares.dat
615 CkPupMainChareData(p, NULL);
618 // save non-migratable chare
622 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
623 CkPupGroupData(p,true);
630 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
631 CkPupNodeGroupData(p,true);
633 CkPupNodeGroupData(p);
637 // pup array elements
638 CkPupArrayElementsData(p);
641 // called only on pe 0
642 static bool checkpointOne(const char* dirname, CkCallback& cb, bool requestStatus){
643 CmiAssert(CkMyPe()==0);
646 // save readonlys, and callback BTW
647 FILE* fRO = openCheckpointFile(dirname, "RO", "wb", -1);
648 PUP::toDisk pRO(fRO);
649 int _numPes = CkNumPes();
651 int _numNodes = CkNumNodes();
663 if(CmiFclose(fRO)!=0)
668 // save mainchares into MainChares.dat
670 FILE* fMain = openCheckpointFile(dirname, "MainChares", "wb", -1);
671 PUP::toDisk pMain(fMain);
672 CkPupMainChareData(pMain, NULL);
673 if(pMain.checkError())
677 if(CmiFclose(fMain) != 0)
685 void CkRemoveArrayElements()
688 int numGroups = CkpvAccess(_groupIDTable)->size();
689 CKLOCMGR_LOOP(mgr->flushAllRecs(););
690 /* GroupTable *gTbl = CkpvAccess(_groupTable);
691 for(i=0; i<numGroups; i++){
692 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
693 if(obj->isLocMgr()) {
694 CkLocMgr *mgr = (CkLocMgr *)obj;
701 void CkTestArrayElements()
704 int numGroups = CkpvAccess(_groupIDTable)->size();
705 //CKLOCMGR_LOOP(mgr->flushAllRecs(););
706 GroupTable *gTbl = CkpvAccess(_groupTable);
707 for(i=0; i<numGroups; i++){
708 IrrGroup *obj = CkpvAccess(_groupTable)->find((*CkpvAccess(_groupIDTable))[i]).getObj();
709 CkPrintf("An object at [%d]: %p | isLocMgr: %d\n", i, obj, obj->isLocMgr());
714 void CkStartCheckpoint(const char* dirname,const CkCallback& cb, bool requestStatus)
717 CkAbort("callback after checkpoint is not set properly");
719 if(cb.containsPointer())
720 CkAbort("Cannot restart from a callback based on a pointer");
723 CkPrintf("[%d] Checkpoint starting in %s\n", CkMyPe(), dirname);
725 // hand over to checkpoint managers for per-processor checkpointing
726 CProxy_CkCheckpointMgr(_sysChkptMgr).Checkpoint(dirname, cb, requestStatus);
730 * Restart: There's no such object as restart manager is created
731 * because a group cannot restore itself anyway.
732 * The mechanism exists as converse code and get invoked by
737 void CkRestartMain(const char* dirname, CkArgMsg *args){
741 if (CmiMyRank() == 0) {
744 CkMemCheckPT::inRestarting = 1;
748 FILE* fRO = openCheckpointFile(dirname, "RO", "rb", -1);
750 PUP::fromDisk pRO(fRO);
755 if (CmiMyRank() == 0) CkPupROData(pRO);
756 bool requestStatus = false;
759 DEBCHK("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
760 _oldNumPes = _numPes;
764 // restore mainchares
765 FILE* fMain = openCheckpointFile(dirname, "MainChares", "rb");
766 if(fMain && CkMyPe()==0){ // only main chares have been checkpointed, we restart on PE0
767 PUP::fromDisk pMain(fMain);
768 CkPupMainChareData(pMain, args);
770 DEBCHK("[%d]CkRestartMain: mainchares restored\n",CkMyPe());
771 //bdcastRO(); // moved to CkPupMainChareData()
774 #ifndef CMK_CHARE_USE_PTR
775 // restore chares only when number of pes is the same
776 if(CkNumPes() == _numPes) {
777 FILE* fChares = openCheckpointFile(dirname, "Chares", "rb", CkMyPe());
778 PUP::fromDisk pChares(fChares);
779 CkPupChareData(pChares);
781 if (CmiMyRank() == 0) _chareRestored = 1;
786 // content of the file: numGroups, GroupInfo[numGroups], _groupTable(PUP'ed), groups(PUP'ed)
787 // restore from PE0's copy if shrink/expand
788 FILE* fGroups = openCheckpointFile(dirname, "Groups", "rb",
789 (CkNumPes() == _numPes) ? CkMyPe() : 0);
790 PUP::fromDisk pGroups(fGroups);
791 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
792 CkPupGroupData(pGroups,true);
794 CkPupGroupData(pGroups);
798 // restore nodegroups
799 // content of the file: numNodeGroups, GroupInfo[numNodeGroups], _nodeGroupTable(PUP'ed), nodegroups(PUP'ed)
801 FILE* fNodeGroups = openCheckpointFile(dirname, "NodeGroups", "rb",
802 (CkNumNodes() == _numNodes) ? CkMyNode() : 0);
803 PUP::fromDisk pNodeGroups(fNodeGroups);
804 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
805 CkPupNodeGroupData(pNodeGroups,true);
807 CkPupNodeGroupData(pNodeGroups);
809 CmiFclose(fNodeGroups);
812 // for each location, restore arrays
813 //DEBCHK("[%d]Trying to find location manager\n",CkMyPe());
814 DEBCHK("[%d]Number of PE: %d -> %d\n",CkMyPe(),_numPes,CkNumPes());
815 if(CkMyPe() < _numPes) // in normal range: restore, otherwise, do nothing
816 for (i=0; i<_numPes;i++) {
817 if (i%CkNumPes() == CkMyPe()) {
818 FILE *datFile = openCheckpointFile(dirname, "arr", "rb", i);
819 PUP::fromDisk p(datFile);
820 CkPupArrayElementsData(p);
827 if (CmiMyRank()==0) _initDone(); // this rank will trigger other ranks
829 CkMemCheckPT::inRestarting = 0;
831 CmiPrintf("[%d]CkRestartMain done. sending out callback.\n",CkMyPe());
834 CkCheckpointStatusMsg * m = new CkCheckpointStatusMsg(CK_CHECKPOINT_SUCCESS);
844 #if CMK_SHRINK_EXPAND
845 // after resume and getting message
846 void CkResumeRestartMain(char * msg) {
849 const char * dirname = "";
852 CkMemCheckPT::inRestarting = 1;
853 CmiPrintf("[%d]CkResumeRestartMain: Inside Resume Restart\n",CkMyPe());
854 CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
858 PUP::fromMem pRO((char *)(msg+CmiMsgHeaderSizeBytes+2*sizeof(int)));
861 CmiPrintf("[%d]CkRestartMain: readonlys restored\n",CkMyPe());
863 #if (defined(_FAULT_MLOG_) || defined(_FAULT_CAUSAL_))
864 CkPupGroupData(pRO,true);
868 CmiPrintf("[%d]CkResumeRestartMain: Group restored %d\n",CkMyPe(), CkpvAccess(_numGroups)-1);
873 if(Cmi_isOldProcess) {
874 /* CmiPrintf("[%d] For shrinkexpand newpe=%d, oldpe=%d \n",Cmi_myoldpe, CkMyPe(), Cmi_myoldpe); */
875 // non-shrink files would be empty since LB would take care
876 FILE *datFile = openCheckpointFile(dirname, "arr", "rb", Cmi_myoldpe);
877 PUP::fromDisk p(datFile);
878 CkPupArrayElementsData(p);
883 CkMemCheckPT::inRestarting = 0;
885 CmiPrintf("[%d]CkResumeRestartMain done. sending out callback.\n",CkMyPe());
886 CkPrintf("Restart from shared memory finished in %fs, sending out the cb...\n", CmiWallTimer() - chkptStartTimer);
892 // Main chare: initialize system checkpoint manager
893 class CkCheckpointInit : public Chare {
895 CkCheckpointInit(CkArgMsg *msg) {
896 _sysChkptMgr = CProxy_CkCheckpointMgr::ckNew();
899 CkCheckpointInit(CkMigrateMessage *m) {delete m;}
902 #include "CkCheckpoint.def.h"
903 #include "CkCheckpointStatus.def.h"