1 #ifndef _CK_MEM_CHECKPT_
2 #define _CK_MEM_CHECKPT_
4 #include "CkMemCheckpoint.decl.h"
6 extern CkGroupID ckCheckPTGroupID
;
7 class CkArrayCheckPTReqMessage
: public CMessage_CkArrayCheckPTReqMessage
{
9 CkArrayCheckPTReqMessage() {}
12 class CkArrayCheckPTMessage
: public CMessage_CkArrayCheckPTMessage
{
20 int cp_flag
; // 1: from checkpoint 0: from recover
24 class CkProcCheckPTMessage
: public CMessage_CkProcCheckPTMessage
{
27 int reportPe
; // chkpt starter
29 int cur_restart_phase
;
34 // table entry base class
36 friend class CkMemCheckPT
;
41 int pNo
; //another buddy
44 CkCheckPTInfo(CkArrayID a
, CkGroupID loc
, CkArrayIndex idx
, int pno
):
45 aid(a
), locMgr(loc
), index(idx
), pNo(pno
) {}
46 virtual ~CkCheckPTInfo() {}
47 virtual void updateBuffer(CkArrayCheckPTMessage
*data
) = 0;
48 virtual CkArrayCheckPTMessage
* getCopy() = 0;
49 virtual void updateBuddy(int b1
, int b2
) = 0;
50 virtual int getSize() = 0;
53 /// memory or disk checkpointing
54 #define CkCheckPoint_inMEM 1
55 #define CkCheckPoint_inDISK 2
58 CkArrayCheckPTMessage
*data
;
63 void init(int _where
, int idx
)
67 if(where
== CkCheckPoint_inDISK
)
72 sprintf(fname
, "/tmp/ckpt%d-%d-%d-XXXXXX",CmiMyPartition(), CkMyPe(), idx
);
74 sprintf(fname
, "/tmp/ckpt%d-%d-XXXXXX", CkMyPe(), idx
);
78 CmiAbort("mkstemp fail in checkpoint");
86 void updateBuffer(CkArrayCheckPTMessage
* msg
)
88 if(where
== CkCheckPoint_inDISK
)
90 envelope
*env
= UsrToEnv(msg
);
91 CkUnpackMessage(&env
);
92 data
= (CkArrayCheckPTMessage
*)EnvToUsr(env
);
93 FILE *f
= fopen(fname
,"wb");
95 CkPupMessage(p
, (void **)&msg
);
96 // delay sync to the end because otherwise the messages are blocked
104 CmiAssert(where
== CkCheckPoint_inMEM
);
105 CmiAssert(msg
!=NULL
);
106 if (data
) delete data
;
113 CkArrayCheckPTMessage
* getCopy()
115 if(where
== CkCheckPoint_inDISK
)
117 CkArrayCheckPTMessage
*msg
;
118 FILE *f
= fopen(fname
,"rb");
120 CkPupMessage(p
, (void **)&msg
);
122 msg
->bud1
= bud1
; // update the buddies
127 CmiAssert(where
== CkCheckPoint_inMEM
);
129 CmiPrintf("[%d] recoverArrayElements: element does not have checkpoint data.", CkMyPe());
132 return (CkArrayCheckPTMessage
*)CkCopyMsg((void **)&data
);
138 class CkMemCheckPT
: public CBase_CkMemCheckPT
{
141 CkMemCheckPT(CkMigrateMessage
*m
):CBase_CkMemCheckPT(m
) {};
142 virtual ~CkMemCheckPT();
143 void pup(PUP::er
& p
);
144 inline int BuddyPE(int pe
);
145 void doItNow(int sp
, CkCallback
&);
146 void restart(int diePe
);
147 void removeArrayElements();
148 void createEntry(CkArrayID aid
, CkGroupID loc
, CkArrayIndex index
, int buddy
);
149 void recvData(CkArrayCheckPTMessage
*);
151 void recvProcData(CkProcCheckPTMessage
*);
153 void syncFiles(CkReductionMsg
*);
155 void recoverBuddies();
156 void recoverEntry(CkArrayCheckPTMessage
*msg
);
157 void recoverArrayElements();
158 void quiescence(CkCallback
&);
159 void resetReductionMgr();
162 void inmem_restore(CkArrayCheckPTMessage
*m
);
163 void updateLocations(int n
, CkGroupID
*g
, CkArrayIndex
*idx
,int nowOnPe
);
164 void resetLB(int diepe
);
165 int isFailed(int pe
);
166 void pupAllElements(PUP::er
&p
);
167 void startArrayCheckpoint();
168 void recvArrayCheckpoint(CkArrayCheckPTMessage
*m
);
169 void recoverAll(CkArrayCheckPTMessage
* msg
, CkVec
<CkGroupID
> * gmap
=NULL
, CkVec
<CkArrayIndex
> * imap
=NULL
);
171 static CkCallback cpCallback
;
173 static int inRestarting
;
174 static int inCheckpointing
;
175 static int inLoadbalancing
;
176 static double startTime
;
179 CkVec
<CkCheckPTInfo
*> ckTable
;
180 CkCheckPTEntry chkpTable
[2];
182 int recvCount
, peCount
;
183 int expectCount
, ackCount
;
184 /// the processor who initiate the checkpointing
186 CkVec
<int> failedPes
;
189 /// to use memory or disk checkpointing
193 inline int isMaster(int pe
);
201 // called in initCharm
202 void CkMemRestart(const char *, CkArgMsg
*);
204 // called by user applications
205 // to start a checkpointing
206 void CkStartMemCheckpoint(CkCallback
&cb
);
208 // true if inside a restarting phase
209 extern "C" int CkInRestarting();
210 extern "C" int CkInLdb();
211 extern "C" void CkSetInLdb();
212 extern "C" void CkResetInLdb();
214 extern "C" int CkHasCheckpoints();
216 extern "C" void CkDieNow();