LRTS Comm Thread Tracing in message recieve
[charm.git] / src / ck-core / ckmemcheckpoint.h
blob26888fb17573acd3189dc984bba68c94efc66a53
1 #ifndef _CK_MEM_CHECKPT_
2 #define _CK_MEM_CHECKPT_
4 #include "CkMemCheckpoint.decl.h"
6 extern CkGroupID ckCheckPTGroupID;
7 class CkArrayCheckPTReqMessage: public CMessage_CkArrayCheckPTReqMessage {
8 public:
9 CkArrayCheckPTReqMessage() {}
12 class CkArrayCheckPTMessage: public CMessage_CkArrayCheckPTMessage {
13 public:
14 CkArrayID aid;
15 CkGroupID locMgr;
16 CkArrayIndex index;
17 double *packData;
18 int bud1, bud2;
19 int len;
20 int cp_flag; // 1: from checkpoint 0: from recover
24 class CkProcCheckPTMessage: public CMessage_CkProcCheckPTMessage {
25 public:
26 int pe;
27 int reportPe; // chkpt starter
28 int failedpe;
29 int cur_restart_phase;
30 int len;
31 char *packData;
34 // table entry base class
35 class CkCheckPTInfo {
36 friend class CkMemCheckPT;
37 protected:
38 CkArrayID aid;
39 CkGroupID locMgr;
40 CkArrayIndex index;
41 int pNo; //another buddy
42 public:
43 CkCheckPTInfo();
44 CkCheckPTInfo(CkArrayID a, CkGroupID loc, CkArrayIndex idx, int pno):
45 aid(a), locMgr(loc), index(idx), pNo(pno) {}
46 virtual ~CkCheckPTInfo() {}
47 virtual void updateBuffer(CkArrayCheckPTMessage *data) = 0;
48 virtual CkArrayCheckPTMessage * getCopy() = 0;
49 virtual void updateBuddy(int b1, int b2) = 0;
50 virtual int getSize() = 0;
53 /// memory or disk checkpointing
54 #define CkCheckPoint_inMEM 1
55 #define CkCheckPoint_inDISK 2
57 class CkCheckPTEntry{
58 CkArrayCheckPTMessage *data;
59 char * fname;
60 public:
61 int bud1, bud2;
62 int where;
63 void init(int _where, int idx)
65 data = NULL;
66 where = _where;
67 if(where == CkCheckPoint_inDISK)
69 #if CMK_USE_MKSTEMP
70 fname = new char[64];
71 #if CMK_CONVERSE_MPI
72 sprintf(fname, "/tmp/ckpt%d-%d-%d-XXXXXX",CmiMyPartition(), CkMyPe(), idx);
73 #else
74 sprintf(fname, "/tmp/ckpt%d-%d-XXXXXX", CkMyPe(), idx);
75 #endif
76 if(mkstemp(fname)<0)
78 CmiAbort("mkstemp fail in checkpoint");
80 #else
81 fname=tmpnam(NULL);
82 #endif
86 void updateBuffer(CkArrayCheckPTMessage * msg)
88 if(where == CkCheckPoint_inDISK)
90 envelope *env = UsrToEnv(msg);
91 CkUnpackMessage(&env);
92 data = (CkArrayCheckPTMessage *)EnvToUsr(env);
93 FILE *f = fopen(fname,"wb");
94 PUP::toDisk p(f);
95 CkPupMessage(p, (void **)&msg);
96 // delay sync to the end because otherwise the messages are blocked
97 // fsync(fileno(f));
98 fclose(f);
99 bud1 = msg->bud1;
100 bud2 = msg->bud2;
101 delete msg;
102 }else
104 CmiAssert(where == CkCheckPoint_inMEM);
105 CmiAssert(msg!=NULL);
106 if (data) delete data;
107 data = msg;
108 bud1 = msg->bud1;
109 bud2 = msg->bud2;
113 CkArrayCheckPTMessage * getCopy()
115 if(where == CkCheckPoint_inDISK)
117 CkArrayCheckPTMessage *msg;
118 FILE *f = fopen(fname,"rb");
119 PUP::fromDisk p(f);
120 CkPupMessage(p, (void **)&msg);
121 fclose(f);
122 msg->bud1 = bud1; // update the buddies
123 msg->bud2 = bud2;
124 return msg;
125 }else
127 CmiAssert(where == CkCheckPoint_inMEM);
128 if (data == NULL) {
129 CmiPrintf("[%d] recoverArrayElements: element does not have checkpoint data.", CkMyPe());
130 CmiAbort("Abort!");
132 return (CkArrayCheckPTMessage *)CkCopyMsg((void **)&data);
138 class CkMemCheckPT: public CBase_CkMemCheckPT {
139 public:
140 CkMemCheckPT(int w);
141 CkMemCheckPT(CkMigrateMessage *m):CBase_CkMemCheckPT(m) {};
142 virtual ~CkMemCheckPT();
143 void pup(PUP::er& p);
144 inline int BuddyPE(int pe);
145 void doItNow(int sp, CkCallback &);
146 void restart(int diePe);
147 void removeArrayElements();
148 void createEntry(CkArrayID aid, CkGroupID loc, CkArrayIndex index, int buddy);
149 void recvData(CkArrayCheckPTMessage *);
150 void gotData();
151 void recvProcData(CkProcCheckPTMessage *);
152 void cpFinish();
153 void syncFiles(CkReductionMsg *);
154 void report();
155 void recoverBuddies();
156 void recoverEntry(CkArrayCheckPTMessage *msg);
157 void recoverArrayElements();
158 void quiescence(CkCallback &);
159 void resetReductionMgr();
160 void finishUp();
161 void gotReply();
162 void inmem_restore(CkArrayCheckPTMessage *m);
163 void updateLocations(int n, CkGroupID *g, CkArrayIndex *idx,int nowOnPe);
164 void resetLB(int diepe);
165 int isFailed(int pe);
166 void pupAllElements(PUP::er &p);
167 void startArrayCheckpoint();
168 void recvArrayCheckpoint(CkArrayCheckPTMessage *m);
169 void recoverAll(CkArrayCheckPTMessage * msg, CkVec<CkGroupID> * gmap=NULL, CkVec<CkArrayIndex> * imap=NULL);
170 public:
171 static CkCallback cpCallback;
173 static int inRestarting;
174 static int inCheckpointing;
175 static int inLoadbalancing;
176 static double startTime;
177 static char* stage;
178 private:
179 CkVec<CkCheckPTInfo *> ckTable;
180 CkCheckPTEntry chkpTable[2];
182 int recvCount, peCount;
183 int expectCount, ackCount;
184 /// the processor who initiate the checkpointing
185 int cpStarter;
186 CkVec<int> failedPes;
187 int thisFailedPe;
189 /// to use memory or disk checkpointing
190 int where;
191 private:
192 void initEntry();
193 inline int isMaster(int pe);
195 void failed(int pe);
196 int totalFailed();
198 void sendProcData();
201 // called in initCharm
202 void CkMemRestart(const char *, CkArgMsg *);
204 // called by user applications
205 // to start a checkpointing
206 void CkStartMemCheckpoint(CkCallback &cb);
208 // true if inside a restarting phase
209 extern "C" int CkInRestarting();
210 extern "C" int CkInLdb();
211 extern "C" void CkSetInLdb();
212 extern "C" void CkResetInLdb();
214 extern "C" int CkHasCheckpoints();
216 extern "C" void CkDieNow();
218 #endif