Bug #1062: Fix linking errors by moving definition of userDrivenMode to machine-commo...
[charm.git] / src / ck-ldb / TempAwareRefineLB.C
blob26e54911aac9db321e9f0e6eecdd80000b8d4745
1 //#define NO_TEMP_LB
2 //#define ORG_VERSION
3 //#define MAX_MIN
4 #define MAX_TEMP 49
5 //#define tolerance 0.03
6 /** \file TempAwareRefineLB.C
7  *
8  *  Written by Osman Sarood
9  *  Temperature aware load balancer. Needs frequency control access to work.
10  */
12 /**
13  * \addtogroup CkLdb
15 /*@{*/
17 #include "TempAwareRefineLB.h"
18 #include "ckgraph.h"
19 #include <algorithm>
21 CreateLBFunc_Def(TempAwareRefineLB, "always assign the heaviest obj onto lightest loaded processor.")
23 #ifdef TEMP_LDB
26 static int cpufreq_sysfs_write (
27                      const char *setting,int proc
28                      )
30 char path[100];
31 sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",proc);
32                 FILE *fd = fopen (path, "w");
34                 if (!fd) {
35                         printf("PROC#%d ooooooo666 FILE OPEN ERROR file=%s\n",CkMyPe(),path);
36                         return -1;
37                 }
38 //                else CkPrintf("PROC#%d opened freq file=%s\n",proc,path);
40         fseek ( fd , 0 , SEEK_SET );
41         int numw=fprintf (fd, setting);
42         if (numw <= 0) {
44                 fclose (fd);
45                 printf("FILE WRITING ERROR\n");
46                 return 0;
47         }
48 //        else CkPrintf("Freq for Proc#%d set to %s numw=%d\n",proc,setting,numw);
49         fclose(fd);
50         return 1;
53 float TempAwareRefineLB::getTemp(int cpu)
55         char val[10];
56         FILE *f;
57                 char path[100];
58                 sprintf(path,"/sys/devices/platform/coretemp.%d/temp1_input",cpu);
59                 f=fopen(path,"r");
60                 if (!f) {
61                         printf("777 FILE OPEN ERROR file=%s\n",path);
62                         exit(0);
63                 }
65         if(f==NULL) {printf("ddddddddddddddddddddddddddd\n");exit(0);}
66         fgets(val,10,f);
67         fclose(f);
68         return atof(val)/1000;
71 static int cpufreq_sysfs_read (int proc)
73         FILE *fd;
74         char path[100];
75         int i=proc;
76         sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",i);
78         fd = fopen (path, "r");
80         if (!fd) {
81                 printf("22 FILE OPEN ERROR file=%s\n",path);
82                 return 0;
83         }
84         char val[10];
85         fgets(val,10,fd);
86         int ff=atoi(val);
87         fclose (fd);
89         return ff;
92 void printCurrentTemperature(void *LB, double curWallTime)
94   TempAwareRefineLB *taalb = static_cast<TempAwareRefineLB *>(LB);
95   int pe = CkMyPe();
96   float temp = taalb->getTemp(pe % taalb->physicalCoresPerNode);
97   int freq = cpufreq_sysfs_read (pe % taalb->logicalCoresPerNode);
98   fprintf(taalb->logFD, "%f, %d, %f, %d\n", curWallTime, pe, temp, freq);
101 int getProcFreqPtr(int *freqs,int numAvail,int freq)
103         for(int i=0;i<numAvail;i++) if(freqs[i]==freq) return i;
105 #endif
106 FILE *migFile;
107 double starting;
108 TempAwareRefineLB::TempAwareRefineLB(const CkLBOptions &opt): CBase_TempAwareRefineLB(opt)
110 #ifdef TEMP_LDB
111 starting=CmiWallTimer();
112 //  procsPerNode=4;
113 migFile=fopen("migInfo","w");
114   numAvailFreqs = 11;
115 //numAvailFreqs = 14;
116 //numAvailFreqs = 7;  
117 freqs=new int[numAvailFreqs];
118 freqsEffect=new int[numAvailFreqs];
119 // for might (lab machine)
121   freqs[0] = 2262000;
122   freqs[1] = 2261000;
123   freqs[2] = 2128000;
124   freqs[3] = 1995000;
125   freqs[4] = 1862000;
126   freqs[5] = 1729000;
127   freqs[6] = 1596000;
130 // for tarekc cluster
131   freqs[0] = 2395000;
132   freqs[1] = 2394000;
133   freqs[2] = 2261000;
134   freqs[3] = 2128000;
135   freqs[4] = 1995000;
136   freqs[5] = 1862000;
137   freqs[6] = 1729000;
138   freqs[7] = 1596000;
139   freqs[8] = 1463000;
140   freqs[9] = 1330000;
141   freqs[10] = 1197000;
143         freqsEffect[0] = 1979886;
144   freqsEffect[1] = 1943017;
145   freqsEffect[2] = 1910989;
146   freqsEffect[3] = 1876619;
147   freqsEffect[4] = 1824126;
148   freqsEffect[5] = 1763990;
149   freqsEffect[6] = 1666773;
150   freqsEffect[7] = 1560224;
151   freqsEffect[8] = 1443154;
152   freqsEffect[9] = 1317009;
153   freqsEffect[10] = 1200000;
157 // for grace, humility etc (lab i7 machines)
158   freqs[0] = 2801000;
159   freqs[1] = 2800000;
160   freqs[2] = 2667000;
161   freqs[3] = 2533000;
162   freqs[4] = 2400000;
163   freqs[5] = 2267000;
164 freqs[6] = 2133000;
165 freqs[7] = 2000000;
166 freqs[8] = 1867000;
167 freqs[9] = 1733000;
168 freqs[10] = 1600000;
169 freqs[11] = 1467000;
170 freqs[12] = 1333000;
171   freqs[13] = 1200000;
174   procFreqPtr = new int[CkNumPes()];
176   for(int i=0;i<CkNumPes();i++)
177   {
178         char newfreq[10];
179         sprintf(newfreq,"%d",freqs[0]);
180         cpufreq_sysfs_write(newfreq,i%physicalCoresPerNode);    
181         procFreqPtr[i]=0;
182   }
183 //  logicalCoresPerChip=4;
184   procFreq=NULL;
185   procTemp=NULL;
186         procFreqNew=NULL;
187         procFreqNewEffect = NULL;
188         avgChipTemp=NULL;
189   lbname = "TempAwareRefineLB";
190   if (CkMyPe()==0)
191     CkPrintf("[%d] TempAwareRefineLB created\n",CkMyPe());
193   char logFile[100];
194   snprintf(logFile, sizeof(logFile), "temp_freq.log.%d", CkMyPe());
195   if ((logFD = fopen(logFile, "a"))) {
196     fprintf(logFD, "Time, PE, Temperature, Frequency\n");
197   } else {
198     CkAbort("Couldn't open temperature/frequency log file");
199   }
202   CcdCallOnConditionKeep(CcdPERIODIC_1second, &printCurrentTemperature, this);
203 #else
204         CmiAbort("TEMPLB ERROR: not supported without TEMP_LDB flag.\n");
205 #endif
209 void TempAwareRefineLB::populateEffectiveFreq(int numProcs)
211 #ifdef TEMP_LDB
212         for(int i=0;i<numProcs;i++)
213         {
214                 for(int j=0;j<numAvailFreqs;j++)
215                 {
216                         if(freqs[j] == procFreqNew[i]) // same freq . copy effective freq
217                         {
218                                 procFreqNewEffect[i] = freqsEffect[j];
219 //                              CkPrintf("** Proc%d j:%d NEWFreq:%d\n",i,j,procFreqNewEffect[i]);
220                         }
221                         if(freqs[j] == procFreq[i]) 
222                         {
223                                 procFreqEffect[i] = freqsEffect[j];
224 //                              CkPrintf("-- Proc%d j:%d OLDFreq:%d procFreq:%d \n",i,j,procFreqEffect[i],procFreq[i]);
225                         }
226                 }
227         }
228 #endif
231 bool TempAwareRefineLB::QueryBalanceNow(int _step)
233   //  CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
234   return true;
237 class ProcLoadGreater {
238   public:
239     bool operator()(ProcInfo p1, ProcInfo p2) {
240       return (p1.getTotalLoad() > p2.getTotalLoad());
241     }
244 class ProcLoadLesser {
245   public:
246     bool operator()(ProcInfo p1, ProcInfo p2) {
247       return (p1.getTotalLoad() < p2.getTotalLoad());
248     }
251 class ObjLoadGreater {
252   public:
253     bool operator()(Vertex v1, Vertex v2) {
254       return (v1.getVertexLoad() > v2.getVertexLoad());
255     }
258 void TempAwareRefineLB::changeFreq(int nFreq)
260 #ifdef TEMP_LDB
261         //CkPrintf("PROC#%d in changeFreq numProcs=%d\n",CkMyPe(),nFreq);
262 //  for(int i=0;i<numProcs;i++)
263   {
264 //        if(procFreq[i]!=procFreqNew[i])
265         {
266               char newfreq[10];
267               sprintf(newfreq,"%d",nFreq);
268               cpufreq_sysfs_write(newfreq,CkMyPe()%physicalCoresPerNode);//i%physicalCoresPerNode);
269 //            CkPrintf("PROC#%d freq changing from %d to %d temp=%f\n",i,procFreq[i],procFreqNew[i],procTemp[i]);
270         }
271   }
272 #endif
275 #ifdef TEMP_LDB
276 int getTaskIdForMigration(ObjGraph *ogr,int pe,int start)
278         for(int vert = start; vert < ogr->vertices.size(); vert++)
279         {
280                 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) return vert;
281         }
282         CkPrintf("THERE IS A PROBLEM IN TEMPREFINELB 222 start=%d pe=%d objArraySize=%d!!!!!\n",start,pe,ogr->vertices.size());
283         CkExit();
286 int getNumTasks(ObjGraph *ogr,int pe)
288         int c=0;
289         for(int vert = 0; vert < ogr->vertices.size(); vert++)
290         {
291                 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) c++;
292         }
293         return c;
296 int getTaskIdForMigration(ObjGraph *ogr,int pe,std::vector<int> assTasks)
298         for(int vert = 0; vert < ogr->vertices.size(); vert++)
299         {
300                 if(ogr->vertices[vert].getCurrentPe()==pe  && ogr->vertices[vert].getNewPe()==-1)
301                 {
303    CkPrintf("======================= pe=%d vert=%d ========================\n",pe,vert);
305                         bool hasIt=false;
306                         for(int i=0;i<assTasks.size();i++)
307                         {
308                                 if(vert==assTasks[i]) 
309                                 {
310                                         hasIt=true;
311                                         break;
312                                 }
313                         }
314                         if(hasIt==false) return vert;
316                         return vert;
317                 }
318         }
319 return -1;
320 //      CkPrintf("THERE IS A PROBLEM IN TEMPREFINELB 111  pe=%d objArraySize=%d assTasks.size()=%d !!!!!\n",pe,ogr->vertices.size(),assTasks.size());
321 //        CmiPrintStackTrace(0);
322 //      CkExit();
325 bool saneFreqNormLds(double *loads, int numProcs)
327         double tot=0.0;
328         for(int i=0;i<numProcs;i++)
329         {
330                 tot+=loads[i];
331         }
332         double r=numProcs-tot;
333         if(r>0.01 || r<-0.01)
334         {
335                 CkPrintf("THere is a problem with LOADs!!! r=%f procs=%d loadSum=%f\n",r,numProcs,tot);
336                 return false;
337         }
338         else return true;
340 #endif
341 void TempAwareRefineLB::work(LDStats* stats)
343 #ifdef TEMP_LDB
344 ////////////////////////////////////////////////////
345   numProcs=stats->nprocs();
346   numChips=numProcs/logicalCoresPerChip;
347   avgChipTemp=new float[numChips];
348   if(procFreq!=NULL) delete [] procFreq;
349         if(procFreqEffect!=NULL) delete [] procFreqEffect;
350 //  if(procFreqPtr!=NULL) delete [] procFreqPtr;
351   if(procTemp!=NULL) delete [] procTemp;
352   if(procFreqNew!=NULL) delete [] procFreqNew;
353         if(procFreqNewEffect!=NULL) delete [] procFreqNewEffect;
354   if(avgChipTemp!=NULL) delete [] avgChipTemp;
356   procFreq = new int[numProcs];
357         procFreqEffect = new int[numProcs];
358 //  procFreqPtr = new int[numProcs];
359   procTemp = new float[numProcs];
360   procFreqNew = new int[numProcs];
361         procFreqNewEffect = new int[numProcs];
362   avgChipTemp = new float[numChips];
364   for(int i=0;i<numChips;i++) avgChipTemp[i]=0;
366   for(int i=0;i<numProcs;i++)
367   {
368         procFreq[i] = stats->procs[i].pe_speed;
369         procTemp[i] = stats->procs[i].pe_temp;
370 //      procFreqPtr[i] = getProcFreqPtr(freqs,numAvailFreqs,procFreq[i]);
371         avgChipTemp[i/logicalCoresPerChip] += procTemp[i];
372   }
374   for(int i=0;i<numChips;i++) 
375   {
376         avgChipTemp[i]/=logicalCoresPerChip;
377 //CkPrintf("---- CHIP#%d has temp=%f ----------\n",i,avgChipTemp[i]);
378   }
379   for(int i=0;i<numChips;i++)
380   {
381         int over=0,under=0;
382         if(avgChipTemp[i] > MAX_TEMP)
383         {
384                 over=1;
385                 if(procFreqPtr[i*logicalCoresPerChip]==numAvailFreqs-1)
386                 {
387                         for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
388                         CkPrintf("CHIP#%d RUNNING HOT EVEN WITH MIN FREQUENCY!!\n",i);
389                 }
390                 else
391                 {
392                         for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
393                         {
394                                 if(procFreqPtr[j]<numAvailFreqs-1) procFreqPtr[j]++;
395 #ifdef MAX_MIN
396 /// PLEASE COMMENT OUT .. TESTING ONLY
397 if(i==0) {procFreqPtr[j] = numAvailFreqs-1;/*CkPrintf("C for i:%d\n",j);*/}
398 //if(i<numChips-1) procFreqPtr[j]=0;
399 else  procFreqPtr[j]=0;
400 /////////////////////////
401 #endif
402                                 procFreqNew[j] = freqs[procFreqPtr[j]];
403                         }
404 #ifndef ORG_VERSION
405                         CkPrintf("!!!!! Chip#%d running HOT shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
406 #endif
407                 }
408         }
409         else
410 //      if(avgChipTemp[i] < MAX_TEMP-1)
411         {
412                 under=1;
413                 if(procFreqPtr[i*logicalCoresPerChip]>0)
414                 {
415                         for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
416                         {
417                                 if(procFreqPtr[j]>0)
418                                         procFreqPtr[j]--;
419 #ifdef MAX_MIN
420 /// PLEASE COMMENT OUT .. TESTING ONLY
421 if(i==0) procFreqPtr[j] = numAvailFreqs-1;
422 //if(i<numChips-1) procFreqPtr[j]=0;
423 else  procFreqPtr[j]=0;
424 /////////////////////////
425 #endif
426                                 procFreqNew[j] = freqs[procFreqPtr[j]];
427                         }
428 #ifndef ORG_VERSION
429                         CkPrintf("!!!!! Chip#%d running COLD shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
430 #endif
431                 }
432                 else
433                 {
434                         for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
435                 }
436         }
438         if(under==0 && over==0) 
439         {
440                 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
441         }
443 //if(i==5) for(int j=i*c(resPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[numAvailFreqs-1];
444 //else 
445 #ifdef ORG_VERSION
446 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0];
447 #endif
448 //for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0];
449   }
450 //for(int x=0;x<numProcs;x+=logicalCoresPerChip) if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]);
451 //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d freq %d\n",x,procFreqNew[x]);
452 ////////////////////////////////////////////////////
454 #ifndef NO_TEMP_LB
455   int obj;
456   int n_pes = stats->nprocs();
458   //  CkPrintf("[%d] RefineLB strategy\n",CkMyPe());
460   // RemoveNonMigratable(stats, n_pes);
462   // get original object mapping
463   int* from_procs = RefinerTemp::AllocProcs(n_pes, stats);
464   for(obj=0;obj<stats->n_objs;obj++)  {
465     int pe = stats->from_proc[obj];
466     from_procs[obj] = pe;
467   }
468   // Get a new buffer to refine into
469         populateEffectiveFreq(numProcs);
470   int* to_procs = RefinerTemp::AllocProcs(n_pes, stats);
471 //  RefinerTemp refiner(1.03,procFreqEffect,procFreqNewEffect,n_pes);  // overload tolerance=1.05
472         RefinerTemp refiner(1.03,procFreq,procFreqNew,n_pes);
473   refiner.Refine(n_pes, stats, from_procs, to_procs);
474   // Save output
475         int migs=0;
476         int *numMigs = new int[numProcs];
477         int totE = 0;
478         for(int mm=0;mm<numProcs;mm++) numMigs[mm] = 0;
479   for(obj=0;obj<stats->n_objs;obj++) {
480       int pe = stats->from_proc[obj];
481                         numMigs[to_procs[obj]]++;
482 //stats->objData[obj].objID();
483   LDObjData &odata = stats->objData[obj];
484         computeInfo *c1 = new computeInfo();
485         c1->id = odata.objID();
486 //if(to_procs[obj]==3) CkPrintf("[%d,%d] going to 3 totE:%d\n",c1->id.getID()[0],c1->id.getID()[1],totE++);//,(stats->objData[obj].objID().getID())[1],totE++);
487       if (to_procs[obj] != pe) {
488         migs++;
489         //if (_lb_args.debug()>=2)  
490                                 {
491 //          CkPrintf("[%d,%d] Obj %d migrating from %d to %d\n",
492 //                 c1->id.getID()[0],c1->id.getID()[1],obj,pe,to_procs[obj]);
493         }
494         stats->to_proc[obj] = to_procs[obj];
495       }
496   }
498         for(int mm=0;mm<numProcs;mm++)
499         {
500                 //CkPrintf("PROC#%d freq:%d objs:%d ----------\n",mm,procFreqNew[mm],numMigs[mm]);
501         }
502   CkPrintf("TEMPLB INFO: Total Objs:%d migrations:%d time:%f \n",stats->n_objs,migs,CmiWallTimer()-starting);
503   fprintf(migFile,"%f %d\n",CmiWallTimer()-starting,migs);
504   // Free the refine buffers
505   RefinerTemp::FreeProcs(from_procs);
506   RefinerTemp::FreeProcs(to_procs);
508 #endif
509 //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d ------- freq %d\n",x,procFreqNew[x]);
511 for(int x=0;x<numProcs;x+=logicalCoresPerChip) 
513         if(procFreq[x]!=procFreqNew[x]) 
514         {
515                 CkPrintf("Chaning the freq for PROC#%d\n",x);
516                 thisProxy[x].changeFreq(procFreqNew[x]);
517         }
520 for(int x=0;x<numProcs;x++)
521   {
522 //CkPrintf("--------- Proc#%d %d numProcs=%d\n",x,procFreqNew[x],numProcs);
523 if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]);
525 #endif // TEMP_LDB endif
527 #include "TempAwareRefineLB.def.h"
529 /*@}*/