5 //#define tolerance 0.03
6 /** \file TempAwareRefineLB.C
8 * Written by Osman Sarood
9 * Temperature aware load balancer. Needs frequency control access to work.
17 #include "TempAwareRefineLB.h"
21 CreateLBFunc_Def(TempAwareRefineLB, "always assign the heaviest obj onto lightest loaded processor.")
26 static int cpufreq_sysfs_write (
27 const char *setting,int proc
31 sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",proc);
32 FILE *fd = fopen (path, "w");
35 printf("PROC#%d ooooooo666 FILE OPEN ERROR file=%s\n",CkMyPe(),path);
38 // else CkPrintf("PROC#%d opened freq file=%s\n",proc,path);
40 fseek ( fd , 0 , SEEK_SET );
41 int numw=fprintf (fd, setting);
45 printf("FILE WRITING ERROR\n");
48 // else CkPrintf("Freq for Proc#%d set to %s numw=%d\n",proc,setting,numw);
53 float TempAwareRefineLB::getTemp(int cpu)
58 sprintf(path,"/sys/devices/platform/coretemp.%d/temp1_input",cpu);
61 printf("777 FILE OPEN ERROR file=%s\n",path);
65 if(f==NULL) {printf("ddddddddddddddddddddddddddd\n");exit(0);}
68 return atof(val)/1000;
71 static int cpufreq_sysfs_read (int proc)
76 sprintf(path,"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed",i);
78 fd = fopen (path, "r");
81 printf("22 FILE OPEN ERROR file=%s\n",path);
92 void printCurrentTemperature(void *LB, double curWallTime)
94 TempAwareRefineLB *taalb = static_cast<TempAwareRefineLB *>(LB);
96 float temp = taalb->getTemp(pe % taalb->physicalCoresPerNode);
97 int freq = cpufreq_sysfs_read (pe % taalb->logicalCoresPerNode);
98 fprintf(taalb->logFD, "%f, %d, %f, %d\n", curWallTime, pe, temp, freq);
101 int getProcFreqPtr(int *freqs,int numAvail,int freq)
103 for(int i=0;i<numAvail;i++) if(freqs[i]==freq) return i;
108 TempAwareRefineLB::TempAwareRefineLB(const CkLBOptions &opt): CBase_TempAwareRefineLB(opt)
111 starting=CmiWallTimer();
113 migFile=fopen("migInfo","w");
115 //numAvailFreqs = 14;
117 freqs=new int[numAvailFreqs];
118 freqsEffect=new int[numAvailFreqs];
119 // for might (lab machine)
130 // for tarekc cluster
143 freqsEffect[0] = 1979886;
144 freqsEffect[1] = 1943017;
145 freqsEffect[2] = 1910989;
146 freqsEffect[3] = 1876619;
147 freqsEffect[4] = 1824126;
148 freqsEffect[5] = 1763990;
149 freqsEffect[6] = 1666773;
150 freqsEffect[7] = 1560224;
151 freqsEffect[8] = 1443154;
152 freqsEffect[9] = 1317009;
153 freqsEffect[10] = 1200000;
157 // for grace, humility etc (lab i7 machines)
174 procFreqPtr = new int[CkNumPes()];
176 for(int i=0;i<CkNumPes();i++)
179 sprintf(newfreq,"%d",freqs[0]);
180 cpufreq_sysfs_write(newfreq,i%physicalCoresPerNode);
183 // logicalCoresPerChip=4;
187 procFreqNewEffect = NULL;
189 lbname = "TempAwareRefineLB";
191 CkPrintf("[%d] TempAwareRefineLB created\n",CkMyPe());
194 snprintf(logFile, sizeof(logFile), "temp_freq.log.%d", CkMyPe());
195 if ((logFD = fopen(logFile, "a"))) {
196 fprintf(logFD, "Time, PE, Temperature, Frequency\n");
198 CkAbort("Couldn't open temperature/frequency log file");
202 CcdCallOnConditionKeep(CcdPERIODIC_1second, &printCurrentTemperature, this);
204 CmiAbort("TEMPLB ERROR: not supported without TEMP_LDB flag.\n");
209 void TempAwareRefineLB::populateEffectiveFreq(int numProcs)
212 for(int i=0;i<numProcs;i++)
214 for(int j=0;j<numAvailFreqs;j++)
216 if(freqs[j] == procFreqNew[i]) // same freq . copy effective freq
218 procFreqNewEffect[i] = freqsEffect[j];
219 // CkPrintf("** Proc%d j:%d NEWFreq:%d\n",i,j,procFreqNewEffect[i]);
221 if(freqs[j] == procFreq[i])
223 procFreqEffect[i] = freqsEffect[j];
224 // CkPrintf("-- Proc%d j:%d OLDFreq:%d procFreq:%d \n",i,j,procFreqEffect[i],procFreq[i]);
231 bool TempAwareRefineLB::QueryBalanceNow(int _step)
233 // CkPrintf("[%d] Balancing on step %d\n",CkMyPe(),_step);
237 class ProcLoadGreater {
239 bool operator()(ProcInfo p1, ProcInfo p2) {
240 return (p1.getTotalLoad() > p2.getTotalLoad());
244 class ProcLoadLesser {
246 bool operator()(ProcInfo p1, ProcInfo p2) {
247 return (p1.getTotalLoad() < p2.getTotalLoad());
251 class ObjLoadGreater {
253 bool operator()(Vertex v1, Vertex v2) {
254 return (v1.getVertexLoad() > v2.getVertexLoad());
258 void TempAwareRefineLB::changeFreq(int nFreq)
261 //CkPrintf("PROC#%d in changeFreq numProcs=%d\n",CkMyPe(),nFreq);
262 // for(int i=0;i<numProcs;i++)
264 // if(procFreq[i]!=procFreqNew[i])
267 sprintf(newfreq,"%d",nFreq);
268 cpufreq_sysfs_write(newfreq,CkMyPe()%physicalCoresPerNode);//i%physicalCoresPerNode);
269 // CkPrintf("PROC#%d freq changing from %d to %d temp=%f\n",i,procFreq[i],procFreqNew[i],procTemp[i]);
276 int getTaskIdForMigration(ObjGraph *ogr,int pe,int start)
278 for(int vert = start; vert < ogr->vertices.size(); vert++)
280 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) return vert;
282 CkPrintf("THERE IS A PROBLEM IN TEMPREFINELB 222 start=%d pe=%d objArraySize=%d!!!!!\n",start,pe,ogr->vertices.size());
286 int getNumTasks(ObjGraph *ogr,int pe)
289 for(int vert = 0; vert < ogr->vertices.size(); vert++)
291 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1) c++;
296 int getTaskIdForMigration(ObjGraph *ogr,int pe,std::vector<int> assTasks)
298 for(int vert = 0; vert < ogr->vertices.size(); vert++)
300 if(ogr->vertices[vert].getCurrentPe()==pe && ogr->vertices[vert].getNewPe()==-1)
303 CkPrintf("======================= pe=%d vert=%d ========================\n",pe,vert);
306 for(int i=0;i<assTasks.size();i++)
308 if(vert==assTasks[i])
314 if(hasIt==false) return vert;
320 // CkPrintf("THERE IS A PROBLEM IN TEMPREFINELB 111 pe=%d objArraySize=%d assTasks.size()=%d !!!!!\n",pe,ogr->vertices.size(),assTasks.size());
321 // CmiPrintStackTrace(0);
325 bool saneFreqNormLds(double *loads, int numProcs)
328 for(int i=0;i<numProcs;i++)
332 double r=numProcs-tot;
333 if(r>0.01 || r<-0.01)
335 CkPrintf("THere is a problem with LOADs!!! r=%f procs=%d loadSum=%f\n",r,numProcs,tot);
341 void TempAwareRefineLB::work(LDStats* stats)
344 ////////////////////////////////////////////////////
345 numProcs=stats->nprocs();
346 numChips=numProcs/logicalCoresPerChip;
347 avgChipTemp=new float[numChips];
348 if(procFreq!=NULL) delete [] procFreq;
349 if(procFreqEffect!=NULL) delete [] procFreqEffect;
350 // if(procFreqPtr!=NULL) delete [] procFreqPtr;
351 if(procTemp!=NULL) delete [] procTemp;
352 if(procFreqNew!=NULL) delete [] procFreqNew;
353 if(procFreqNewEffect!=NULL) delete [] procFreqNewEffect;
354 if(avgChipTemp!=NULL) delete [] avgChipTemp;
356 procFreq = new int[numProcs];
357 procFreqEffect = new int[numProcs];
358 // procFreqPtr = new int[numProcs];
359 procTemp = new float[numProcs];
360 procFreqNew = new int[numProcs];
361 procFreqNewEffect = new int[numProcs];
362 avgChipTemp = new float[numChips];
364 for(int i=0;i<numChips;i++) avgChipTemp[i]=0;
366 for(int i=0;i<numProcs;i++)
368 procFreq[i] = stats->procs[i].pe_speed;
369 procTemp[i] = stats->procs[i].pe_temp;
370 // procFreqPtr[i] = getProcFreqPtr(freqs,numAvailFreqs,procFreq[i]);
371 avgChipTemp[i/logicalCoresPerChip] += procTemp[i];
374 for(int i=0;i<numChips;i++)
376 avgChipTemp[i]/=logicalCoresPerChip;
377 //CkPrintf("---- CHIP#%d has temp=%f ----------\n",i,avgChipTemp[i]);
379 for(int i=0;i<numChips;i++)
382 if(avgChipTemp[i] > MAX_TEMP)
385 if(procFreqPtr[i*logicalCoresPerChip]==numAvailFreqs-1)
387 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
388 CkPrintf("CHIP#%d RUNNING HOT EVEN WITH MIN FREQUENCY!!\n",i);
392 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
394 if(procFreqPtr[j]<numAvailFreqs-1) procFreqPtr[j]++;
396 /// PLEASE COMMENT OUT .. TESTING ONLY
397 if(i==0) {procFreqPtr[j] = numAvailFreqs-1;/*CkPrintf("C for i:%d\n",j);*/}
398 //if(i<numChips-1) procFreqPtr[j]=0;
399 else procFreqPtr[j]=0;
400 /////////////////////////
402 procFreqNew[j] = freqs[procFreqPtr[j]];
405 CkPrintf("!!!!! Chip#%d running HOT shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
410 // if(avgChipTemp[i] < MAX_TEMP-1)
413 if(procFreqPtr[i*logicalCoresPerChip]>0)
415 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++)
420 /// PLEASE COMMENT OUT .. TESTING ONLY
421 if(i==0) procFreqPtr[j] = numAvailFreqs-1;
422 //if(i<numChips-1) procFreqPtr[j]=0;
423 else procFreqPtr[j]=0;
424 /////////////////////////
426 procFreqNew[j] = freqs[procFreqPtr[j]];
429 CkPrintf("!!!!! Chip#%d running COLD shifting from %d to %d temp=%f\n",i,procFreq[i*logicalCoresPerChip],procFreqNew[i*logicalCoresPerChip],avgChipTemp[i]);
434 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
438 if(under==0 && over==0)
440 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[procFreqPtr[j]];
443 //if(i==5) for(int j=i*c(resPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[numAvailFreqs-1];
446 for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0];
448 //for(int j=i*logicalCoresPerChip;j<i*logicalCoresPerChip+logicalCoresPerChip;j++) procFreqNew[j] = freqs[0];
450 //for(int x=0;x<numProcs;x+=logicalCoresPerChip) if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]);
451 //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d freq %d\n",x,procFreqNew[x]);
452 ////////////////////////////////////////////////////
456 int n_pes = stats->nprocs();
458 // CkPrintf("[%d] RefineLB strategy\n",CkMyPe());
460 // RemoveNonMigratable(stats, n_pes);
462 // get original object mapping
463 int* from_procs = RefinerTemp::AllocProcs(n_pes, stats);
464 for(obj=0;obj<stats->n_objs;obj++) {
465 int pe = stats->from_proc[obj];
466 from_procs[obj] = pe;
468 // Get a new buffer to refine into
469 populateEffectiveFreq(numProcs);
470 int* to_procs = RefinerTemp::AllocProcs(n_pes, stats);
471 // RefinerTemp refiner(1.03,procFreqEffect,procFreqNewEffect,n_pes); // overload tolerance=1.05
472 RefinerTemp refiner(1.03,procFreq,procFreqNew,n_pes);
473 refiner.Refine(n_pes, stats, from_procs, to_procs);
476 int *numMigs = new int[numProcs];
478 for(int mm=0;mm<numProcs;mm++) numMigs[mm] = 0;
479 for(obj=0;obj<stats->n_objs;obj++) {
480 int pe = stats->from_proc[obj];
481 numMigs[to_procs[obj]]++;
482 //stats->objData[obj].objID();
483 LDObjData &odata = stats->objData[obj];
484 computeInfo *c1 = new computeInfo();
485 c1->id = odata.objID();
486 //if(to_procs[obj]==3) CkPrintf("[%d,%d] going to 3 totE:%d\n",c1->id.getID()[0],c1->id.getID()[1],totE++);//,(stats->objData[obj].objID().getID())[1],totE++);
487 if (to_procs[obj] != pe) {
489 //if (_lb_args.debug()>=2)
491 // CkPrintf("[%d,%d] Obj %d migrating from %d to %d\n",
492 // c1->id.getID()[0],c1->id.getID()[1],obj,pe,to_procs[obj]);
494 stats->to_proc[obj] = to_procs[obj];
498 for(int mm=0;mm<numProcs;mm++)
500 //CkPrintf("PROC#%d freq:%d objs:%d ----------\n",mm,procFreqNew[mm],numMigs[mm]);
502 CkPrintf("TEMPLB INFO: Total Objs:%d migrations:%d time:%f \n",stats->n_objs,migs,CmiWallTimer()-starting);
503 fprintf(migFile,"%f %d\n",CmiWallTimer()-starting,migs);
504 // Free the refine buffers
505 RefinerTemp::FreeProcs(from_procs);
506 RefinerTemp::FreeProcs(to_procs);
509 //for(int x=0;x<numProcs;x++) CkPrintf("Procs#%d ------- freq %d\n",x,procFreqNew[x]);
511 for(int x=0;x<numProcs;x+=logicalCoresPerChip)
513 if(procFreq[x]!=procFreqNew[x])
515 CkPrintf("Chaning the freq for PROC#%d\n",x);
516 thisProxy[x].changeFreq(procFreqNew[x]);
520 for(int x=0;x<numProcs;x++)
522 //CkPrintf("--------- Proc#%d %d numProcs=%d\n",x,procFreqNew[x],numProcs);
523 if(procFreq[x]!=procFreqNew[x]) thisProxy[x].changeFreq(procFreqNew[x]);
525 #endif // TEMP_LDB endif
527 #include "TempAwareRefineLB.def.h"