3 A system for exposing application and runtime "control points"
4 to the dynamic optimization framework.
9 #ifndef __CONTROLPOINTS_H__
10 #define __CONTROLPOINTS_H__
12 #include "conv-config.h"
14 #if CMK_WITH_CONTROLPOINT
17 #include "ControlPoints.decl.h"
38 #include <charm-api.h>
40 #include "LBDatabase.h"
41 #include "arrayRedistributor.h"
42 #include "pathHistory.h"
45 #include <cp_effects.h>
48 * \addtogroup ControlPointFramework
54 /* readonly */ extern CProxy_controlPointManager controlPointManagerProxy
;
55 /* readonly */ extern int random_seed
;
56 /* readonly */ extern long controlPointSamplePeriod
;
57 /* readonly */ extern int whichTuningScheme
;
58 /* readonly */ extern bool writeDataFileAtShutdown
;
59 /* readonly */ extern bool shouldFilterOutputData
;
60 /* readonly */ extern bool loadDataFileAtStartup
;
61 /* readonly */ extern char CPDataFilename
[512];
65 void registerCPChangeCallback(CkCallback cb
, bool frameworkShouldAdvancePhase
);
67 void setFrameworkAdvancePhase(bool frameworkShouldAdvancePhase
);
70 void registerControlPointTiming(double time
);
72 /// Called once each application step. Can be used instead of registerControlPointTiming()
73 void controlPointTimingStamp();
74 FLINKAGE
void FTN_NAME(CONTROLPOINTTIMINGSTAMP
,controlpointtimingstamp
)();
77 /// The application specifies that it is ready to proceed to a new set of control point values.
78 /// This should be called after registerControlPointTiming()
79 /// This should be called before calling controlPoint()
81 FLINKAGE
void FTN_NAME(GOTONEXTPHASE
,gotonextphase
)();
83 /// Return an integral power of 2 between c1 and c2
84 /// The value returned will likely change between subsequent invocations
85 int controlPoint2Pow(const char *name
, int c1
, int c2
);
87 /// Return an integer between lb and ub inclusive
88 /// The value returned will likely change between subsequent invocations
89 int controlPoint(const char *name
, int lb
, int ub
);
91 /// A fortran callable one. I couldn't figure out how to pass a string from fortran to C++ yet
92 /// So far fortran can only have one control point
93 FLINKAGE
int FTN_NAME(CONTROLPOINT
,controlpoint
)(CMK_TYPEDEF_INT4
*lb
, CMK_TYPEDEF_INT4
*ub
);
96 /// Return an integer from the provided vector of values
97 /// The value returned will likely change between subsequent invocations
98 int controlPoint(const char *name
, std::vector
<int>& values
);
100 /// Write output data to disk. Callable from user program (for example, to periodically flush to disk if program might run out of time, or NAMD)
101 void ControlPointWriteOutputToDisk();
103 /// The application specifies that it is ready to proceed to a new set of control point values.
104 /// This should be called after registerControlPointTiming()
105 /// This should be called before calling controlPoint()
106 void gotoNextPhase();
111 /// A message used for signaling changes in control point values
112 class controlPointMsg
: public CMessage_controlPointMsg
{
121 /// A container that stores idle time statistics (min/max/avg etc.)
122 class idleTimeContainer
{
134 bool isValid() const{
135 return (min
>= 0.0 && avg
>= min
&& max
>= avg
&& max
<= 1.0);
140 CkPrintf("[%d] Idle Time is Min=%.2lf%% Avg=%.2lf%% Max=%.2lf%%\n", CkMyPe(), min
*100.0, avg
*100.0, max
*100.0);
142 CkPrintf("[%d] Idle Time is invalid\n", CkMyPe(), min
*100.0, avg
*100.0, max
*100.0);
148 /// A container that stores overhead statistics (min/max/avg etc.)
149 class overheadContainer
{
161 bool isValid() const{
162 return (min
>= 0.0 && avg
>= min
&& max
>= avg
&& max
<= 1.0);
167 CkPrintf("[%d] Overhead Time is Min=%.2lf%% Avg=%.2lf%% Max=%.2lf%%\n", CkMyPe(), min
*100.0, avg
*100.0, max
*100.0);
169 CkPrintf("[%d] Overhead Time is invalid\n", CkMyPe(), min
*100.0, avg
*100.0, max
*100.0);
176 /// Stores data for a phase (a time range in which a single set of control point values is used).
177 /// The data stored includes the control point values, a set of timings registered by the application,
178 /// The critical paths detected, the max memory usage, and the idle time.
179 class instrumentedPhase
{
181 std::map
<std::string
,int> controlPoints
; // The control point values for this phase(don't vary within the phase)
182 std::vector
<double> times
; // A list of times observed for iterations in this phase
184 #if USE_CRITICAL_PATH_HEADER_ARRAY
185 std::vector
<PathHistoryTableEntry
> criticalPaths
;
188 double memoryUsageMB
;
190 idleTimeContainer idleTime
;
191 overheadContainer overheadTime
;
194 /** Approximately records the average message size for an entry method. */
195 double bytesPerInvoke
;
197 /** Records the average grain size (might be off a bit due to non application entry methods), in seconds */
202 memoryUsageMB
= -1.0;
204 bytesPerInvoke
= -1.0;
208 controlPoints
.clear();
210 memoryUsageMB
= -1.0;
212 bytesPerInvoke
= -1.0;
213 // criticalPaths.clear();
216 // Provide a previously computed value, or a value from a previous run
217 bool haveValueForName(const char* name
){
219 return (controlPoints
.count(n
)>0);
222 void operator=(const instrumentedPhase
& p
){
223 controlPoints
= p
.controlPoints
;
225 memoryUsageMB
= p
.memoryUsageMB
;
230 bool operator<(const instrumentedPhase
& p
){
231 CkAssert(hasSameKeysAs(p
));
232 std::map
<std::string
,int>::iterator iter1
= controlPoints
.begin();
233 std::map
<std::string
,int>::const_iterator iter2
= p
.controlPoints
.begin();
234 for(;iter1
!= controlPoints
.end() && iter2
!= p
.controlPoints
.end(); iter1
++, iter2
++){
235 if(iter1
->second
< iter2
->second
){
243 // Determines if the control point values and other information exists
244 bool hasValidControlPointValues(){
245 std::map
<std::string
,int>::iterator iter
;
246 for(iter
= controlPoints
.begin();iter
!= controlPoints
.end(); iter
++){
247 if(iter
->second
== -1){
255 // int medianCriticalPathIdx() const{
256 // // Bubble sort the critical path indices by Time
257 // int numPaths = criticalPaths.size();
259 // int *sortedPaths = new int[numPaths];
260 // for(int i=0;i<numPaths;i++){
261 // sortedPaths[i] = i;
264 // for(int j=0;j<numPaths;j++){
265 // for(int i=0;i<numPaths-1;i++){
266 // if(criticalPaths[sortedPaths[i]].getTotalTime() < criticalPaths[sortedPaths[i+1]].getTotalTime()){
267 // // swap sortedPaths[i], sortedPaths[i+1]
268 // int tmp = sortedPaths[i+1];
269 // sortedPaths[i+1] = sortedPaths[i];
270 // sortedPaths[i] = tmp;
274 // int result = sortedPaths[numPaths/2];
275 // delete[] sortedPaths;
284 bool operator==(const instrumentedPhase
& p
){
285 CkAssert(hasSameKeysAs(p
));
286 std::map
<std::string
,int>::iterator iter1
= controlPoints
.begin();
287 std::map
<std::string
,int>::const_iterator iter2
= p
.controlPoints
.begin();
288 for(;iter1
!= controlPoints
.end() && iter2
!= p
.controlPoints
.end(); iter1
++, iter2
++){
289 if(iter1
->second
!= iter2
->second
){
296 /// Verify the names of the control points are consistent
297 /// note: std::map stores the pairs in a sorted order based on their first component
298 bool hasSameKeysAs(const instrumentedPhase
& p
){
300 if(controlPoints
.size() != p
.controlPoints
.size())
303 std::map
<std::string
,int>::iterator iter1
= controlPoints
.begin();
304 std::map
<std::string
,int>::const_iterator iter2
= p
.controlPoints
.begin();
306 for(;iter1
!= controlPoints
.end() && iter2
!= p
.controlPoints
.end(); iter1
++, iter2
++){
307 if(iter1
->first
!= iter2
->first
)
315 void addAllNames(std::set
<std::string
> names_
) {
317 std::set
<std::string
> names
= names_
;
319 // Remove all the names that we already have
320 std::map
<std::string
,int>::iterator iter
;
322 for(iter
= controlPoints
.begin(); iter
!= controlPoints
.end(); iter
++){
323 names
.erase(iter
->first
);
326 // Add -1 values for each name we didn't find
327 std::set
<std::string
>::iterator iter2
;
328 for(iter2
= names
.begin(); iter2
!= names
.end(); iter2
++){
329 controlPoints
.insert(std::make_pair(*iter2
,-1));
330 CkPrintf("One of the datasets was missing a value for %s, so -1 was used\n", iter2
->c_str());
338 std::map
<std::string
,int>::const_iterator iter
;
340 if(controlPoints
.size() == 0){
341 CkPrintf("no control point values found\n");
344 for(iter
= controlPoints
.begin(); iter
!= controlPoints
.end(); iter
++){
345 const std::string name
= iter
->first
;
346 const int val
= iter
->second
;
347 CkPrintf("%s ---> %d\n", name
.c_str(), val
);
352 /** Determine the median time for this phase */
354 std::vector
<double> sortedTimes
= times
;
355 std::sort(sortedTimes
.begin(), sortedTimes
.end());
356 if(sortedTimes
.size()>0){
357 return sortedTimes
[sortedTimes
.size() / 2];
359 CkAbort("Cannot compute medianTime for empty sortedTimes vector");
368 /// Stores and manipulate all known instrumented phases. One instance of this exists on each PE in its local controlPointManager
369 class instrumentedData
{
372 /// Stores all known instrumented phases(loaded from file, or from this run)
373 std::vector
<instrumentedPhase
*> phases
;
375 /// get control point names for all phases
376 std::set
<std::string
> getNames(){
377 std::set
<std::string
> names
;
379 std::vector
<instrumentedPhase
*>::iterator iter
;
380 for(iter
= phases
.begin();iter
!=phases
.end();iter
++) {
382 std::map
<std::string
,int>::iterator iter2
;
383 for(iter2
= (*iter
)->controlPoints
.begin(); iter2
!= (*iter
)->controlPoints
.end(); iter2
++){
384 names
.insert(iter2
->first
);
394 std::set
<std::string
> names
= getNames();
396 std::vector
<instrumentedPhase
*>::iterator iter
;
397 for(iter
= phases
.begin();iter
!=phases
.end();iter
++) {
398 (*iter
)->addAllNames(names
);
403 /// Remove one phase with invalid control point values if found
404 bool filterOutOnePhase(){
406 // Note: calling erase on a vector will invalidate any iterators beyond the deletion point
407 std::vector
<instrumentedPhase
*>::iterator iter
;
408 for(iter
= phases
.begin(); iter
!= phases
.end(); iter
++) {
409 if(! (*iter
)->hasValidControlPointValues() || (*iter
)->times
.size()==0){
410 // CkPrintf("Filtered out a phase with incomplete control point values\n");
414 // CkPrintf("Not filtering out some phase with good control point values\n");
421 /// Drop any phases that do not contain timings or control point values
422 void filterOutIncompletePhases(){
424 while(filterOutOnePhase()){
430 std::string
toString(){
431 std::ostringstream s
;
435 s
<< "# Data for use with Isaac Dooley's Control Point Framework\n";
436 s
<< "# Number of instrumented timings in this file:\n";
437 s
<< phases
.size() << "\n" ;
439 if(phases
.size() > 0){
441 std::map
<std::string
,int> &ps
= phases
[0]->controlPoints
;
442 std::map
<std::string
,int>::iterator cpiter
;
446 s
<< "# number of named control points:\n";
447 s
<< ps
.size() << "\n";
448 for(cpiter
= ps
.begin(); cpiter
!= ps
.end(); cpiter
++){
449 s
<< cpiter
->first
<< "\n";
454 s
<< "# There are " << ps
.size() << " control points\n";
455 s
<< "# number of recorded phases: " << phases
.size() << "\n";
457 s
<< "# Memory (MB)\tIdle Min\tIdle Avg\tIdle Max\tOverhead Min\tOverhead Avg\tOverhead Max\tByte Per Invoke\tGrain Size\t";
458 for(cpiter
= ps
.begin(); cpiter
!= ps
.end(); cpiter
++){
459 s
<< cpiter
->first
<< "\t";
461 s
<< "Median Timing\tTimings\n";
464 std::vector
<instrumentedPhase
*>::iterator runiter
;
465 for(runiter
=phases
.begin();runiter
!=phases
.end();runiter
++){
467 // Print the memory usage
468 s
<< (*runiter
)->memoryUsageMB
<< "\t";
470 s
<< (*runiter
)->idleTime
.min
<< "\t" << (*runiter
)->idleTime
.avg
<< "\t" << (*runiter
)->idleTime
.max
<< "\t";
471 s
<< (*runiter
)->overheadTime
.min
<< "\t" << (*runiter
)->overheadTime
.avg
<< "\t" << (*runiter
)->overheadTime
.max
<< "\t";
474 s
<< (*runiter
)->bytesPerInvoke
<< "\t";
476 s
<< (*runiter
)->grainSize
<< "\t";
479 // Print the control point values
480 for(cpiter
= (*runiter
)->controlPoints
.begin(); cpiter
!= (*runiter
)->controlPoints
.end(); cpiter
++){
481 s
<< cpiter
->second
<< "\t";
484 // Print the median time
485 if((*runiter
)->times
.size()>0){
486 s
<< (*runiter
)->medianTime() << "\t";
492 std::vector
<double>::iterator titer
;
493 for(titer
= (*runiter
)->times
.begin(); titer
!= (*runiter
)->times
.end(); titer
++){
508 /// Verify that all our phases of data have the same sets of control point names
510 if(phases
.size() > 1){
511 instrumentedPhase
*firstpoint
= phases
[0];
512 std::vector
<instrumentedPhase
*>::iterator iter
;
513 for(iter
= phases
.begin();iter
!=phases
.end();iter
++){
514 CkAssert( firstpoint
->hasSameKeysAs(*(*iter
)));
520 // Find the fastest time from previous runs
521 instrumentedPhase
* findBest(){
522 CkAssert(phases
.size()>1);
524 double total_time
= 0.0; // total for all times
527 instrumentedPhase
* best_phase
;
530 double best_phase_avgtime
= std::numeric_limits
<double>::max();
532 double best_phase_avgtime
= DBL_MAX
;
535 int valid_phase_count
= 0;
537 std::vector
<instrumentedPhase
*>::iterator iter
;
538 for(iter
= phases
.begin();iter
!=phases
.end();iter
++){
539 if((*iter
)->hasValidControlPointValues()){
542 double total_for_phase
= 0.0;
545 // iterate over all times for this control point configuration
546 std::vector
<double>::iterator titer
;
547 for(titer
= (*iter
)->times
.begin(); titer
!= (*iter
)->times
.end(); titer
++){
549 total_time
+= *titer
;
550 total_for_phase
+= *titer
;
554 double phase_average_time
= total_for_phase
/ (double)phase_count
;
556 if(phase_average_time
< best_phase_avgtime
){
558 best_phase_avgtime
= phase_average_time
;
564 CkAssert(total_count
> 0);
566 double avg
= total_time
/ total_count
;
569 CkPrintf("Best average time for a phase was %.1lf\n", best_phase_avgtime
);
570 CkPrintf("Mean time for all %d times in the %d valid recorded phases was %.1lf\n", total_count
, valid_phase_count
, avg
);
573 // Compute standard deviation
575 for(iter
= phases
.begin();iter
!=phases
.end();iter
++){
576 if((*iter
)->hasValidControlPointValues()){
577 std::vector
<double>::iterator titer
;
578 for(titer
= (*iter
)->times
.begin(); titer
!= (*iter
)->times
.end(); titer
++){
579 sumx
+= (avg
- *titer
)*(avg
- *titer
);
584 double std_dev
= sqrt(sumx
/ total_count
);
587 CkPrintf("Standard Deviation for previous runs was %.2lf or %.1lf%% of mean\n", std_dev
, std_dev
/avg
*100.0);
588 CkPrintf("The best phase average time was %.1lf%% faster than the mean\n", (avg
-best_phase_avgtime
)/avg
*100.0);
598 /** A class that implements the Nelder Mead Simplex Optimization Algorithm */
599 class simplexScheme
{
601 /** The states used by the Nelder Mead Simplex Algorithm.
602 The transitions between these states are displayed in the NelderMeadStateDiagram.pdf diagram.
604 typedef enum simplexStateEnumT
{beginning
, reflecting
, expanding
, contracting
, doneExpanding
, stillContracting
} simplexStateT
;
606 /// The indices into the allData->phases that correspond to the current simplex used one of the tuning schemes.
607 std::set
<int> simplexIndices
;
608 simplexStateT simplexState
;
610 /// Reflection coefficient
612 // Contraction coefficient
614 // Expansion coefficient
618 /// The first phase that was used by the this scheme. This helps us ignore a few startup phases that are out of our control.
619 int firstSimplexPhase
;
621 // Worst performing point in the simplex
624 std::vector
<double> worst
; // p_h
626 // Centroid of remaining points in the simplex
627 std::vector
<double> centroid
;
629 // Best performing point in the simplex
632 std::vector
<double> best
;
635 std::vector
<double> P
;
639 std::vector
<double> P2
;
642 // A set of phases for which (P_i+P_l)/2 ought to be evaluated
643 std::set
<int> stillMustContractList
;
647 void computeCentroidBestWorst(std::map
<std::string
, std::pair
<int,int> > & controlPointSpace
, std::map
<std::string
,int> &newControlPoints
, const int phase_id
, instrumentedData
&allData
);
649 void doReflection(std::map
<std::string
, std::pair
<int,int> > & controlPointSpace
, std::map
<std::string
,int> &newControlPoints
, const int phase_id
, instrumentedData
&allData
);
650 void doExpansion(std::map
<std::string
, std::pair
<int,int> > & controlPointSpace
, std::map
<std::string
,int> &newControlPoints
, const int phase_id
, instrumentedData
&allData
);
651 void doContraction(std::map
<std::string
, std::pair
<int,int> > & controlPointSpace
, std::map
<std::string
,int> &newControlPoints
, const int phase_id
, instrumentedData
&allData
);
654 std::vector
<double> pointCoords(instrumentedData
&allData
, int i
);
656 inline int keepInRange(int v
, int lb
, int ub
){
664 void printSimplex(instrumentedData
&allData
){
667 for(std::set
<int>::iterator iter
= simplexIndices
.begin(); iter
!= simplexIndices
.end(); ++iter
){
668 sprintf(s
+strlen(s
), "%d: ", *iter
);
670 for(std::map
<std::string
,int>::iterator citer
= allData
.phases
[*iter
]->controlPoints
.begin(); citer
!= allData
.phases
[*iter
]->controlPoints
.end(); ++citer
){
671 sprintf(s
+strlen(s
), " %d", citer
->second
);
674 sprintf(s
+strlen(s
), "\n");
676 CkPrintf("Current simplex is:\n%s\n", s
);
682 simplexState(beginning
),
683 alpha(1.0), beta(0.5), gamma(2.0),
684 firstSimplexPhase(-1),
687 // Make sure the coefficients are reasonable
688 CkAssert(alpha
>= 0);
689 CkAssert(beta
>= 0.0 && beta
<= 1.0);
690 CkAssert(gamma
>= 1.0);
693 void adapt(std::map
<std::string
, std::pair
<int,int> > & controlPointSpace
, std::map
<std::string
,int> &newControlPoints
, const int phase_id
, instrumentedData
&allData
);
699 class controlPointManager
: public CBase_controlPointManager
{
702 instrumentedData allData
;
704 /// The lower and upper bounds for each named control point
705 std::map
<std::string
, std::pair
<int,int> > controlPointSpace
;
707 /// A set of named control points whose values cannot change within a single run of an application
708 std::set
<std::string
> staticControlPoints
;
710 /// @deprecated Sets of entry point ids that are affected by some named control points
711 std::map
<std::string
, std::set
<int> > affectsPrioritiesEP
;
712 /// @deprecated Sets of entry array ids that are affected by some named control points
713 std::map
<std::string
, std::set
<int> > affectsPrioritiesArray
;
716 /// The control points to be used in the next phase. In gotoNextPhase(), these will be used
717 std::map
<std::string
,int> newControlPoints
;
718 int generatedPlanForStep
;
723 /// A user supplied callback to call when control point values are to be changed
724 CkCallback controlPointChangeCallback
;
725 bool haveControlPointChangeCallback
;
726 bool frameworkShouldAdvancePhase
;
730 bool alreadyRequestedMemoryUsage
;
731 bool alreadyRequestedIdleTime
;
732 bool alreadyRequestedAll
;
737 controlPointManager();
739 ~controlPointManager();
741 void pup(PUP::er
&p
);
743 controlPointManager(CkMigrateMessage
* m
) : CBase_controlPointManager(m
) {
744 // CkPrintf("Warning: control point framework likely won't work after checkpoint restart, please fix this problem if you need the functionality.");
748 /// Loads the previous run data file
751 /// Add the current data to allData and output it to a file
752 void writeDataFile();
754 /// User can register a callback that is called when application should advance to next phase
755 void setCPCallback(CkCallback cb
, bool _frameworkShouldAdvancePhase
);
757 void setFrameworkAdvancePhase(bool _frameworkShouldAdvancePhase
);
759 /// Called periodically by the runtime to handle the control points
760 /// Currently called on each PE
761 void processControlPoints();
763 /// Determine if any control point is known to affect an entry method
764 bool controlPointAffectsThisEP(int ep
);
766 /// Determine if any control point is known to affect a chare array
767 bool controlPointAffectsThisArray(int array
);
769 /// Generate a plan (new control point values) once per phase
772 /// The data for the current phase
773 instrumentedPhase
*currentPhaseData();
775 /// The data from the previous phase
776 instrumentedPhase
*previousPhaseData();
778 /// The data from two phases back
779 instrumentedPhase
*twoAgoPhaseData();
781 /// Called by either the application or the Control Point Framework to advance to the next phase
782 void gotoNextPhase();
784 /// An application uses this to register an instrumented timing for this phase
785 void setTiming(double time
);
787 /// Check to see if we are in the shutdown process, and handle it appropriately.
788 void checkForShutdown();
790 /// Start shutdown procedures for the controlPoints
791 /// module(s). CkContinueExit will be called once all outstanding
792 /// operations have completed (e.g. waiting for idle time & memory
793 /// usage to be gathered)
796 /// All outstanding operations have completed, so do the shutdown now. First write files to disk, and then call CkExit().
799 /// Write data to disk (when needed), likely at exit
800 void writeOutputToDisk();
803 /// Entry method called on all PEs to request memory usage
804 void requestMemoryUsage(CkCallback cb
);
805 /// All processors reduce their memory usages to this method
806 void gatherMemoryUsage(CkReductionMsg
*msg
);
809 /// Entry method called on all PEs to request memory usage
810 void requestIdleTime(CkCallback cb
);
811 /// All processors reduce their memory usages in requestIdleTime() to this method
812 void gatherIdleTime(CkReductionMsg
*msg
);
815 /// Entry method called on all PEs to request Idle, Overhead, and Memory measurements
816 void requestAll(CkCallback cb
);
817 /// All processors reduce their memory usages in requestIdleTime() to this method
818 void gatherAll(CkReductionMsg
*msg
);
821 /* /// Inform the control point framework that a named control point affects the priorities of some array */
822 /* void associatePriorityArray(const char *name, int groupIdx); */
824 /* /// Inform the control point framework that a named control point affects the priority of some entry method */
825 /* void associatePriorityEntry(const char *name, int idx); */