src/ck-ldb/TreeMatchLB.C

   1 /**
   2  * \addtogroup CkLdb
   3 */
   4 /*@{*/
   5
   6 #include <charm++.h>
   7 #include "tm_tree.h"
   8 #include "tm_mapping.h"
   9 #include "TreeMatchLB.h"
  10 #include "ckgraph.h"
  11 #include <algorithm>
  12
  13
  14
  15 CreateLBFunc_Def(TreeMatchLB, "TreeMatch load balancer, like a normal one but with empty strategy")
  16
  17 #include "TreeMatchLB.def.h"
  18
  19 TreeMatchLB::TreeMatchLB(const CkLBOptions &opt): CBase_TreeMatchLB(opt)
  20 {
  21   lbname = (char*)"TreeMatchLB";
  22   if (CkMyPe() == 0)
  23     CkPrintf("[%d] TreeMatchLB created\n",CkMyPe());
  24 }
  25
  26 bool TreeMatchLB::QueryBalanceNow(int _step)
  27 {
  28   return true;
  29 }
  30
  31
  32 double *get_comm_speed(int *depth){
  33   double *res;
  34   int i;
  35
  36   *depth=5;
  37
  38   res=(double*)malloc(sizeof(double)*(*depth));
  39   res[0]=1;
  40
  41   for(i=1;i<*depth;i++){
  42     res[i]=res[i-1]*2;
  43   }
  44   return res;
  45 }
  46
  47 tm_topology_t *build_abe_topology(int nb_procs){
  48   int arity[4]={nb_procs,2,2,2}; /*abe memory hierarchy. Should be given by HWLOC*/
  49   int nbring[8]={0,2,4,6,1,3,5,7}; /*abe cores numbering. Should be given by HWLOC*/
  50   return build_synthetic_topology(arity,5,nbring,8);
  51 }
  52
  53 class ProcLoadGreater {
  54   public:
  55     bool operator()(ProcInfo p1, ProcInfo p2) {
  56       return (p1.totalLoad() > p2.totalLoad());
  57     }
  58 };
  59
  60 class ObjLoadGreater {
  61   public:
  62     bool operator()(Vertex v1, Vertex v2) {
  63       return (v1.getVertexLoad() > v2.getVertexLoad());
  64     }
  65 };
  66
  67 void TreeMatchLB::work(BaseLB::LDStats* stats)
  68 {
  69   /** ========================= 1st Do Load Balancing =======================*/
  70
  71   /** ========================== INITIALIZATION ============================= */
  72   ProcArray *parr = new ProcArray(stats);       // Processor Array
  73   ObjGraph *ogr = new ObjGraph(stats);          // Object Graph
  74
  75   /** ============================= STRATEGY ================================ */
  76   parr->resetTotalLoad();
  77
  78   if (_lb_args.debug()>1)
  79     CkPrintf("[%d] In GreedyLB strategy\n",CkMyPe());
  80
  81   int vert;
  82
  83   // max heap of objects
  84   std::sort(ogr->vertices.begin(), ogr->vertices.end(), ObjLoadGreater());
  85   // min heap of processors
  86   std::make_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater());
  87
  88   for(vert = 0; vert < ogr->vertices.size(); vert++) {
  89     // Pop the least loaded processor
  90     ProcInfo p = parr->procs.front();
  91     std::pop_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater());
  92     parr->procs.pop_back();
  93
  94     // Increment the load of the least loaded processor by the load of the
  95     // 'heaviest' unmapped object
  96     p.totalLoad() += ogr->vertices[vert].getVertexLoad();
  97     ogr->vertices[vert].setNewPe(p.getProcId());
  98
  99     // Insert the least loaded processor with load updated back into the heap
 100     parr->procs.push_back(p);
 101     std::push_heap(parr->procs.begin(), parr->procs.end(), ProcLoadGreater());
 102   }
 103
 104   /** ============================== CLEANUP ================================ */
 105   ogr->convertDecisions(stats);         // Send decisions back to LDStats
 106
 107
 108   /** ====================== 2nd do Topology aware mapping ====================*/
 109
 110
 111
 112   int nb_procs;
 113   double **comm_mat;
 114   int i;
 115   int *object_mapping, *permutation;
 116
 117
 118   /* get number of processors and teh greedy load balancing*/
 119   nb_procs = stats->nprocs();
 120   object_mapping=stats->to_proc.getVec();
 121
 122
 123   stats->makeCommHash();
 124   // allocate communication matrix
 125   comm_mat=(double**)malloc(sizeof(double*)*nb_procs);
 126   for(i=0;i<nb_procs;i++){
 127     comm_mat[i]=(double*)calloc(nb_procs,sizeof(double));
 128   }
 129
 130   /* Build the communicartion matrix*/
 131   for(i=0;i<stats->n_comm;i++){
 132     LDCommData &commData = stats->commData[i];
 133     if((!commData.from_proc())&&(commData.recv_type()==LD_OBJ_MSG)){
 134       /* object_mapping[i] is the processors of object i*/
 135       int from = object_mapping[stats->getHash(commData.sender)];
 136       int to = object_mapping[stats->getHash(commData.receiver.get_destObj())];
 137       if(from!=to){
 138         comm_mat[from][to]+=commData.bytes;
 139         comm_mat[to][from]+=commData.bytes;
 140       }
 141     }
 142   }
 143
 144   /* build the topology of the hardware (abe machine here)*/
 145   tm_topology_t *topology=build_abe_topology(nb_procs);
 146   display_topology(topology);
 147   /* compute the affinity tree */
 148   tree_t *comm_tree=build_tree_from_topology(topology,comm_mat,nb_procs,NULL,NULL);
 149
 150   /* Compute the processor permutation*/
 151   permutation=(int*)malloc(sizeof(int)*nb_procs);
 152   map_topology_simple(topology,comm_tree,permutation,NULL);
 153
 154
 155   /*
 156      Apply this perutation to all objects
 157      Side effect: object_mapping points to the stats->to_proc.getVec()
 158      So, these lines change also stats->to_proc.getVec()
 159   */
 160   for(i=0;i<nb_procs;i++)
 161     object_mapping[i]=permutation[object_mapping[i]];
 162
 163   // free communication matrix;
 164   for(i=0;i<nb_procs;i++){
 165       free(comm_mat[i]);
 166   }
 167   free(comm_mat);
 168   free_topology(topology);
 169 }
 170
 171 /*@}*/