3 #include "sockRoutines.h"
6 #define DEBUGP(x) /** CmiPrintf x; */
8 /** This scheme relies on using IP address to identify physical nodes
9 * written by Gengbin Zheng 9/2008
11 * last updated 10/4/2009 Gengbin Zheng
12 * added function CmiCpuTopologyEnabled() which retuens 1 when supported
13 * when not supported return 0
14 * all functions when cputopology not support, now act like a normal non-smp
15 * case and all PEs are unique.
17 * major changes 10/28/09 Gengbin Zheng
18 * - parameters changed from pe to node to be consistent with the function name
19 * - two new functions: CmiPhysicalNodeID and CmiPhysicalRank
21 * 3/5/2010 Gengbin Zheng
22 * - use CmiReduce to optimize the collection of node info
34 #if CMK_BLUEGENEL || CMK_BLUEGENEP || CMK_BLUEGENEQ
35 #include "TopoManager.h"
38 #if CMK_CRAYXT || CMK_CRAYXE
39 extern "C" int getXTNodeID(int mpirank, int nummpiranks);
42 #if defined(__APPLE__) && CMK_HAS_MULTIPROCESSING_H
43 #include <Carbon/Carbon.h>
44 #include <Multiprocessing.h>
48 #include "middle-blue.h"
49 using namespace BGConverse;
52 extern "C" int CmiNumCores(void) {
55 struct _SYSTEM_INFO sysinfo;
58 /* Allow the user to override the number of CPUs for use
59 in scalability testing, debugging, etc. */
60 char *forcecount = getenv("FORCECPUCOUNT");
61 if (forcecount != NULL) {
62 if (sscanf(forcecount, "%d", &a) == 1) {
63 return a; /* if we got a valid count, return it */
65 a = 1; /* otherwise use the real available hardware CPU count */
69 #if defined(__APPLE__) && CMK_HAS_MULTIPROCESSING_H
70 a = MPProcessorsScheduled(); /* Number of active/running CPUs */
74 //struct _SYSTEM_INFO sysinfo;
75 GetSystemInfo(&sysinfo);
76 a = sysinfo.dwNumberOfProcessors; /* total number of CPUs */
80 #ifdef _SC_NPROCESSORS_ONLN
81 a = sysconf(_SC_NPROCESSORS_ONLN); /* number of active/running CPUs */
82 #elif defined(_SC_CRAY_NCPU)
83 a = sysconf(_SC_CRAY_NCPU);
84 #elif defined(_SC_NPROC_ONLN)
85 a = sysconf(_SC_NPROC_ONLN); /* number of active/running CPUs */
89 #if defined(ARCH_HPUX11) || defined(ARCH_HPUX10)
90 a = mpctl(MPC_GETNUMSPUS, 0, 0); /* total number of CPUs */
96 static int cpuTopoHandlerIdx;
97 static int cpuTopoRecvHandlerIdx;
107 typedef struct _hostnameMsg {
108 char core[CmiMsgHeaderSizeBytes];
113 typedef struct _nodeTopoMsg {
114 char core[CmiMsgHeaderSizeBytes];
118 static nodeTopoMsg *topomsg = NULL;
119 static CmmTable hostTable;
121 // nodeIDs[pe] is the node number of processor pe
127 static CkVec<int> *bynodes;
128 static int supported;
130 // return -1 when not supported
133 if (numNodes != 0) return numNodes;
135 for (int i=0; i<CmiNumPes(); i++)
141 if (numNodes > 0) return numNodes; // already calculated
144 for (i=0; i<numPes; i++) unodes.push_back(nodeIDs[i]);
145 //unodes.bubbleSort(0, numPes-1);
148 std::map<int, int> nodemap; // nodeIDs can be out of range of [0,numNodes]
149 for (i=0; i<numPes; i++) {
150 if (unodes[i] != last) {
152 nodemap[unodes[i]] = numNodes;
157 numNodes = CmiNumNodes();
158 numPes = CmiNumPes();
161 // re-number nodeIDs, which may be necessary e.g. on BlueGene/P
162 for (i=0; i<numPes; i++) nodeIDs[i] = nodemap[nodeIDs[i]];
163 CpuTopology::supported = 1;
172 bynodes = new CkVec<int>[numNodes];
174 for (i=0; i<numPes; i++){
175 CmiAssert(nodeIDs[i] >=0 && nodeIDs[i] <= numNodes); // Sanity check for bug that occurs on mpi-crayxt
176 bynodes[nodeIDs[i]].push_back(i);
179 else { /* not supported/enabled */
180 for (i=0;i<CmiNumPes();i++) bynodes[CmiNodeOf(i)].push_back(i);
186 CmiPrintf("Charm++> Cpu topology info:\n");
187 CmiPrintf("PE to node map: ");
188 for (i=0; i<CmiNumPes(); i++)
189 CmiPrintf("%d ", nodeIDs[i]);
191 CmiPrintf("Node to PE map:\n");
192 for (i=0; i<numNodes; i++) {
193 CmiPrintf("Chip #%d: ", i);
194 for (int j=0; j<bynodes[i].size(); j++)
195 CmiPrintf("%d ", bynodes[i][j]);
202 int *CpuTopology::nodeIDs = NULL;
203 int CpuTopology::numPes = 0;
204 int CpuTopology::numNodes = 0;
205 CkVec<int> *CpuTopology::bynodes = NULL;
206 int CpuTopology::supported = 0;
208 static CpuTopology cpuTopo;
209 static CmiNodeLock topoLock = 0; /* Not spelled 'NULL' to quiet warnings when CmiNodeLock is just 'int' */
213 static void cpuTopoHandler(void *m)
216 hostnameMsg *msg = (hostnameMsg *)m;
220 if (topomsg == NULL) {
222 hostTable = CmmNew();
223 topomsg = (nodeTopoMsg *)CmiAlloc(sizeof(nodeTopoMsg)+CmiNumPes()*sizeof(int));
224 CmiSetHandler((char *)topomsg, cpuTopoRecvHandlerIdx);
225 topomsg->nodes = (int *)((char*)topomsg + sizeof(nodeTopoMsg));
226 for (i=0; i<CmiNumPes(); i++) topomsg->nodes[i] = -1;
228 CmiAssert(topomsg != NULL);
230 msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
231 CmiAssert(msg->n == CmiNumPes());
232 for (int i=0; i<msg->n; i++)
234 _procInfo *proc = msg->procs+i;
237 skt_print_ip(str, msg->ip);
238 printf("hostname: %d %s\n", msg->pe, str);
240 tag = *(int*)&proc->ip;
242 if ((rec = (_procInfo *)CmmProbe(hostTable, 1, &tag, &tag1)) != NULL) {
245 proc->nodeID = pe; // we will compact the node ID later
247 CmmPut(hostTable, 1, &tag, proc);
249 topomsg->nodes[pe] = rec->nodeID;
253 // assume all nodes have same number of cores
254 int ncores = CmiNumCores();
256 sprintf(str, "Charm++> Running on %d unique compute nodes (%d-way SMP).\n", CmmEntries(hostTable), ncores);
258 sprintf(str, "Charm++> Running on %d unique compute nodes.\n", CmmEntries(hostTable));
263 while (tmpm = (hostnameMsg *)CmmGet(hostTable, 1, &tag, &tag1));
267 CmiSyncBroadcastAllAndFree(sizeof(nodeTopoMsg)+CmiNumPes()*sizeof(int), (char *)topomsg);
270 /* called on each processor */
271 static void cpuTopoRecvHandler(void *msg)
273 nodeTopoMsg *m = (nodeTopoMsg *)msg;
274 m->nodes = (int *)((char*)m + sizeof(nodeTopoMsg));
277 if (cpuTopo.nodeIDs == NULL) {
278 cpuTopo.nodeIDs = m->nodes;
286 //if (CmiMyPe() == 0) cpuTopo.print();
289 // reduction function
290 static void * combineMessage(int *size, void *data, void **remote, int count)
293 int nprocs = ((hostnameMsg *)data)->n;
294 if (count == 0) return data;
295 for (i=0; i<count; i++) nprocs += ((hostnameMsg *)remote[i])->n;
296 *size = sizeof(hostnameMsg)+sizeof(_procInfo)*nprocs;
297 hostnameMsg *msg = (hostnameMsg *)CmiAlloc(*size);
298 msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
300 CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
303 hostnameMsg *m = (hostnameMsg*)data;
304 m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg));
305 for (j=0; j<m->n; j++)
306 msg->procs[n++] = m->procs[j];
307 for (i=0; i<count; i++) {
308 m = (hostnameMsg*)remote[i];
309 m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg));
310 for (j=0; j<m->n; j++)
311 msg->procs[n++] = m->procs[j];
316 /****************** API implementation **********************/
318 extern "C" int LrtsCpuTopoEnabled()
320 return CpuTopology::supported;
323 extern "C" int LrtsPeOnSameNode(int pe1, int pe2)
325 int *nodeIDs = cpuTopo.nodeIDs;
326 if (!cpuTopo.supported || nodeIDs == NULL) return CmiNodeOf(pe1) == CmiNodeOf(pe2);
327 else return nodeIDs[pe1] == nodeIDs[pe2];
330 // return -1 when not supported
331 extern "C" int LrtsNumNodes()
333 if (!cpuTopo.supported) return CmiNumNodes();
334 else return cpuTopo.numUniqNodes();
337 extern "C" int LrtsNodeSize(int node)
339 return !cpuTopo.supported?CmiNodeSize(node):(int)cpuTopo.bynodes[node].size();
342 // pelist points to system memory, user should not free it
343 extern "C" void LrtsPeOnNode(int node, int **pelist, int *num)
345 *num = cpuTopo.bynodes[node].size();
346 if (pelist!=NULL && *num>0) *pelist = cpuTopo.bynodes[node].getVec();
349 extern "C" int LrtsRankOf(int pe)
351 if (!cpuTopo.supported) return CmiRankOf(pe);
352 const CkVec<int> &v = cpuTopo.bynodes[cpuTopo.nodeIDs[pe]];
355 while (rank < npes && v[rank] < pe) rank++; // already sorted
356 CmiAssert(v[rank] == pe);
360 extern "C" int LrtsNodeOf(int pe)
362 if (!cpuTopo.supported) return CmiNodeOf(pe);
363 return cpuTopo.nodeIDs[pe];
366 // the least number processor on the same physical node
367 extern "C" int LrtsNodeFirst(int node)
369 if (!cpuTopo.supported) return CmiNodeFirst(node);
370 return cpuTopo.bynodes[node][0];
374 static int _noip = 0;
375 extern "C" void LrtsInitCpuTopo(char **argv)
377 static skt_ip_t myip;
381 int obtain_flag = 1; // default on
382 int show_flag = 0; // default not show topology
384 if (CmiMyRank() ==0) {
385 topoLock = CmiCreateLock();
391 if(CmiGetArgFlagDesc(argv,"+obtain_cpu_topology",
392 "obtain cpu topology info"))
394 if (CmiGetArgFlagDesc(argv,"+skip_cpu_topology",
395 "skip the processof getting cpu topology info"))
397 if(CmiGetArgFlagDesc(argv,"+show_cpu_topology",
398 "Show cpu topology info"))
402 if (BgNodeRank() == 0)
406 CmiRegisterHandler((CmiHandler)cpuTopoHandler);
407 cpuTopoRecvHandlerIdx =
408 CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler);
412 if (CmiMyRank() == 0) cpuTopo.sort();
414 CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks
418 if (CmiMyPe() == 0) {
420 if (BgNodeRank() == 0)
422 startT = CmiWallTimer();
426 if (BgNodeRank() == 0)
428 //int numPes = BgNumNodes()*BgGetNumWorkThread();
429 int numPes = cpuTopo.numPes = CkNumPes();
430 cpuTopo.nodeIDs = new int[numPes];
431 CpuTopology::supported = 1;
432 int wth = BgGetNumWorkThread();
433 for (int i=0; i<numPes; i++) {
435 cpuTopo.nodeIDs[i] = nid;
448 if (gethostname(hostname, 999)!=0) {
449 strcpy(hostname, "");
452 #if CMK_BLUEGENEL || CMK_BLUEGENEP
453 if (CmiMyRank() == 0) {
456 int numPes = cpuTopo.numPes = CmiNumPes();
457 cpuTopo.nodeIDs = new int[numPes];
458 CpuTopology::supported = 1;
461 for(int i=0; i<numPes; i++) {
462 tmgr.rankToCoordinates(i, x, y, z, t);
463 nid = tmgr.coordinatesToRank(x, y, z, 0);
464 cpuTopo.nodeIDs[i] = nid;
467 if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
471 if (CmiMyRank() == 0) {
474 int numPes = cpuTopo.numPes = CmiNumPes();
475 cpuTopo.nodeIDs = new int[numPes];
476 CpuTopology::supported = 1;
478 int a, b, c, d, e, t, nid;
479 for(int i=0; i<numPes; i++) {
480 tmgr.rankToCoordinates(i, a, b, c, d, e, t);
481 nid = tmgr.coordinatesToRank(a, b, c, d, e, 0);
482 cpuTopo.nodeIDs[i] = nid;
485 if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
488 #elif CMK_CRAYXT || CMK_CRAYXE
489 if(CmiMyRank() == 0) {
490 int numPes = cpuTopo.numPes = CmiNumPes();
491 int numNodes = CmiNumNodes();
492 cpuTopo.nodeIDs = new int[numPes];
493 CpuTopology::supported = 1;
496 for(int i=0; i<numPes; i++) {
497 nid = getXTNodeID(CmiNodeOf(i), numNodes);
498 cpuTopo.nodeIDs[i] = nid;
503 // this assumes that all cores on a node have consecutive MPI rank IDs
504 // and then changes nodeIDs to 0 to numNodes-1
505 for(int i=0; i<numPes; i++) {
506 if(cpuTopo.nodeIDs[i] != prev) {
507 prev = cpuTopo.nodeIDs[i];
508 cpuTopo.nodeIDs[i] = ++nid;
511 cpuTopo.nodeIDs[i] = nid;
514 if (CmiMyPe()==0) CmiPrintf("Charm++> Running on %d unique compute nodes (%d-way SMP).\n", cpuTopo.numNodes, CmiNumCores());
520 if (CmiMyPe() >= CmiNumPes()) {
521 CmiNodeAllBarrier(); // comm thread waiting
522 #if CMK_MACHINE_PROGRESS_DEFINED
524 while (done < CmiMyNodeSize()) CmiNetworkProgress();
527 return; /* comm thread return */
530 /* get my ip address */
531 if (CmiMyRank() == 0)
533 #if CMK_HAS_GETHOSTNAME && !CMK_BLUEGENEQ
534 myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */
535 // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(), myip.data[0],myip.data[1],myip.data[2],myip.data[3]);
537 myip = skt_innode_my_ip();
540 CmiPrintf("CmiInitCPUTopology Warning: Can not get unique name for the compute nodes. \n");
544 cpuTopo.numPes = CmiNumPes();
549 /* prepare a msg to send */
550 msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg)+sizeof(_procInfo));
552 msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
553 CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
554 msg->procs[0].pe = CmiMyPe();
555 msg->procs[0].ip = myip;
556 msg->procs[0].ncores = CmiNumCores();
557 msg->procs[0].rank = 0;
558 msg->procs[0].nodeID = 0;
559 CmiReduce(msg, sizeof(hostnameMsg)+sizeof(_procInfo), combineMessage);
562 while (done != CmiMyNodeSize())
565 if (CmiMyPe() == 0) {
567 if (BgNodeRank() == 0)
569 CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n", CmiWallTimer()-startT);
573 #endif /* __BIGSIM__ */
575 // now every one should have the node info
576 CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks
577 if (CmiMyPe() == 0 && show_flag) cpuTopo.print();
580 #else /* not supporting cpu topology */
582 extern "C" void LrtsInitCpuTopo(char **argv)
585 int obtain_flag = CmiGetArgFlagDesc(argv,"+obtain_cpu_topology",
586 "obtain cpu topology info");
587 CmiGetArgFlagDesc(argv,"+skip_cpu_topology",
588 "skip the processof getting cpu topology info");
589 CmiGetArgFlagDesc(argv,"+show_cpu_topology",
590 "Show cpu topology info");
595 extern "C" int CmiCpuTopologyEnabled()
597 return LrtsCpuTopoEnabled();
599 extern "C" int CmiPeOnSamePhysicalNode(int pe1, int pe2)
601 return LrtsPeOnSameNode(pe1, pe2);
603 extern "C" int CmiNumPhysicalNodes()
605 return LrtsNumNodes();
607 extern "C" int CmiNumPesOnPhysicalNode(int node)
609 return LrtsNodeSize(node);
611 extern "C" void CmiGetPesOnPhysicalNode(int node, int **pelist, int *num)
613 LrtsPeOnNode(node, pelist, num);
615 extern "C" int CmiPhysicalRank(int pe)
617 return LrtsRankOf(pe);
619 extern "C" int CmiPhysicalNodeID(int pe)
621 return LrtsNodeOf(pe);
623 extern "C" int CmiGetFirstPeOnPhysicalNode(int node)
625 return LrtsNodeFirst(node);
627 extern "C" void CmiInitCPUTopology(char **argv)
629 LrtsInitCpuTopo(argv);