3 This scheme relies on using IP address to identify nodes and assigning
6 when CMK_NO_SOCKETS, which is typically on cray xt3 and bluegene/L.
7 There is no hostname for the compute nodes.
9 * last updated 3/20/2010 Gengbin Zheng
10 * new options +pemap +commmap takes complex pattern of a list of cores
18 #include "sockRoutines.h"
22 #include <infiniband/verbs.h>
23 #include <hwloc/openfabrics-verbs.h>
26 #define DEBUGP(x) /* CmiPrintf x; */
27 CpvDeclare(int, myCPUAffToCore);
30 * /proc/<PID>/[task/<TID>]/stat file descriptor
31 * Used to retrieve the info about which physical
32 * coer this process or thread is on.
34 CpvDeclare(void *, myProcStatFP);
37 CmiHwlocTopology CmiHwlocTopologyLocal;
39 void CmiInitHwlocTopology(void)
41 hwloc_topology_t topology;
44 /* Allocate and initialize topology object. */
45 cmi_hwloc_topology_init(&topology);
46 /* Perform the topology detection. */
47 cmi_hwloc_topology_load(topology);
49 // packages == sockets
50 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE);
51 CmiHwlocTopologyLocal.num_sockets = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
53 // ignore BG/Q's reserved socket
54 if (CmiHwlocTopologyLocal.num_sockets == 17)
55 CmiHwlocTopologyLocal.num_sockets = 16;
59 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
60 CmiHwlocTopologyLocal.num_cores = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
63 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
64 CmiHwlocTopologyLocal.num_pus = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
66 cmi_hwloc_topology_destroy(topology);
69 #if CMK_HAS_SETAFFINITY || defined (_WIN32) || CMK_HAS_BINDPROCESSOR
81 //long sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
82 //long sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
86 #include <sys/syscall.h>
89 #if defined(__APPLE__)
90 #include <Carbon/Carbon.h> /* Carbon APIs for Multiprocessing */
93 #define MAX_EXCLUDE 64
94 static int excludecore[MAX_EXCLUDE] = {-1};
95 static int excludecount = 0;
97 static int affinity_doneflag = 0;
100 static int affMsgsRecvd = 1; // number of affinity messages received at PE0
101 static cpu_set_t core_usage; // used to record union of CPUs used by every PE in physical node
102 static int aff_is_set = 0;
105 static int in_exclude(int core)
108 for (i=0; i<excludecount; i++) if (core == excludecore[i]) return 1;
112 static void add_exclude(int core)
114 if (in_exclude(core)) return;
115 CmiAssert(excludecount < MAX_EXCLUDE);
116 excludecore[excludecount++] = core;
119 #if CMK_HAS_BINDPROCESSOR
120 #include <sys/processor.h>
123 static int set_process_affinity(hwloc_topology_t topology, hwloc_cpuset_t cpuset)
126 HANDLE process = GetCurrentProcess();
128 pid_t process = getpid();
131 if (cmi_hwloc_set_proc_cpubind(topology, process, cpuset, HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_STRICT))
135 cmi_hwloc_bitmap_asprintf(&str, cpuset);
136 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str, strerror(error));
142 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
145 cmi_hwloc_bitmap_asprintf(&str, cpuset);
146 CmiPrintf("HWLOC> [%d] Process %p bound to cpuset: %s\n", CmiMyPe(), process, str);
155 static int set_thread_affinity(hwloc_topology_t topology, hwloc_cpuset_t cpuset)
158 HANDLE thread = GetCurrentThread();
160 pthread_t thread = pthread_self();
163 if (cmi_hwloc_set_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT))
167 cmi_hwloc_bitmap_asprintf(&str, cpuset);
168 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str, strerror(error));
174 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
177 cmi_hwloc_bitmap_asprintf(&str, cpuset);
178 CmiPrintf("HWLOC> [%d] Thread %p bound to cpuset: %s\n", CmiMyPe(), thread, str);
188 int CmiSetCPUAffinity(int mycore)
192 core = CmiNumCores() + core;
195 CmiError("Error: Invalid cpu affinity core number: %d\n", mycore);
196 CmiAbort("CmiSetCPUAffinity failed");
199 CpvAccess(myCPUAffToCore) = core;
201 hwloc_topology_t topology;
203 cmi_hwloc_topology_init(&topology);
204 cmi_hwloc_topology_load(topology);
206 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
207 cmi_hwloc_bitmap_set(cpuset, core);
210 set_thread_affinity(topology, cpuset);
212 set_process_affinity(topology, cpuset);
215 cmi_hwloc_bitmap_free(cpuset);
217 cmi_hwloc_topology_destroy(topology);
221 /* This implementation assumes the default x86 CPU mask size used by Linux */
222 /* For a large SMP machine, this code should be changed to use a variable sized */
223 /* CPU affinity mask buffer instead, as the present code will fail beyond 32 CPUs */
224 int print_cpu_affinity(void) {
225 hwloc_topology_t topology;
226 // Allocate and initialize topology object.
227 cmi_hwloc_topology_init(&topology);
228 // Perform the topology detection.
229 cmi_hwloc_topology_load(topology);
231 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
232 // And try to bind ourself there. */
233 if (cmi_hwloc_get_cpubind(topology, cpuset, 0)) {
235 CmiPrintf("[%d] CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error));
236 cmi_hwloc_bitmap_free(cpuset);
237 cmi_hwloc_topology_destroy(topology);
242 cmi_hwloc_bitmap_asprintf(&str, cpuset);
243 CmiPrintf("[%d] CPU affinity mask is %s\n", CmiMyPe(), str);
245 cmi_hwloc_bitmap_free(cpuset);
246 cmi_hwloc_topology_destroy(topology);
251 int print_thread_affinity(void) {
252 hwloc_topology_t topology;
253 // Allocate and initialize topology object.
254 cmi_hwloc_topology_init(&topology);
255 // Perform the topology detection.
256 cmi_hwloc_topology_load(topology);
259 HANDLE thread = GetCurrentThread();
261 pthread_t thread = pthread_self();
264 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
265 // And try to bind ourself there. */
266 // if (cmi_hwloc_get_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD)) {
267 if (cmi_hwloc_get_cpubind(topology, cpuset, HWLOC_CPUBIND_THREAD) == -1) {
269 CmiPrintf("[%d] thread CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error));
270 cmi_hwloc_bitmap_free(cpuset);
271 cmi_hwloc_topology_destroy(topology);
276 cmi_hwloc_bitmap_asprintf(&str, cpuset);
277 CmiPrintf("[%d] thread CPU affinity mask is %s\n", CmiMyPe(), str);
279 cmi_hwloc_bitmap_free(cpuset);
280 cmi_hwloc_topology_destroy(topology);
286 int CmiPrintCPUAffinity(void)
289 return print_thread_affinity();
291 return print_cpu_affinity();
296 int get_cpu_affinity(cpu_set_t *cpuset) {
298 if (sched_getaffinity(0, sizeof(cpuset), cpuset) < 0) {
299 perror("sched_getaffinity");
306 int get_thread_affinity(cpu_set_t *cpuset) {
307 #if CMK_HAS_PTHREAD_SETAFFINITY
309 if (errno = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset)) {
310 perror("pthread_getaffinity");
320 int get_affinity(cpu_set_t *cpuset) {
322 return get_thread_affinity(cpuset);
324 return get_cpu_affinity(cpuset);
329 int CmiOnCore(void) {
332 * The info (task_cpu) is read from the Linux /proc virtual file system.
333 * The /proc/<PID>/[task/<TID>]/stat is explained in the Linux
334 * kernel documentation. The online one could be found in:
335 * http://www.mjmwired.net/kernel/Documentation/filesystems/proc.txt
336 * Based on the documentation, task_cpu is found at the 39th field in
339 #define TASK_CPU_POS (39)
342 FILE *fp = (FILE *)CpvAccess(myProcStatFP);
344 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
347 fseek(fp, 0, SEEK_SET);
348 for (n=0; n<TASK_CPU_POS; n++) {
349 if (fscanf(fp, "%127s", str) != 1) {
350 CmiAbort("CPU affinity> reading from /proc/<PID>/[task/<TID>]/stat failed!");
355 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
361 static int cpuAffinityHandlerIdx;
362 static int cpuAffinityRecvHandlerIdx;
363 static int cpuPhyNodeAffinityRecvHandlerIdx;
365 typedef struct _hostnameMsg {
366 char core[CmiMsgHeaderSizeBytes];
374 typedef struct _rankMsg {
375 char core[CmiMsgHeaderSizeBytes];
376 int *ranks; /* PE => core rank mapping */
377 int *nodes; /* PE => node number mapping */
380 typedef struct _affMsg {
381 char core[CmiMsgHeaderSizeBytes];
387 static rankMsg *rankmsg = NULL;
388 static CmmTable hostTable;
389 static CmiNodeLock affLock = 0;
392 static void cpuAffinityHandler(void *m)
394 static int count = 0;
395 static int nodecount = 0;
397 hostnameMsg *msg = (hostnameMsg *)m;
399 int tag, tag1, pe, myrank;
400 int npes = CmiNumPes();
404 skt_print_ip(str, msg->ip);
405 printf("hostname: %d %s\n", msg->pe, str);
407 CmiAssert(CmiMyPe()==0 && rankmsg != NULL);
408 tag = *(int*)&msg->ip;
410 if ((rec = (hostnameMsg *)CmmProbe(hostTable, 1, &tag, &tag1)) != NULL) {
415 rec->seq = nodecount;
416 nodecount++; /* a new node record */
417 CmmPut(hostTable, 1, &tag, msg);
419 myrank = rec->rank%rec->ncores;
420 while (in_exclude(myrank)) { /* skip excluded core */
421 myrank = (myrank+1)%rec->ncores;
424 rankmsg->ranks[pe] = myrank; /* core rank */
425 rankmsg->nodes[pe] = rec->seq; /* on which node */
428 if (count == CmiNumPes()) {
429 DEBUGP(("Cpuaffinity> %d unique compute nodes detected! \n", CmmEntries(hostTable)));
431 while ((tmpm = CmmGet(hostTable, 1, &tag, &tag1))) CmiFree(tmpm);
434 /* bubble sort ranks on each node according to the PE number */
437 for (i=0; i<npes-1; i++)
438 for(j=i+1; j<npes; j++) {
439 if (rankmsg->nodes[i] == rankmsg->nodes[j] &&
440 rankmsg->ranks[i] > rankmsg->ranks[j])
442 int tmp = rankmsg->ranks[i];
443 rankmsg->ranks[i] = rankmsg->ranks[j];
444 rankmsg->ranks[j] = tmp;
449 CmiSyncBroadcastAllAndFree(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2, (void *)rankmsg);
453 /* called on each processor */
454 static void cpuAffinityRecvHandler(void *msg)
457 rankMsg *m = (rankMsg *)msg;
458 m->ranks = (int *)((char*)m + sizeof(rankMsg));
459 m->nodes = (int *)((char*)m + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
460 myrank = m->ranks[CmiMyPe()];
461 mynode = m->nodes[CmiMyPe()];
463 DEBUGP(("[%d %d] set to core #: %d\n", CmiMyNode(), CmiMyPe(), myrank));
465 if (-1 != CmiSetCPUAffinity(myrank)) {
466 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
469 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
470 CmiAbort("set cpu affinity abort!\n");
475 /* called on first PE in physical node, receive affinity set from other PEs in phy node */
476 static void cpuPhyNodeAffinityRecvHandler(void *msg)
478 affMsg *m = (affMsg *)msg;
479 #if !defined(_WIN32) && defined(CPU_OR)
480 CPU_OR(&core_usage, &core_usage, &m->affinity);
487 /* strtok is thread safe in VC++ */
488 #define strtok_r(x,y,z) strtok(x,y)
491 static int search_pemap(char *pecoremap, int pe)
493 int *map = (int *)malloc(CmiNumPesGlobal()*sizeof(int));
495 int h, i, j, k, count;
499 char *mapstr = (char*)malloc(strlen(pecoremap)+1);
500 strcpy(mapstr, pecoremap);
502 str = strtok_r(mapstr, ",", &ptr);
504 while (str && count < CmiNumPesGlobal())
506 int hasdash=0, hascolon=0, hasdot=0, hasstar1=0, hasstar2=0, numplus=0;
507 int start, end, stride=1, block=1;
510 for (i=0; i<strlen(str); i++) {
511 if (str[i] == '-' && i!=0) hasdash=1;
512 else if (str[i] == ':') hascolon=1;
513 else if (str[i] == '.') hasdot=1;
514 else if (str[i] == 'x') hasstar1=1;
515 else if (str[i] == 'X') hasstar2=1;
516 else if (str[i] == '+') {
517 if (str[i+1] == '+' || str[i+1] == '-') {
518 printf("Warning: Check the format of \"%s\".\n", str);
519 } else if (sscanf(&str[i], "+%d", &plusarr[++numplus]) != 1) {
520 printf("Warning: Check the format of \"%s\".\n", str);
525 if (hasstar1 || hasstar2) {
526 if (hasstar1) sscanf(str, "%dx", &iter);
527 if (hasstar2) sscanf(str, "%dX", &iter);
528 while (*str!='x' && *str!='X') str++;
534 if (sscanf(str, "%d-%d:%d.%d", &start, &end, &stride, &block) != 4)
535 printf("Warning: Check the format of \"%s\".\n", str);
538 if (sscanf(str, "%d-%d:%d", &start, &end, &stride) != 3)
539 printf("Warning: Check the format of \"%s\".\n", str);
543 if (sscanf(str, "%d-%d", &start, &end) != 2)
544 printf("Warning: Check the format of \"%s\".\n", str);
548 sscanf(str, "%d", &start);
551 if (block > stride) {
552 printf("Warning: invalid block size in \"%s\" ignored.\n", str);
555 //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus);
556 for (k = 0; k<iter; k++) {
557 for (i = start; i<=end; i+=stride) {
558 for (j=0; j<block; j++) {
560 for (h=0; h<=numplus; h++) {
561 map[count++] = i+j+plusarr[h];
562 if (count == CmiNumPesGlobal()) break;
564 if (count == CmiNumPesGlobal()) break;
566 if (count == CmiNumPesGlobal()) break;
568 if (count == CmiNumPesGlobal()) break;
570 str = strtok_r(NULL, ",", &ptr);
579 #if CMK_CRAYXE || CMK_CRAYXC
580 CLINKAGE int getXTNodeID(int mpirank, int nummpiranks);
584 * Check that there are not multiple PEs assigned to the same core.
585 * If a pemap has been computed by this module (or passed by the user) this
586 * function will print a warning if oversubscription detected. If no affinity
587 * has been set explicitly by this module, it will print error and abort if
588 * oversubscription detected.
590 void CmiCheckAffinity(void)
592 #if !defined(_WIN32) && CMK_SMP && CMK_HAS_PTHREAD_SETAFFINITY && defined(CPU_OR)
594 if (!CmiCpuTopologyEnabled()) return; // only works if cpu topology enabled
596 if (CmiMyPe() == 0) {
597 // wait for every PE affinity from my physical node (for now only done on phy node 0)
600 if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n");
601 CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0)
602 int N = CmiNumPesOnPhysicalNode(0);
603 while (affMsgsRecvd < N)
604 CmiDeliverSpecificMsg(cpuPhyNodeAffinityRecvHandlerIdx);
606 // NOTE this test is simple and may not detect every possible case of
608 if (CPU_COUNT(&core_usage) < N) {
609 // TODO suggest command line arguments?
611 CmiAbort("Multiple PEs assigned to same core. Set affinity "
612 "options to correct or lower the number of threads, or pass +setcpuaffinity to ignore.\n");
614 CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend "
615 "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
619 } else if ((CmiMyPe() < CmiNumPes()) && (CmiPhysicalNodeID(CmiMyPe()) == 0)) {
620 // send my affinity to first PE on physical node (only done on phy node 0 for now)
621 affMsg *m = (affMsg*)CmiAlloc(sizeof(affMsg));
622 CmiSetHandler((char *)m, cpuPhyNodeAffinityRecvHandlerIdx);
623 if (get_affinity(&m->affinity) == -1) { // put my affinity in msg
625 CmiAbort("get_affinity failed\n");
627 CmiSyncSendAndFree(0, sizeof(affMsg), (void *)m);
632 extern int CmiMyLocalRank;
634 static void bind_process_only(hwloc_obj_type_t process_unit)
636 hwloc_topology_t topology;
637 hwloc_cpuset_t cpuset;
638 cmi_hwloc_topology_init(&topology);
639 cmi_hwloc_topology_load(topology);
642 int process_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, process_unit);
644 // ignore BG/Q's reserved socket
645 if (process_unit == HWLOC_OBJ_PACKAGE && process_unitcount == 17)
646 process_unitcount = 16;
649 int process_assignment = CmiMyLocalRank % process_unitcount;
651 hwloc_obj_t process_obj = cmi_hwloc_get_obj_by_type(topology, process_unit, process_assignment);
652 set_process_affinity(topology, process_obj->cpuset);
655 cmi_hwloc_topology_destroy(topology);
659 static void bind_threads_only(hwloc_obj_type_t thread_unit)
661 hwloc_topology_t topology;
662 hwloc_cpuset_t cpuset;
663 cmi_hwloc_topology_init(&topology);
664 cmi_hwloc_topology_load(topology);
667 int thread_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, thread_unit);
669 // ignore BG/Q's reserved socket
670 if (thread_unit == HWLOC_OBJ_PACKAGE && thread_unitcount == 17)
671 thread_unitcount = 16;
674 int thread_assignment = CmiMyRank() % thread_unitcount;
676 hwloc_obj_t thread_obj = cmi_hwloc_get_obj_by_type(topology, thread_unit, thread_assignment);
677 hwloc_cpuset_t thread_cpuset = cmi_hwloc_bitmap_dup(thread_obj->cpuset);
678 cmi_hwloc_bitmap_singlify(thread_cpuset);
679 set_thread_affinity(topology, thread_cpuset);
680 cmi_hwloc_bitmap_free(thread_cpuset);
683 cmi_hwloc_topology_destroy(topology);
686 static void bind_process_and_threads(hwloc_obj_type_t process_unit, hwloc_obj_type_t thread_unit)
688 hwloc_topology_t topology;
689 hwloc_cpuset_t cpuset;
690 cmi_hwloc_topology_init(&topology);
691 cmi_hwloc_topology_load(topology);
694 int process_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, process_unit);
696 int process_assignment = CmiMyLocalRank % process_unitcount;
698 hwloc_obj_t process_obj = cmi_hwloc_get_obj_by_type(topology, process_unit, process_assignment);
699 set_process_affinity(topology, process_obj->cpuset);
701 int thread_unitcount = cmi_hwloc_get_nbobjs_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit);
703 int thread_assignment = CmiMyRank() % thread_unitcount;
705 hwloc_obj_t thread_obj = cmi_hwloc_get_obj_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit, thread_assignment);
706 hwloc_cpuset_t thread_cpuset = cmi_hwloc_bitmap_dup(thread_obj->cpuset);
707 cmi_hwloc_bitmap_singlify(thread_cpuset);
708 set_thread_affinity(topology, thread_cpuset);
709 cmi_hwloc_bitmap_free(thread_cpuset);
712 cmi_hwloc_topology_destroy(topology);
716 static int set_default_affinity(void)
721 if ((s = getenv("CmiProcessPerSocket")))
725 if (getenv("CmiOneWthPerCore"))
726 bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_CORE);
727 else if (getenv("CmiOneWthPerPU"))
728 bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PU);
731 bind_process_only(HWLOC_OBJ_PACKAGE);
733 else if ((s = getenv("CmiProcessPerCore")))
737 if (getenv("CmiOneWthPerPU"))
738 bind_process_and_threads(HWLOC_OBJ_CORE, HWLOC_OBJ_PU);
741 bind_process_only(HWLOC_OBJ_CORE);
743 else if ((s = getenv("CmiProcessPerPU")))
746 bind_process_only(HWLOC_OBJ_PU);
748 else // if ((s = getenv("CmiProcessPerHost")))
751 if (getenv("CmiOneWthPerSocket"))
754 bind_threads_only(HWLOC_OBJ_PACKAGE);
756 else if (getenv("CmiOneWthPerCore"))
759 bind_threads_only(HWLOC_OBJ_CORE);
761 else if (getenv("CmiOneWthPerPU"))
764 bind_threads_only(HWLOC_OBJ_PU);
772 void CmiInitCPUAffinity(char **argv)
774 static skt_ip_t myip;
779 char *pemapfile = NULL;
781 int show_affinity_flag;
783 int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
786 while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity")) {
787 if (CmiMyRank() == 0) add_exclude(exclude);
791 if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) {
794 pemap = (char*)malloc(1024);
795 fp = fopen(pemapfile, "r");
796 if (fp == NULL) CmiAbort("pemapfile does not exist");
798 if (fgets(buf, 128, fp)) {
799 if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0;
804 if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap);
807 CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
808 if (pemap!=NULL && excludecount>0)
809 CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");
811 CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");
813 if (pemap!=NULL || commap!=NULL) affinity_flag = 1;
815 show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity");
817 CmiAssignOnce(&cpuAffinityHandlerIdx, CmiRegisterHandler((CmiHandler)cpuAffinityHandler));
818 CmiAssignOnce(&cpuAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler));
819 CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler));
826 /* must bind the rank 0 which is the main thread first */
827 /* binding the main thread seems to change binding for all threads */
828 if (CmiMyRank() == 0) {
829 done = set_default_affinity();
834 if (CmiMyRank() != 0) {
835 done = set_default_affinity();
839 if (show_affinity_flag) CmiPrintCPUAffinity();
844 if (CmiMyRank() ==0) {
845 affLock = CmiCreateLock();
847 aff_is_set = affinity_flag;
848 CPU_ZERO(&core_usage);
855 if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene/Q, thus ignored.\n");
857 if(show_affinity_flag){
858 show_affinity_flag = 0;
859 if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene/Q.\n");
863 if (!affinity_flag) {
864 if (show_affinity_flag) {
865 CmiPrintCPUAffinity();
866 CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
871 if (CmiMyPe() == 0) {
872 CmiPrintf("Charm++> cpu affinity enabled. \n");
873 if (excludecount > 0) {
874 CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]);
875 for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]);
879 CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap);
882 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
883 /* comm thread either can float around, or pin down to the last rank.
884 however it seems to be reportedly slower if it is floating */
886 if (commap != NULL) {
887 int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal());
888 if (CmiPhysicalNodeID(CmiMyPe()) == 0) CmiPrintf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore);
889 if (-1 == CmiSetCPUAffinity(mycore))
890 CmiAbort("set_cpu_affinity abort!");
892 if (show_affinity_flag) CmiPrintCPUAffinity();
893 return; /* comm thread return */
896 /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
897 #if !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8
899 #if CMK_MACHINE_PROGRESS_DEFINED
900 while (affinity_doneflag < CmiMyNodeSize()) CmiNetworkProgress();
903 #error "Machine progress call needs to be implemented for cpu affinity!"
908 #if CMK_CRAYXE || CMK_CRAYXC
909 /* if both pemap and commmap are NULL, will compute one */
914 if (show_affinity_flag) CmiPrintCPUAffinity();
915 return; /* comm thread return */
920 if (pemap != NULL && CmiMyPe()<CmiNumPes()) { /* work thread */
921 int mycore = search_pemap(pemap, CmiMyPeGlobal());
922 if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore);
923 if (mycore >= CmiNumCores()) {
924 CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1);
925 CmiAbort("Invalid core number");
927 if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!");
930 /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
934 #if CMK_CRAYXE || CMK_CRAYXC
936 int numCores = CmiNumCores();
938 int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
940 int pe, mype = CmiMyPeGlobal();
941 int node = CmiMyNodeGlobal();
944 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
945 int node = CmiMyPe() - CmiNumPes();
946 mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
947 node = CmiGetNodeGlobal(node, CmiMyPartition());
952 int n = CmiNodeOf(pe);
953 if (n != node) { nnodes++; node = n; }
954 if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break;
957 CmiAssert(numCores > 0);
958 myrank = (mype - pe - 1 + nnodes)%numCores;
960 if (CmiMyPe() >= CmiNumPes())
961 myrank = (myrank + 1)%numCores;
964 if (-1 != CmiSetCPUAffinity(myrank)) {
965 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
968 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
969 CmiAbort("set cpu affinity abort!\n");
972 if (CmiMyPe() < CmiNumPes())
976 /* get my ip address */
977 if (CmiMyRank() == 0)
979 #if CMK_HAS_GETHOSTNAME
980 myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */
982 CmiAbort("Can not get unique name for the compute nodes. \n");
987 /* prepare a msg to send */
988 msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg));
989 CmiSetHandler((char *)msg, cpuAffinityHandlerIdx);
992 msg->ncores = CmiNumCores();
993 DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores));
995 CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg);
997 if (CmiMyPe() == 0) {
999 hostTable = CmmNew();
1000 rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2);
1001 CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx);
1002 rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg));
1003 rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
1004 for (i=0; i<CmiNumPes(); i++) {
1005 rankmsg->ranks[i] = 0;
1006 rankmsg->nodes[i] = -1;
1009 for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx);
1012 /* receive broadcast from PE 0 */
1013 CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx);
1015 affinity_doneflag++;
1017 CmiNodeAllBarrier();
1020 if (show_affinity_flag) CmiPrintCPUAffinity();
1023 /* called in ConverseCommonInit to initialize basic variables */
1024 void CmiInitCPUAffinityUtil(void){
1026 CpvInitialize(int, myCPUAffToCore);
1027 CpvAccess(myCPUAffToCore) = -1;
1029 CpvInitialize(void *, myProcStatFP);
1030 CmiLock(_smp_mutex);
1032 sprintf(fname, "/proc/%d/task/%ld/stat", getpid(), syscall(SYS_gettid));
1034 sprintf(fname, "/proc/%d/stat", getpid());
1036 CpvAccess(myProcStatFP) = (void *)fopen(fname, "r");
1037 CmiUnlock(_smp_mutex);
1039 if(CmiMyPe()==0 && CpvAccess(myProcStatFP) == NULL){
1040 CmiPrintf("WARNING: ERROR IN OPENING FILE %s on PROC %d, CmiOnCore() SHOULDN'T BE CALLED\n", fname, CmiMyPe());
1046 #else /* not supporting affinity */
1048 int CmiSetCPUAffinity(int mycore)
1053 int CmiPrintCPUAffinity(void)
1055 CmiPrintf("Warning: CmiPrintCPUAffinity not supported.\n");
1059 void CmiCheckAffinity(void) {
1062 void CmiInitCPUAffinity(char **argv)
1065 char *pemapfile = NULL;
1066 char *commap = NULL;
1067 int excludecore = -1;
1068 int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
1069 "set cpu affinity");
1070 while (CmiGetArgIntDesc(argv,"+excludecore",&excludecore, "avoid core when setting cpuaffinity"));
1071 CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
1072 CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file");
1073 CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");
1074 CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity");
1075 if (affinity_flag && CmiMyPe()==0)
1076 CmiPrintf("sched_setaffinity() is not supported, +setcpuaffinity disabled.\n");
1077 if (excludecore != -1 && CmiMyPe()==0)
1078 CmiPrintf("sched_setaffinity() is not supported, +excludecore disabled.\n");
1079 if (pemap && CmiMyPe()==0)
1080 CmiPrintf("sched_setaffinity() is not supported, +pemap disabled.\n");
1081 if (pemapfile && CmiMyPe()==0)
1082 CmiPrintf("sched_setaffinity() is not supported, +pemapfile disabled.\n");
1083 if (commap && CmiMyPe()==0)
1084 CmiPrintf("sched_setaffinity() is not supported, +commap disabled.\n");
1087 /* called in ConverseCommonInit to initialize basic variables */
1088 void CmiInitCPUAffinityUtil(void){
1089 CpvInitialize(int, myCPUAffToCore);
1090 CpvAccess(myCPUAffToCore) = -1;
1092 CpvInitialize(void *, myProcStatFP);
1093 CpvAccess(myProcStatFP) = NULL;
1097 int CmiOnCore(void){
1098 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");