build: Fix --no-build-shared option to properly cancel --build-shared
[charm.git] / src / conv-core / cpuaffinity.c
blob95b955d563e35d36940b2050a904e4928e6c33b6
2 /*
3 This scheme relies on using IP address to identify nodes and assigning
4 cpu affinity.
6 when CMK_NO_SOCKETS, which is typically on cray xt3 and bluegene/L.
7 There is no hostname for the compute nodes.
9 * last updated 3/20/2010 Gengbin Zheng
10 * new options +pemap +commmap takes complex pattern of a list of cores
13 #define _GNU_SOURCE
15 #include "converse.h"
16 #include "sockRoutines.h"
17 #include "hwloc.h"
19 #if CMK_USE_IBVERBS
20 #include <infiniband/verbs.h>
21 #include <hwloc/openfabrics-verbs.h>
22 #endif
24 #define DEBUGP(x) /* CmiPrintf x; */
25 CpvDeclare(int, myCPUAffToCore);
26 #if CMK_OS_IS_LINUX
27 /*
28 * /proc/<PID>/[task/<TID>]/stat file descriptor
29 * Used to retrieve the info about which physical
30 * coer this process or thread is on.
31 **/
32 CpvDeclare(void *, myProcStatFP);
33 #endif
35 CmiHwlocTopology CmiHwlocTopologyLocal;
37 void CmiInitHwlocTopology(void)
39 hwloc_topology_t topology;
40 int depth;
42 /* Allocate and initialize topology object. */
43 cmi_hwloc_topology_init(&topology);
44 /* Perform the topology detection. */
45 cmi_hwloc_topology_load(topology);
47 // packages == sockets
48 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE);
49 CmiHwlocTopologyLocal.num_sockets = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
50 #if CMK_BLUEGENEQ
51 // ignore BG/Q's reserved socket
52 if (CmiHwlocTopologyLocal.num_sockets == 17)
53 CmiHwlocTopologyLocal.num_sockets = 16;
54 #endif
56 // cores
57 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
58 CmiHwlocTopologyLocal.num_cores = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
60 // PUs
61 depth = cmi_hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
62 CmiHwlocTopologyLocal.num_pus = depth != HWLOC_TYPE_DEPTH_UNKNOWN ? cmi_hwloc_get_nbobjs_by_depth(topology, depth) : 1;
64 cmi_hwloc_topology_destroy(topology);
67 #if CMK_HAS_SETAFFINITY || defined (_WIN32) || CMK_HAS_BINDPROCESSOR
69 #include <stdlib.h>
70 #include <stdio.h>
71 #include <unistd.h>
72 #include <errno.h>
74 #ifdef _WIN32
75 #include <windows.h>
76 #include <winbase.h>
77 #else
78 #define _GNU_SOURCE
79 #include <sched.h>
80 //long sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
81 //long sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
82 #endif
84 #if CMK_OS_IS_LINUX
85 #include <sys/syscall.h>
86 #endif
88 #if defined(__APPLE__)
89 #include <Carbon/Carbon.h> /* Carbon APIs for Multiprocessing */
90 #endif
92 #define MAX_EXCLUDE 64
93 static int excludecore[MAX_EXCLUDE] = {-1};
94 static int excludecount = 0;
96 static int affinity_doneflag = 0;
98 #ifndef _WIN32
99 static int affMsgsRecvd = 1; // number of affinity messages received at PE0
100 static cpu_set_t core_usage; // used to record union of CPUs used by every PE in physical node
101 static int aff_is_set = 0;
102 #endif
104 static int in_exclude(int core)
106 int i;
107 for (i=0; i<excludecount; i++) if (core == excludecore[i]) return 1;
108 return 0;
111 static void add_exclude(int core)
113 if (in_exclude(core)) return;
114 CmiAssert(excludecount < MAX_EXCLUDE);
115 excludecore[excludecount++] = core;
118 #if CMK_HAS_BINDPROCESSOR
119 #include <sys/processor.h>
120 #endif
122 static int set_process_affinity(hwloc_topology_t topology, hwloc_cpuset_t cpuset)
124 #ifdef _WIN32
125 HANDLE process = GetCurrentProcess();
126 #else
127 pid_t process = getpid();
128 #endif
130 if (cmi_hwloc_set_proc_cpubind(topology, process, cpuset, HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_STRICT))
132 char *str;
133 int error = errno;
134 cmi_hwloc_bitmap_asprintf(&str, cpuset);
135 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str, strerror(error));
136 free(str);
137 return -1;
140 #if CMK_CHARMDEBUG
141 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
143 char *str;
144 cmi_hwloc_bitmap_asprintf(&str, cpuset);
145 CmiPrintf("HWLOC> [%d] Process %p bound to cpuset: %s\n", CmiMyPe(), process, str);
146 free(str);
148 #endif
150 return 0;
153 #if CMK_SMP
154 static int set_thread_affinity(hwloc_topology_t topology, hwloc_cpuset_t cpuset)
156 #ifdef _WIN32
157 HANDLE thread = GetCurrentThread();
158 #else
159 pthread_t thread = pthread_self();
160 #endif
162 if (cmi_hwloc_set_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT))
164 char *str;
165 int error = errno;
166 cmi_hwloc_bitmap_asprintf(&str, cpuset);
167 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str, strerror(error));
168 free(str);
169 return -1;
172 #if CMK_CHARMDEBUG
173 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
175 char *str;
176 cmi_hwloc_bitmap_asprintf(&str, cpuset);
177 CmiPrintf("HWLOC> [%d] Thread %p bound to cpuset: %s\n", CmiMyPe(), thread, str);
178 free(str);
180 #endif
182 return 0;
184 #endif
187 int CmiSetCPUAffinity(int mycore)
189 int core = mycore;
190 if (core < 0) {
191 core = CmiNumCores() + core;
193 if (core < 0) {
194 CmiError("Error: Invalid cpu affinity core number: %d\n", mycore);
195 CmiAbort("CmiSetCPUAffinity failed");
198 CpvAccess(myCPUAffToCore) = core;
200 hwloc_topology_t topology;
202 cmi_hwloc_topology_init(&topology);
203 cmi_hwloc_topology_load(topology);
205 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
206 cmi_hwloc_bitmap_set(cpuset, core);
208 #if CMK_SMP
209 set_thread_affinity(topology, cpuset);
210 #else
211 set_process_affinity(topology, cpuset);
212 #endif
214 cmi_hwloc_bitmap_free(cpuset);
216 cmi_hwloc_topology_destroy(topology);
217 return 0;
220 /* This implementation assumes the default x86 CPU mask size used by Linux */
221 /* For a large SMP machine, this code should be changed to use a variable sized */
222 /* CPU affinity mask buffer instead, as the present code will fail beyond 32 CPUs */
223 int print_cpu_affinity(void) {
224 hwloc_topology_t topology;
225 // Allocate and initialize topology object.
226 cmi_hwloc_topology_init(&topology);
227 // Perform the topology detection.
228 cmi_hwloc_topology_load(topology);
230 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
231 // And try to bind ourself there. */
232 if (cmi_hwloc_get_cpubind(topology, cpuset, 0)) {
233 int error = errno;
234 CmiPrintf("[%d] CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error));
235 cmi_hwloc_bitmap_free(cpuset);
236 cmi_hwloc_topology_destroy(topology);
237 return -1;
240 char *str;
241 cmi_hwloc_bitmap_asprintf(&str, cpuset);
242 CmiPrintf("[%d] CPU affinity mask is %s\n", CmiMyPe(), str);
243 free(str);
244 cmi_hwloc_bitmap_free(cpuset);
245 cmi_hwloc_topology_destroy(topology);
246 return 0;
249 #if CMK_SMP
250 int print_thread_affinity(void) {
251 hwloc_topology_t topology;
252 // Allocate and initialize topology object.
253 cmi_hwloc_topology_init(&topology);
254 // Perform the topology detection.
255 cmi_hwloc_topology_load(topology);
257 #ifdef _WIN32
258 HANDLE thread = GetCurrentThread();
259 #else
260 pthread_t thread = pthread_self();
261 #endif
263 hwloc_cpuset_t cpuset = cmi_hwloc_bitmap_alloc();
264 // And try to bind ourself there. */
265 // if (cmi_hwloc_get_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD)) {
266 if (cmi_hwloc_get_cpubind(topology, cpuset, HWLOC_CPUBIND_THREAD) == -1) {
267 int error = errno;
268 CmiPrintf("[%d] thread CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error));
269 cmi_hwloc_bitmap_free(cpuset);
270 cmi_hwloc_topology_destroy(topology);
271 return -1;
274 char *str;
275 cmi_hwloc_bitmap_asprintf(&str, cpuset);
276 CmiPrintf("[%d] thread CPU affinity mask is %s\n", CmiMyPe(), str);
277 free(str);
278 cmi_hwloc_bitmap_free(cpuset);
279 cmi_hwloc_topology_destroy(topology);
280 return 0;
283 #endif
285 int CmiPrintCPUAffinity(void)
287 #if CMK_SMP
288 return print_thread_affinity();
289 #else
290 return print_cpu_affinity();
291 #endif
294 #ifndef _WIN32
295 int get_cpu_affinity(cpu_set_t *cpuset) {
296 CPU_ZERO(cpuset);
297 if (sched_getaffinity(0, sizeof(cpuset), cpuset) < 0) {
298 perror("sched_getaffinity");
299 return -1;
301 return 0;
304 #if CMK_SMP
305 int get_thread_affinity(cpu_set_t *cpuset) {
306 #if CMK_HAS_PTHREAD_SETAFFINITY
307 CPU_ZERO(cpuset);
308 if (errno = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset)) {
309 perror("pthread_getaffinity");
310 return -1;
312 return 0;
313 #else
314 return -1;
315 #endif
317 #endif
319 int get_affinity(cpu_set_t *cpuset) {
320 #if CMK_SMP
321 return get_thread_affinity(cpuset);
322 #else
323 return get_cpu_affinity(cpuset);
324 #endif
326 #endif
328 int CmiOnCore(void) {
329 #if CMK_OS_IS_LINUX
331 * The info (task_cpu) is read from the Linux /proc virtual file system.
332 * The /proc/<PID>/[task/<TID>]/stat is explained in the Linux
333 * kernel documentation. The online one could be found in:
334 * http://www.mjmwired.net/kernel/Documentation/filesystems/proc.txt
335 * Based on the documentation, task_cpu is found at the 39th field in
336 * the stat file.
338 #define TASK_CPU_POS (39)
339 int n;
340 char str[128];
341 FILE *fp = (FILE *)CpvAccess(myProcStatFP);
342 if (fp == NULL){
343 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
344 return -1;
346 fseek(fp, 0, SEEK_SET);
347 for (n=0; n<TASK_CPU_POS; n++) {
348 if (fscanf(fp, "%127s", str) != 1) {
349 CmiAbort("CPU affinity> reading from /proc/<PID>/[task/<TID>]/stat failed!");
352 return atoi(str);
353 #else
354 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
355 return -1;
356 #endif
360 static int cpuAffinityHandlerIdx;
361 static int cpuAffinityRecvHandlerIdx;
362 static int cpuPhyNodeAffinityRecvHandlerIdx;
364 typedef struct _hostnameMsg {
365 char core[CmiMsgHeaderSizeBytes];
366 int pe;
367 skt_ip_t ip;
368 int ncores;
369 int rank;
370 int seq;
371 } hostnameMsg;
373 typedef struct _rankMsg {
374 char core[CmiMsgHeaderSizeBytes];
375 int *ranks; /* PE => core rank mapping */
376 int *nodes; /* PE => node number mapping */
377 } rankMsg;
379 typedef struct _affMsg {
380 char core[CmiMsgHeaderSizeBytes];
381 #ifndef _WIN32
382 cpu_set_t affinity;
383 #endif
384 } affMsg;
386 static rankMsg *rankmsg = NULL;
387 static CmmTable hostTable;
388 static CmiNodeLock affLock = 0;
390 /* called on PE 0 */
391 static void cpuAffinityHandler(void *m)
393 static int count = 0;
394 static int nodecount = 0;
395 hostnameMsg *rec;
396 hostnameMsg *msg = (hostnameMsg *)m;
397 void *tmpm;
398 int tag, tag1, pe, myrank;
399 int npes = CmiNumPes();
401 /* for debug
402 char str[128];
403 skt_print_ip(str, msg->ip);
404 printf("hostname: %d %s\n", msg->pe, str);
406 CmiAssert(CmiMyPe()==0 && rankmsg != NULL);
407 tag = *(int*)&msg->ip;
408 pe = msg->pe;
409 if ((rec = (hostnameMsg *)CmmProbe(hostTable, 1, &tag, &tag1)) != NULL) {
410 CmiFree(msg);
412 else {
413 rec = msg;
414 rec->seq = nodecount;
415 nodecount++; /* a new node record */
416 CmmPut(hostTable, 1, &tag, msg);
418 myrank = rec->rank%rec->ncores;
419 while (in_exclude(myrank)) { /* skip excluded core */
420 myrank = (myrank+1)%rec->ncores;
421 rec->rank ++;
423 rankmsg->ranks[pe] = myrank; /* core rank */
424 rankmsg->nodes[pe] = rec->seq; /* on which node */
425 rec->rank ++;
426 count ++;
427 if (count == CmiNumPes()) {
428 DEBUGP(("Cpuaffinity> %d unique compute nodes detected! \n", CmmEntries(hostTable)));
429 tag = CmmWildCard;
430 while ((tmpm = CmmGet(hostTable, 1, &tag, &tag1))) CmiFree(tmpm);
431 CmmFree(hostTable);
432 #if 1
433 /* bubble sort ranks on each node according to the PE number */
435 int i,j;
436 for (i=0; i<npes-1; i++)
437 for(j=i+1; j<npes; j++) {
438 if (rankmsg->nodes[i] == rankmsg->nodes[j] &&
439 rankmsg->ranks[i] > rankmsg->ranks[j])
441 int tmp = rankmsg->ranks[i];
442 rankmsg->ranks[i] = rankmsg->ranks[j];
443 rankmsg->ranks[j] = tmp;
447 #endif
448 CmiSyncBroadcastAllAndFree(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2, (void *)rankmsg);
452 /* called on each processor */
453 static void cpuAffinityRecvHandler(void *msg)
455 int myrank, mynode;
456 rankMsg *m = (rankMsg *)msg;
457 m->ranks = (int *)((char*)m + sizeof(rankMsg));
458 m->nodes = (int *)((char*)m + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
459 myrank = m->ranks[CmiMyPe()];
460 mynode = m->nodes[CmiMyPe()];
462 DEBUGP(("[%d %d] set to core #: %d\n", CmiMyNode(), CmiMyPe(), myrank));
464 if (-1 != CmiSetCPUAffinity(myrank)) {
465 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
467 else{
468 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
469 CmiAbort("set cpu affinity abort!\n");
471 CmiFree(m);
474 /* called on first PE in physical node, receive affinity set from other PEs in phy node */
475 static void cpuPhyNodeAffinityRecvHandler(void *msg)
477 affMsg *m = (affMsg *)msg;
478 #if !defined(_WIN32) && defined(CPU_OR)
479 CPU_OR(&core_usage, &core_usage, &m->affinity);
480 affMsgsRecvd++;
481 #endif
482 CmiFree(m);
485 #if defined(_WIN32)
486 /* strtok is thread safe in VC++ */
487 #define strtok_r(x,y,z) strtok(x,y)
488 #endif
490 static int search_pemap(char *pecoremap, int pe)
492 int *map = (int *)malloc(CmiNumPesGlobal()*sizeof(int));
493 char *ptr = NULL;
494 int h, i, j, k, count;
495 int plusarr[128];
496 char *str;
498 char *mapstr = (char*)malloc(strlen(pecoremap)+1);
499 strcpy(mapstr, pecoremap);
501 str = strtok_r(mapstr, ",", &ptr);
502 count = 0;
503 while (str && count < CmiNumPesGlobal())
505 int hasdash=0, hascolon=0, hasdot=0, hasstar1=0, hasstar2=0, numplus=0;
506 int start, end, stride=1, block=1;
507 int iter=1;
508 plusarr[0] = 0;
509 for (i=0; i<strlen(str); i++) {
510 if (str[i] == '-' && i!=0) hasdash=1;
511 else if (str[i] == ':') hascolon=1;
512 else if (str[i] == '.') hasdot=1;
513 else if (str[i] == 'x') hasstar1=1;
514 else if (str[i] == 'X') hasstar2=1;
515 else if (str[i] == '+') {
516 if (str[i+1] == '+' || str[i+1] == '-') {
517 printf("Warning: Check the format of \"%s\".\n", str);
518 } else if (sscanf(&str[i], "+%d", &plusarr[++numplus]) != 1) {
519 printf("Warning: Check the format of \"%s\".\n", str);
520 --numplus;
524 if (hasstar1 || hasstar2) {
525 if (hasstar1) sscanf(str, "%dx", &iter);
526 if (hasstar2) sscanf(str, "%dX", &iter);
527 while (*str!='x' && *str!='X') str++;
528 str++;
530 if (hasdash) {
531 if (hascolon) {
532 if (hasdot) {
533 if (sscanf(str, "%d-%d:%d.%d", &start, &end, &stride, &block) != 4)
534 printf("Warning: Check the format of \"%s\".\n", str);
536 else {
537 if (sscanf(str, "%d-%d:%d", &start, &end, &stride) != 3)
538 printf("Warning: Check the format of \"%s\".\n", str);
541 else {
542 if (sscanf(str, "%d-%d", &start, &end) != 2)
543 printf("Warning: Check the format of \"%s\".\n", str);
546 else {
547 sscanf(str, "%d", &start);
548 end = start;
550 if (block > stride) {
551 printf("Warning: invalid block size in \"%s\" ignored.\n", str);
552 block=1;
554 //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus);
555 for (k = 0; k<iter; k++) {
556 for (i = start; i<=end; i+=stride) {
557 for (j=0; j<block; j++) {
558 if (i+j>end) break;
559 for (h=0; h<=numplus; h++) {
560 map[count++] = i+j+plusarr[h];
561 if (count == CmiNumPesGlobal()) break;
563 if (count == CmiNumPesGlobal()) break;
565 if (count == CmiNumPesGlobal()) break;
567 if (count == CmiNumPesGlobal()) break;
569 str = strtok_r(NULL, ",", &ptr);
571 i = map[pe % count];
573 free(map);
574 free(mapstr);
575 return i;
578 #if CMK_CRAYXE || CMK_CRAYXC
579 CMI_EXTERNC
580 int getXTNodeID(int mpirank, int nummpiranks);
581 #endif
584 * Check that there are not multiple PEs assigned to the same core.
585 * If a pemap has been computed by this module (or passed by the user) this
586 * function will print a warning if oversubscription detected. If no affinity
587 * has been set explicitly by this module, it will print error and abort if
588 * oversubscription detected.
590 CMI_EXTERNC
591 void CmiCheckAffinity(void)
593 #if !defined(_WIN32) && CMK_SMP && CMK_HAS_PTHREAD_SETAFFINITY && defined(CPU_OR)
595 if (!CmiCpuTopologyEnabled()) return; // only works if cpu topology enabled
597 if (CmiMyPe() == 0) {
598 // wait for every PE affinity from my physical node (for now only done on phy node 0)
600 cpu_set_t my_aff;
601 if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n");
602 CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0)
603 int N = CmiNumPesOnPhysicalNode(0);
604 while (affMsgsRecvd < N)
605 CmiDeliverSpecificMsg(cpuPhyNodeAffinityRecvHandlerIdx);
607 // NOTE this test is simple and may not detect every possible case of
608 // oversubscription
609 if (CPU_COUNT(&core_usage) < N) {
610 // TODO suggest command line arguments?
611 if (!aff_is_set) {
612 CmiAbort("Multiple PEs assigned to same core. Set affinity "
613 "options to correct or lower the number of threads, or pass +setcpuaffinity to ignore.\n");
614 } else {
615 CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend "
616 "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
617 "interference.\n");
620 } else if ((CmiMyPe() < CmiNumPes()) && (CmiPhysicalNodeID(CmiMyPe()) == 0)) {
621 // send my affinity to first PE on physical node (only done on phy node 0 for now)
622 affMsg *m = (affMsg*)CmiAlloc(sizeof(affMsg));
623 CmiSetHandler((char *)m, cpuPhyNodeAffinityRecvHandlerIdx);
624 if (get_affinity(&m->affinity) == -1) { // put my affinity in msg
625 CmiFree(m);
626 CmiAbort("get_affinity failed\n");
628 CmiSyncSendAndFree(0, sizeof(affMsg), (void *)m);
630 #endif
633 CMI_EXTERNC_VARIABLE int CmiMyLocalRank;
635 static void bind_process_only(hwloc_obj_type_t process_unit)
637 hwloc_topology_t topology;
638 hwloc_cpuset_t cpuset;
639 cmi_hwloc_topology_init(&topology);
640 cmi_hwloc_topology_load(topology);
643 int process_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, process_unit);
644 #if CMK_BLUEGENEQ
645 // ignore BG/Q's reserved socket
646 if (process_unit == HWLOC_OBJ_PACKAGE && process_unitcount == 17)
647 process_unitcount = 16;
648 #endif
650 int process_assignment = CmiMyLocalRank % process_unitcount;
652 hwloc_obj_t process_obj = cmi_hwloc_get_obj_by_type(topology, process_unit, process_assignment);
653 set_process_affinity(topology, process_obj->cpuset);
656 cmi_hwloc_topology_destroy(topology);
659 #if CMK_SMP
660 static void bind_threads_only(hwloc_obj_type_t thread_unit)
662 hwloc_topology_t topology;
663 hwloc_cpuset_t cpuset;
664 cmi_hwloc_topology_init(&topology);
665 cmi_hwloc_topology_load(topology);
668 int thread_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, thread_unit);
669 #if CMK_BLUEGENEQ
670 // ignore BG/Q's reserved socket
671 if (thread_unit == HWLOC_OBJ_PACKAGE && thread_unitcount == 17)
672 thread_unitcount = 16;
673 #endif
675 int thread_assignment = CmiMyRank() % thread_unitcount;
677 hwloc_obj_t thread_obj = cmi_hwloc_get_obj_by_type(topology, thread_unit, thread_assignment);
678 hwloc_cpuset_t thread_cpuset = cmi_hwloc_bitmap_dup(thread_obj->cpuset);
679 cmi_hwloc_bitmap_singlify(thread_cpuset);
680 set_thread_affinity(topology, thread_cpuset);
681 cmi_hwloc_bitmap_free(thread_cpuset);
684 cmi_hwloc_topology_destroy(topology);
687 static void bind_process_and_threads(hwloc_obj_type_t process_unit, hwloc_obj_type_t thread_unit)
689 hwloc_topology_t topology;
690 hwloc_cpuset_t cpuset;
691 cmi_hwloc_topology_init(&topology);
692 cmi_hwloc_topology_load(topology);
695 int process_unitcount = cmi_hwloc_get_nbobjs_by_type(topology, process_unit);
697 int process_assignment = CmiMyLocalRank % process_unitcount;
699 hwloc_obj_t process_obj = cmi_hwloc_get_obj_by_type(topology, process_unit, process_assignment);
700 set_process_affinity(topology, process_obj->cpuset);
702 int thread_unitcount = cmi_hwloc_get_nbobjs_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit);
704 int thread_assignment = CmiMyRank() % thread_unitcount;
706 hwloc_obj_t thread_obj = cmi_hwloc_get_obj_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit, thread_assignment);
707 hwloc_cpuset_t thread_cpuset = cmi_hwloc_bitmap_dup(thread_obj->cpuset);
708 cmi_hwloc_bitmap_singlify(thread_cpuset);
709 set_thread_affinity(topology, thread_cpuset);
710 cmi_hwloc_bitmap_free(thread_cpuset);
713 cmi_hwloc_topology_destroy(topology);
715 #endif
717 static int set_default_affinity(void)
719 char *s;
720 int n = -1;
722 if ((s = getenv("CmiProcessPerSocket")))
724 n = atoi(s);
725 #if CMK_SMP
726 if (getenv("CmiOneWthPerCore"))
727 bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_CORE);
728 else if (getenv("CmiOneWthPerPU"))
729 bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PU);
730 else
731 #endif
732 bind_process_only(HWLOC_OBJ_PACKAGE);
734 else if ((s = getenv("CmiProcessPerCore")))
736 n = atoi(s);
737 #if CMK_SMP
738 if (getenv("CmiOneWthPerPU"))
739 bind_process_and_threads(HWLOC_OBJ_CORE, HWLOC_OBJ_PU);
740 else
741 #endif
742 bind_process_only(HWLOC_OBJ_CORE);
744 else if ((s = getenv("CmiProcessPerPU")))
746 n = atoi(s);
747 bind_process_only(HWLOC_OBJ_PU);
749 else // if ((s = getenv("CmiProcessPerHost")))
751 #if CMK_SMP
752 if (getenv("CmiOneWthPerSocket"))
754 n = 0;
755 bind_threads_only(HWLOC_OBJ_PACKAGE);
757 else if (getenv("CmiOneWthPerCore"))
759 n = 0;
760 bind_threads_only(HWLOC_OBJ_CORE);
762 else if (getenv("CmiOneWthPerPU"))
764 n = 0;
765 bind_threads_only(HWLOC_OBJ_PU);
767 #endif
770 return n != -1;
773 CMI_EXTERNC
774 void CmiInitCPUAffinity(char **argv)
776 static skt_ip_t myip;
777 int ret, i, exclude;
778 hostnameMsg *msg;
779 char *pemap = NULL;
780 char *commap = NULL;
781 char *pemapfile = NULL;
783 int show_affinity_flag;
785 int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
786 "set cpu affinity");
788 while (CmiGetArgIntDesc(argv,"+excludecore", &exclude, "avoid core when setting cpuaffinity")) {
789 if (CmiMyRank() == 0) add_exclude(exclude);
790 affinity_flag = 1;
793 if (CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file")) {
794 FILE *fp;
795 char buf[128];
796 pemap = (char*)malloc(1024);
797 fp = fopen(pemapfile, "r");
798 if (fp == NULL) CmiAbort("pemapfile does not exist");
799 while (!feof(fp)) {
800 if (fgets(buf, 128, fp)) {
801 if (buf[strlen(buf)-1] == '\n') buf[strlen(buf)-1] = 0;
802 strcat(pemap, buf);
805 fclose(fp);
806 if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile, pemap);
809 CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
810 if (pemap!=NULL && excludecount>0)
811 CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");
813 CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");
815 if (pemap!=NULL || commap!=NULL) affinity_flag = 1;
817 show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity");
819 CmiAssignOnce(&cpuAffinityHandlerIdx, CmiRegisterHandler((CmiHandler)cpuAffinityHandler));
820 CmiAssignOnce(&cpuAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuAffinityRecvHandler));
821 CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler));
823 /* new style */
825 int done = 0;
826 CmiNodeAllBarrier();
828 /* must bind the rank 0 which is the main thread first */
829 /* binding the main thread seems to change binding for all threads */
830 if (CmiMyRank() == 0) {
831 done = set_default_affinity();
834 CmiNodeAllBarrier();
836 if (CmiMyRank() != 0) {
837 done = set_default_affinity();
840 if (done) {
841 if (show_affinity_flag) CmiPrintCPUAffinity();
842 return;
846 if (CmiMyRank() ==0) {
847 affLock = CmiCreateLock();
848 #ifndef _WIN32
849 aff_is_set = affinity_flag;
850 CPU_ZERO(&core_usage);
851 #endif
854 #if CMK_BLUEGENEQ
855 if(affinity_flag){
856 affinity_flag = 0;
857 if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene/Q, thus ignored.\n");
859 if(show_affinity_flag){
860 show_affinity_flag = 0;
861 if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene/Q.\n");
863 #endif
865 if (!affinity_flag) {
866 if (show_affinity_flag) {
867 CmiPrintCPUAffinity();
868 CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
870 return;
873 if (CmiMyPe() == 0) {
874 CmiPrintf("Charm++> cpu affinity enabled. \n");
875 if (excludecount > 0) {
876 CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore[0]);
877 for (i=1; i<excludecount; i++) CmiPrintf(" %d", excludecore[i]);
878 CmiPrintf(".\n");
880 if (pemap!=NULL)
881 CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap);
884 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
885 /* comm thread either can float around, or pin down to the last rank.
886 however it seems to be reportedly slower if it is floating */
887 CmiNodeAllBarrier();
888 if (commap != NULL) {
889 int mycore = search_pemap(commap, CmiMyPeGlobal()-CmiNumPesGlobal());
890 if (CmiPhysicalNodeID(CmiMyPe()) == 0) CmiPrintf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore);
891 if (-1 == CmiSetCPUAffinity(mycore))
892 CmiAbort("set_cpu_affinity abort!");
893 CmiNodeAllBarrier();
894 if (show_affinity_flag) CmiPrintCPUAffinity();
895 return; /* comm thread return */
897 else {
898 /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
899 #if !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8
900 if (pemap == NULL) {
901 #if CMK_MACHINE_PROGRESS_DEFINED
902 while (affinity_doneflag < CmiMyNodeSize()) CmiNetworkProgress();
903 #else
904 #if CMK_SMP
905 #error "Machine progress call needs to be implemented for cpu affinity!"
906 #endif
907 #endif
909 #endif
910 #if CMK_CRAYXE || CMK_CRAYXC
911 /* if both pemap and commmap are NULL, will compute one */
912 if (pemap != NULL)
913 #endif
915 CmiNodeAllBarrier();
916 if (show_affinity_flag) CmiPrintCPUAffinity();
917 return; /* comm thread return */
922 if (pemap != NULL && CmiMyPe()<CmiNumPes()) { /* work thread */
923 int mycore = search_pemap(pemap, CmiMyPeGlobal());
924 if(show_affinity_flag) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore);
925 if (mycore >= CmiNumCores()) {
926 CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore, CmiNumCores(), CmiNumCores()-1);
927 CmiAbort("Invalid core number");
929 if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("set_cpu_affinity abort!");
930 CmiNodeAllBarrier();
931 CmiNodeAllBarrier();
932 /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
933 return;
936 #if CMK_CRAYXE || CMK_CRAYXC
938 int numCores = CmiNumCores();
940 int myid = getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
941 int myrank;
942 int pe, mype = CmiMyPeGlobal();
943 int node = CmiMyNodeGlobal();
944 int nnodes = 0;
945 #if CMK_SMP
946 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
947 int node = CmiMyPe() - CmiNumPes();
948 mype = CmiGetPeGlobal(CmiNodeFirst(node) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
949 node = CmiGetNodeGlobal(node, CmiMyPartition());
951 #endif
952 pe = mype - 1;
953 while (pe >= 0) {
954 int n = CmiNodeOf(pe);
955 if (n != node) { nnodes++; node = n; }
956 if (getXTNodeID(n, CmiNumNodesGlobal()) != myid) break;
957 pe --;
959 CmiAssert(numCores > 0);
960 myrank = (mype - pe - 1 + nnodes)%numCores;
961 #if CMK_SMP
962 if (CmiMyPe() >= CmiNumPes())
963 myrank = (myrank + 1)%numCores;
964 #endif
966 if (-1 != CmiSetCPUAffinity(myrank)) {
967 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank, mynode));
969 else{
970 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
971 CmiAbort("set cpu affinity abort!\n");
974 if (CmiMyPe() < CmiNumPes())
975 CmiNodeAllBarrier();
976 CmiNodeAllBarrier();
977 #else
978 /* get my ip address */
979 if (CmiMyRank() == 0)
981 #if CMK_HAS_GETHOSTNAME
982 myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */
983 #else
984 CmiAbort("Can not get unique name for the compute nodes. \n");
985 #endif
987 CmiNodeAllBarrier();
989 /* prepare a msg to send */
990 msg = (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg));
991 CmiSetHandler((char *)msg, cpuAffinityHandlerIdx);
992 msg->pe = CmiMyPe();
993 msg->ip = myip;
994 msg->ncores = CmiNumCores();
995 DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg->ncores));
996 msg->rank = 0;
997 CmiSyncSendAndFree(0, sizeof(hostnameMsg), (void *)msg);
999 if (CmiMyPe() == 0) {
1000 int i;
1001 hostTable = CmmNew();
1002 rankmsg = (rankMsg *)CmiAlloc(sizeof(rankMsg)+CmiNumPes()*sizeof(int)*2);
1003 CmiSetHandler((char *)rankmsg, cpuAffinityRecvHandlerIdx);
1004 rankmsg->ranks = (int *)((char*)rankmsg + sizeof(rankMsg));
1005 rankmsg->nodes = (int *)((char*)rankmsg + sizeof(rankMsg) + CmiNumPes()*sizeof(int));
1006 for (i=0; i<CmiNumPes(); i++) {
1007 rankmsg->ranks[i] = 0;
1008 rankmsg->nodes[i] = -1;
1011 for (i=0; i<CmiNumPes(); i++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx);
1014 /* receive broadcast from PE 0 */
1015 CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx);
1016 CmiLock(affLock);
1017 affinity_doneflag++;
1018 CmiUnlock(affLock);
1019 CmiNodeAllBarrier();
1020 #endif
1022 if (show_affinity_flag) CmiPrintCPUAffinity();
1025 /* called in ConverseCommonInit to initialize basic variables */
1026 void CmiInitCPUAffinityUtil(void){
1027 char fname[64];
1028 CpvInitialize(int, myCPUAffToCore);
1029 CpvAccess(myCPUAffToCore) = -1;
1030 #if CMK_OS_IS_LINUX
1031 CpvInitialize(void *, myProcStatFP);
1032 CmiLock(_smp_mutex);
1033 #if CMK_SMP
1034 sprintf(fname, "/proc/%d/task/%ld/stat", getpid(), syscall(SYS_gettid));
1035 #else
1036 sprintf(fname, "/proc/%d/stat", getpid());
1037 #endif
1038 CpvAccess(myProcStatFP) = (void *)fopen(fname, "r");
1039 CmiUnlock(_smp_mutex);
1041 if(CmiMyPe()==0 && CpvAccess(myProcStatFP) == NULL){
1042 CmiPrintf("WARNING: ERROR IN OPENING FILE %s on PROC %d, CmiOnCore() SHOULDN'T BE CALLED\n", fname, CmiMyPe());
1045 #endif
1048 #else /* not supporting affinity */
1050 int CmiSetCPUAffinity(int mycore)
1052 return -1;
1055 int CmiPrintCPUAffinity(void)
1057 CmiPrintf("Warning: CmiPrintCPUAffinity not supported.\n");
1058 return -1;
1061 CMI_EXTERNC
1062 void CmiCheckAffinity(void) {
1065 CMI_EXTERNC
1066 void CmiInitCPUAffinity(char **argv)
1068 char *pemap = NULL;
1069 char *pemapfile = NULL;
1070 char *commap = NULL;
1071 int excludecore = -1;
1072 int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity",
1073 "set cpu affinity");
1074 while (CmiGetArgIntDesc(argv,"+excludecore",&excludecore, "avoid core when setting cpuaffinity"));
1075 CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
1076 CmiGetArgStringDesc(argv, "+pemapfile", &pemapfile, "define pe to core mapping file");
1077 CmiGetArgStringDesc(argv, "+commap", &commap, "define comm threads to core mapping");
1078 CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity");
1079 if (affinity_flag && CmiMyPe()==0)
1080 CmiPrintf("sched_setaffinity() is not supported, +setcpuaffinity disabled.\n");
1081 if (excludecore != -1 && CmiMyPe()==0)
1082 CmiPrintf("sched_setaffinity() is not supported, +excludecore disabled.\n");
1083 if (pemap && CmiMyPe()==0)
1084 CmiPrintf("sched_setaffinity() is not supported, +pemap disabled.\n");
1085 if (pemapfile && CmiMyPe()==0)
1086 CmiPrintf("sched_setaffinity() is not supported, +pemapfile disabled.\n");
1087 if (commap && CmiMyPe()==0)
1088 CmiPrintf("sched_setaffinity() is not supported, +commap disabled.\n");
1091 /* called in ConverseCommonInit to initialize basic variables */
1092 void CmiInitCPUAffinityUtil(void){
1093 CpvInitialize(int, myCPUAffToCore);
1094 CpvAccess(myCPUAffToCore) = -1;
1095 #if CMK_OS_IS_LINUX
1096 CpvInitialize(void *, myProcStatFP);
1097 CpvAccess(myProcStatFP) = NULL;
1098 #endif
1101 int CmiOnCore(void){
1102 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
1103 return -1;
1105 #endif