3 This scheme relies on using IP address to identify nodes and assigning
6 when CMK_NO_SOCKETS, which is typically on cray xt3 and bluegene/L.
7 There is no hostname for the compute nodes.
9 * last updated 3/20/2010 Gengbin Zheng
10 * new options +pemap +commmap takes complex pattern of a list of cores
16 #include "sockRoutines.h"
20 #include <infiniband/verbs.h>
21 #include <hwloc/openfabrics-verbs.h>
24 #define DEBUGP(x) /* CmiPrintf x; */
25 CpvDeclare(int, myCPUAffToCore
);
28 * /proc/<PID>/[task/<TID>]/stat file descriptor
29 * Used to retrieve the info about which physical
30 * coer this process or thread is on.
32 CpvDeclare(void *, myProcStatFP
);
35 CmiHwlocTopology CmiHwlocTopologyLocal
;
37 void CmiInitHwlocTopology(void)
39 hwloc_topology_t topology
;
42 /* Allocate and initialize topology object. */
43 cmi_hwloc_topology_init(&topology
);
44 /* Perform the topology detection. */
45 cmi_hwloc_topology_load(topology
);
47 // packages == sockets
48 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PACKAGE
);
49 CmiHwlocTopologyLocal
.num_sockets
= depth
!= HWLOC_TYPE_DEPTH_UNKNOWN
? cmi_hwloc_get_nbobjs_by_depth(topology
, depth
) : 1;
51 // ignore BG/Q's reserved socket
52 if (CmiHwlocTopologyLocal
.num_sockets
== 17)
53 CmiHwlocTopologyLocal
.num_sockets
= 16;
57 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_CORE
);
58 CmiHwlocTopologyLocal
.num_cores
= depth
!= HWLOC_TYPE_DEPTH_UNKNOWN
? cmi_hwloc_get_nbobjs_by_depth(topology
, depth
) : 1;
61 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
62 CmiHwlocTopologyLocal
.num_pus
= depth
!= HWLOC_TYPE_DEPTH_UNKNOWN
? cmi_hwloc_get_nbobjs_by_depth(topology
, depth
) : 1;
64 cmi_hwloc_topology_destroy(topology
);
67 #if CMK_HAS_SETAFFINITY || defined (_WIN32) || CMK_HAS_BINDPROCESSOR
80 //long sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
81 //long sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
85 #include <sys/syscall.h>
88 #if defined(__APPLE__)
89 #include <Carbon/Carbon.h> /* Carbon APIs for Multiprocessing */
92 #define MAX_EXCLUDE 64
93 static int excludecore
[MAX_EXCLUDE
] = {-1};
94 static int excludecount
= 0;
96 static int affinity_doneflag
= 0;
99 static int affMsgsRecvd
= 1; // number of affinity messages received at PE0
100 static cpu_set_t core_usage
; // used to record union of CPUs used by every PE in physical node
101 static int aff_is_set
= 0;
104 static int in_exclude(int core
)
107 for (i
=0; i
<excludecount
; i
++) if (core
== excludecore
[i
]) return 1;
111 static void add_exclude(int core
)
113 if (in_exclude(core
)) return;
114 CmiAssert(excludecount
< MAX_EXCLUDE
);
115 excludecore
[excludecount
++] = core
;
118 #if CMK_HAS_BINDPROCESSOR
119 #include <sys/processor.h>
122 static int set_process_affinity(hwloc_topology_t topology
, hwloc_cpuset_t cpuset
)
125 HANDLE process
= GetCurrentProcess();
127 pid_t process
= getpid();
130 if (cmi_hwloc_set_proc_cpubind(topology
, process
, cpuset
, HWLOC_CPUBIND_PROCESS
|HWLOC_CPUBIND_STRICT
))
134 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
135 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str
, strerror(error
));
141 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
144 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
145 CmiPrintf("HWLOC> [%d] Process %p bound to cpuset: %s\n", CmiMyPe(), process
, str
);
154 static int set_thread_affinity(hwloc_topology_t topology
, hwloc_cpuset_t cpuset
)
157 HANDLE thread
= GetCurrentThread();
159 pthread_t thread
= pthread_self();
162 if (cmi_hwloc_set_thread_cpubind(topology
, thread
, cpuset
, HWLOC_CPUBIND_THREAD
|HWLOC_CPUBIND_STRICT
))
166 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
167 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str
, strerror(error
));
173 if (CmiPhysicalNodeID(CmiMyPe()) == 0)
176 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
177 CmiPrintf("HWLOC> [%d] Thread %p bound to cpuset: %s\n", CmiMyPe(), thread
, str
);
187 int CmiSetCPUAffinity(int mycore
)
191 core
= CmiNumCores() + core
;
194 CmiError("Error: Invalid cpu affinity core number: %d\n", mycore
);
195 CmiAbort("CmiSetCPUAffinity failed");
198 CpvAccess(myCPUAffToCore
) = core
;
200 hwloc_topology_t topology
;
202 cmi_hwloc_topology_init(&topology
);
203 cmi_hwloc_topology_load(topology
);
205 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
206 cmi_hwloc_bitmap_set(cpuset
, core
);
209 set_thread_affinity(topology
, cpuset
);
211 set_process_affinity(topology
, cpuset
);
214 cmi_hwloc_bitmap_free(cpuset
);
216 cmi_hwloc_topology_destroy(topology
);
220 /* This implementation assumes the default x86 CPU mask size used by Linux */
221 /* For a large SMP machine, this code should be changed to use a variable sized */
222 /* CPU affinity mask buffer instead, as the present code will fail beyond 32 CPUs */
223 int print_cpu_affinity(void) {
224 hwloc_topology_t topology
;
225 // Allocate and initialize topology object.
226 cmi_hwloc_topology_init(&topology
);
227 // Perform the topology detection.
228 cmi_hwloc_topology_load(topology
);
230 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
231 // And try to bind ourself there. */
232 if (cmi_hwloc_get_cpubind(topology
, cpuset
, 0)) {
234 CmiPrintf("[%d] CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error
));
235 cmi_hwloc_bitmap_free(cpuset
);
236 cmi_hwloc_topology_destroy(topology
);
241 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
242 CmiPrintf("[%d] CPU affinity mask is %s\n", CmiMyPe(), str
);
244 cmi_hwloc_bitmap_free(cpuset
);
245 cmi_hwloc_topology_destroy(topology
);
250 int print_thread_affinity(void) {
251 hwloc_topology_t topology
;
252 // Allocate and initialize topology object.
253 cmi_hwloc_topology_init(&topology
);
254 // Perform the topology detection.
255 cmi_hwloc_topology_load(topology
);
258 HANDLE thread
= GetCurrentThread();
260 pthread_t thread
= pthread_self();
263 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
264 // And try to bind ourself there. */
265 // if (cmi_hwloc_get_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD)) {
266 if (cmi_hwloc_get_cpubind(topology
, cpuset
, HWLOC_CPUBIND_THREAD
) == -1) {
268 CmiPrintf("[%d] thread CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error
));
269 cmi_hwloc_bitmap_free(cpuset
);
270 cmi_hwloc_topology_destroy(topology
);
275 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
276 CmiPrintf("[%d] thread CPU affinity mask is %s\n", CmiMyPe(), str
);
278 cmi_hwloc_bitmap_free(cpuset
);
279 cmi_hwloc_topology_destroy(topology
);
285 int CmiPrintCPUAffinity(void)
288 return print_thread_affinity();
290 return print_cpu_affinity();
295 int get_cpu_affinity(cpu_set_t
*cpuset
) {
297 if (sched_getaffinity(0, sizeof(cpuset
), cpuset
) < 0) {
298 perror("sched_getaffinity");
305 int get_thread_affinity(cpu_set_t
*cpuset
) {
306 #if CMK_HAS_PTHREAD_SETAFFINITY
308 if (errno
= pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t
), cpuset
)) {
309 perror("pthread_getaffinity");
319 int get_affinity(cpu_set_t
*cpuset
) {
321 return get_thread_affinity(cpuset
);
323 return get_cpu_affinity(cpuset
);
328 int CmiOnCore(void) {
331 * The info (task_cpu) is read from the Linux /proc virtual file system.
332 * The /proc/<PID>/[task/<TID>]/stat is explained in the Linux
333 * kernel documentation. The online one could be found in:
334 * http://www.mjmwired.net/kernel/Documentation/filesystems/proc.txt
335 * Based on the documentation, task_cpu is found at the 39th field in
338 #define TASK_CPU_POS (39)
341 FILE *fp
= (FILE *)CpvAccess(myProcStatFP
);
343 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
346 fseek(fp
, 0, SEEK_SET
);
347 for (n
=0; n
<TASK_CPU_POS
; n
++) {
348 if (fscanf(fp
, "%127s", str
) != 1) {
349 CmiAbort("CPU affinity> reading from /proc/<PID>/[task/<TID>]/stat failed!");
354 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
360 static int cpuAffinityHandlerIdx
;
361 static int cpuAffinityRecvHandlerIdx
;
362 static int cpuPhyNodeAffinityRecvHandlerIdx
;
364 typedef struct _hostnameMsg
{
365 char core
[CmiMsgHeaderSizeBytes
];
373 typedef struct _rankMsg
{
374 char core
[CmiMsgHeaderSizeBytes
];
375 int *ranks
; /* PE => core rank mapping */
376 int *nodes
; /* PE => node number mapping */
379 typedef struct _affMsg
{
380 char core
[CmiMsgHeaderSizeBytes
];
386 static rankMsg
*rankmsg
= NULL
;
387 static CmmTable hostTable
;
388 static CmiNodeLock affLock
= 0;
391 static void cpuAffinityHandler(void *m
)
393 static int count
= 0;
394 static int nodecount
= 0;
396 hostnameMsg
*msg
= (hostnameMsg
*)m
;
398 int tag
, tag1
, pe
, myrank
;
399 int npes
= CmiNumPes();
403 skt_print_ip(str, msg->ip);
404 printf("hostname: %d %s\n", msg->pe, str);
406 CmiAssert(CmiMyPe()==0 && rankmsg
!= NULL
);
407 tag
= *(int*)&msg
->ip
;
409 if ((rec
= (hostnameMsg
*)CmmProbe(hostTable
, 1, &tag
, &tag1
)) != NULL
) {
414 rec
->seq
= nodecount
;
415 nodecount
++; /* a new node record */
416 CmmPut(hostTable
, 1, &tag
, msg
);
418 myrank
= rec
->rank
%rec
->ncores
;
419 while (in_exclude(myrank
)) { /* skip excluded core */
420 myrank
= (myrank
+1)%rec
->ncores
;
423 rankmsg
->ranks
[pe
] = myrank
; /* core rank */
424 rankmsg
->nodes
[pe
] = rec
->seq
; /* on which node */
427 if (count
== CmiNumPes()) {
428 DEBUGP(("Cpuaffinity> %d unique compute nodes detected! \n", CmmEntries(hostTable
)));
430 while ((tmpm
= CmmGet(hostTable
, 1, &tag
, &tag1
))) CmiFree(tmpm
);
433 /* bubble sort ranks on each node according to the PE number */
436 for (i
=0; i
<npes
-1; i
++)
437 for(j
=i
+1; j
<npes
; j
++) {
438 if (rankmsg
->nodes
[i
] == rankmsg
->nodes
[j
] &&
439 rankmsg
->ranks
[i
] > rankmsg
->ranks
[j
])
441 int tmp
= rankmsg
->ranks
[i
];
442 rankmsg
->ranks
[i
] = rankmsg
->ranks
[j
];
443 rankmsg
->ranks
[j
] = tmp
;
448 CmiSyncBroadcastAllAndFree(sizeof(rankMsg
)+CmiNumPes()*sizeof(int)*2, (void *)rankmsg
);
452 /* called on each processor */
453 static void cpuAffinityRecvHandler(void *msg
)
456 rankMsg
*m
= (rankMsg
*)msg
;
457 m
->ranks
= (int *)((char*)m
+ sizeof(rankMsg
));
458 m
->nodes
= (int *)((char*)m
+ sizeof(rankMsg
) + CmiNumPes()*sizeof(int));
459 myrank
= m
->ranks
[CmiMyPe()];
460 mynode
= m
->nodes
[CmiMyPe()];
462 DEBUGP(("[%d %d] set to core #: %d\n", CmiMyNode(), CmiMyPe(), myrank
));
464 if (-1 != CmiSetCPUAffinity(myrank
)) {
465 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank
, mynode
));
468 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
469 CmiAbort("set cpu affinity abort!\n");
474 /* called on first PE in physical node, receive affinity set from other PEs in phy node */
475 static void cpuPhyNodeAffinityRecvHandler(void *msg
)
477 affMsg
*m
= (affMsg
*)msg
;
478 #if !defined(_WIN32) && defined(CPU_OR)
479 CPU_OR(&core_usage
, &core_usage
, &m
->affinity
);
486 /* strtok is thread safe in VC++ */
487 #define strtok_r(x,y,z) strtok(x,y)
490 static int search_pemap(char *pecoremap
, int pe
)
492 int *map
= (int *)malloc(CmiNumPesGlobal()*sizeof(int));
494 int h
, i
, j
, k
, count
;
498 char *mapstr
= (char*)malloc(strlen(pecoremap
)+1);
499 strcpy(mapstr
, pecoremap
);
501 str
= strtok_r(mapstr
, ",", &ptr
);
503 while (str
&& count
< CmiNumPesGlobal())
505 int hasdash
=0, hascolon
=0, hasdot
=0, hasstar1
=0, hasstar2
=0, numplus
=0;
506 int start
, end
, stride
=1, block
=1;
509 for (i
=0; i
<strlen(str
); i
++) {
510 if (str
[i
] == '-' && i
!=0) hasdash
=1;
511 else if (str
[i
] == ':') hascolon
=1;
512 else if (str
[i
] == '.') hasdot
=1;
513 else if (str
[i
] == 'x') hasstar1
=1;
514 else if (str
[i
] == 'X') hasstar2
=1;
515 else if (str
[i
] == '+') {
516 if (str
[i
+1] == '+' || str
[i
+1] == '-') {
517 printf("Warning: Check the format of \"%s\".\n", str
);
518 } else if (sscanf(&str
[i
], "+%d", &plusarr
[++numplus
]) != 1) {
519 printf("Warning: Check the format of \"%s\".\n", str
);
524 if (hasstar1
|| hasstar2
) {
525 if (hasstar1
) sscanf(str
, "%dx", &iter
);
526 if (hasstar2
) sscanf(str
, "%dX", &iter
);
527 while (*str
!='x' && *str
!='X') str
++;
533 if (sscanf(str
, "%d-%d:%d.%d", &start
, &end
, &stride
, &block
) != 4)
534 printf("Warning: Check the format of \"%s\".\n", str
);
537 if (sscanf(str
, "%d-%d:%d", &start
, &end
, &stride
) != 3)
538 printf("Warning: Check the format of \"%s\".\n", str
);
542 if (sscanf(str
, "%d-%d", &start
, &end
) != 2)
543 printf("Warning: Check the format of \"%s\".\n", str
);
547 sscanf(str
, "%d", &start
);
550 if (block
> stride
) {
551 printf("Warning: invalid block size in \"%s\" ignored.\n", str
);
554 //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus);
555 for (k
= 0; k
<iter
; k
++) {
556 for (i
= start
; i
<=end
; i
+=stride
) {
557 for (j
=0; j
<block
; j
++) {
559 for (h
=0; h
<=numplus
; h
++) {
560 map
[count
++] = i
+j
+plusarr
[h
];
561 if (count
== CmiNumPesGlobal()) break;
563 if (count
== CmiNumPesGlobal()) break;
565 if (count
== CmiNumPesGlobal()) break;
567 if (count
== CmiNumPesGlobal()) break;
569 str
= strtok_r(NULL
, ",", &ptr
);
578 #if CMK_CRAYXE || CMK_CRAYXC
580 int getXTNodeID(int mpirank
, int nummpiranks
);
584 * Check that there are not multiple PEs assigned to the same core.
585 * If a pemap has been computed by this module (or passed by the user) this
586 * function will print a warning if oversubscription detected. If no affinity
587 * has been set explicitly by this module, it will print error and abort if
588 * oversubscription detected.
591 void CmiCheckAffinity(void)
593 #if !defined(_WIN32) && CMK_SMP && CMK_HAS_PTHREAD_SETAFFINITY && defined(CPU_OR)
595 if (!CmiCpuTopologyEnabled()) return; // only works if cpu topology enabled
597 if (CmiMyPe() == 0) {
598 // wait for every PE affinity from my physical node (for now only done on phy node 0)
601 if (get_affinity(&my_aff
) == -1) CmiAbort("get_affinity failed\n");
602 CPU_OR(&core_usage
, &core_usage
, &my_aff
); // add my affinity (pe0)
603 int N
= CmiNumPesOnPhysicalNode(0);
604 while (affMsgsRecvd
< N
)
605 CmiDeliverSpecificMsg(cpuPhyNodeAffinityRecvHandlerIdx
);
607 // NOTE this test is simple and may not detect every possible case of
609 if (CPU_COUNT(&core_usage
) < N
) {
610 // TODO suggest command line arguments?
612 CmiAbort("Multiple PEs assigned to same core. Set affinity "
613 "options to correct or lower the number of threads, or pass +setcpuaffinity to ignore.\n");
615 CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend "
616 "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
620 } else if ((CmiMyPe() < CmiNumPes()) && (CmiPhysicalNodeID(CmiMyPe()) == 0)) {
621 // send my affinity to first PE on physical node (only done on phy node 0 for now)
622 affMsg
*m
= (affMsg
*)CmiAlloc(sizeof(affMsg
));
623 CmiSetHandler((char *)m
, cpuPhyNodeAffinityRecvHandlerIdx
);
624 if (get_affinity(&m
->affinity
) == -1) { // put my affinity in msg
626 CmiAbort("get_affinity failed\n");
628 CmiSyncSendAndFree(0, sizeof(affMsg
), (void *)m
);
633 CMI_EXTERNC_VARIABLE
int CmiMyLocalRank
;
635 static void bind_process_only(hwloc_obj_type_t process_unit
)
637 hwloc_topology_t topology
;
638 hwloc_cpuset_t cpuset
;
639 cmi_hwloc_topology_init(&topology
);
640 cmi_hwloc_topology_load(topology
);
643 int process_unitcount
= cmi_hwloc_get_nbobjs_by_type(topology
, process_unit
);
645 // ignore BG/Q's reserved socket
646 if (process_unit
== HWLOC_OBJ_PACKAGE
&& process_unitcount
== 17)
647 process_unitcount
= 16;
650 int process_assignment
= CmiMyLocalRank
% process_unitcount
;
652 hwloc_obj_t process_obj
= cmi_hwloc_get_obj_by_type(topology
, process_unit
, process_assignment
);
653 set_process_affinity(topology
, process_obj
->cpuset
);
656 cmi_hwloc_topology_destroy(topology
);
660 static void bind_threads_only(hwloc_obj_type_t thread_unit
)
662 hwloc_topology_t topology
;
663 hwloc_cpuset_t cpuset
;
664 cmi_hwloc_topology_init(&topology
);
665 cmi_hwloc_topology_load(topology
);
668 int thread_unitcount
= cmi_hwloc_get_nbobjs_by_type(topology
, thread_unit
);
670 // ignore BG/Q's reserved socket
671 if (thread_unit
== HWLOC_OBJ_PACKAGE
&& thread_unitcount
== 17)
672 thread_unitcount
= 16;
675 int thread_assignment
= CmiMyRank() % thread_unitcount
;
677 hwloc_obj_t thread_obj
= cmi_hwloc_get_obj_by_type(topology
, thread_unit
, thread_assignment
);
678 hwloc_cpuset_t thread_cpuset
= cmi_hwloc_bitmap_dup(thread_obj
->cpuset
);
679 cmi_hwloc_bitmap_singlify(thread_cpuset
);
680 set_thread_affinity(topology
, thread_cpuset
);
681 cmi_hwloc_bitmap_free(thread_cpuset
);
684 cmi_hwloc_topology_destroy(topology
);
687 static void bind_process_and_threads(hwloc_obj_type_t process_unit
, hwloc_obj_type_t thread_unit
)
689 hwloc_topology_t topology
;
690 hwloc_cpuset_t cpuset
;
691 cmi_hwloc_topology_init(&topology
);
692 cmi_hwloc_topology_load(topology
);
695 int process_unitcount
= cmi_hwloc_get_nbobjs_by_type(topology
, process_unit
);
697 int process_assignment
= CmiMyLocalRank
% process_unitcount
;
699 hwloc_obj_t process_obj
= cmi_hwloc_get_obj_by_type(topology
, process_unit
, process_assignment
);
700 set_process_affinity(topology
, process_obj
->cpuset
);
702 int thread_unitcount
= cmi_hwloc_get_nbobjs_inside_cpuset_by_type(topology
, process_obj
->cpuset
, thread_unit
);
704 int thread_assignment
= CmiMyRank() % thread_unitcount
;
706 hwloc_obj_t thread_obj
= cmi_hwloc_get_obj_inside_cpuset_by_type(topology
, process_obj
->cpuset
, thread_unit
, thread_assignment
);
707 hwloc_cpuset_t thread_cpuset
= cmi_hwloc_bitmap_dup(thread_obj
->cpuset
);
708 cmi_hwloc_bitmap_singlify(thread_cpuset
);
709 set_thread_affinity(topology
, thread_cpuset
);
710 cmi_hwloc_bitmap_free(thread_cpuset
);
713 cmi_hwloc_topology_destroy(topology
);
717 static int set_default_affinity(void)
722 if ((s
= getenv("CmiProcessPerSocket")))
726 if (getenv("CmiOneWthPerCore"))
727 bind_process_and_threads(HWLOC_OBJ_PACKAGE
, HWLOC_OBJ_CORE
);
728 else if (getenv("CmiOneWthPerPU"))
729 bind_process_and_threads(HWLOC_OBJ_PACKAGE
, HWLOC_OBJ_PU
);
732 bind_process_only(HWLOC_OBJ_PACKAGE
);
734 else if ((s
= getenv("CmiProcessPerCore")))
738 if (getenv("CmiOneWthPerPU"))
739 bind_process_and_threads(HWLOC_OBJ_CORE
, HWLOC_OBJ_PU
);
742 bind_process_only(HWLOC_OBJ_CORE
);
744 else if ((s
= getenv("CmiProcessPerPU")))
747 bind_process_only(HWLOC_OBJ_PU
);
749 else // if ((s = getenv("CmiProcessPerHost")))
752 if (getenv("CmiOneWthPerSocket"))
755 bind_threads_only(HWLOC_OBJ_PACKAGE
);
757 else if (getenv("CmiOneWthPerCore"))
760 bind_threads_only(HWLOC_OBJ_CORE
);
762 else if (getenv("CmiOneWthPerPU"))
765 bind_threads_only(HWLOC_OBJ_PU
);
774 void CmiInitCPUAffinity(char **argv
)
776 static skt_ip_t myip
;
781 char *pemapfile
= NULL
;
783 int show_affinity_flag
;
785 int affinity_flag
= CmiGetArgFlagDesc(argv
,"+setcpuaffinity",
788 while (CmiGetArgIntDesc(argv
,"+excludecore", &exclude
, "avoid core when setting cpuaffinity")) {
789 if (CmiMyRank() == 0) add_exclude(exclude
);
793 if (CmiGetArgStringDesc(argv
, "+pemapfile", &pemapfile
, "define pe to core mapping file")) {
796 pemap
= (char*)malloc(1024);
797 fp
= fopen(pemapfile
, "r");
798 if (fp
== NULL
) CmiAbort("pemapfile does not exist");
800 if (fgets(buf
, 128, fp
)) {
801 if (buf
[strlen(buf
)-1] == '\n') buf
[strlen(buf
)-1] = 0;
806 if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile
, pemap
);
809 CmiGetArgStringDesc(argv
, "+pemap", &pemap
, "define pe to core mapping");
810 if (pemap
!=NULL
&& excludecount
>0)
811 CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");
813 CmiGetArgStringDesc(argv
, "+commap", &commap
, "define comm threads to core mapping");
815 if (pemap
!=NULL
|| commap
!=NULL
) affinity_flag
= 1;
817 show_affinity_flag
= CmiGetArgFlagDesc(argv
,"+showcpuaffinity", "print cpu affinity");
819 CmiAssignOnce(&cpuAffinityHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuAffinityHandler
));
820 CmiAssignOnce(&cpuAffinityRecvHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuAffinityRecvHandler
));
821 CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuPhyNodeAffinityRecvHandler
));
828 /* must bind the rank 0 which is the main thread first */
829 /* binding the main thread seems to change binding for all threads */
830 if (CmiMyRank() == 0) {
831 done
= set_default_affinity();
836 if (CmiMyRank() != 0) {
837 done
= set_default_affinity();
841 if (show_affinity_flag
) CmiPrintCPUAffinity();
846 if (CmiMyRank() ==0) {
847 affLock
= CmiCreateLock();
849 aff_is_set
= affinity_flag
;
850 CPU_ZERO(&core_usage
);
857 if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene/Q, thus ignored.\n");
859 if(show_affinity_flag
){
860 show_affinity_flag
= 0;
861 if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene/Q.\n");
865 if (!affinity_flag
) {
866 if (show_affinity_flag
) {
867 CmiPrintCPUAffinity();
868 CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
873 if (CmiMyPe() == 0) {
874 CmiPrintf("Charm++> cpu affinity enabled. \n");
875 if (excludecount
> 0) {
876 CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore
[0]);
877 for (i
=1; i
<excludecount
; i
++) CmiPrintf(" %d", excludecore
[i
]);
881 CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap
);
884 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
885 /* comm thread either can float around, or pin down to the last rank.
886 however it seems to be reportedly slower if it is floating */
888 if (commap
!= NULL
) {
889 int mycore
= search_pemap(commap
, CmiMyPeGlobal()-CmiNumPesGlobal());
890 if (CmiPhysicalNodeID(CmiMyPe()) == 0) CmiPrintf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore
);
891 if (-1 == CmiSetCPUAffinity(mycore
))
892 CmiAbort("set_cpu_affinity abort!");
894 if (show_affinity_flag
) CmiPrintCPUAffinity();
895 return; /* comm thread return */
898 /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
899 #if !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8
901 #if CMK_MACHINE_PROGRESS_DEFINED
902 while (affinity_doneflag
< CmiMyNodeSize()) CmiNetworkProgress();
905 #error "Machine progress call needs to be implemented for cpu affinity!"
910 #if CMK_CRAYXE || CMK_CRAYXC
911 /* if both pemap and commmap are NULL, will compute one */
916 if (show_affinity_flag
) CmiPrintCPUAffinity();
917 return; /* comm thread return */
922 if (pemap
!= NULL
&& CmiMyPe()<CmiNumPes()) { /* work thread */
923 int mycore
= search_pemap(pemap
, CmiMyPeGlobal());
924 if(show_affinity_flag
) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore
);
925 if (mycore
>= CmiNumCores()) {
926 CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore
, CmiNumCores(), CmiNumCores()-1);
927 CmiAbort("Invalid core number");
929 if (CmiSetCPUAffinity(mycore
) == -1) CmiAbort("set_cpu_affinity abort!");
932 /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
936 #if CMK_CRAYXE || CMK_CRAYXC
938 int numCores
= CmiNumCores();
940 int myid
= getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
942 int pe
, mype
= CmiMyPeGlobal();
943 int node
= CmiMyNodeGlobal();
946 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
947 int node
= CmiMyPe() - CmiNumPes();
948 mype
= CmiGetPeGlobal(CmiNodeFirst(node
) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
949 node
= CmiGetNodeGlobal(node
, CmiMyPartition());
954 int n
= CmiNodeOf(pe
);
955 if (n
!= node
) { nnodes
++; node
= n
; }
956 if (getXTNodeID(n
, CmiNumNodesGlobal()) != myid
) break;
959 CmiAssert(numCores
> 0);
960 myrank
= (mype
- pe
- 1 + nnodes
)%numCores
;
962 if (CmiMyPe() >= CmiNumPes())
963 myrank
= (myrank
+ 1)%numCores
;
966 if (-1 != CmiSetCPUAffinity(myrank
)) {
967 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank
, mynode
));
970 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
971 CmiAbort("set cpu affinity abort!\n");
974 if (CmiMyPe() < CmiNumPes())
978 /* get my ip address */
979 if (CmiMyRank() == 0)
981 #if CMK_HAS_GETHOSTNAME
982 myip
= skt_my_ip(); /* not thread safe, so only calls on rank 0 */
984 CmiAbort("Can not get unique name for the compute nodes. \n");
989 /* prepare a msg to send */
990 msg
= (hostnameMsg
*)CmiAlloc(sizeof(hostnameMsg
));
991 CmiSetHandler((char *)msg
, cpuAffinityHandlerIdx
);
994 msg
->ncores
= CmiNumCores();
995 DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg
->ncores
));
997 CmiSyncSendAndFree(0, sizeof(hostnameMsg
), (void *)msg
);
999 if (CmiMyPe() == 0) {
1001 hostTable
= CmmNew();
1002 rankmsg
= (rankMsg
*)CmiAlloc(sizeof(rankMsg
)+CmiNumPes()*sizeof(int)*2);
1003 CmiSetHandler((char *)rankmsg
, cpuAffinityRecvHandlerIdx
);
1004 rankmsg
->ranks
= (int *)((char*)rankmsg
+ sizeof(rankMsg
));
1005 rankmsg
->nodes
= (int *)((char*)rankmsg
+ sizeof(rankMsg
) + CmiNumPes()*sizeof(int));
1006 for (i
=0; i
<CmiNumPes(); i
++) {
1007 rankmsg
->ranks
[i
] = 0;
1008 rankmsg
->nodes
[i
] = -1;
1011 for (i
=0; i
<CmiNumPes(); i
++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx
);
1014 /* receive broadcast from PE 0 */
1015 CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx
);
1017 affinity_doneflag
++;
1019 CmiNodeAllBarrier();
1022 if (show_affinity_flag
) CmiPrintCPUAffinity();
1025 /* called in ConverseCommonInit to initialize basic variables */
1026 void CmiInitCPUAffinityUtil(void){
1028 CpvInitialize(int, myCPUAffToCore
);
1029 CpvAccess(myCPUAffToCore
) = -1;
1031 CpvInitialize(void *, myProcStatFP
);
1032 CmiLock(_smp_mutex
);
1034 sprintf(fname
, "/proc/%d/task/%ld/stat", getpid(), syscall(SYS_gettid
));
1036 sprintf(fname
, "/proc/%d/stat", getpid());
1038 CpvAccess(myProcStatFP
) = (void *)fopen(fname
, "r");
1039 CmiUnlock(_smp_mutex
);
1041 if(CmiMyPe()==0 && CpvAccess(myProcStatFP) == NULL){
1042 CmiPrintf("WARNING: ERROR IN OPENING FILE %s on PROC %d, CmiOnCore() SHOULDN'T BE CALLED\n", fname, CmiMyPe());
1048 #else /* not supporting affinity */
1050 int CmiSetCPUAffinity(int mycore
)
1055 int CmiPrintCPUAffinity(void)
1057 CmiPrintf("Warning: CmiPrintCPUAffinity not supported.\n");
1062 void CmiCheckAffinity(void) {
1066 void CmiInitCPUAffinity(char **argv
)
1069 char *pemapfile
= NULL
;
1070 char *commap
= NULL
;
1071 int excludecore
= -1;
1072 int affinity_flag
= CmiGetArgFlagDesc(argv
,"+setcpuaffinity",
1073 "set cpu affinity");
1074 while (CmiGetArgIntDesc(argv
,"+excludecore",&excludecore
, "avoid core when setting cpuaffinity"));
1075 CmiGetArgStringDesc(argv
, "+pemap", &pemap
, "define pe to core mapping");
1076 CmiGetArgStringDesc(argv
, "+pemapfile", &pemapfile
, "define pe to core mapping file");
1077 CmiGetArgStringDesc(argv
, "+commap", &commap
, "define comm threads to core mapping");
1078 CmiGetArgFlagDesc(argv
,"+showcpuaffinity", "print cpu affinity");
1079 if (affinity_flag
&& CmiMyPe()==0)
1080 CmiPrintf("sched_setaffinity() is not supported, +setcpuaffinity disabled.\n");
1081 if (excludecore
!= -1 && CmiMyPe()==0)
1082 CmiPrintf("sched_setaffinity() is not supported, +excludecore disabled.\n");
1083 if (pemap
&& CmiMyPe()==0)
1084 CmiPrintf("sched_setaffinity() is not supported, +pemap disabled.\n");
1085 if (pemapfile
&& CmiMyPe()==0)
1086 CmiPrintf("sched_setaffinity() is not supported, +pemapfile disabled.\n");
1087 if (commap
&& CmiMyPe()==0)
1088 CmiPrintf("sched_setaffinity() is not supported, +commap disabled.\n");
1091 /* called in ConverseCommonInit to initialize basic variables */
1092 void CmiInitCPUAffinityUtil(void){
1093 CpvInitialize(int, myCPUAffToCore
);
1094 CpvAccess(myCPUAffToCore
) = -1;
1096 CpvInitialize(void *, myProcStatFP
);
1097 CpvAccess(myProcStatFP
) = NULL
;
1101 int CmiOnCore(void){
1102 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");