3 This scheme relies on using IP address to identify nodes and assigning
6 when CMK_NO_SOCKETS, which is typically on cray xt3 and bluegene/L.
7 There is no hostname for the compute nodes.
9 * last updated 3/20/2010 Gengbin Zheng
10 * new options +pemap +commmap takes complex pattern of a list of cores
17 #include "sockRoutines.h"
20 #include <infiniband/verbs.h>
21 #include <hwloc/openfabrics-verbs.h>
24 #define DEBUGP(x) /* CmiPrintf x; */
25 CpvDeclare(int, myCPUAffToCore
);
28 * /proc/<PID>/[task/<TID>]/stat file descriptor
29 * Used to retrieve the info about which physical
30 * coer this process or thread is on.
32 CpvDeclare(void *, myProcStatFP
);
35 #if CMK_HAS_SETAFFINITY || defined (_WIN32) || CMK_HAS_BINDPROCESSOR
48 //long sched_setaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
49 //long sched_getaffinity(pid_t pid, unsigned int len, unsigned long *user_mask_ptr);
53 #include <sys/syscall.h>
56 #if defined(__APPLE__)
57 #include <Carbon/Carbon.h> /* Carbon APIs for Multiprocessing */
60 #define MAX_EXCLUDE 64
61 static int excludecore
[MAX_EXCLUDE
] = {-1};
62 static int excludecount
= 0;
64 static int affinity_doneflag
= 0;
67 static int affMsgsRecvd
= 1; // number of affinity messages received at PE0
68 static cpu_set_t core_usage
; // used to record union of CPUs used by every PE in physical node
69 static int aff_is_set
= 0;
72 static int in_exclude(int core
)
75 for (i
=0; i
<excludecount
; i
++) if (core
== excludecore
[i
]) return 1;
79 static void add_exclude(int core
)
81 if (in_exclude(core
)) return;
82 CmiAssert(excludecount
< MAX_EXCLUDE
);
83 excludecore
[excludecount
++] = core
;
86 #if CMK_HAS_BINDPROCESSOR
87 #include <sys/processor.h>
91 static void show_topology(hwloc_topology_t topology
) {
96 // Optionally, get some additional topology information
97 // in case we need the topology depth later.
98 int topodepth
= cmi_hwloc_topology_get_depth(topology
);
100 // Walk the topology with an array style, from level 0 (always
101 // the system level) to the lowest level (always the proc level).
102 for (depth
= 0; depth
< topodepth
; depth
++) {
103 CmiPrintf("*** Objects at level %d\n", depth
);
104 for (i
= 0; i
< cmi_hwloc_get_nbobjs_by_depth(topology
, depth
); i
++) {
105 cmi_hwloc_obj_snprintf(string
, sizeof(string
), topology
,
106 cmi_hwloc_get_obj_by_depth(topology
, depth
, i
),
108 CmiPrintf("Index %u: %s\n", i
, string
);
113 int set_cpu_affinity(unsigned int cpuid
) {
114 hwloc_topology_t topology
;
115 // Allocate and initialize topology object.
116 cmi_hwloc_topology_init(&topology
);
117 // Perform the topology detection.
118 cmi_hwloc_topology_load(topology
);
120 if (CmiMyPe() == 0) {
121 show_topology(topology
);
125 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
126 cmi_hwloc_bitmap_set(cpuset
, cpuid
);
128 // And try to bind ourself there. */
129 if (cmi_hwloc_set_cpubind(topology
, cpuset
, 0)) {
132 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
133 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str
, strerror(error
));
135 cmi_hwloc_bitmap_free(cpuset
);
136 cmi_hwloc_topology_destroy(topology
);
142 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
143 CmiPrintf("HWLOC> [%d] Bound to cpu: %d cpuset: %s\n", CmiMyPe(), cpuid
, str
);
147 cmi_hwloc_bitmap_free(cpuset
);
148 cmi_hwloc_topology_destroy(topology
);
153 int set_thread_affinity(int cpuid
) {
155 hwloc_topology_t topology
;
156 // Allocate and initialize topology object.
157 cmi_hwloc_topology_init(&topology
);
158 // Perform the topology detection.
159 cmi_hwloc_topology_load(topology
);
161 if (CmiMyPe() == 0) {
162 show_topology(topology
);
166 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
167 cmi_hwloc_bitmap_set(cpuset
, cpuid
);
170 HANDLE thread
= GetCurrentThread();
172 pthread_t thread
= pthread_self();
175 // And try to bind ourself there. */
176 if (cmi_hwloc_set_thread_cpubind(topology
, thread
, cpuset
, 0)) {
179 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
180 CmiPrintf("HWLOC> Couldn't bind to cpuset %s: %s\n", str
, strerror(error
));
182 cmi_hwloc_bitmap_free(cpuset
);
183 cmi_hwloc_topology_destroy(topology
);
189 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
190 CmiPrintf("HWLOC> [%d] Bound to cpu: %d cpuset: %s\n", CmiMyPe(), cpuid
, str
);
194 cmi_hwloc_bitmap_free(cpuset
);
195 cmi_hwloc_topology_destroy(topology
);
201 int CmiSetCPUAffinity(int mycore
)
205 core
= CmiNumCores() + core
;
208 CmiError("Error: Invalid cpu affinity core number: %d\n", mycore
);
209 CmiAbort("CmiSetCPUAffinity failed");
212 CpvAccess(myCPUAffToCore
) = core
;
214 /* set cpu affinity */
216 return set_thread_affinity(core
);
218 return set_cpu_affinity(core
);
219 /* print_cpu_affinity(); */
223 extern int CmiMyLocalRank
;
225 int CmiMapHosts(int mylocalrank
, int proc_per_host
)
227 hwloc_topology_t topology
;
228 hwloc_cpuset_t cpuset
;
230 int depth
, npus
, nsockets
, index
, nthreads
;
232 if (mylocalrank
== -1) {
233 CmiAbort("Error: Default affinity with +processPerHost is not compatible with this launching scheme.");
236 cmi_hwloc_topology_init(&topology
);
237 cmi_hwloc_topology_load(topology
);
238 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
239 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
241 nthreads
= CmiMyNodeSize();
245 index
= mylocalrank
% proc_per_host
;
246 /* now divide the cpuset to proc_per_socket partitions */
247 if (proc_per_host
* nthreads
<= npus
) {
249 int range
= npus
/ proc_per_host
;
250 int pos
= index
* range
+ range
/ nthreads
* CmiMyRank();
251 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, pos
);
252 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, -1);
253 // CmiPrintf("[%d] bind to idx: %d\n", CmiMyPe(), idx);
254 CmiSetCPUAffinity(idx
);
257 CmiPrintf("Warning: not implemented for cpu affinity under oversubscription.\n");
259 cmi_hwloc_topology_destroy(topology
);
262 static void CmiMapHostsBySocket(int mylocalrank
, int proc_per_host
)
264 hwloc_topology_t topology
;
265 unsigned long loading_flags
;
266 hwloc_cpuset_t cpuset
;
268 int depth
, npus_per_socket
, npus
, nsockets
, ncores
, index
, nthreads
;
271 if (mylocalrank
== -1) {
272 CmiAbort("Error: Default affinity with +processPerHost is not compatible with this launching scheme.");
275 cmi_hwloc_topology_init(&topology
);
276 loading_flags
= HWLOC_TOPOLOGY_FLAG_IO_BRIDGES
| HWLOC_TOPOLOGY_FLAG_IO_DEVICES
;
277 err
= cmi_hwloc_topology_set_flags(topology
, loading_flags
);
279 CmiAbort("hwloc_topology_set_flags() failed, PCI devices will not be loaded in the topology \n");
281 cmi_hwloc_topology_load(topology
);
282 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PACKAGE
);
283 nsockets
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
284 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_CORE
);
285 ncores
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
286 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
287 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
289 if (CmiMyRank() < CmiMyNodeSize()) {
290 nthreads
= CmiMyNodeSize() + 1;
291 int nthsocket
= mylocalrank
* CmiMyNodeSize() + CmiMyRank();
292 int pos
= nthsocket
* (npus
/nsockets
);
293 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, pos
);
294 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, -1);
295 //printf("[%d] bind to idx: %d\n", CmiMyPe(), idx);
298 // this is comm thread
299 // TODO: find one close to NIC
301 struct ibv_device
**dev_list
;
305 struct ibv_device
*dev
;
308 // printf("ibv_get_device_list found %d devices\n", count);
309 CmiAssert(npus
/nsockets
> 1);
310 dev_list
= ibv_get_device_list(&count
);
312 CmiAbort("ibv_get_device_list failed\n");
315 set
= cmi_hwloc_bitmap_alloc();
316 err
= cmi_hwloc_ibv_get_device_cpuset(topology
, dev
, set
);
318 cmi_hwloc_bitmap_asprintf(&string
, set
);
319 printf("found cpuset %s for %dth device\n", string
, i
);
324 while ((obj
= hwloc_get_next_obj_covering_cpuset_by_type(topology
, set
, HWLOC_OBJ_NODE
, obj
)) == NULL
);
326 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, obj
->os_index
);
327 idx
= cmi_hwloc_bitmap_last(obj
->cpuset
, -1);
329 idx
= hwloc_bitmap_last(set
);
332 /* try openFabrics */
334 hwloc_bitmap_free(set
);
336 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, npus
-1);
337 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, -1);
340 CmiSetCPUAffinity(idx
);
341 cmi_hwloc_topology_destroy(topology
);
344 int CmiMapHostsByCore(int mylocalrank
, int proc_per_host
)
346 hwloc_topology_t topology
;
347 hwloc_cpuset_t cpuset
;
349 int depth
, npus
, nsockets
, ncores
, index
, nthreads
;
351 if (mylocalrank
== -1) {
352 CmiAbort("Error: Default affinity with +processPerHost is not compatible with this launching scheme.");
355 cmi_hwloc_topology_init(&topology
);
356 cmi_hwloc_topology_load(topology
);
357 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_CORE
);
358 ncores
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
359 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
360 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
362 nthreads
= CmiMyNodeSize() + 1;
363 /* now divide the cpuset to proc_per_socket partitions */
364 if (proc_per_host
* nthreads
<= npus
) {
366 int nthcore
= mylocalrank
* nthreads
+ CmiMyRank();
367 int pos
= nthcore
* (npus
/ncores
);
368 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, pos
);
369 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, -1);
370 //printf("[%d] bind to idx: %d\n", CmiMyPe(), idx);
371 CmiSetCPUAffinity(idx
);
374 CmiPrintf("Warning: not implemented for cpu affinity under oversubscription.\n");
376 cmi_hwloc_topology_destroy(topology
);
379 int CmiMapSockets(int mylocalrank
, int proc_per_socket
)
381 hwloc_topology_t topology
;
382 hwloc_cpuset_t cpuset
;
384 int depth
, npus_per_socket_per_pe
, npus
, nsockets
, index
, whichsocket
, nthreads
;
387 if (mylocalrank
== -1) {
388 CmiAbort("Error: Default affinity with +processPerSocket is not compatible with the launching scheme.");
391 cmi_hwloc_topology_init(&topology
);
392 cmi_hwloc_topology_load(topology
);
393 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
394 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
395 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PACKAGE
);
396 nsockets
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
398 nthreads
= CmiMyNodeSize();
402 whichsocket
= mylocalrank
/ proc_per_socket
;
403 index
= mylocalrank
% proc_per_socket
;
404 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, whichsocket
);
408 cmi_hwloc_bitmap_asprintf(&str
, obj
->cpuset
);
409 CmiPrintf("HWLOC> [%d] %d %d %d %d cpuset %s\n", CmiMyPe(), mylocalrank
, nlocalranks
, proc_per_socket
, nthreads
, str
);
412 /* now divide the cpuset to proc_per_socket partitions */
413 npus_per_socket_per_pe
= npus
/ nsockets
/ proc_per_socket
;
414 m
= npus_per_socket_per_pe
/ nthreads
;
418 int pos
= index
* npus_per_socket_per_pe
+ m
* CmiMyRank();
419 for (i
=0; i
<=pos
; i
++)
421 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, idx
);
423 // CmiPrintf("[%d:%d] bind to socket: %d pos: %d idx: %d\n", CmiMyPe(), CmiMyRank(), whichsocket, pos, idx);
424 CmiSetCPUAffinity(idx
);
427 CmiPrintf("Warning: not implemented for cpu affinity under oversubscription.\n");
429 cmi_hwloc_topology_destroy(topology
);
433 int CmiMapCores(int mylocalrank
, int proc_per_core
)
435 hwloc_topology_t topology
;
436 hwloc_cpuset_t cpuset
;
438 int depth
, ncores
, npus
, npus_per_core
, index
, whichcore
, nthreads
;
440 if (mylocalrank
== -1) {
441 CmiAbort("Error: Default affinity with +processPerCore is not compatible with the launching scheme.");
444 cmi_hwloc_topology_init(&topology
);
445 cmi_hwloc_topology_load(topology
);
446 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
447 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
448 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_CORE
);
449 ncores
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
451 nthreads
= CmiMyNodeSize();
455 whichcore
= mylocalrank
/ proc_per_core
;
456 index
= mylocalrank
% proc_per_core
;
457 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, whichcore
);
461 cmi_hwloc_bitmap_asprintf(&str
, obj
->cpuset
);
462 CmiPrintf("HWLOC> [%d] %d %d %d cpuset %s\n", CmiMyPe(), mylocalrank
, nlocalranks
, proc_per_core
, str
);
465 /* now divide the cpuset to proc_per_socket partitions */
466 npus_per_core
= npus
/ ncores
;
467 if (npus_per_core
/ proc_per_core
/ nthreads
>= 1) {
470 /* pos is relative to the core */
471 int pos
= npus_per_core
/ proc_per_core
* index
+ npus_per_core
/ proc_per_core
/ nthreads
* CmiMyRank();
472 for (i
=0; i
<=pos
; i
++)
474 idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, idx
);
476 // CmiPrintf("[%d] bind to idx: %d\n", CmiMyPe(), idx);
477 CmiSetCPUAffinity(idx
);
480 CmiPrintf("Warning: not implemented for cpu affinity under oversubscription.\n");
484 int CmiMapPUs(int mylocalrank
, int proc_per_pu
)
486 hwloc_topology_t topology
;
487 hwloc_cpuset_t cpuset
;
489 int depth
, npus_per_socket
, npus
, nsockets
, index
, nthreads
;
491 if (mylocalrank
== -1) {
492 CmiAbort("Error: Default affinity with +processPerPU is not compatible with the launching scheme.");
495 cmi_hwloc_topology_init(&topology
);
496 cmi_hwloc_topology_load(topology
);
497 depth
= cmi_hwloc_get_type_depth(topology
, HWLOC_OBJ_PU
);
498 npus
= cmi_hwloc_get_nbobjs_by_depth(topology
, depth
);
500 nthreads
= CmiMyNodeSize();
505 obj
= cmi_hwloc_get_obj_by_depth(topology
, depth
, index
);
509 cmi_hwloc_bitmap_asprintf(&str
, obj
->cpuset
);
510 CmiPrintf("HWLOC> [%d] %d %d %d cpuset %s\n", CmiMyPe(), mylocalrank
, nlocalranks
, proc_per_pu
, str
);
513 /* now divide the cpuset to proc_per_socket partitions */
514 if (proc_per_pu
* nthreads
== 1) {
515 cpuset
= cmi_hwloc_bitmap_alloc();
516 int idx
= cmi_hwloc_bitmap_next(obj
->cpuset
, -1);
517 CmiSetCPUAffinity(idx
);
520 CmiPrintf("Warning: not implemented for cpu affinity under oversubscription.\n");
522 cmi_hwloc_topology_destroy(topology
);
525 /* This implementation assumes the default x86 CPU mask size used by Linux */
526 /* For a large SMP machine, this code should be changed to use a variable sized */
527 /* CPU affinity mask buffer instead, as the present code will fail beyond 32 CPUs */
528 int print_cpu_affinity(void) {
529 hwloc_topology_t topology
;
530 // Allocate and initialize topology object.
531 cmi_hwloc_topology_init(&topology
);
532 // Perform the topology detection.
533 cmi_hwloc_topology_load(topology
);
535 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
536 // And try to bind ourself there. */
537 if (cmi_hwloc_get_cpubind(topology
, cpuset
, 0)) {
539 CmiPrintf("[%d] CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error
));
540 cmi_hwloc_bitmap_free(cpuset
);
541 cmi_hwloc_topology_destroy(topology
);
546 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
547 CmiPrintf("[%d] CPU affinity mask is %s\n", CmiMyPe(), str
);
549 cmi_hwloc_bitmap_free(cpuset
);
550 cmi_hwloc_topology_destroy(topology
);
555 int print_thread_affinity(void) {
556 hwloc_topology_t topology
;
557 // Allocate and initialize topology object.
558 cmi_hwloc_topology_init(&topology
);
559 // Perform the topology detection.
560 cmi_hwloc_topology_load(topology
);
563 HANDLE thread
= GetCurrentThread();
565 pthread_t thread
= pthread_self();
568 hwloc_cpuset_t cpuset
= cmi_hwloc_bitmap_alloc();
569 // And try to bind ourself there. */
570 if (cmi_hwloc_get_thread_cpubind(topology
, thread
, cpuset
, 0)) {
572 CmiPrintf("[%d] CPU affinity mask is unknown %s\n", CmiMyPe(), strerror(error
));
573 cmi_hwloc_bitmap_free(cpuset
);
574 cmi_hwloc_topology_destroy(topology
);
579 cmi_hwloc_bitmap_asprintf(&str
, cpuset
);
580 CmiPrintf("[%d] CPU affinity mask is %s\n", CmiMyPe(), str
);
582 cmi_hwloc_bitmap_free(cpuset
);
583 cmi_hwloc_topology_destroy(topology
);
589 int CmiPrintCPUAffinity(void)
592 return print_thread_affinity();
594 return print_cpu_affinity();
599 int get_cpu_affinity(cpu_set_t
*cpuset
) {
601 if (sched_getaffinity(0, sizeof(cpuset
), cpuset
) < 0) {
602 perror("sched_getaffinity");
609 int get_thread_affinity(cpu_set_t
*cpuset
) {
610 #if CMK_HAS_PTHREAD_SETAFFINITY
612 if (errno
= pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t
), cpuset
)) {
613 perror("pthread_getaffinity");
623 int get_affinity(cpu_set_t
*cpuset
) {
625 return get_thread_affinity(cpuset
);
627 return get_cpu_affinity(cpuset
);
632 int CmiOnCore(void) {
635 * The info (task_cpu) is read from the Linux /proc virtual file system.
636 * The /proc/<PID>/[task/<TID>]/stat is explained in the Linux
637 * kernel documentation. The online one could be found in:
638 * http://www.mjmwired.net/kernel/Documentation/filesystems/proc.txt
639 * Based on the documentation, task_cpu is found at the 39th field in
642 #define TASK_CPU_POS (39)
645 FILE *fp
= (FILE *)CpvAccess(myProcStatFP
);
647 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
650 fseek(fp
, 0, SEEK_SET
);
651 for (n
=0; n
<TASK_CPU_POS
; n
++) {
652 fscanf(fp
, "%127s", str
);
656 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");
662 static int cpuAffinityHandlerIdx
;
663 static int cpuAffinityRecvHandlerIdx
;
664 static int cpuPhyNodeAffinityRecvHandlerIdx
;
666 typedef struct _hostnameMsg
{
667 char core
[CmiMsgHeaderSizeBytes
];
675 typedef struct _rankMsg
{
676 char core
[CmiMsgHeaderSizeBytes
];
677 int *ranks
; /* PE => core rank mapping */
678 int *nodes
; /* PE => node number mapping */
681 typedef struct _affMsg
{
682 char core
[CmiMsgHeaderSizeBytes
];
688 static rankMsg
*rankmsg
= NULL
;
689 static CmmTable hostTable
;
690 static CmiNodeLock affLock
= 0;
693 static void cpuAffinityHandler(void *m
)
695 static int count
= 0;
696 static int nodecount
= 0;
698 hostnameMsg
*msg
= (hostnameMsg
*)m
;
700 int tag
, tag1
, pe
, myrank
;
701 int npes
= CmiNumPes();
705 skt_print_ip(str, msg->ip);
706 printf("hostname: %d %s\n", msg->pe, str);
708 CmiAssert(CmiMyPe()==0 && rankmsg
!= NULL
);
709 tag
= *(int*)&msg
->ip
;
711 if ((rec
= (hostnameMsg
*)CmmProbe(hostTable
, 1, &tag
, &tag1
)) != NULL
) {
716 rec
->seq
= nodecount
;
717 nodecount
++; /* a new node record */
718 CmmPut(hostTable
, 1, &tag
, msg
);
720 myrank
= rec
->rank
%rec
->ncores
;
721 while (in_exclude(myrank
)) { /* skip excluded core */
722 myrank
= (myrank
+1)%rec
->ncores
;
725 rankmsg
->ranks
[pe
] = myrank
; /* core rank */
726 rankmsg
->nodes
[pe
] = rec
->seq
; /* on which node */
729 if (count
== CmiNumPes()) {
730 DEBUGP(("Cpuaffinity> %d unique compute nodes detected! \n", CmmEntries(hostTable
)));
732 while ((tmpm
= CmmGet(hostTable
, 1, &tag
, &tag1
))) CmiFree(tmpm
);
735 /* bubble sort ranks on each node according to the PE number */
738 for (i
=0; i
<npes
-1; i
++)
739 for(j
=i
+1; j
<npes
; j
++) {
740 if (rankmsg
->nodes
[i
] == rankmsg
->nodes
[j
] &&
741 rankmsg
->ranks
[i
] > rankmsg
->ranks
[j
])
743 int tmp
= rankmsg
->ranks
[i
];
744 rankmsg
->ranks
[i
] = rankmsg
->ranks
[j
];
745 rankmsg
->ranks
[j
] = tmp
;
750 CmiSyncBroadcastAllAndFree(sizeof(rankMsg
)+CmiNumPes()*sizeof(int)*2, (void *)rankmsg
);
754 /* called on each processor */
755 static void cpuAffinityRecvHandler(void *msg
)
758 rankMsg
*m
= (rankMsg
*)msg
;
759 m
->ranks
= (int *)((char*)m
+ sizeof(rankMsg
));
760 m
->nodes
= (int *)((char*)m
+ sizeof(rankMsg
) + CmiNumPes()*sizeof(int));
761 myrank
= m
->ranks
[CmiMyPe()];
762 mynode
= m
->nodes
[CmiMyPe()];
764 DEBUGP(("[%d %d] set to core #: %d\n", CmiMyNode(), CmiMyPe(), myrank
));
766 if (-1 != CmiSetCPUAffinity(myrank
)) {
767 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank
, mynode
));
770 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
771 CmiAbort("set cpu affinity abort!\n");
776 /* called on first PE in physical node, receive affinity set from other PEs in phy node */
777 static void cpuPhyNodeAffinityRecvHandler(void *msg
)
779 affMsg
*m
= (affMsg
*)msg
;
780 #if !defined(_WIN32) && defined(CPU_OR)
781 CPU_OR(&core_usage
, &core_usage
, &m
->affinity
);
788 /* strtok is thread safe in VC++ */
789 #define strtok_r(x,y,z) strtok(x,y)
792 static int search_pemap(char *pecoremap
, int pe
)
794 int *map
= (int *)malloc(CmiNumPesGlobal()*sizeof(int));
796 int h
, i
, j
, k
, count
;
800 char *mapstr
= (char*)malloc(strlen(pecoremap
)+1);
801 strcpy(mapstr
, pecoremap
);
803 str
= strtok_r(mapstr
, ",", &ptr
);
805 while (str
&& count
< CmiNumPesGlobal())
807 int hasdash
=0, hascolon
=0, hasdot
=0, hasstar1
=0, hasstar2
=0, numplus
=0;
808 int start
, end
, stride
=1, block
=1;
811 for (i
=0; i
<strlen(str
); i
++) {
812 if (str
[i
] == '-' && i
!=0) hasdash
=1;
813 else if (str
[i
] == ':') hascolon
=1;
814 else if (str
[i
] == '.') hasdot
=1;
815 else if (str
[i
] == 'x') hasstar1
=1;
816 else if (str
[i
] == 'X') hasstar2
=1;
817 else if (str
[i
] == '+') {
818 if (str
[i
+1] == '+' || str
[i
+1] == '-') {
819 printf("Warning: Check the format of \"%s\".\n", str
);
820 } else if (sscanf(&str
[i
], "+%d", &plusarr
[++numplus
]) != 1) {
821 printf("Warning: Check the format of \"%s\".\n", str
);
826 if (hasstar1
|| hasstar2
) {
827 if (hasstar1
) sscanf(str
, "%dx", &iter
);
828 if (hasstar2
) sscanf(str
, "%dX", &iter
);
829 while (*str
!='x' && *str
!='X') str
++;
835 if (sscanf(str
, "%d-%d:%d.%d", &start
, &end
, &stride
, &block
) != 4)
836 printf("Warning: Check the format of \"%s\".\n", str
);
839 if (sscanf(str
, "%d-%d:%d", &start
, &end
, &stride
) != 3)
840 printf("Warning: Check the format of \"%s\".\n", str
);
844 if (sscanf(str
, "%d-%d", &start
, &end
) != 2)
845 printf("Warning: Check the format of \"%s\".\n", str
);
849 sscanf(str
, "%d", &start
);
852 if (block
> stride
) {
853 printf("Warning: invalid block size in \"%s\" ignored.\n", str
);
856 //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus);
857 for (k
= 0; k
<iter
; k
++) {
858 for (i
= start
; i
<=end
; i
+=stride
) {
859 for (j
=0; j
<block
; j
++) {
861 for (h
=0; h
<=numplus
; h
++) {
862 map
[count
++] = i
+j
+plusarr
[h
];
863 if (count
== CmiNumPesGlobal()) break;
865 if (count
== CmiNumPesGlobal()) break;
867 if (count
== CmiNumPesGlobal()) break;
869 if (count
== CmiNumPesGlobal()) break;
871 str
= strtok_r(NULL
, ",", &ptr
);
880 #if CMK_CRAYXE || CMK_CRAYXC
882 int getXTNodeID(int mpirank
, int nummpiranks
);
886 * Check that there are not multiple PEs assigned to the same core.
887 * If a pemap has been computed by this module (or passed by the user) this
888 * function will print a warning if oversubscription detected. If no affinity
889 * has been set explicitly by this module, it will print error and abort if
890 * oversubscription detected.
893 void CmiCheckAffinity(void)
895 #if !defined(_WIN32) && CMK_SMP && CMK_HAS_PTHREAD_SETAFFINITY && defined(CPU_OR)
897 if (!CmiCpuTopologyEnabled()) return; // only works if cpu topology enabled
899 if (CmiMyPe() == 0) {
900 // wait for every PE affinity from my physical node (for now only done on phy node 0)
903 if (get_affinity(&my_aff
) == -1) CmiAbort("get_affinity failed\n");
904 CPU_OR(&core_usage
, &core_usage
, &my_aff
); // add my affinity (pe0)
905 int N
= CmiNumPesOnPhysicalNode(0);
906 while (affMsgsRecvd
< N
)
907 CmiDeliverSpecificMsg(cpuPhyNodeAffinityRecvHandlerIdx
);
909 // NOTE this test is simple and may not detect every possible case of
911 if (CPU_COUNT(&core_usage
) < N
) {
912 // TODO suggest command line arguments?
914 CmiAbort("Multiple PEs assigned to same core. Set affinity "
915 "options to correct or lower the number of threads.\n");
917 CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend "
918 "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
922 } else if ((CmiMyPe() < CmiNumPes()) && (CmiPhysicalNodeID(CmiMyPe()) == 0)) {
923 // send my affinity to first PE on physical node (only done on phy node 0 for now)
924 affMsg
*m
= (affMsg
*)CmiAlloc(sizeof(affMsg
));
925 CmiSetHandler((char *)m
, cpuPhyNodeAffinityRecvHandlerIdx
);
926 if (get_affinity(&m
->affinity
) == -1) { // put my affinity in msg
928 CmiAbort("get_affinity failed\n");
930 CmiSyncSendAndFree(0, sizeof(affMsg
), (void *)m
);
936 void CmiInitCPUAffinity(char **argv
)
938 static skt_ip_t myip
;
943 char *pemapfile
= NULL
;
945 int show_affinity_flag
;
947 int affinity_flag
= CmiGetArgFlagDesc(argv
,"+setcpuaffinity",
950 while (CmiGetArgIntDesc(argv
,"+excludecore", &exclude
, "avoid core when setting cpuaffinity")) {
951 if (CmiMyRank() == 0) add_exclude(exclude
);
955 if (CmiGetArgStringDesc(argv
, "+pemapfile", &pemapfile
, "define pe to core mapping file")) {
958 pemap
= (char*)malloc(1024);
959 fp
= fopen(pemapfile
, "r");
960 if (fp
== NULL
) CmiAbort("pemapfile does not exist");
962 if (fgets(buf
, 128, fp
)) {
963 if (buf
[strlen(buf
)-1] == '\n') buf
[strlen(buf
)-1] = 0;
968 if (CmiMyPe()==0) CmiPrintf("Charm++> read from pemap file '%s': %s\n", pemapfile
, pemap
);
971 CmiGetArgStringDesc(argv
, "+pemap", &pemap
, "define pe to core mapping");
972 if (pemap
!=NULL
&& excludecount
>0)
973 CmiAbort("Charm++> +pemap can not be used with +excludecore.\n");
975 CmiGetArgStringDesc(argv
, "+commap", &commap
, "define comm threads to core mapping");
977 if (pemap
!=NULL
|| commap
!=NULL
) affinity_flag
= 1;
979 #if CMK_PAMI_LINUX_PPC8
983 show_affinity_flag
= CmiGetArgFlagDesc(argv
,"+showcpuaffinity",
984 "print cpu affinity");
990 if (s
= getenv("CmiProcessPerHost")) {
992 if (getenv("CmiOneWthPerCore"))
993 CmiMapHostsByCore(CmiMyLocalRank
, n
);
994 else if (getenv("CmiOneWthPerSocket"))
995 CmiMapHostsBySocket(CmiMyLocalRank
, n
);
997 CmiMapHosts(CmiMyLocalRank
, n
); // scatter
999 else if (s
= getenv("CmiProcessPerSocket")) {
1001 CmiMapSockets(CmiMyLocalRank
, n
);
1003 else if (s
= getenv("CmiProcessPerCore")) {
1005 CmiMapCores(CmiMyLocalRank
, n
);
1007 else if (s
= getenv("CmiProcessPerPU")) {
1009 CmiMapPUs(CmiMyLocalRank
, n
);
1012 if (show_affinity_flag
) CmiPrintCPUAffinity();
1017 CmiAssignOnce(&cpuAffinityHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuAffinityHandler
));
1018 CmiAssignOnce(&cpuAffinityRecvHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuAffinityRecvHandler
));
1019 CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx
, CmiRegisterHandler((CmiHandler
)cpuPhyNodeAffinityRecvHandler
));
1021 if (CmiMyRank() ==0) {
1022 affLock
= CmiCreateLock();
1024 aff_is_set
= affinity_flag
;
1025 CPU_ZERO(&core_usage
);
1032 if(CmiMyPe()==0) CmiPrintf("Charm++> cpu affinity setting is not needed on Blue Gene/Q, thus ignored.\n");
1034 if(show_affinity_flag
){
1035 show_affinity_flag
= 0;
1036 if(CmiMyPe()==0) CmiPrintf("Charm++> printing cpu affinity is not supported on Blue Gene/Q.\n");
1040 if (!affinity_flag
) {
1041 if (show_affinity_flag
) {
1042 CmiPrintCPUAffinity();
1043 CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
1048 if (CmiMyPe() == 0) {
1049 CmiPrintf("Charm++> cpu affinity enabled. \n");
1050 if (excludecount
> 0) {
1051 CmiPrintf("Charm++> cpuaffinity excludes core: %d", excludecore
[0]);
1052 for (i
=1; i
<excludecount
; i
++) CmiPrintf(" %d", excludecore
[i
]);
1056 CmiPrintf("Charm++> cpuaffinity PE-core map : %s\n", pemap
);
1059 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
1060 /* comm thread either can float around, or pin down to the last rank.
1061 however it seems to be reportedly slower if it is floating */
1062 CmiNodeAllBarrier();
1063 if (commap
!= NULL
) {
1064 int mycore
= search_pemap(commap
, CmiMyPeGlobal()-CmiNumPesGlobal());
1065 if(CmiMyPe()-CmiNumPes()==0) CmiPrintf("Charm++> set comm %d on node %d to core #%d\n", CmiMyPe()-CmiNumPes(), CmiMyNode(), mycore
);
1066 if (-1 == CmiSetCPUAffinity(mycore
))
1067 CmiAbort("set_cpu_affinity abort!");
1068 CmiNodeAllBarrier();
1069 if (show_affinity_flag
) CmiPrintCPUAffinity();
1070 return; /* comm thread return */
1073 /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
1074 #if !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8
1075 if (pemap
== NULL
) {
1076 #if CMK_MACHINE_PROGRESS_DEFINED
1077 while (affinity_doneflag
< CmiMyNodeSize()) CmiNetworkProgress();
1080 #error "Machine progress call needs to be implemented for cpu affinity!"
1085 #if CMK_CRAYXE || CMK_CRAYXC
1086 /* if both pemap and commmap are NULL, will compute one */
1090 CmiNodeAllBarrier();
1091 if (show_affinity_flag
) CmiPrintCPUAffinity();
1092 return; /* comm thread return */
1097 if (pemap
!= NULL
&& CmiMyPe()<CmiNumPes()) { /* work thread */
1098 int mycore
= search_pemap(pemap
, CmiMyPeGlobal());
1099 if(show_affinity_flag
) CmiPrintf("Charm++> set PE %d on node %d to core #%d\n", CmiMyPe(), CmiMyNode(), mycore
);
1100 if (mycore
>= CmiNumCores()) {
1101 CmiPrintf("Error> Invalid core number %d, only have %d cores (0-%d) on the node. \n", mycore
, CmiNumCores(), CmiNumCores()-1);
1102 CmiAbort("Invalid core number");
1104 if (CmiSetCPUAffinity(mycore
) == -1) CmiAbort("set_cpu_affinity abort!");
1105 CmiNodeAllBarrier();
1106 CmiNodeAllBarrier();
1107 /* if (show_affinity_flag) CmiPrintCPUAffinity(); */
1111 #if CMK_CRAYXE || CMK_CRAYXC
1113 int numCores
= CmiNumCores();
1115 int myid
= getXTNodeID(CmiMyNodeGlobal(), CmiNumNodesGlobal());
1117 int pe
, mype
= CmiMyPeGlobal();
1118 int node
= CmiMyNodeGlobal();
1121 if (CmiMyPe() >= CmiNumPes()) { /* this is comm thread */
1122 int node
= CmiMyPe() - CmiNumPes();
1123 mype
= CmiGetPeGlobal(CmiNodeFirst(node
) + CmiMyNodeSize() - 1, CmiMyPartition()); /* last pe on SMP node */
1124 node
= CmiGetNodeGlobal(node
, CmiMyPartition());
1129 int n
= CmiNodeOf(pe
);
1130 if (n
!= node
) { nnodes
++; node
= n
; }
1131 if (getXTNodeID(n
, CmiNumNodesGlobal()) != myid
) break;
1134 CmiAssert(numCores
> 0);
1135 myrank
= (mype
- pe
- 1 + nnodes
)%numCores
;
1137 if (CmiMyPe() >= CmiNumPes())
1138 myrank
= (myrank
+ 1)%numCores
;
1141 if (-1 != CmiSetCPUAffinity(myrank
)) {
1142 DEBUGP(("Processor %d is bound to core #%d on node #%d\n", CmiMyPe(), myrank
, mynode
));
1145 CmiPrintf("Processor %d set affinity failed!\n", CmiMyPe());
1146 CmiAbort("set cpu affinity abort!\n");
1149 if (CmiMyPe() < CmiNumPes())
1150 CmiNodeAllBarrier();
1151 CmiNodeAllBarrier();
1152 #elif CMK_SMP && CMK_PAMI_LINUX_PPC8
1153 #define CMK_PAMI_LINUX_PPC8_CORES_PER_NODE 20
1154 #define CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE 8
1155 #define CMK_PAMI_LINUX_PPC8_SKIP_CORE_0 0
1156 int cores_per_node
= CMK_PAMI_LINUX_PPC8_CORES_PER_NODE
;
1157 int threads_per_core
= CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE
;
1159 CmiGetArgInt(argv
,"+cores_per_node", &cores_per_node
);
1160 CmiGetArgInt(argv
,"+threads_per_core", &threads_per_core
);
1162 int my_core
= CmiMyPe() % cores_per_node
;
1163 int my_core_2
= CmiMyPe() % (cores_per_node
/2);
1164 #if CMK_PAMI_LINUX_PPC8_SKIP_CORE_0
1165 my_core_2
= (my_core_2
+ 1) % (CMK_PAMI_LINUX_PPC8_CORES_PER_NODE
/2);
1169 if (my_core
< (cores_per_node
/2))
1170 cpu
= my_core_2
* threads_per_core
;
1172 cpu
= (my_core_2
+ CMK_PAMI_LINUX_PPC8_CORES_PER_NODE
/2) * threads_per_core
;
1176 CPU_SET(cpu
, &cset
);
1177 CPU_SET(cpu
+1, &cset
);
1178 if(sched_setaffinity(0, sizeof(cpu_set_t
), &cset
) < 0)
1179 perror("sched_setaffinity");
1182 if (sched_getaffinity(0, sizeof(cset
), &cset
) < 0)
1183 perror("sched_getaffinity");
1187 CmiPrintf("Setting default affinity\n");
1190 /* get my ip address */
1191 if (CmiMyRank() == 0)
1193 #if CMK_HAS_GETHOSTNAME
1194 myip
= skt_my_ip(); /* not thread safe, so only calls on rank 0 */
1196 CmiAbort("Can not get unique name for the compute nodes. \n");
1199 CmiNodeAllBarrier();
1201 /* prepare a msg to send */
1202 msg
= (hostnameMsg
*)CmiAlloc(sizeof(hostnameMsg
));
1203 CmiSetHandler((char *)msg
, cpuAffinityHandlerIdx
);
1204 msg
->pe
= CmiMyPe();
1206 msg
->ncores
= CmiNumCores();
1207 DEBUGP(("PE %d's node has %d number of cores. \n", CmiMyPe(), msg
->ncores
));
1209 CmiSyncSendAndFree(0, sizeof(hostnameMsg
), (void *)msg
);
1211 if (CmiMyPe() == 0) {
1213 hostTable
= CmmNew();
1214 rankmsg
= (rankMsg
*)CmiAlloc(sizeof(rankMsg
)+CmiNumPes()*sizeof(int)*2);
1215 CmiSetHandler((char *)rankmsg
, cpuAffinityRecvHandlerIdx
);
1216 rankmsg
->ranks
= (int *)((char*)rankmsg
+ sizeof(rankMsg
));
1217 rankmsg
->nodes
= (int *)((char*)rankmsg
+ sizeof(rankMsg
) + CmiNumPes()*sizeof(int));
1218 for (i
=0; i
<CmiNumPes(); i
++) {
1219 rankmsg
->ranks
[i
] = 0;
1220 rankmsg
->nodes
[i
] = -1;
1223 for (i
=0; i
<CmiNumPes(); i
++) CmiDeliverSpecificMsg(cpuAffinityHandlerIdx
);
1226 /* receive broadcast from PE 0 */
1227 CmiDeliverSpecificMsg(cpuAffinityRecvHandlerIdx
);
1229 affinity_doneflag
++;
1231 CmiNodeAllBarrier();
1234 if (show_affinity_flag
) CmiPrintCPUAffinity();
1237 /* called in ConverseCommonInit to initialize basic variables */
1238 void CmiInitCPUAffinityUtil(void){
1240 CpvInitialize(int, myCPUAffToCore
);
1241 CpvAccess(myCPUAffToCore
) = -1;
1243 CpvInitialize(void *, myProcStatFP
);
1244 CmiLock(_smp_mutex
);
1246 sprintf(fname
, "/proc/%d/task/%d/stat", getpid(), syscall(SYS_gettid
));
1248 sprintf(fname
, "/proc/%d/stat", getpid());
1250 CpvAccess(myProcStatFP
) = (void *)fopen(fname
, "r");
1251 CmiUnlock(_smp_mutex
);
1253 if(CmiMyPe()==0 && CpvAccess(myProcStatFP) == NULL){
1254 CmiPrintf("WARNING: ERROR IN OPENING FILE %s on PROC %d, CmiOnCore() SHOULDN'T BE CALLED\n", fname, CmiMyPe());
1260 #else /* not supporting affinity */
1262 int CmiSetCPUAffinity(int mycore
)
1267 int CmiPrintCPUAffinity(void)
1269 CmiPrintf("Warning: CmiPrintCPUAffinity not supported.\n");
1274 void CmiCheckAffinity(void) {
1278 void CmiInitCPUAffinity(char **argv
)
1281 char *pemapfile
= NULL
;
1282 char *commap
= NULL
;
1283 int excludecore
= -1;
1284 int affinity_flag
= CmiGetArgFlagDesc(argv
,"+setcpuaffinity",
1285 "set cpu affinity");
1286 while (CmiGetArgIntDesc(argv
,"+excludecore",&excludecore
, "avoid core when setting cpuaffinity"));
1287 CmiGetArgStringDesc(argv
, "+pemap", &pemap
, "define pe to core mapping");
1288 CmiGetArgStringDesc(argv
, "+pemapfile", &pemapfile
, "define pe to core mapping file");
1289 CmiGetArgStringDesc(argv
, "+commap", &commap
, "define comm threads to core mapping");
1290 if (affinity_flag
&& CmiMyPe()==0)
1291 CmiPrintf("sched_setaffinity() is not supported, +setcpuaffinity disabled.\n");
1292 if (excludecore
!= -1 && CmiMyPe()==0)
1293 CmiPrintf("sched_setaffinity() is not supported, +excludecore disabled.\n");
1294 if (pemap
&& CmiMyPe()==0)
1295 CmiPrintf("sched_setaffinity() is not supported, +pemap disabled.\n");
1296 if (pemapfile
&& CmiMyPe()==0)
1297 CmiPrintf("sched_setaffinity() is not supported, +pemapfile disabled.\n");
1298 if (commap
&& CmiMyPe()==0)
1299 CmiPrintf("sched_setaffinity() is not supported, +commap disabled.\n");
1302 /* called in ConverseCommonInit to initialize basic variables */
1303 void CmiInitCPUAffinityUtil(void){
1304 CpvInitialize(int, myCPUAffToCore
);
1305 CpvAccess(myCPUAffToCore
) = -1;
1307 CpvInitialize(void *, myProcStatFP
);
1308 CpvAccess(myProcStatFP
) = NULL
;
1312 int CmiOnCore(void){
1313 printf("WARNING: CmiOnCore IS NOT SUPPORTED ON THIS PLATFORM\n");