2 * Copyright © 2009 CNRS
3 * Copyright © 2009-2018 Inria. All rights reserved.
4 * Copyright © 2009-2013, 2015 Université Bordeaux
5 * Copyright © 2009-2014 Cisco Systems, Inc. All rights reserved.
6 * Copyright © 2015 Intel, Inc. All rights reserved.
8 * See COPYING in top-level directory.
11 #include <private/autogen/config.h>
13 #include <hwloc/linux.h>
14 #include <private/misc.h>
15 #include <private/private.h>
16 #include <private/misc.h>
17 #include <private/debug.h>
30 #ifdef HWLOC_HAVE_LIBUDEV
33 #include <sys/types.h>
38 #include <sys/syscall.h>
40 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND || defined HWLOC_HAVE_MOVE_PAGES
41 #define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
45 struct hwloc_linux_backend_data_s
{
46 char *root_path
; /* NULL if unused */
47 int root_fd
; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
48 int is_real_fsroot
; /* Boolean saying whether root_fd points to the real filesystem root of the system */
49 #ifdef HWLOC_HAVE_LIBUDEV
50 struct udev
*udev
; /* Global udev context */
52 char *dumped_hwdata_dirname
;
54 HWLOC_LINUX_ARCH_X86
, /* x86 32 or 64bits, including k1om (KNC) */
55 HWLOC_LINUX_ARCH_IA64
,
57 HWLOC_LINUX_ARCH_POWER
,
58 HWLOC_LINUX_ARCH_UNKNOWN
62 struct utsname utsname
; /* fields contain \0 when unknown */
63 unsigned fallback_nbprocessors
;
66 int deprecated_classlinks_model
; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
67 int mic_need_directlookup
; /* if not tried yet, 0 if not needed, 1 if needed */
68 unsigned mic_directlookup_id_max
; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
73 /***************************
74 * Misc Abstraction layers *
75 ***************************/
77 #if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
78 /* libc doesn't have support for sched_setaffinity, make system call
80 # include <linux/unistd.h>
81 # ifndef __NR_sched_setaffinity
83 # define __NR_sched_setaffinity 241
84 # elif defined(__x86_64__)
85 # define __NR_sched_setaffinity 203
86 # elif defined(__ia64__)
87 # define __NR_sched_setaffinity 1231
88 # elif defined(__hppa__)
89 # define __NR_sched_setaffinity 211
90 # elif defined(__alpha__)
91 # define __NR_sched_setaffinity 395
92 # elif defined(__s390__)
93 # define __NR_sched_setaffinity 239
94 # elif defined(__sparc__)
95 # define __NR_sched_setaffinity 261
96 # elif defined(__m68k__)
97 # define __NR_sched_setaffinity 311
98 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
99 # define __NR_sched_setaffinity 222
100 # elif defined(__arm__)
101 # define __NR_sched_setaffinity 241
102 # elif defined(__cris__)
103 # define __NR_sched_setaffinity 241
104 /*# elif defined(__mips__)
105 # define __NR_sched_setaffinity TODO (32/64/nabi) */
107 # warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
108 # define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
111 # ifndef sched_setaffinity
112 # define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
114 # ifndef __NR_sched_getaffinity
116 # define __NR_sched_getaffinity 242
117 # elif defined(__x86_64__)
118 # define __NR_sched_getaffinity 204
119 # elif defined(__ia64__)
120 # define __NR_sched_getaffinity 1232
121 # elif defined(__hppa__)
122 # define __NR_sched_getaffinity 212
123 # elif defined(__alpha__)
124 # define __NR_sched_getaffinity 396
125 # elif defined(__s390__)
126 # define __NR_sched_getaffinity 240
127 # elif defined(__sparc__)
128 # define __NR_sched_getaffinity 260
129 # elif defined(__m68k__)
130 # define __NR_sched_getaffinity 312
131 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
132 # define __NR_sched_getaffinity 223
133 # elif defined(__arm__)
134 # define __NR_sched_getaffinity 242
135 # elif defined(__cris__)
136 # define __NR_sched_getaffinity 242
137 /*# elif defined(__mips__)
138 # define __NR_sched_getaffinity TODO (32/64/nabi) */
140 # warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
141 # define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
144 # ifndef sched_getaffinity
145 # define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
149 /* Added for ntohl() */
150 #include <arpa/inet.h>
153 /* Use our own filesystem functions if we have openat */
156 hwloc_checkat(const char *path
, int fsroot_fd
)
158 const char *relative_path
;
164 /* Skip leading slashes. */
165 for (relative_path
= path
; *relative_path
== '/'; relative_path
++);
167 return relative_path
;
171 hwloc_openat(const char *path
, int fsroot_fd
)
173 const char *relative_path
;
175 relative_path
= hwloc_checkat(path
, fsroot_fd
);
179 return openat (fsroot_fd
, relative_path
, O_RDONLY
);
183 hwloc_fopenat(const char *path
, const char *mode
, int fsroot_fd
)
187 if (strcmp(mode
, "r")) {
192 fd
= hwloc_openat (path
, fsroot_fd
);
196 return fdopen(fd
, mode
);
200 hwloc_accessat(const char *path
, int mode
, int fsroot_fd
)
202 const char *relative_path
;
204 relative_path
= hwloc_checkat(path
, fsroot_fd
);
208 return faccessat(fsroot_fd
, relative_path
, mode
, 0);
212 hwloc_fstatat(const char *path
, struct stat
*st
, int flags
, int fsroot_fd
)
214 const char *relative_path
;
216 relative_path
= hwloc_checkat(path
, fsroot_fd
);
220 return fstatat(fsroot_fd
, relative_path
, st
, flags
);
224 hwloc_opendirat(const char *path
, int fsroot_fd
)
227 const char *relative_path
;
229 relative_path
= hwloc_checkat(path
, fsroot_fd
);
233 dir_fd
= openat(fsroot_fd
, relative_path
, O_RDONLY
| O_DIRECTORY
);
237 return fdopendir(dir_fd
);
240 #endif /* HAVE_OPENAT */
242 /* Static inline version of fopen so that we can use openat if we have
243 it, but still preserve compiler parameter checking */
244 static __hwloc_inline
int
245 hwloc_open(const char *p
, int d __hwloc_attribute_unused
)
248 return hwloc_openat(p
, d
);
250 return open(p
, O_RDONLY
);
254 static __hwloc_inline
FILE *
255 hwloc_fopen(const char *p
, const char *m
, int d __hwloc_attribute_unused
)
258 return hwloc_fopenat(p
, m
, d
);
264 /* Static inline version of access so that we can use openat if we have
265 it, but still preserve compiler parameter checking */
266 static __hwloc_inline
int
267 hwloc_access(const char *p
, int m
, int d __hwloc_attribute_unused
)
270 return hwloc_accessat(p
, m
, d
);
276 static __hwloc_inline
int
277 hwloc_stat(const char *p
, struct stat
*st
, int d __hwloc_attribute_unused
)
280 return hwloc_fstatat(p
, st
, 0, d
);
286 static __hwloc_inline
int
287 hwloc_lstat(const char *p
, struct stat
*st
, int d __hwloc_attribute_unused
)
290 return hwloc_fstatat(p
, st
, AT_SYMLINK_NOFOLLOW
, d
);
296 /* Static inline version of opendir so that we can use openat if we have
297 it, but still preserve compiler parameter checking */
298 static __hwloc_inline
DIR *
299 hwloc_opendir(const char *p
, int d __hwloc_attribute_unused
)
302 return hwloc_opendirat(p
, d
);
309 /*****************************************
310 ******* Helpers for reading files *******
311 *****************************************/
313 static __hwloc_inline
int
314 hwloc_read_path_by_length(const char *path
, char *string
, size_t length
, int fsroot_fd
)
318 fd
= hwloc_open(path
, fsroot_fd
);
322 ret
= read(fd
, string
, length
-1); /* read -1 to put the ending \0 */
333 static __hwloc_inline
int
334 hwloc_read_path_as_int(const char *path
, int *value
, int fsroot_fd
)
337 if (hwloc_read_path_by_length(path
, string
, sizeof(string
), fsroot_fd
) < 0)
339 *value
= atoi(string
);
343 static __hwloc_inline
int
344 hwloc_read_path_as_uint(const char *path
, unsigned *value
, int fsroot_fd
)
347 if (hwloc_read_path_by_length(path
, string
, sizeof(string
), fsroot_fd
) < 0)
349 *value
= (unsigned) strtoul(string
, NULL
, 10);
353 /* Read everything from fd and save it into a newly allocated buffer
354 * returned in bufferp. Use sizep as a default buffer size, and returned
355 * the actually needed size in sizep.
357 static __hwloc_inline
int
358 hwloc__read_fd(int fd
, char **bufferp
, size_t *sizep
)
361 size_t toread
, filesize
, totalread
;
364 toread
= filesize
= *sizep
;
366 /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
367 buffer
= malloc(filesize
+1);
371 ret
= read(fd
, buffer
, toread
+1);
377 totalread
= (size_t) ret
;
379 if (totalread
< toread
+ 1)
380 /* Normal case, a single read got EOF */
383 /* Unexpected case, must extend the buffer and read again.
384 * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
392 tmp
= realloc(buffer
, filesize
+1);
399 ret
= read(fd
, buffer
+toread
+1, toread
);
406 } while ((size_t) ret
== toread
);
409 buffer
[totalread
] = '\0';
415 /* kernel cpumaps are composed of an array of 32bits cpumasks */
416 #define KERNEL_CPU_MASK_BITS 32
417 #define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
419 static __hwloc_inline
int
420 hwloc__read_fd_as_cpumask(int fd
, hwloc_bitmap_t set
)
422 static size_t _filesize
= 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
427 static int _nr_maps_allocated
= 8; /* Only compute the power-of-two above the kernel cpumask size once.
428 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
430 int nr_maps_allocated
= _nr_maps_allocated
;
431 char *buffer
, *tmpbuf
;
434 /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
435 * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
437 * If we ever need a larger buffer, we'll realloc() the buffer during the first
438 * invocation of this function so that others directly allocate the right size
439 * (all cpumask files have the exact same size).
441 filesize
= _filesize
;
443 filesize
= hwloc_getpagesize();
444 if (hwloc__read_fd(fd
, &buffer
, &filesize
) < 0)
446 /* Only update the static value with the final one,
447 * to avoid sharing intermediate values that we modify,
448 * in case there's ever multiple concurrent calls.
450 _filesize
= filesize
;
452 maps
= malloc(nr_maps_allocated
* sizeof(*maps
));
458 /* reset to zero first */
459 hwloc_bitmap_zero(set
);
461 /* parse the whole mask */
463 while (sscanf(tmpbuf
, "%lx", &map
) == 1) {
464 /* read one kernel cpu mask and the ending comma */
465 if (nr_maps
== nr_maps_allocated
) {
466 unsigned long *tmp
= realloc(maps
, 2*nr_maps_allocated
* sizeof(*maps
));
473 nr_maps_allocated
*= 2;
476 tmpbuf
= strchr(tmpbuf
, ',');
478 maps
[nr_maps
++] = map
;
483 if (!map
&& !nr_maps
)
484 /* ignore the first map if it's empty */
487 maps
[nr_maps
++] = map
;
492 /* convert into a set */
493 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
494 for(i
=0; i
<nr_maps
; i
++)
495 hwloc_bitmap_set_ith_ulong(set
, i
, maps
[nr_maps
-1-i
]);
497 for(i
=0; i
<(nr_maps
+1)/2; i
++) {
499 mask
= maps
[nr_maps
-2*i
-1];
501 mask
|= maps
[nr_maps
-2*i
-2] << KERNEL_CPU_MASK_BITS
;
502 hwloc_bitmap_set_ith_ulong(set
, i
, mask
);
508 /* Only update the static value with the final one,
509 * to avoid sharing intermediate values that we modify,
510 * in case there's ever multiple concurrent calls.
512 if (nr_maps_allocated
> _nr_maps_allocated
)
513 _nr_maps_allocated
= nr_maps_allocated
;
517 static __hwloc_inline
int
518 hwloc__read_path_as_cpumask(const char *maskpath
, hwloc_bitmap_t set
, int fsroot_fd
)
521 fd
= hwloc_open(maskpath
, fsroot_fd
);
524 err
= hwloc__read_fd_as_cpumask(fd
, set
);
529 static __hwloc_inline hwloc_bitmap_t
530 hwloc__alloc_read_path_as_cpumask(const char *maskpath
, int fsroot_fd
)
534 set
= hwloc_bitmap_alloc();
537 err
= hwloc__read_path_as_cpumask(maskpath
, set
, fsroot_fd
);
539 hwloc_bitmap_free(set
);
545 /* set must be full on input */
546 static __hwloc_inline
int
547 hwloc__read_fd_as_cpulist(int fd
, hwloc_bitmap_t set
)
549 /* Kernel sysfs files are usually at most one page.
550 * But cpulists can be of very different sizes depending on the fragmentation,
551 * so don't bother remember the actual read size between invocations.
552 * We don't have many invocations anyway.
554 size_t filesize
= hwloc_getpagesize();
555 char *buffer
, *current
, *comma
, *tmp
;
556 int prevlast
, nextfirst
, nextlast
; /* beginning/end of enabled-segments */
558 if (hwloc__read_fd(fd
, &buffer
, &filesize
) < 0)
565 /* save a pointer to the next comma and erase it to simplify things */
566 comma
= strchr(current
, ',');
570 /* find current enabled-segment bounds */
571 nextfirst
= strtoul(current
, &tmp
, 0);
573 nextlast
= strtoul(tmp
+1, NULL
, 0);
575 nextlast
= nextfirst
;
576 if (prevlast
+1 <= nextfirst
-1)
577 hwloc_bitmap_clr_range(set
, prevlast
+1, nextfirst
-1);
579 /* switch to next enabled-segment */
586 hwloc_bitmap_clr_range(set
, prevlast
+1, -1);
592 /*****************************
593 ******* CpuBind Hooks *******
594 *****************************/
597 hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused
, pid_t tid __hwloc_attribute_unused
, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused
)
599 /* TODO Kerrighed: Use
600 * int migrate (pid_t pid, int destination_node);
601 * int migrate_self (int destination_node);
602 * int thread_migrate (int thread_id, int destination_node);
605 /* The resulting binding is always strict */
607 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
608 cpu_set_t
*plinux_set
;
614 last
= hwloc_bitmap_last(hwloc_set
);
620 setsize
= CPU_ALLOC_SIZE(last
+1);
621 plinux_set
= CPU_ALLOC(last
+1);
623 CPU_ZERO_S(setsize
, plinux_set
);
624 hwloc_bitmap_foreach_begin(cpu
, hwloc_set
)
625 CPU_SET_S(cpu
, setsize
, plinux_set
);
626 hwloc_bitmap_foreach_end();
628 err
= sched_setaffinity(tid
, setsize
, plinux_set
);
630 CPU_FREE(plinux_set
);
632 #elif defined(HWLOC_HAVE_CPU_SET)
636 CPU_ZERO(&linux_set
);
637 hwloc_bitmap_foreach_begin(cpu
, hwloc_set
)
638 CPU_SET(cpu
, &linux_set
);
639 hwloc_bitmap_foreach_end();
641 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
642 return sched_setaffinity(tid
, &linux_set
);
643 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
644 return sched_setaffinity(tid
, sizeof(linux_set
), &linux_set
);
645 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
646 #elif defined(HWLOC_HAVE_SYSCALL)
647 unsigned long mask
= hwloc_bitmap_to_ulong(hwloc_set
);
649 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
650 return sched_setaffinity(tid
, (void*) &mask
);
651 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
652 return sched_setaffinity(tid
, sizeof(mask
), (void*) &mask
);
653 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
657 #endif /* !SYSCALL */
660 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
662 * On some kernels, sched_getaffinity requires the output size to be larger
663 * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
664 * Try sched_affinity on ourself until we find a nr_cpus value that makes
668 hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology
)
670 static int _nr_cpus
= -1;
671 int nr_cpus
= _nr_cpus
;
675 /* already computed */
678 if (topology
->levels
[0][0]->complete_cpuset
)
679 /* start with a nr_cpus that may contain the whole topology */
680 nr_cpus
= hwloc_bitmap_last(topology
->levels
[0][0]->complete_cpuset
) + 1;
682 /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
685 fd
= open("/sys/devices/system/cpu/possible", O_RDONLY
); /* binding only supported in real fsroot, no need for data->root_fd */
687 hwloc_bitmap_t possible_bitmap
= hwloc_bitmap_alloc_full();
688 if (hwloc__read_fd_as_cpulist(fd
, possible_bitmap
) == 0) {
689 int max_possible
= hwloc_bitmap_last(possible_bitmap
);
690 hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap
);
692 if (nr_cpus
< max_possible
+ 1)
693 nr_cpus
= max_possible
+ 1;
696 hwloc_bitmap_free(possible_bitmap
);
700 cpu_set_t
*set
= CPU_ALLOC(nr_cpus
);
701 size_t setsize
= CPU_ALLOC_SIZE(nr_cpus
);
702 int err
= sched_getaffinity(0, setsize
, set
); /* always works, unless setsize is too small */
704 nr_cpus
= setsize
* 8; /* that's the value that was actually tested */
706 /* Found it. Only update the static value with the final one,
707 * to avoid sharing intermediate values that we modify,
708 * in case there's ever multiple concurrent calls.
710 return _nr_cpus
= nr_cpus
;
717 hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused
, pid_t tid __hwloc_attribute_unused
, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused
)
719 int err __hwloc_attribute_unused
;
722 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
723 cpu_set_t
*plinux_set
;
729 /* find the kernel nr_cpus so as to use a large enough cpu_set size */
730 kernel_nr_cpus
= hwloc_linux_find_kernel_nr_cpus(topology
);
731 setsize
= CPU_ALLOC_SIZE(kernel_nr_cpus
);
732 plinux_set
= CPU_ALLOC(kernel_nr_cpus
);
734 err
= sched_getaffinity(tid
, setsize
, plinux_set
);
737 CPU_FREE(plinux_set
);
742 if (topology
->levels
[0][0]->complete_cpuset
)
743 last
= hwloc_bitmap_last(topology
->levels
[0][0]->complete_cpuset
);
745 /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
746 last
= kernel_nr_cpus
-1;
748 hwloc_bitmap_zero(hwloc_set
);
749 for(cpu
=0; cpu
<=(unsigned) last
; cpu
++)
750 if (CPU_ISSET_S(cpu
, setsize
, plinux_set
))
751 hwloc_bitmap_set(hwloc_set
, cpu
);
753 CPU_FREE(plinux_set
);
754 #elif defined(HWLOC_HAVE_CPU_SET)
758 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
759 err
= sched_getaffinity(tid
, &linux_set
);
760 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
761 err
= sched_getaffinity(tid
, sizeof(linux_set
), &linux_set
);
762 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
766 hwloc_bitmap_zero(hwloc_set
);
767 for(cpu
=0; cpu
<CPU_SETSIZE
; cpu
++)
768 if (CPU_ISSET(cpu
, &linux_set
))
769 hwloc_bitmap_set(hwloc_set
, cpu
);
770 #elif defined(HWLOC_HAVE_SYSCALL)
773 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
774 err
= sched_getaffinity(tid
, (void*) &mask
);
775 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
776 err
= sched_getaffinity(tid
, sizeof(mask
), (void*) &mask
);
777 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
781 hwloc_bitmap_from_ulong(hwloc_set
, mask
);
785 #endif /* !SYSCALL */
790 /* Get the array of tids of a process from the task directory in /proc */
792 hwloc_linux_get_proc_tids(DIR *taskdir
, unsigned *nr_tidsp
, pid_t
** tidsp
)
794 struct dirent
*dirent
;
795 unsigned nr_tids
= 0;
796 unsigned max_tids
= 32;
800 /* take the number of links as a good estimate for the number of tids */
801 if (fstat(dirfd(taskdir
), &sb
) == 0)
802 max_tids
= sb
.st_nlink
;
804 tids
= malloc(max_tids
*sizeof(pid_t
));
812 while ((dirent
= readdir(taskdir
)) != NULL
) {
813 if (nr_tids
== max_tids
) {
816 newtids
= realloc(tids
, max_tids
*sizeof(pid_t
));
824 if (!strcmp(dirent
->d_name
, ".") || !strcmp(dirent
->d_name
, ".."))
826 tids
[nr_tids
++] = atoi(dirent
->d_name
);
834 /* Per-tid callbacks */
835 typedef int (*hwloc_linux_foreach_proc_tid_cb_t
)(hwloc_topology_t topology
, pid_t tid
, void *data
, int idx
);
838 hwloc_linux_foreach_proc_tid(hwloc_topology_t topology
,
839 pid_t pid
, hwloc_linux_foreach_proc_tid_cb_t cb
,
842 char taskdir_path
[128];
844 pid_t
*tids
, *newtids
;
845 unsigned i
, nr
, newnr
, failed
= 0, failed_errno
= 0;
846 unsigned retrynr
= 0;
850 snprintf(taskdir_path
, sizeof(taskdir_path
), "/proc/%u/task", (unsigned) pid
);
852 snprintf(taskdir_path
, sizeof(taskdir_path
), "/proc/self/task");
854 taskdir
= opendir(taskdir_path
);
862 /* read the current list of threads */
863 err
= hwloc_linux_get_proc_tids(taskdir
, &nr
, &tids
);
868 /* apply the callback to all threads */
870 for(i
=0; i
<nr
; i
++) {
871 err
= cb(topology
, tids
[i
], data
, i
);
874 failed_errno
= errno
;
878 /* re-read the list of thread */
879 err
= hwloc_linux_get_proc_tids(taskdir
, &newnr
, &newtids
);
882 /* retry if the list changed in the meantime, or we failed for *some* threads only.
883 * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
885 if (newnr
!= nr
|| memcmp(newtids
, tids
, nr
*sizeof(pid_t
)) || (failed
&& failed
!= nr
)) {
889 if (++retrynr
> 10) {
890 /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
900 /* if all threads failed, return the last errno. */
903 errno
= failed_errno
;
916 /* Per-tid proc_set_cpubind callback and caller.
917 * Callback data is a hwloc_bitmap_t. */
919 hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology
, pid_t tid
, void *data
, int idx __hwloc_attribute_unused
)
921 return hwloc_linux_set_tid_cpubind(topology
, tid
, (hwloc_bitmap_t
) data
);
925 hwloc_linux_set_pid_cpubind(hwloc_topology_t topology
, pid_t pid
, hwloc_const_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
927 return hwloc_linux_foreach_proc_tid(topology
, pid
,
928 hwloc_linux_foreach_proc_tid_set_cpubind_cb
,
932 /* Per-tid proc_get_cpubind callback data, callback function and caller */
933 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s
{
934 hwloc_bitmap_t cpuset
;
935 hwloc_bitmap_t tidset
;
940 hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology
, pid_t tid
, void *_data
, int idx
)
942 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s
*data
= _data
;
943 hwloc_bitmap_t cpuset
= data
->cpuset
;
944 hwloc_bitmap_t tidset
= data
->tidset
;
945 int flags
= data
->flags
;
947 if (hwloc_linux_get_tid_cpubind(topology
, tid
, tidset
))
950 /* reset the cpuset on first iteration */
952 hwloc_bitmap_zero(cpuset
);
954 if (flags
& HWLOC_CPUBIND_STRICT
) {
955 /* if STRICT, we want all threads to have the same binding */
957 /* this is the first thread, copy its binding */
958 hwloc_bitmap_copy(cpuset
, tidset
);
959 } else if (!hwloc_bitmap_isequal(cpuset
, tidset
)) {
960 /* this is not the first thread, and it's binding is different */
965 /* if not STRICT, just OR all thread bindings */
966 hwloc_bitmap_or(cpuset
, cpuset
, tidset
);
972 hwloc_linux_get_pid_cpubind(hwloc_topology_t topology
, pid_t pid
, hwloc_bitmap_t hwloc_set
, int flags
)
974 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data
;
975 hwloc_bitmap_t tidset
= hwloc_bitmap_alloc();
978 data
.cpuset
= hwloc_set
;
979 data
.tidset
= tidset
;
981 ret
= hwloc_linux_foreach_proc_tid(topology
, pid
,
982 hwloc_linux_foreach_proc_tid_get_cpubind_cb
,
984 hwloc_bitmap_free(tidset
);
989 hwloc_linux_set_proc_cpubind(hwloc_topology_t topology
, pid_t pid
, hwloc_const_bitmap_t hwloc_set
, int flags
)
993 if (flags
& HWLOC_CPUBIND_THREAD
)
994 return hwloc_linux_set_tid_cpubind(topology
, pid
, hwloc_set
);
996 return hwloc_linux_set_pid_cpubind(topology
, pid
, hwloc_set
, flags
);
1000 hwloc_linux_get_proc_cpubind(hwloc_topology_t topology
, pid_t pid
, hwloc_bitmap_t hwloc_set
, int flags
)
1003 pid
= topology
->pid
;
1004 if (flags
& HWLOC_CPUBIND_THREAD
)
1005 return hwloc_linux_get_tid_cpubind(topology
, pid
, hwloc_set
);
1007 return hwloc_linux_get_pid_cpubind(topology
, pid
, hwloc_set
, flags
);
1011 hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology
, hwloc_const_bitmap_t hwloc_set
, int flags
)
1013 return hwloc_linux_set_pid_cpubind(topology
, topology
->pid
, hwloc_set
, flags
);
1017 hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology
, hwloc_bitmap_t hwloc_set
, int flags
)
1019 return hwloc_linux_get_pid_cpubind(topology
, topology
->pid
, hwloc_set
, flags
);
1023 hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology
, hwloc_const_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1025 if (topology
->pid
) {
1029 return hwloc_linux_set_tid_cpubind(topology
, 0, hwloc_set
);
1033 hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology
, hwloc_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1035 if (topology
->pid
) {
1039 return hwloc_linux_get_tid_cpubind(topology
, 0, hwloc_set
);
1042 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1043 #pragma weak pthread_setaffinity_np
1044 #pragma weak pthread_self
1047 hwloc_linux_set_thread_cpubind(hwloc_topology_t topology
, pthread_t tid
, hwloc_const_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1051 if (topology
->pid
) {
1056 if (!pthread_self
) {
1057 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1061 if (tid
== pthread_self())
1062 return hwloc_linux_set_tid_cpubind(topology
, 0, hwloc_set
);
1064 if (!pthread_setaffinity_np
) {
1068 /* TODO Kerrighed: Use
1069 * int migrate (pid_t pid, int destination_node);
1070 * int migrate_self (int destination_node);
1071 * int thread_migrate (int thread_id, int destination_node);
1074 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1075 /* Use a separate block so that we can define specific variable
1078 cpu_set_t
*plinux_set
;
1083 last
= hwloc_bitmap_last(hwloc_set
);
1089 setsize
= CPU_ALLOC_SIZE(last
+1);
1090 plinux_set
= CPU_ALLOC(last
+1);
1092 CPU_ZERO_S(setsize
, plinux_set
);
1093 hwloc_bitmap_foreach_begin(cpu
, hwloc_set
)
1094 CPU_SET_S(cpu
, setsize
, plinux_set
);
1095 hwloc_bitmap_foreach_end();
1097 err
= pthread_setaffinity_np(tid
, setsize
, plinux_set
);
1099 CPU_FREE(plinux_set
);
1101 #elif defined(HWLOC_HAVE_CPU_SET)
1102 /* Use a separate block so that we can define specific variable
1105 cpu_set_t linux_set
;
1108 CPU_ZERO(&linux_set
);
1109 hwloc_bitmap_foreach_begin(cpu
, hwloc_set
)
1110 CPU_SET(cpu
, &linux_set
);
1111 hwloc_bitmap_foreach_end();
1113 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1114 err
= pthread_setaffinity_np(tid
, &linux_set
);
1115 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1116 err
= pthread_setaffinity_np(tid
, sizeof(linux_set
), &linux_set
);
1117 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1120 /* Use a separate block so that we can define specific variable
1123 unsigned long mask
= hwloc_bitmap_to_ulong(hwloc_set
);
1125 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1126 err
= pthread_setaffinity_np(tid
, (void*) &mask
);
1127 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1128 err
= pthread_setaffinity_np(tid
, sizeof(mask
), (void*) &mask
);
1129 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1131 #endif /* CPU_SET */
1139 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1141 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1142 #pragma weak pthread_getaffinity_np
1143 #pragma weak pthread_self
1146 hwloc_linux_get_thread_cpubind(hwloc_topology_t topology
, pthread_t tid
, hwloc_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1150 if (topology
->pid
) {
1155 if (!pthread_self
) {
1156 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1160 if (tid
== pthread_self())
1161 return hwloc_linux_get_tid_cpubind(topology
, 0, hwloc_set
);
1163 if (!pthread_getaffinity_np
) {
1167 /* TODO Kerrighed */
1169 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1170 /* Use a separate block so that we can define specific variable
1173 cpu_set_t
*plinux_set
;
1178 last
= hwloc_bitmap_last(topology
->levels
[0][0]->complete_cpuset
);
1179 assert (last
!= -1);
1181 setsize
= CPU_ALLOC_SIZE(last
+1);
1182 plinux_set
= CPU_ALLOC(last
+1);
1184 err
= pthread_getaffinity_np(tid
, setsize
, plinux_set
);
1186 CPU_FREE(plinux_set
);
1191 hwloc_bitmap_zero(hwloc_set
);
1192 for(cpu
=0; cpu
<=(unsigned) last
; cpu
++)
1193 if (CPU_ISSET_S(cpu
, setsize
, plinux_set
))
1194 hwloc_bitmap_set(hwloc_set
, cpu
);
1196 CPU_FREE(plinux_set
);
1198 #elif defined(HWLOC_HAVE_CPU_SET)
1199 /* Use a separate block so that we can define specific variable
1202 cpu_set_t linux_set
;
1205 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1206 err
= pthread_getaffinity_np(tid
, &linux_set
);
1207 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1208 err
= pthread_getaffinity_np(tid
, sizeof(linux_set
), &linux_set
);
1209 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1215 hwloc_bitmap_zero(hwloc_set
);
1216 for(cpu
=0; cpu
<CPU_SETSIZE
; cpu
++)
1217 if (CPU_ISSET(cpu
, &linux_set
))
1218 hwloc_bitmap_set(hwloc_set
, cpu
);
1221 /* Use a separate block so that we can define specific variable
1226 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1227 err
= pthread_getaffinity_np(tid
, (void*) &mask
);
1228 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1229 err
= pthread_getaffinity_np(tid
, sizeof(mask
), (void*) &mask
);
1230 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1236 hwloc_bitmap_from_ulong(hwloc_set
, mask
);
1238 #endif /* CPU_SET */
1242 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1245 hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused
, pid_t tid
, hwloc_bitmap_t set
)
1247 /* read /proc/pid/stat.
1248 * its second field contains the command name between parentheses,
1249 * and the command itself may contain parentheses,
1250 * so read the whole line and find the last closing parenthesis to find the third field.
1252 char buf
[1024] = "";
1257 /* TODO: find a way to use sched_getcpu().
1258 * either compare tid with gettid() in all callbacks.
1259 * or pass gettid() in the callback data.
1264 tid
= syscall(SYS_gettid
);
1271 snprintf(name
, sizeof(name
), "/proc/%lu/stat", (unsigned long) tid
);
1272 fd
= open(name
, O_RDONLY
); /* no fsroot for real /proc */
1277 err
= read(fd
, buf
, sizeof(buf
)-1); /* read -1 to put the ending \0 */
1285 tmp
= strrchr(buf
, ')');
1290 /* skip ') ' to find the actual third argument */
1293 /* skip 35 fields */
1294 for(i
=0; i
<36; i
++) {
1295 tmp
= strchr(tmp
, ' ');
1300 /* skip the ' ' itself */
1304 /* read the last cpu in the 38th field now */
1305 if (sscanf(tmp
, "%d ", &i
) != 1) {
1310 hwloc_bitmap_only(set
, i
);
1314 /* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
1315 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s
{
1316 hwloc_bitmap_t cpuset
;
1317 hwloc_bitmap_t tidset
;
1321 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology
, pid_t tid
, void *_data
, int idx
)
1323 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s
*data
= _data
;
1324 hwloc_bitmap_t cpuset
= data
->cpuset
;
1325 hwloc_bitmap_t tidset
= data
->tidset
;
1327 if (hwloc_linux_get_tid_last_cpu_location(topology
, tid
, tidset
))
1330 /* reset the cpuset on first iteration */
1332 hwloc_bitmap_zero(cpuset
);
1334 hwloc_bitmap_or(cpuset
, cpuset
, tidset
);
1339 hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology
, pid_t pid
, hwloc_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1341 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data
;
1342 hwloc_bitmap_t tidset
= hwloc_bitmap_alloc();
1345 data
.cpuset
= hwloc_set
;
1346 data
.tidset
= tidset
;
1347 ret
= hwloc_linux_foreach_proc_tid(topology
, pid
,
1348 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb
,
1350 hwloc_bitmap_free(tidset
);
1355 hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology
, pid_t pid
, hwloc_bitmap_t hwloc_set
, int flags
)
1358 pid
= topology
->pid
;
1359 if (flags
& HWLOC_CPUBIND_THREAD
)
1360 return hwloc_linux_get_tid_last_cpu_location(topology
, pid
, hwloc_set
);
1362 return hwloc_linux_get_pid_last_cpu_location(topology
, pid
, hwloc_set
, flags
);
1366 hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology
, hwloc_bitmap_t hwloc_set
, int flags
)
1368 return hwloc_linux_get_pid_last_cpu_location(topology
, topology
->pid
, hwloc_set
, flags
);
1372 hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology
, hwloc_bitmap_t hwloc_set
, int flags __hwloc_attribute_unused
)
1374 if (topology
->pid
) {
1379 #if HAVE_DECL_SCHED_GETCPU
1381 int pu
= sched_getcpu();
1383 hwloc_bitmap_only(hwloc_set
, pu
);
1389 return hwloc_linux_get_tid_last_cpu_location(topology
, 0, hwloc_set
);
1394 /***************************
1395 ****** Membind hooks ******
1396 ***************************/
1398 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
1400 /* MPOL_LOCAL is not in numaif.h, and it's a enum if linux/mempolicy.h, define ours to avoid conflicts */
1401 #define HWLOC_MPOL_LOCAL 4
1404 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy
, hwloc_membind_policy_t policy
, int flags
)
1407 case HWLOC_MEMBIND_DEFAULT
:
1408 *linuxpolicy
= MPOL_DEFAULT
;
1410 case HWLOC_MEMBIND_FIRSTTOUCH
:
1411 *linuxpolicy
= HWLOC_MPOL_LOCAL
;
1413 case HWLOC_MEMBIND_BIND
:
1414 if (flags
& HWLOC_MEMBIND_STRICT
)
1415 *linuxpolicy
= MPOL_BIND
;
1417 *linuxpolicy
= MPOL_PREFERRED
;
1419 case HWLOC_MEMBIND_INTERLEAVE
:
1420 *linuxpolicy
= MPOL_INTERLEAVE
;
1422 /* TODO: next-touch when (if?) patch applied upstream */
1431 hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused
,
1432 hwloc_const_nodeset_t nodeset
,
1433 unsigned *max_os_index_p
, unsigned long **linuxmaskp
)
1435 unsigned max_os_index
= 0; /* highest os_index + 1 */
1436 unsigned long *linuxmask
;
1438 hwloc_nodeset_t linux_nodeset
= NULL
;
1440 if (hwloc_bitmap_isfull(nodeset
)) {
1441 linux_nodeset
= hwloc_bitmap_alloc();
1442 hwloc_bitmap_only(linux_nodeset
, 0);
1443 nodeset
= linux_nodeset
;
1446 max_os_index
= hwloc_bitmap_last(nodeset
);
1447 if (max_os_index
== (unsigned) -1)
1449 /* add 1 to convert the last os_index into a max_os_index,
1450 * and round up to the nearest multiple of BITS_PER_LONG */
1451 max_os_index
= (max_os_index
+ 1 + HWLOC_BITS_PER_LONG
- 1) & ~(HWLOC_BITS_PER_LONG
- 1);
1453 linuxmask
= calloc(max_os_index
/HWLOC_BITS_PER_LONG
, sizeof(long));
1455 hwloc_bitmap_free(linux_nodeset
);
1460 for(i
=0; i
<max_os_index
/HWLOC_BITS_PER_LONG
; i
++)
1461 linuxmask
[i
] = hwloc_bitmap_to_ith_ulong(nodeset
, i
);
1464 hwloc_bitmap_free(linux_nodeset
);
1466 *max_os_index_p
= max_os_index
;
1467 *linuxmaskp
= linuxmask
;
1472 hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused
,
1473 hwloc_nodeset_t nodeset
,
1474 unsigned max_os_index
, const unsigned long *linuxmask
)
1479 /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
1480 assert(!(max_os_index
%HWLOC_BITS_PER_LONG
));
1483 hwloc_bitmap_zero(nodeset
);
1484 for(i
=0; i
<max_os_index
/HWLOC_BITS_PER_LONG
; i
++)
1485 hwloc_bitmap_set_ith_ulong(nodeset
, i
, linuxmask
[i
]);
1487 #endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
1489 #ifdef HWLOC_HAVE_MBIND
1491 hwloc_linux_set_area_membind(hwloc_topology_t topology
, const void *addr
, size_t len
, hwloc_const_nodeset_t nodeset
, hwloc_membind_policy_t policy
, int flags
)
1493 unsigned max_os_index
; /* highest os_index + 1 */
1494 unsigned long *linuxmask
;
1497 unsigned linuxflags
= 0;
1500 remainder
= (uintptr_t) addr
& (hwloc_getpagesize()-1);
1501 addr
= (char*) addr
- remainder
;
1504 err
= hwloc_linux_membind_policy_from_hwloc(&linuxpolicy
, policy
, flags
);
1508 if (linuxpolicy
== MPOL_DEFAULT
) {
1509 /* Some Linux kernels don't like being passed a set */
1510 return mbind((void *) addr
, len
, linuxpolicy
, NULL
, 0, 0);
1512 } else if (linuxpolicy
== HWLOC_MPOL_LOCAL
) {
1513 /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1514 return mbind((void *) addr
, len
, MPOL_PREFERRED
, NULL
, 0, 0);
1517 err
= hwloc_linux_membind_mask_from_nodeset(topology
, nodeset
, &max_os_index
, &linuxmask
);
1521 if (flags
& HWLOC_MEMBIND_MIGRATE
) {
1523 linuxflags
= MPOL_MF_MOVE
;
1524 if (flags
& HWLOC_MEMBIND_STRICT
)
1525 linuxflags
|= MPOL_MF_STRICT
;
1527 if (flags
& HWLOC_MEMBIND_STRICT
) {
1534 err
= mbind((void *) addr
, len
, linuxpolicy
, linuxmask
, max_os_index
+1, linuxflags
);
1548 hwloc_linux_alloc_membind(hwloc_topology_t topology
, size_t len
, hwloc_const_nodeset_t nodeset
, hwloc_membind_policy_t policy
, int flags
)
1553 buffer
= hwloc_alloc_mmap(topology
, len
);
1557 err
= hwloc_linux_set_area_membind(topology
, buffer
, len
, nodeset
, policy
, flags
);
1558 if (err
< 0 && (flags
& HWLOC_MEMBIND_STRICT
)) {
1559 munmap(buffer
, len
);
1565 #endif /* HWLOC_HAVE_MBIND */
1567 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1569 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology
, hwloc_const_nodeset_t nodeset
, hwloc_membind_policy_t policy
, int flags
)
1571 unsigned max_os_index
; /* highest os_index + 1 */
1572 unsigned long *linuxmask
;
1576 err
= hwloc_linux_membind_policy_from_hwloc(&linuxpolicy
, policy
, flags
);
1580 if (linuxpolicy
== MPOL_DEFAULT
) {
1581 /* Some Linux kernels don't like being passed a set */
1582 return set_mempolicy(linuxpolicy
, NULL
, 0);
1584 } else if (linuxpolicy
== HWLOC_MPOL_LOCAL
) {
1585 /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1586 return set_mempolicy(MPOL_PREFERRED
, NULL
, 0);
1589 err
= hwloc_linux_membind_mask_from_nodeset(topology
, nodeset
, &max_os_index
, &linuxmask
);
1593 if (flags
& HWLOC_MEMBIND_MIGRATE
) {
1594 #ifdef HWLOC_HAVE_MIGRATE_PAGES
1595 unsigned long *fullmask
= malloc(max_os_index
/HWLOC_BITS_PER_LONG
* sizeof(long));
1597 memset(fullmask
, 0xf, max_os_index
/HWLOC_BITS_PER_LONG
* sizeof(long));
1598 err
= migrate_pages(0, max_os_index
+1, fullmask
, linuxmask
);
1602 if (err
< 0 && (flags
& HWLOC_MEMBIND_STRICT
))
1610 err
= set_mempolicy(linuxpolicy
, linuxmask
, max_os_index
+1);
1624 * On some kernels, get_mempolicy requires the output size to be larger
1625 * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
1626 * Try get_mempolicy on ourself until we find a max_os_index value that
1627 * makes the kernel happy.
1630 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused
)
1632 static int _max_numnodes
= -1, max_numnodes
;
1635 if (_max_numnodes
!= -1)
1636 /* already computed */
1637 return _max_numnodes
;
1639 /* start with a single ulong, it's the minimal and it's enough for most machines */
1640 max_numnodes
= HWLOC_BITS_PER_LONG
;
1642 unsigned long *mask
= malloc(max_numnodes
/ HWLOC_BITS_PER_LONG
* sizeof(long));
1643 int err
= get_mempolicy(&linuxpolicy
, mask
, max_numnodes
, 0, 0);
1645 if (!err
|| errno
!= EINVAL
)
1646 /* Found it. Only update the static value with the final one,
1647 * to avoid sharing intermediate values that we modify,
1648 * in case there's ever multiple concurrent calls.
1650 return _max_numnodes
= max_numnodes
;
1656 hwloc_linux_membind_policy_to_hwloc(int linuxpolicy
, hwloc_membind_policy_t
*policy
)
1658 switch (linuxpolicy
) {
1660 case HWLOC_MPOL_LOCAL
: /* converted from MPOL_PREFERRED + empty nodeset by the caller */
1661 *policy
= HWLOC_MEMBIND_FIRSTTOUCH
;
1663 case MPOL_PREFERRED
:
1665 *policy
= HWLOC_MEMBIND_BIND
;
1667 case MPOL_INTERLEAVE
:
1668 *policy
= HWLOC_MEMBIND_INTERLEAVE
;
1676 static int hwloc_linux_mask_is_empty(unsigned max_os_index
, unsigned long *linuxmask
)
1679 for(i
=0; i
<max_os_index
/HWLOC_BITS_PER_LONG
; i
++)
1686 hwloc_linux_get_thisthread_membind(hwloc_topology_t topology
, hwloc_nodeset_t nodeset
, hwloc_membind_policy_t
*policy
, int flags __hwloc_attribute_unused
)
1688 unsigned max_os_index
;
1689 unsigned long *linuxmask
;
1693 max_os_index
= hwloc_linux_find_kernel_max_numnodes(topology
);
1695 linuxmask
= malloc(max_os_index
/HWLOC_BITS_PER_LONG
* sizeof(long));
1701 err
= get_mempolicy(&linuxpolicy
, linuxmask
, max_os_index
, 0, 0);
1705 /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1706 if (linuxpolicy
== MPOL_PREFERRED
&& hwloc_linux_mask_is_empty(max_os_index
, linuxmask
))
1707 linuxpolicy
= HWLOC_MPOL_LOCAL
;
1709 if (linuxpolicy
== MPOL_DEFAULT
|| linuxpolicy
== HWLOC_MPOL_LOCAL
) {
1710 hwloc_bitmap_copy(nodeset
, hwloc_topology_get_topology_nodeset(topology
));
1712 hwloc_linux_membind_mask_to_nodeset(topology
, nodeset
, max_os_index
, linuxmask
);
1715 err
= hwloc_linux_membind_policy_to_hwloc(linuxpolicy
, policy
);
1729 hwloc_linux_get_area_membind(hwloc_topology_t topology
, const void *addr
, size_t len
, hwloc_nodeset_t nodeset
, hwloc_membind_policy_t
*policy
, int flags __hwloc_attribute_unused
)
1731 unsigned max_os_index
;
1732 unsigned long *linuxmask
, *globallinuxmask
;
1733 int linuxpolicy
, globallinuxpolicy
= 0;
1737 int pagesize
= hwloc_getpagesize();
1742 max_os_index
= hwloc_linux_find_kernel_max_numnodes(topology
);
1744 linuxmask
= malloc(max_os_index
/HWLOC_BITS_PER_LONG
* sizeof(long));
1749 globallinuxmask
= calloc(max_os_index
/HWLOC_BITS_PER_LONG
, sizeof(long));
1750 if (!globallinuxmask
) {
1752 goto out_with_masks
;
1755 for(tmpaddr
= (char *)((unsigned long)addr
& ~(pagesize
-1));
1756 tmpaddr
< (char *)addr
+ len
;
1757 tmpaddr
+= pagesize
) {
1758 err
= get_mempolicy(&linuxpolicy
, linuxmask
, max_os_index
, tmpaddr
, MPOL_F_ADDR
);
1760 goto out_with_masks
;
1762 /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1763 if (linuxpolicy
== MPOL_PREFERRED
&& hwloc_linux_mask_is_empty(max_os_index
, linuxmask
))
1764 linuxpolicy
= HWLOC_MPOL_LOCAL
;
1766 /* use the first found policy. if we find a different one later, set mixed to 1 */
1768 globallinuxpolicy
= linuxpolicy
;
1769 else if (globallinuxpolicy
!= linuxpolicy
)
1772 /* agregate masks, and set full to 1 if we ever find DEFAULT or LOCAL */
1773 if (full
|| linuxpolicy
== MPOL_DEFAULT
|| linuxpolicy
== HWLOC_MPOL_LOCAL
) {
1776 for(i
=0; i
<max_os_index
/HWLOC_BITS_PER_LONG
; i
++)
1777 globallinuxmask
[i
] |= linuxmask
[i
];
1784 *policy
= HWLOC_MEMBIND_MIXED
;
1786 err
= hwloc_linux_membind_policy_to_hwloc(linuxpolicy
, policy
);
1788 goto out_with_masks
;
1792 hwloc_bitmap_copy(nodeset
, hwloc_topology_get_topology_nodeset(topology
));
1794 hwloc_linux_membind_mask_to_nodeset(topology
, nodeset
, max_os_index
, globallinuxmask
);
1797 free(globallinuxmask
);
1802 free(globallinuxmask
);
1808 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1810 #ifdef HWLOC_HAVE_MOVE_PAGES
1812 hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused
, const void *addr
, size_t len
, hwloc_nodeset_t nodeset
, int flags __hwloc_attribute_unused
)
1815 unsigned long count
;
1818 int pagesize
= hwloc_getpagesize();
1822 offset
= ((unsigned long) addr
) & (pagesize
-1);
1823 addr
= ((char*) addr
) - offset
;
1825 count
= (len
+ pagesize
-1)/pagesize
;
1826 pages
= malloc(count
*sizeof(*pages
));
1827 status
= malloc(count
*sizeof(*status
));
1828 if (!pages
|| !status
) {
1830 goto out_with_pages
;
1833 for(i
=0; i
<count
; i
++)
1834 pages
[i
] = ((char*)addr
) + i
*pagesize
;
1836 ret
= move_pages(0, count
, pages
, NULL
, status
, 0);
1838 goto out_with_pages
;
1840 hwloc_bitmap_zero(nodeset
);
1841 for(i
=0; i
<count
; i
++)
1843 hwloc_bitmap_set(nodeset
, status
[i
]);
1851 #endif /* HWLOC_HAVE_MOVE_PAGES */
1853 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology
, const char *root_path
, int root_fd
, char **cpuset_namep
);
1855 static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology
)
1857 const char *fsroot_path
;
1861 fsroot_path
= getenv("HWLOC_FSROOT");
1866 root_fd
= open(fsroot_path
, O_RDONLY
| O_DIRECTORY
);
1870 if (strcmp(fsroot_path
, "/")) {
1876 /* we could also error-out if the current topology doesn't actually match the system,
1877 * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
1879 * Just trust the user when he sets THISSYSTEM=1. It enables hacky
1880 * tests such as restricting random XML or synthetic to the current
1881 * machine (uses the default cgroup).
1884 hwloc_linux__get_allowed_resources(topology
, fsroot_path
, root_fd
, &cpuset_name
);
1886 hwloc_obj_add_info(topology
->levels
[0][0], "LinuxCgroup", cpuset_name
);
1897 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks
*hooks
,
1898 struct hwloc_topology_support
*support __hwloc_attribute_unused
)
1900 hooks
->set_thisthread_cpubind
= hwloc_linux_set_thisthread_cpubind
;
1901 hooks
->get_thisthread_cpubind
= hwloc_linux_get_thisthread_cpubind
;
1902 hooks
->set_thisproc_cpubind
= hwloc_linux_set_thisproc_cpubind
;
1903 hooks
->get_thisproc_cpubind
= hwloc_linux_get_thisproc_cpubind
;
1904 hooks
->set_proc_cpubind
= hwloc_linux_set_proc_cpubind
;
1905 hooks
->get_proc_cpubind
= hwloc_linux_get_proc_cpubind
;
1906 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1907 hooks
->set_thread_cpubind
= hwloc_linux_set_thread_cpubind
;
1908 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1909 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1910 hooks
->get_thread_cpubind
= hwloc_linux_get_thread_cpubind
;
1911 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1912 hooks
->get_thisthread_last_cpu_location
= hwloc_linux_get_thisthread_last_cpu_location
;
1913 hooks
->get_thisproc_last_cpu_location
= hwloc_linux_get_thisproc_last_cpu_location
;
1914 hooks
->get_proc_last_cpu_location
= hwloc_linux_get_proc_last_cpu_location
;
1915 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1916 hooks
->set_thisthread_membind
= hwloc_linux_set_thisthread_membind
;
1917 hooks
->get_thisthread_membind
= hwloc_linux_get_thisthread_membind
;
1918 hooks
->get_area_membind
= hwloc_linux_get_area_membind
;
1919 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1920 #ifdef HWLOC_HAVE_MBIND
1921 hooks
->set_area_membind
= hwloc_linux_set_area_membind
;
1922 #ifdef HWLOC_HAVE_MOVE_PAGES
1923 hooks
->get_area_memlocation
= hwloc_linux_get_area_memlocation
;
1924 #endif /* HWLOC_HAVE_MOVE_PAGES */
1925 hooks
->alloc_membind
= hwloc_linux_alloc_membind
;
1926 hooks
->alloc
= hwloc_alloc_mmap
;
1927 hooks
->free_membind
= hwloc_free_mmap
;
1928 support
->membind
->firsttouch_membind
= 1;
1929 support
->membind
->bind_membind
= 1;
1930 support
->membind
->interleave_membind
= 1;
1931 #endif /* HWLOC_HAVE_MBIND */
1932 #if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
1933 support
->membind
->migrate_membind
= 1;
1935 hooks
->get_allowed_resources
= hwloc_linux_get_allowed_resources_hook
;
1939 /*******************************************
1940 *** Misc Helpers for Topology Discovery ***
1941 *******************************************/
1944 struct hwloc_linux_cpuinfo_proc
{
1945 /* set during hwloc_linux_parse_cpuinfo */
1946 unsigned long Pproc
;
1947 /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
1949 /* set later, or -1 if unknown */
1952 /* custom info, set during hwloc_linux_parse_cpuinfo */
1953 struct hwloc_obj_info_s
*infos
;
1954 unsigned infos_count
;
1957 /* deprecated but still needed in hwloc/linux.h for backward compat */
1959 hwloc_linux_parse_cpumap_file(FILE *file
, hwloc_bitmap_t set
)
1961 unsigned long *maps
;
1964 static int _nr_maps_allocated
= 8; /* Only compute the power-of-two above the kernel cpumask size once.
1965 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
1967 int nr_maps_allocated
= _nr_maps_allocated
;
1970 maps
= malloc(nr_maps_allocated
* sizeof(*maps
));
1974 /* reset to zero first */
1975 hwloc_bitmap_zero(set
);
1977 /* parse the whole mask */
1978 while (fscanf(file
, "%lx,", &map
) == 1) /* read one kernel cpu mask and the ending comma */
1980 if (nr_maps
== nr_maps_allocated
) {
1981 unsigned long *tmp
= realloc(maps
, 2*nr_maps_allocated
* sizeof(*maps
));
1987 nr_maps_allocated
*= 2;
1990 if (!map
&& !nr_maps
)
1991 /* ignore the first map if it's empty */
1994 maps
[nr_maps
++] = map
;
1997 /* convert into a set */
1998 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
1999 for(i
=0; i
<nr_maps
; i
++)
2000 hwloc_bitmap_set_ith_ulong(set
, i
, maps
[nr_maps
-1-i
]);
2002 for(i
=0; i
<(nr_maps
+1)/2; i
++) {
2004 mask
= maps
[nr_maps
-2*i
-1];
2006 mask
|= maps
[nr_maps
-2*i
-2] << KERNEL_CPU_MASK_BITS
;
2007 hwloc_bitmap_set_ith_ulong(set
, i
, mask
);
2013 /* Only update the static value with the final one,
2014 * to avoid sharing intermediate values that we modify,
2015 * in case there's ever multiple concurrent calls.
2017 if (nr_maps_allocated
> _nr_maps_allocated
)
2018 _nr_maps_allocated
= nr_maps_allocated
;
2023 hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt
, char **cpuset_mntpnt
, const char *root_path
)
2026 struct mntent mntent
;
2032 *cgroup_mntpnt
= NULL
;
2033 *cpuset_mntpnt
= NULL
;
2036 /* setmntent() doesn't support openat(), so use the root_path directly */
2037 err
= asprintf(&mount_path
, "%s/proc/mounts", root_path
);
2040 fd
= setmntent(mount_path
, "r");
2043 fd
= setmntent("/proc/mounts", "r");
2048 /* getmntent_r() doesn't actually report an error when the buffer
2049 * is too small. It just silently truncates things. So we can't
2050 * dynamically resize things.
2052 * Linux limits mount type, string, and options to one page each.
2053 * getmntent() limits the line size to 4kB.
2054 * so use 4*pagesize to be far above both.
2056 bufsize
= hwloc_getpagesize()*4;
2057 buf
= malloc(bufsize
);
2059 while (getmntent_r(fd
, &mntent
, buf
, bufsize
)) {
2060 if (!strcmp(mntent
.mnt_type
, "cpuset")) {
2061 hwloc_debug("Found cpuset mount point on %s\n", mntent
.mnt_dir
);
2062 *cpuset_mntpnt
= strdup(mntent
.mnt_dir
);
2064 } else if (!strcmp(mntent
.mnt_type
, "cgroup")) {
2065 /* found a cgroup mntpnt */
2066 char *opt
, *opts
= mntent
.mnt_opts
;
2068 int noprefix_opt
= 0;
2069 /* look at options */
2070 while ((opt
= strsep(&opts
, ",")) != NULL
) {
2071 if (!strcmp(opt
, "cpuset"))
2073 else if (!strcmp(opt
, "noprefix"))
2079 hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent
.mnt_dir
);
2080 *cpuset_mntpnt
= strdup(mntent
.mnt_dir
);
2082 hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent
.mnt_dir
);
2083 *cgroup_mntpnt
= strdup(mntent
.mnt_dir
);
2094 * Linux cpusets may be managed directly or through cgroup.
2095 * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
2096 * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
2097 * containing <name>.
2100 hwloc_read_linux_cpuset_name(int fsroot_fd
, hwloc_pid_t pid
)
2102 #define CPUSET_NAME_LEN 128
2103 char cpuset_name
[CPUSET_NAME_LEN
];
2108 /* check whether a cgroup-cpuset is enabled */
2110 file
= hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd
);
2112 char path
[] = "/proc/XXXXXXXXXX/cgroup";
2113 snprintf(path
, sizeof(path
), "/proc/%d/cgroup", pid
);
2114 file
= hwloc_fopen(path
, "r", fsroot_fd
);
2117 /* find a cpuset line */
2118 #define CGROUP_LINE_LEN 256
2119 char line
[CGROUP_LINE_LEN
];
2120 while (fgets(line
, sizeof(line
), file
)) {
2121 char *end
, *colon
= strchr(line
, ':');
2124 if (strncmp(colon
, ":cpuset:", 8))
2127 /* found a cgroup-cpuset line, return the name */
2129 end
= strchr(colon
, '\n');
2132 hwloc_debug("Found cgroup-cpuset %s\n", colon
+8);
2133 return strdup(colon
+8);
2138 /* check whether a cpuset is enabled */
2140 err
= hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name
, sizeof(cpuset_name
), fsroot_fd
);
2142 char path
[] = "/proc/XXXXXXXXXX/cpuset";
2143 snprintf(path
, sizeof(path
), "/proc/%d/cpuset", pid
);
2144 err
= hwloc_read_path_by_length(path
, cpuset_name
, sizeof(cpuset_name
), fsroot_fd
);
2148 hwloc_debug("%s", "No cgroup or cpuset found\n");
2152 /* found a cpuset, return the name */
2153 tmp
= strchr(cpuset_name
, '\n');
2156 hwloc_debug("Found cpuset %s\n", cpuset_name
);
2157 return strdup(cpuset_name
);
2161 * Then, the cpuset description is available from either the cgroup or
2162 * the cpuset filesystem (usually mounted in / or /dev) where there
2163 * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
2166 hwloc_admin_disable_set_from_cpuset(int root_fd
,
2167 const char *cgroup_mntpnt
, const char *cpuset_mntpnt
, const char *cpuset_name
,
2168 const char *attr_name
,
2169 hwloc_bitmap_t admin_enabled_cpus_set
)
2171 #define CPUSET_FILENAME_LEN 256
2172 char cpuset_filename
[CPUSET_FILENAME_LEN
];
2176 if (cgroup_mntpnt
) {
2177 /* try to read the cpuset from cgroup */
2178 snprintf(cpuset_filename
, CPUSET_FILENAME_LEN
, "%s%s/cpuset.%s", cgroup_mntpnt
, cpuset_name
, attr_name
);
2179 hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename
);
2180 } else if (cpuset_mntpnt
) {
2181 /* try to read the cpuset directly */
2182 snprintf(cpuset_filename
, CPUSET_FILENAME_LEN
, "%s%s/%s", cpuset_mntpnt
, cpuset_name
, attr_name
);
2183 hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename
);
2186 fd
= hwloc_open(cpuset_filename
, root_fd
);
2188 /* found no cpuset description, ignore it */
2189 hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name
);
2193 err
= hwloc__read_fd_as_cpulist(fd
, admin_enabled_cpus_set
);
2197 hwloc_bitmap_fill(admin_enabled_cpus_set
);
2199 hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set
);
2203 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s
*data
,
2205 uint64_t *local_memory
,
2206 uint64_t *meminfo_hugepages_count
,
2207 uint64_t *meminfo_hugepages_size
,
2212 unsigned long long number
;
2214 if (hwloc_read_path_by_length(path
, buffer
, sizeof(buffer
), data
->root_fd
) < 0)
2217 tmp
= strstr(buffer
, "MemTotal: "); /* MemTotal: %llu kB */
2219 number
= strtoull(tmp
+10, NULL
, 10);
2220 *local_memory
= number
<< 10;
2225 tmp
= strstr(tmp
, "Hugepagesize: "); /* Hugepagesize: %llu */
2227 number
= strtoull(tmp
+14, NULL
, 10);
2228 *meminfo_hugepages_size
= number
<< 10;
2230 tmp
= strstr(tmp
, "HugePages_Free: "); /* HugePages_Free: %llu */
2232 number
= strtoull(tmp
+16, NULL
, 10);
2233 *meminfo_hugepages_count
= number
;
2239 #define SYSFS_NUMA_NODE_PATH_LEN 128
2242 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s
*data
,
2243 const char *dirpath
,
2244 struct hwloc_obj_memory_s
*memory
,
2245 uint64_t *remaining_local_memory
)
2248 struct dirent
*dirent
;
2249 unsigned long index_
= 1;
2251 char path
[SYSFS_NUMA_NODE_PATH_LEN
];
2253 dir
= hwloc_opendir(dirpath
, data
->root_fd
);
2255 while ((dirent
= readdir(dir
)) != NULL
) {
2257 if (strncmp(dirent
->d_name
, "hugepages-", 10))
2259 memory
->page_types
[index_
].size
= strtoul(dirent
->d_name
+10, NULL
, 0) * 1024ULL;
2260 err
= snprintf(path
, sizeof(path
), "%s/%s/nr_hugepages", dirpath
, dirent
->d_name
);
2261 if ((size_t) err
< sizeof(path
)
2262 && !hwloc_read_path_by_length(path
, line
, sizeof(line
), data
->root_fd
)) {
2263 /* these are the actual total amount of huge pages */
2264 memory
->page_types
[index_
].count
= strtoull(line
, NULL
, 0);
2265 *remaining_local_memory
-= memory
->page_types
[index_
].count
* memory
->page_types
[index_
].size
;
2270 memory
->page_types_len
= index_
;
2275 hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology
*topology
,
2276 struct hwloc_linux_backend_data_s
*data
,
2277 unsigned long node
, struct hwloc_obj_memory_s
*memory
)
2280 uint64_t meminfo_hugepages_count
, meminfo_hugepages_size
= 0;
2283 if (topology
->is_thissystem
) {
2284 memory
->page_types_len
= 2;
2285 memory
->page_types
= malloc(2*sizeof(*memory
->page_types
));
2286 memset(memory
->page_types
, 0, 2*sizeof(*memory
->page_types
));
2287 /* Try to get the hugepage size from sysconf in case we fail to get it from /proc/meminfo later */
2288 #ifdef HAVE__SC_LARGE_PAGESIZE
2289 memory
->page_types
[1].size
= sysconf(_SC_LARGE_PAGESIZE
);
2291 memory
->page_types
[0].size
= data
->pagesize
;
2294 err
= snprintf(path
, sizeof(path
), "/proc/nodes/node%lu/meminfo", node
);
2295 if ((size_t) err
< sizeof(path
))
2296 hwloc_parse_meminfo_info(data
, path
,
2297 &memory
->local_memory
,
2298 &meminfo_hugepages_count
, &meminfo_hugepages_size
,
2299 memory
->page_types
== NULL
);
2301 if (memory
->page_types
) {
2302 uint64_t remaining_local_memory
= memory
->local_memory
;
2303 if (meminfo_hugepages_size
) {
2304 memory
->page_types
[1].size
= meminfo_hugepages_size
;
2305 memory
->page_types
[1].count
= meminfo_hugepages_count
;
2306 remaining_local_memory
-= meminfo_hugepages_count
* meminfo_hugepages_size
;
2308 memory
->page_types_len
= 1;
2310 memory
->page_types
[0].count
= remaining_local_memory
/ memory
->page_types
[0].size
;
2315 hwloc_get_procfs_meminfo_info(struct hwloc_topology
*topology
,
2316 struct hwloc_linux_backend_data_s
*data
,
2317 struct hwloc_obj_memory_s
*memory
)
2319 uint64_t meminfo_hugepages_count
, meminfo_hugepages_size
= 0;
2321 int has_sysfs_hugepages
= 0;
2322 const char *pagesize_env
= getenv("HWLOC_DEBUG_PAGESIZE");
2326 err
= hwloc_stat("/sys/kernel/mm/hugepages", &st
, data
->root_fd
);
2328 types
= 1 + st
.st_nlink
-2;
2329 has_sysfs_hugepages
= 1;
2332 if (topology
->is_thissystem
|| pagesize_env
) {
2333 /* we cannot report any page_type info unless we have the page size.
2334 * we'll take it either from the system if local, or from the debug env variable
2336 memory
->page_types_len
= types
;
2337 memory
->page_types
= calloc(types
, sizeof(*memory
->page_types
));
2340 if (topology
->is_thissystem
) {
2341 /* Get the page and hugepage sizes from sysconf */
2342 #if HAVE_DECL__SC_LARGE_PAGESIZE
2343 memory
->page_types
[1].size
= sysconf(_SC_LARGE_PAGESIZE
);
2345 memory
->page_types
[0].size
= data
->pagesize
; /* might be overwritten later by /proc/meminfo or sysfs */
2348 hwloc_parse_meminfo_info(data
, "/proc/meminfo",
2349 &memory
->local_memory
,
2350 &meminfo_hugepages_count
, &meminfo_hugepages_size
,
2351 memory
->page_types
== NULL
);
2353 if (memory
->page_types
) {
2354 uint64_t remaining_local_memory
= memory
->local_memory
;
2355 if (has_sysfs_hugepages
) {
2356 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2357 hwloc_parse_hugepages_info(data
, "/sys/kernel/mm/hugepages", memory
, &remaining_local_memory
);
2359 /* use what we found in meminfo */
2360 if (meminfo_hugepages_size
) {
2361 memory
->page_types
[1].size
= meminfo_hugepages_size
;
2362 memory
->page_types
[1].count
= meminfo_hugepages_count
;
2363 remaining_local_memory
-= meminfo_hugepages_count
* meminfo_hugepages_size
;
2365 memory
->page_types_len
= 1;
2370 /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
2371 memory
->page_types
[0].size
= strtoull(pagesize_env
, NULL
, 10);
2372 /* If failed, use 4kB */
2373 if (!memory
->page_types
[0].size
)
2374 memory
->page_types
[0].size
= 4096;
2376 assert(memory
->page_types
[0].size
); /* from sysconf if local or from the env */
2377 /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
2378 * may be 0 if no hugepage support in the kernel */
2380 memory
->page_types
[0].count
= remaining_local_memory
/ memory
->page_types
[0].size
;
2385 hwloc_sysfs_node_meminfo_info(struct hwloc_topology
*topology
,
2386 struct hwloc_linux_backend_data_s
*data
,
2387 const char *syspath
, int node
,
2388 struct hwloc_obj_memory_s
*memory
)
2390 char path
[SYSFS_NUMA_NODE_PATH_LEN
];
2391 char meminfopath
[SYSFS_NUMA_NODE_PATH_LEN
];
2392 uint64_t meminfo_hugepages_count
= 0;
2393 uint64_t meminfo_hugepages_size
= 0;
2395 int has_sysfs_hugepages
= 0;
2399 sprintf(path
, "%s/node%d/hugepages", syspath
, node
);
2400 err
= hwloc_stat(path
, &st
, data
->root_fd
);
2402 types
= 1 + st
.st_nlink
-2;
2403 has_sysfs_hugepages
= 1;
2406 if (topology
->is_thissystem
) {
2407 memory
->page_types_len
= types
;
2408 memory
->page_types
= malloc(types
*sizeof(*memory
->page_types
));
2409 memset(memory
->page_types
, 0, types
*sizeof(*memory
->page_types
));
2412 sprintf(meminfopath
, "%s/node%d/meminfo", syspath
, node
);
2413 hwloc_parse_meminfo_info(data
, meminfopath
,
2414 &memory
->local_memory
,
2415 &meminfo_hugepages_count
, NULL
/* no hugepage size in node-specific meminfo */,
2416 memory
->page_types
== NULL
);
2418 if (memory
->page_types
) {
2419 uint64_t remaining_local_memory
= memory
->local_memory
;
2420 if (has_sysfs_hugepages
) {
2421 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2422 hwloc_parse_hugepages_info(data
, path
, memory
, &remaining_local_memory
);
2424 /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
2425 * hwloc_get_procfs_meminfo_info must have been called earlier */
2426 meminfo_hugepages_size
= topology
->levels
[0][0]->memory
.page_types
[1].size
;
2427 /* use what we found in meminfo */
2428 if (meminfo_hugepages_size
) {
2429 memory
->page_types
[1].count
= meminfo_hugepages_count
;
2430 memory
->page_types
[1].size
= meminfo_hugepages_size
;
2431 remaining_local_memory
-= meminfo_hugepages_count
* meminfo_hugepages_size
;
2433 memory
->page_types_len
= 1;
2436 /* update what's remaining as normal pages */
2437 memory
->page_types
[0].size
= data
->pagesize
;
2438 memory
->page_types
[0].count
= remaining_local_memory
/ memory
->page_types
[0].size
;
2443 hwloc_parse_nodes_distances(const char *path
, unsigned nbnodes
, unsigned *indexes
, float *distances
, int fsroot_fd
)
2445 size_t len
= (10+1)*nbnodes
;
2446 float *curdist
= distances
;
2450 string
= malloc(len
); /* space-separated %d */
2454 for(i
=0; i
<nbnodes
; i
++) {
2455 unsigned osnode
= indexes
[i
];
2456 char distancepath
[SYSFS_NUMA_NODE_PATH_LEN
];
2460 /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
2461 * store them in slots X*N...X*N+N-1 */
2462 sprintf(distancepath
, "%s/node%u/distance", path
, osnode
);
2463 if (hwloc_read_path_by_length(distancepath
, string
, len
, fsroot_fd
) < 0)
2464 goto out_with_string
;
2469 unsigned distance
= strtoul(tmp
, &next
, 0); /* stored as a %d */
2472 *curdist
= (float) distance
;
2475 if (found
== nbnodes
)
2479 if (found
!= nbnodes
)
2480 goto out_with_string
;
2493 hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s
*data
,
2495 char *path
, unsigned pathlen
,
2496 const char *dmi_name
, const char *hwloc_name
)
2500 strcpy(path
+pathlen
, dmi_name
);
2501 if (hwloc_read_path_by_length(path
, dmi_line
, sizeof(dmi_line
), data
->root_fd
) < 0)
2504 if (dmi_line
[0] != '\0') {
2505 char *tmp
= strchr(dmi_line
, '\n');
2508 hwloc_debug("found %s '%s'\n", hwloc_name
, dmi_line
);
2509 hwloc_obj_add_info(obj
, hwloc_name
, dmi_line
);
2514 hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s
*data
, hwloc_obj_t obj
)
2520 strcpy(path
, "/sys/devices/virtual/dmi/id");
2521 dir
= hwloc_opendir(path
, data
->root_fd
);
2525 strcpy(path
, "/sys/class/dmi/id");
2526 dir
= hwloc_opendir(path
, data
->root_fd
);
2534 path
[pathlen
++] = '/';
2536 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "product_name", "DMIProductName");
2537 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "product_version", "DMIProductVersion");
2538 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "product_serial", "DMIProductSerial");
2539 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "product_uuid", "DMIProductUUID");
2540 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "board_vendor", "DMIBoardVendor");
2541 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "board_name", "DMIBoardName");
2542 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "board_version", "DMIBoardVersion");
2543 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "board_serial", "DMIBoardSerial");
2544 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "board_asset_tag", "DMIBoardAssetTag");
2545 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "chassis_vendor", "DMIChassisVendor");
2546 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "chassis_type", "DMIChassisType");
2547 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "chassis_version", "DMIChassisVersion");
2548 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "chassis_serial", "DMIChassisSerial");
2549 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "chassis_asset_tag", "DMIChassisAssetTag");
2550 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "bios_vendor", "DMIBIOSVendor");
2551 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "bios_version", "DMIBIOSVersion");
2552 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "bios_date", "DMIBIOSDate");
2553 hwloc__get_dmi_id_one_info(data
, obj
, path
, pathlen
, "sys_vendor", "DMISysVendor");
2556 struct hwloc_firmware_dmi_mem_device_header
{
2558 unsigned char length
;
2559 unsigned char handle
[2];
2560 unsigned char phy_mem_handle
[2];
2561 unsigned char mem_err_handle
[2];
2562 unsigned char tot_width
[2];
2563 unsigned char dat_width
[2];
2564 unsigned char size
[2];
2566 unsigned char dev_set
;
2567 unsigned char dev_loc_str_num
;
2568 unsigned char bank_loc_str_num
;
2569 unsigned char mem_type
;
2570 unsigned char type_detail
[2];
2571 unsigned char speed
[2];
2572 unsigned char manuf_str_num
;
2573 unsigned char serial_str_num
;
2574 unsigned char asset_tag_str_num
;
2575 unsigned char part_num_str_num
;
2576 /* don't include the following fields since we don't need them,
2577 * some old implementations may miss them.
2581 static int check_dmi_entry(const char *buffer
)
2583 /* reject empty strings */
2586 /* reject strings of spaces (at least Dell use this for empty memory slots) */
2587 if (strspn(buffer
, " ") == strlen(buffer
))
2593 hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology
*topology
,
2594 unsigned idx
, const char *path
, FILE *fd
,
2595 struct hwloc_firmware_dmi_mem_device_header
*header
)
2598 char buffer
[256]; /* enough for memory device strings, or at least for each of them */
2599 unsigned foff
; /* offset in raw file */
2600 unsigned boff
; /* offset in buffer read from raw file */
2602 struct hwloc_obj_info_s
*infos
= NULL
;
2603 unsigned infos_count
= 0;
2607 hwloc__add_info(&infos
, &infos_count
, "Type", "MemoryModule");
2609 /* start after the header */
2610 foff
= header
->length
;
2613 /* read one buffer */
2614 if (fseek(fd
, foff
, SEEK_SET
) < 0)
2616 if (!fgets(buffer
, sizeof(buffer
), fd
))
2618 /* read string at the beginning of the buffer */
2621 /* stop on empty string */
2624 /* stop if this string goes to the end of the buffer */
2625 slen
= strlen(buffer
+boff
);
2626 if (boff
+ slen
+1 == sizeof(buffer
))
2628 /* string didn't get truncated, should be OK */
2629 if (i
== header
->manuf_str_num
) {
2630 if (check_dmi_entry(buffer
+boff
)) {
2631 hwloc__add_info(&infos
, &infos_count
, "Vendor", buffer
+boff
);
2634 } else if (i
== header
->serial_str_num
) {
2635 if (check_dmi_entry(buffer
+boff
)) {
2636 hwloc__add_info(&infos
, &infos_count
, "SerialNumber", buffer
+boff
);
2639 } else if (i
== header
->asset_tag_str_num
) {
2640 if (check_dmi_entry(buffer
+boff
)) {
2641 hwloc__add_info(&infos
, &infos_count
, "AssetTag", buffer
+boff
);
2644 } else if (i
== header
->part_num_str_num
) {
2645 if (check_dmi_entry(buffer
+boff
)) {
2646 hwloc__add_info(&infos
, &infos_count
, "PartNumber", buffer
+boff
);
2649 } else if (i
== header
->dev_loc_str_num
) {
2650 if (check_dmi_entry(buffer
+boff
)) {
2651 hwloc__add_info(&infos
, &infos_count
, "DeviceLocation", buffer
+boff
);
2652 /* only a location, not an actual info about the device */
2654 } else if (i
== header
->bank_loc_str_num
) {
2655 if (check_dmi_entry(buffer
+boff
)) {
2656 hwloc__add_info(&infos
, &infos_count
, "BankLocation", buffer
+boff
);
2657 /* only a location, not an actual info about the device */
2662 /* next string in buffer */
2666 /* couldn't read a single full string from that buffer, we're screwed */
2668 fprintf(stderr
, "hwloc could read a DMI firmware entry #%u in %s\n",
2672 /* reread buffer after previous string */
2678 /* found no actual info about the device. if there's only location info, the slot may be empty */
2679 goto out_with_infos
;
2682 misc
= hwloc_alloc_setup_object(HWLOC_OBJ_MISC
, idx
);
2684 goto out_with_infos
;
2686 hwloc__move_infos(&misc
->infos
, &misc
->infos_count
, &infos
, &infos_count
);
2687 /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
2688 * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
2689 * with the vendor, and it's hard to be 100% sure 'B' is second socket.
2690 * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
2691 * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
2693 hwloc_insert_object_by_parent(topology
, hwloc_get_root_obj(topology
), misc
);
2697 hwloc__free_infos(infos
, infos_count
);
2701 hwloc__get_firmware_dmi_memory_info(struct hwloc_topology
*topology
,
2702 struct hwloc_linux_backend_data_s
*data
)
2709 struct hwloc_firmware_dmi_mem_device_header header
;
2712 snprintf(path
, sizeof(path
), "/sys/firmware/dmi/entries/17-%u/raw", i
);
2713 fd
= hwloc_fopen(path
, "r", data
->root_fd
);
2717 err
= fread(&header
, sizeof(header
), 1, fd
);
2722 if (header
.length
< sizeof(header
)) {
2723 /* invalid, or too old entry/spec that doesn't contain what we need */
2728 hwloc__get_firmware_dmi_memory_info_one(topology
, i
, path
, fd
, &header
);
2735 /***********************************
2736 ****** Device tree Discovery ******
2737 ***********************************/
2739 /* Reads the entire file and returns bytes read if bytes_read != NULL
2740 * Returned pointer can be freed by using free(). */
2742 hwloc_read_raw(const char *p
, const char *p1
, size_t *bytes_read
, int root_fd
)
2749 snprintf(fname
, sizeof(fname
), "%s/%s", p
, p1
);
2751 file
= hwloc_open(fname
, root_fd
);
2755 if (fstat(file
, &fs
)) {
2759 ret
= (char *) malloc(fs
.st_size
);
2761 ssize_t cb
= read(file
, ret
, fs
.st_size
);
2766 if (NULL
!= bytes_read
)
2777 /* Reads the entire file and returns it as a 0-terminated string
2778 * Returned pointer can be freed by using free(). */
2780 hwloc_read_str(const char *p
, const char *p1
, int root_fd
)
2783 char *ret
= hwloc_read_raw(p
, p1
, &cb
, root_fd
);
2784 if ((NULL
!= ret
) && (0 < cb
) && (0 != ret
[cb
-1])) {
2785 char *tmp
= realloc(ret
, cb
+ 1);
2796 /* Reads first 32bit bigendian value */
2798 hwloc_read_unit32be(const char *p
, const char *p1
, uint32_t *buf
, int root_fd
)
2801 uint32_t *tmp
= hwloc_read_raw(p
, p1
, &cb
, root_fd
);
2802 if (sizeof(*buf
) != cb
) {
2804 free(tmp
); /* tmp is either NULL or contains useless things */
2809 return sizeof(*buf
);
2813 unsigned int n
, allocated
;
2815 hwloc_bitmap_t cpuset
;
2820 } device_tree_cpus_t
;
2823 add_device_tree_cpus_node(device_tree_cpus_t
*cpus
, hwloc_bitmap_t cpuset
,
2824 uint32_t l2_cache
, uint32_t phandle
, const char *name
)
2826 if (cpus
->n
== cpus
->allocated
) {
2829 if (!cpus
->allocated
)
2832 allocated
= 2 * cpus
->allocated
;
2833 tmp
= realloc(cpus
->p
, allocated
* sizeof(cpus
->p
[0]));
2835 return; /* failed to realloc, ignore this entry */
2837 cpus
->allocated
= allocated
;
2839 cpus
->p
[cpus
->n
].phandle
= phandle
;
2840 cpus
->p
[cpus
->n
].cpuset
= (NULL
== cpuset
)?NULL
:hwloc_bitmap_dup(cpuset
);
2841 cpus
->p
[cpus
->n
].l2_cache
= l2_cache
;
2842 cpus
->p
[cpus
->n
].name
= strdup(name
);
2846 /* Walks over the cache list in order to detect nested caches and CPU mask for each */
2848 look_powerpc_device_tree_discover_cache(device_tree_cpus_t
*cpus
,
2849 uint32_t phandle
, unsigned int *level
, hwloc_bitmap_t cpuset
)
2853 if ((NULL
== level
) || (NULL
== cpuset
) || phandle
== (uint32_t) -1)
2855 for (i
= 0; i
< cpus
->n
; ++i
) {
2856 if (phandle
!= cpus
->p
[i
].l2_cache
)
2858 if (NULL
!= cpus
->p
[i
].cpuset
) {
2859 hwloc_bitmap_or(cpuset
, cpuset
, cpus
->p
[i
].cpuset
);
2863 if (0 == look_powerpc_device_tree_discover_cache(cpus
,
2864 cpus
->p
[i
].phandle
, level
, cpuset
))
2872 try__add_cache_from_device_tree_cpu(struct hwloc_topology
*topology
,
2873 unsigned int level
, hwloc_obj_cache_type_t type
,
2874 uint32_t cache_line_size
, uint32_t cache_size
, uint32_t cache_sets
,
2875 hwloc_bitmap_t cpuset
)
2877 struct hwloc_obj
*c
= NULL
;
2879 if (0 == cache_size
)
2882 c
= hwloc_alloc_setup_object(HWLOC_OBJ_CACHE
, -1);
2883 c
->attr
->cache
.depth
= level
;
2884 c
->attr
->cache
.linesize
= cache_line_size
;
2885 c
->attr
->cache
.size
= cache_size
;
2886 c
->attr
->cache
.type
= type
;
2887 if (cache_sets
== 1)
2888 /* likely wrong, make it unknown */
2890 if (cache_sets
&& cache_line_size
)
2891 c
->attr
->cache
.associativity
= cache_size
/ (cache_sets
* cache_line_size
);
2893 c
->attr
->cache
.associativity
= 0;
2894 c
->cpuset
= hwloc_bitmap_dup(cpuset
);
2895 hwloc_debug_2args_bitmap("cache (%s) depth %u has cpuset %s\n",
2896 type
== HWLOC_OBJ_CACHE_UNIFIED
? "unified" : (type
== HWLOC_OBJ_CACHE_DATA
? "data" : "instruction"),
2898 hwloc_insert_object_by_cpuset(topology
, c
);
2902 try_add_cache_from_device_tree_cpu(struct hwloc_topology
*topology
,
2903 struct hwloc_linux_backend_data_s
*data
,
2904 const char *cpu
, unsigned int level
, hwloc_bitmap_t cpuset
)
2906 /* d-cache-block-size - ignore */
2907 /* d-cache-line-size - to read, in bytes */
2908 /* d-cache-sets - ignore */
2909 /* d-cache-size - to read, in bytes */
2910 /* i-cache, same for instruction */
2911 /* cache-unified only exist if data and instruction caches are unified */
2912 /* d-tlb-sets - ignore */
2913 /* d-tlb-size - ignore, always 0 on power6 */
2915 uint32_t d_cache_line_size
= 0, d_cache_size
= 0, d_cache_sets
= 0;
2916 uint32_t i_cache_line_size
= 0, i_cache_size
= 0, i_cache_sets
= 0;
2917 char unified_path
[1024];
2918 struct stat statbuf
;
2921 snprintf(unified_path
, sizeof(unified_path
), "%s/cache-unified", cpu
);
2922 unified
= (hwloc_stat(unified_path
, &statbuf
, data
->root_fd
) == 0);
2924 hwloc_read_unit32be(cpu
, "d-cache-line-size", &d_cache_line_size
,
2926 hwloc_read_unit32be(cpu
, "d-cache-size", &d_cache_size
,
2928 hwloc_read_unit32be(cpu
, "d-cache-sets", &d_cache_sets
,
2930 hwloc_read_unit32be(cpu
, "i-cache-line-size", &i_cache_line_size
,
2932 hwloc_read_unit32be(cpu
, "i-cache-size", &i_cache_size
,
2934 hwloc_read_unit32be(cpu
, "i-cache-sets", &i_cache_sets
,
2938 try__add_cache_from_device_tree_cpu(topology
, level
, HWLOC_OBJ_CACHE_INSTRUCTION
,
2939 i_cache_line_size
, i_cache_size
, i_cache_sets
, cpuset
);
2940 try__add_cache_from_device_tree_cpu(topology
, level
, unified
? HWLOC_OBJ_CACHE_UNIFIED
: HWLOC_OBJ_CACHE_DATA
,
2941 d_cache_line_size
, d_cache_size
, d_cache_sets
, cpuset
);
2945 * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
2946 * which provide NUMA nodes information without any details
2949 look_powerpc_device_tree(struct hwloc_topology
*topology
,
2950 struct hwloc_linux_backend_data_s
*data
)
2952 device_tree_cpus_t cpus
;
2953 const char ofroot
[] = "/proc/device-tree/cpus";
2955 int root_fd
= data
->root_fd
;
2956 DIR *dt
= hwloc_opendir(ofroot
, root_fd
);
2957 struct dirent
*dirent
;
2962 /* only works for Power so far, and not useful on ARM */
2963 if (data
->arch
!= HWLOC_LINUX_ARCH_POWER
) {
2972 while (NULL
!= (dirent
= readdir(dt
))) {
2975 uint32_t reg
= -1, l2_cache
= -1, phandle
= -1;
2978 if ('.' == dirent
->d_name
[0])
2981 err
= snprintf(cpu
, sizeof(cpu
), "%s/%s", ofroot
, dirent
->d_name
);
2982 if ((size_t) err
>= sizeof(cpu
))
2985 device_type
= hwloc_read_str(cpu
, "device_type", root_fd
);
2986 if (NULL
== device_type
)
2989 hwloc_read_unit32be(cpu
, "reg", ®
, root_fd
);
2990 if (hwloc_read_unit32be(cpu
, "next-level-cache", &l2_cache
, root_fd
) == -1)
2991 hwloc_read_unit32be(cpu
, "l2-cache", &l2_cache
, root_fd
);
2992 if (hwloc_read_unit32be(cpu
, "phandle", &phandle
, root_fd
) == -1)
2993 if (hwloc_read_unit32be(cpu
, "ibm,phandle", &phandle
, root_fd
) == -1)
2994 hwloc_read_unit32be(cpu
, "linux,phandle", &phandle
, root_fd
);
2996 if (0 == strcmp(device_type
, "cache")) {
2997 add_device_tree_cpus_node(&cpus
, NULL
, l2_cache
, phandle
, dirent
->d_name
);
2999 else if (0 == strcmp(device_type
, "cpu")) {
3001 hwloc_bitmap_t cpuset
= NULL
;
3003 uint32_t *threads
= hwloc_read_raw(cpu
, "ibm,ppc-interrupt-server#s", &cb
, root_fd
);
3004 uint32_t nthreads
= cb
/ sizeof(threads
[0]);
3006 if (NULL
!= threads
) {
3007 cpuset
= hwloc_bitmap_alloc();
3008 for (i
= 0; i
< nthreads
; ++i
) {
3009 if (hwloc_bitmap_isset(topology
->levels
[0][0]->complete_cpuset
, ntohl(threads
[i
])))
3010 hwloc_bitmap_set(cpuset
, ntohl(threads
[i
]));
3013 } else if ((unsigned int)-1 != reg
) {
3014 /* Doesn't work on ARM because cpu "reg" do not start at 0.
3015 * We know the first cpu "reg" is the lowest. The others are likely
3016 * in order assuming the device-tree shows objects in order.
3018 cpuset
= hwloc_bitmap_alloc();
3019 hwloc_bitmap_set(cpuset
, reg
);
3022 if (NULL
== cpuset
) {
3023 hwloc_debug("%s has no \"reg\" property, skipping\n", cpu
);
3025 struct hwloc_obj
*core
= NULL
;
3026 add_device_tree_cpus_node(&cpus
, cpuset
, l2_cache
, phandle
, dirent
->d_name
);
3029 core
= hwloc_alloc_setup_object(HWLOC_OBJ_CORE
, reg
);
3030 core
->cpuset
= hwloc_bitmap_dup(cpuset
);
3031 hwloc_insert_object_by_cpuset(topology
, core
);
3034 try_add_cache_from_device_tree_cpu(topology
, data
, cpu
, 1, cpuset
);
3036 hwloc_bitmap_free(cpuset
);
3043 /* No cores and L2 cache were found, exiting */
3045 hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot
);
3050 for (i
= 0; i
< cpus
.n
; ++i
) {
3051 hwloc_debug("%u: %s ibm,phandle=%08X l2_cache=%08X ",
3052 i
, cpus
.p
[i
].name
, cpus
.p
[i
].phandle
, cpus
.p
[i
].l2_cache
);
3053 if (NULL
== cpus
.p
[i
].cpuset
) {
3054 hwloc_debug("%s\n", "no cpuset");
3056 hwloc_debug_bitmap("cpuset %s\n", cpus
.p
[i
].cpuset
);
3061 /* Scan L2/L3/... caches */
3062 for (i
= 0; i
< cpus
.n
; ++i
) {
3063 unsigned int level
= 2;
3064 hwloc_bitmap_t cpuset
;
3065 /* Skip real CPUs */
3066 if (NULL
!= cpus
.p
[i
].cpuset
)
3069 /* Calculate cache level and CPU mask */
3070 cpuset
= hwloc_bitmap_alloc();
3071 if (0 == look_powerpc_device_tree_discover_cache(&cpus
,
3072 cpus
.p
[i
].phandle
, &level
, cpuset
)) {
3074 snprintf(cpu
, sizeof(cpu
), "%s/%s", ofroot
, cpus
.p
[i
].name
);
3075 try_add_cache_from_device_tree_cpu(topology
, data
, cpu
, level
, cpuset
);
3077 hwloc_bitmap_free(cpuset
);
3081 for (i
= 0; i
< cpus
.n
; ++i
) {
3082 hwloc_bitmap_free(cpus
.p
[i
].cpuset
);
3083 free(cpus
.p
[i
].name
);
3088 /* Try to handle knl hwdata properties
3089 * Returns 0 on success and -1 otherwise */
3090 static int hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology
, struct hwloc_linux_backend_data_s
*data
, hwloc_obj_t
*nodes
, unsigned nbnodes
)
3092 char *knl_cache_file
;
3093 long long int cache_size
= -1;
3094 int associativity
= -1;
3095 int inclusiveness
= -1;
3099 char buffer
[512] = {0};
3100 char *data_beg
= NULL
;
3101 char memory_mode_str
[32] = {0};
3102 char cluster_mode_str
[32] = {0};
3103 unsigned long MCDRAM_numa_size
, DDR_numa_size
;
3104 unsigned MCDRAM_nbnodes
, DDR_nbnodes
;
3105 unsigned long total_cache_size
;
3106 char * fallback_env
= getenv("HWLOC_KNL_HDH_FALLBACK");
3107 int fallback
= fallback_env
? atoi(fallback_env
) : -1; /* by default, only fallback if needed */
3109 if (fallback
== 1) {
3110 hwloc_debug("KNL dumped hwdata ignored, forcing fallback\n");
3114 if (asprintf(&knl_cache_file
, "%s/knl_memoryside_cache", data
->dumped_hwdata_dirname
) < 0)
3117 hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file
);
3118 if (hwloc_read_path_by_length(knl_cache_file
, buffer
, sizeof(buffer
), data
->root_fd
) < 0) {
3119 hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file
, strerror(errno
));
3120 free(knl_cache_file
);
3123 free(knl_cache_file
);
3125 data_beg
= &buffer
[0];
3127 /* file must start with version information */
3128 if (sscanf(data_beg
, "version: %d", &version
) != 1) {
3129 fprintf(stderr
, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
3134 char *line_end
= strstr(data_beg
, "\n");
3138 if (!strncmp("cache_size:", data_beg
, strlen("cache_size"))) {
3139 sscanf(data_beg
, "cache_size: %lld", &cache_size
);
3140 hwloc_debug("read cache_size=%lld\n", cache_size
);
3141 } else if (!strncmp("line_size:", data_beg
, strlen("line_size:"))) {
3142 sscanf(data_beg
, "line_size: %d", &line_size
);
3143 hwloc_debug("read line_size=%d\n", line_size
);
3144 } else if (!strncmp("inclusiveness:", data_beg
, strlen("inclusiveness:"))) {
3145 sscanf(data_beg
, "inclusiveness: %d", &inclusiveness
);
3146 hwloc_debug("read inclusiveness=%d\n", inclusiveness
);
3147 } else if (!strncmp("associativity:", data_beg
, strlen("associativity:"))) {
3148 sscanf(data_beg
, "associativity: %d\n", &associativity
);
3149 hwloc_debug("read associativity=%d\n", associativity
);
3153 if (!strncmp("cluster_mode: ", data_beg
, strlen("cluster_mode: "))) {
3155 data_beg
+= strlen("cluster_mode: ");
3156 length
= line_end
-data_beg
;
3157 if (length
> sizeof(cluster_mode_str
)-1)
3158 length
= sizeof(cluster_mode_str
)-1;
3159 memcpy(cluster_mode_str
, data_beg
, length
);
3160 cluster_mode_str
[length
] = '\0';
3161 hwloc_debug("read cluster_mode=%s\n", cluster_mode_str
);
3162 } else if (!strncmp("memory_mode: ", data_beg
, strlen("memory_mode: "))) {
3164 data_beg
+= strlen("memory_mode: ");
3165 length
= line_end
-data_beg
;
3166 if (length
> sizeof(memory_mode_str
)-1)
3167 length
= sizeof(memory_mode_str
)-1;
3168 memcpy(memory_mode_str
, data_beg
, length
);
3169 memory_mode_str
[length
] = '\0';
3170 hwloc_debug("read memory_mode=%s\n", memory_mode_str
);
3174 data_beg
= line_end
+ 1;
3177 if (line_size
== -1 || cache_size
== -1 || associativity
== -1 || inclusiveness
== -1) {
3178 hwloc_debug("Incorrect file format line_size=%d cache_size=%lld associativity=%d inclusiveness=%d\n",
3179 line_size
, cache_size
, associativity
, inclusiveness
);
3184 /* In file version 1 mcdram_cache is always non-zero.
3185 * In file version 2 mcdram cache can be zero in flat mode. We need to check and do not expose cache in flat mode. */
3186 if (cache_size
> 0) {
3187 for(i
=0; i
<nbnodes
; i
++) {
3190 if (hwloc_bitmap_iszero(nodes
[i
]->cpuset
))
3191 /* one L3 per DDR, none for MCDRAM nodes */
3194 cache
= hwloc_alloc_setup_object(HWLOC_OBJ_CACHE
, -1);
3198 cache
->attr
->cache
.depth
= 3;
3199 cache
->attr
->cache
.type
= HWLOC_OBJ_CACHE_UNIFIED
;
3200 cache
->attr
->cache
.associativity
= associativity
;
3201 hwloc_obj_add_info(cache
, "Inclusive", inclusiveness
? "1" : "0");
3202 cache
->attr
->cache
.size
= cache_size
;
3203 cache
->attr
->cache
.linesize
= line_size
;
3204 cache
->cpuset
= hwloc_bitmap_dup(nodes
[i
]->cpuset
);
3205 hwloc_obj_add_info(cache
, "Type", "MemorySideCache");
3206 hwloc_insert_object_by_cpuset(topology
, cache
);
3209 /* adding cluster and memory mode as properties of the machine */
3211 if (*cluster_mode_str
) /* in case the fallback below couldn't guess */
3212 hwloc_obj_add_info(topology
->levels
[0][0], "ClusterMode", cluster_mode_str
);
3213 hwloc_obj_add_info(topology
->levels
[0][0], "MemoryMode", memory_mode_str
);
3219 if (fallback
== 0) {
3220 hwloc_debug("KNL hwdata fallback disabled\n");
3224 hwloc_debug("Falling back to a heuristic\n");
3228 MCDRAM_numa_size
= 0;
3230 for(i
=0; i
<nbnodes
; i
++)
3231 if (hwloc_bitmap_iszero(nodes
[i
]->cpuset
)) {
3232 MCDRAM_numa_size
+= nodes
[i
]->memory
.local_memory
;
3235 DDR_numa_size
+= nodes
[i
]->memory
.local_memory
;
3238 assert(DDR_nbnodes
+ MCDRAM_nbnodes
== nbnodes
);
3240 /* there can be 0 MCDRAM_nbnodes, but we must have at least one DDR node (not cpuless) */
3241 assert(DDR_nbnodes
);
3242 /* there are either no MCDRAM nodes, or as many as DDR nodes */
3243 assert(!MCDRAM_nbnodes
|| MCDRAM_nbnodes
== DDR_nbnodes
);
3245 if (!MCDRAM_nbnodes
&& DDR_numa_size
<= 16UL*1024*1024*1024) {
3246 /* We only found DDR numa nodes, but they are <=16GB.
3247 * It could be a DDR-less KNL where numa nodes are actually MCDRAM, we can't know for sure.
3248 * Both cases are unlikely, disable the heuristic for now.
3250 * In theory we could check if DDR_numa_size == 8/12/16GB exactly (amount of MCDRAM numa size in H50/H25/Flat modes),
3251 * but that's never the case since some kilobytes are always stolen by the system.
3253 hwloc_debug("Cannot guess if MCDRAM is in Cache or if the node is DDR-less (total NUMA node size %lu)\n",
3258 /* all commercial KNL/KNM have 16GB of MCDRAM */
3259 total_cache_size
= 16UL*1024*1024*1024 - MCDRAM_numa_size
;
3261 if (!MCDRAM_nbnodes
) {
3262 strcpy(memory_mode_str
, "Cache");
3264 if (!total_cache_size
)
3265 strcpy(memory_mode_str
, "Flat");
3266 else if (total_cache_size
== 8UL*1024*1024*1024)
3267 strcpy(memory_mode_str
, "Hybrid50");
3268 else if (total_cache_size
== 4UL*1024*1024*1024)
3269 strcpy(memory_mode_str
, "Hybrid25");
3271 fprintf(stderr
, "Unexpected KNL MCDRAM cache size %lu\n", total_cache_size
);
3273 if (DDR_nbnodes
== 4) {
3274 strcpy(cluster_mode_str
, "SNC4");
3275 } else if (DDR_nbnodes
== 2) {
3276 strcpy(cluster_mode_str
, "SNC2");
3277 } else if (DDR_nbnodes
== 1) {
3278 /* either Quadrant, All2ALL or Hemisphere */
3280 fprintf(stderr
, "Unexpected number of KNL non-MCDRAM NUMA nodes %u\n", DDR_nbnodes
);
3283 cache_size
= total_cache_size
/DDR_nbnodes
;
3294 /**************************************
3295 ****** Sysfs Topology Discovery ******
3296 **************************************/
3299 look_sysfsnode(struct hwloc_topology
*topology
,
3300 struct hwloc_linux_backend_data_s
*data
,
3301 const char *path
, unsigned *found
)
3304 unsigned nbnodes
= 0;
3306 struct dirent
*dirent
;
3307 hwloc_bitmap_t nodeset
;
3311 /* Get the list of nodes first */
3312 dir
= hwloc_opendir(path
, data
->root_fd
);
3315 nodeset
= hwloc_bitmap_alloc();
3316 while ((dirent
= readdir(dir
)) != NULL
)
3318 if (strncmp(dirent
->d_name
, "node", 4))
3320 osnode
= strtoul(dirent
->d_name
+4, NULL
, 0);
3321 hwloc_bitmap_set(nodeset
, osnode
);
3329 if (!nbnodes
|| (nbnodes
== 1 && !data
->is_knl
)) { /* always keep NUMA for KNL, or configs might look too different */
3330 hwloc_bitmap_free(nodeset
);
3334 /* For convenience, put these declarations inside a block. */
3337 hwloc_obj_t
* nodes
= calloc(nbnodes
, sizeof(hwloc_obj_t
));
3338 unsigned *indexes
= calloc(nbnodes
, sizeof(unsigned));
3339 float * distances
= NULL
;
3340 int failednodes
= 0;
3343 if (NULL
== nodes
|| NULL
== indexes
) {
3346 hwloc_bitmap_free(nodeset
);
3351 /* Unsparsify node indexes.
3352 * We'll need them later because Linux groups sparse distances
3353 * and keeps them in order in the sysfs distance files.
3354 * It'll simplify things in the meantime.
3357 hwloc_bitmap_foreach_begin (osnode
, nodeset
) {
3358 indexes
[index_
] = osnode
;
3360 } hwloc_bitmap_foreach_end();
3361 hwloc_bitmap_free(nodeset
);
3364 hwloc_debug("%s", "NUMA indexes: ");
3365 for (index_
= 0; index_
< nbnodes
; index_
++) {
3366 hwloc_debug(" %u", indexes
[index_
]);
3368 hwloc_debug("%s", "\n");
3371 /* Create NUMA objects */
3372 for (index_
= 0; index_
< nbnodes
; index_
++) {
3373 hwloc_obj_t node
, res_obj
;
3376 osnode
= indexes
[index_
];
3378 node
= hwloc_get_numanode_obj_by_os_index(topology
, osnode
);
3379 annotate
= (node
!= NULL
);
3381 /* create a new node */
3382 char nodepath
[SYSFS_NUMA_NODE_PATH_LEN
];
3383 hwloc_bitmap_t cpuset
;
3384 sprintf(nodepath
, "%s/node%u/cpumap", path
, osnode
);
3385 cpuset
= hwloc__alloc_read_path_as_cpumask(nodepath
, data
->root_fd
);
3387 /* This NUMA object won't be inserted, we'll ignore distances */
3392 node
= hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE
, osnode
);
3393 node
->cpuset
= cpuset
;
3394 node
->nodeset
= hwloc_bitmap_alloc();
3395 hwloc_bitmap_set(node
->nodeset
, osnode
);
3397 hwloc_sysfs_node_meminfo_info(topology
, data
, path
, osnode
, &node
->memory
);
3399 hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
3400 osnode
, node
->cpuset
);
3403 nodes
[index_
] = node
;
3405 res_obj
= hwloc_insert_object_by_cpuset(topology
, node
);
3406 if (node
== res_obj
) {
3407 nodes
[index_
] = node
;
3409 /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
3410 * This object disappeared, we'll ignore distances */
3416 if (!failednodes
&& data
->is_knl
)
3417 hwloc_linux_try_handle_knl_hwdata_properties(topology
, data
, nodes
, nbnodes
);
3420 /* failed to read/create some nodes, don't bother reading/fixing
3421 * a distance matrix that would likely be wrong anyway.
3423 nbnodes
-= failednodes
;
3424 } else if (nbnodes
> 1) {
3425 distances
= malloc(nbnodes
*nbnodes
*sizeof(*distances
));
3428 if (NULL
== distances
) {
3434 if (hwloc_parse_nodes_distances(path
, nbnodes
, indexes
, distances
, data
->root_fd
) < 0) {
3441 if (data
->is_knl
&& distances
) {
3442 char *env
= getenv("HWLOC_KNL_NUMA_QUIRK");
3443 if (!(env
&& !atoi(env
)) && nbnodes
>=2) { /* SNC2 or SNC4, with 0 or 2/4 MCDRAM, and 0-4 DDR nodes */
3444 unsigned i
, j
, closest
;
3445 for(i
=0; i
<nbnodes
; i
++) {
3446 if (!hwloc_bitmap_iszero(nodes
[i
]->cpuset
))
3447 /* nodes with CPU, that's DDR, skip it */
3449 hwloc_obj_add_info(nodes
[i
], "Type", "MCDRAM");
3451 /* DDR is the closest node with CPUs */
3452 closest
= (unsigned)-1;
3453 for(j
=0; j
<nbnodes
; j
++) {
3456 if (hwloc_bitmap_iszero(nodes
[j
]->cpuset
))
3457 /* nodes without CPU, that's another MCDRAM, skip it */
3459 if (closest
== (unsigned)-1 || distances
[i
*nbnodes
+j
]<distances
[i
*nbnodes
+closest
])
3462 if (closest
!= (unsigned) -1) {
3463 /* Add a Group for Cluster containing this MCDRAM + DDR */
3464 hwloc_obj_t cluster
= hwloc_alloc_setup_object(HWLOC_OBJ_GROUP
, -1);
3465 cluster
->cpuset
= hwloc_bitmap_dup(nodes
[i
]->cpuset
);
3466 cluster
->nodeset
= hwloc_bitmap_dup(nodes
[i
]->nodeset
);
3467 hwloc_bitmap_or(cluster
->cpuset
, cluster
->cpuset
, nodes
[closest
]->cpuset
);
3468 hwloc_bitmap_or(cluster
->nodeset
, cluster
->nodeset
, nodes
[closest
]->nodeset
);
3469 hwloc_obj_add_info(cluster
, "Type", "Cluster");
3470 hwloc_insert_object_by_cpuset(topology
, cluster
);
3473 /* drop the distance matrix, it contradicts the above NUMA layout groups */
3481 hwloc_distances_set(topology
, HWLOC_OBJ_NUMANODE
, nbnodes
, indexes
, nodes
, distances
, 0 /* OS cannot force */);
3489 /* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
3491 look_sysfscpu(struct hwloc_topology
*topology
,
3492 struct hwloc_linux_backend_data_s
*data
,
3494 struct hwloc_linux_cpuinfo_proc
* cpuinfo_Lprocs
, unsigned cpuinfo_numprocs
)
3496 hwloc_bitmap_t cpuset
; /* Set of cpus for which we have topology information */
3497 #define CPU_TOPOLOGY_STR_LEN 128
3498 char str
[CPU_TOPOLOGY_STR_LEN
];
3501 unsigned caches_added
, merge_buggy_core_siblings
;
3502 hwloc_obj_t packages
= NULL
; /* temporary list of packages before actual insert in the tree */
3503 int threadwithcoreid
= data
->is_amd_with_CU
? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
3505 /* fill the cpuset of interesting cpus */
3506 dir
= hwloc_opendir(path
, data
->root_fd
);
3510 struct dirent
*dirent
;
3511 cpuset
= hwloc_bitmap_alloc();
3513 while ((dirent
= readdir(dir
)) != NULL
) {
3517 if (strncmp(dirent
->d_name
, "cpu", 3))
3519 cpu
= strtoul(dirent
->d_name
+3, NULL
, 0);
3521 /* Maybe we don't have topology information but at least it exists */
3522 hwloc_bitmap_set(topology
->levels
[0][0]->complete_cpuset
, cpu
);
3524 /* check whether this processor is online */
3525 sprintf(str
, "%s/cpu%lu/online", path
, cpu
);
3526 if (hwloc_read_path_by_length(str
, online
, sizeof(online
), data
->root_fd
) == 0) {
3528 hwloc_debug("os proc %lu is online\n", cpu
);
3530 hwloc_debug("os proc %lu is offline\n", cpu
);
3531 hwloc_bitmap_clr(topology
->levels
[0][0]->online_cpuset
, cpu
);
3535 /* check whether the kernel exports topology information for this cpu */
3536 sprintf(str
, "%s/cpu%lu/topology", path
, cpu
);
3537 if (hwloc_access(str
, X_OK
, data
->root_fd
) < 0 && errno
== ENOENT
) {
3538 hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
3543 hwloc_bitmap_set(cpuset
, cpu
);
3548 topology
->support
.discovery
->pu
= 1;
3549 hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
3550 hwloc_bitmap_weight(cpuset
), cpuset
);
3552 merge_buggy_core_siblings
= (data
->arch
== HWLOC_LINUX_ARCH_X86
);
3554 hwloc_bitmap_foreach_begin(i
, cpuset
) {
3555 hwloc_bitmap_t packageset
, coreset
, bookset
, threadset
;
3556 unsigned mypackageid
, mycoreid
, mybookid
;
3559 /* look at the package */
3560 sprintf(str
, "%s/cpu%d/topology/core_siblings", path
, i
);
3561 packageset
= hwloc__alloc_read_path_as_cpumask(str
, data
->root_fd
);
3562 if (packageset
&& hwloc_bitmap_first(packageset
) == i
) {
3563 /* first cpu in this package, add the package */
3564 struct hwloc_obj
*package
;
3566 mypackageid
= (unsigned) -1;
3567 sprintf(str
, "%s/cpu%d/topology/physical_package_id", path
, i
); /* contains %d at least up to 4.9 */
3568 if (hwloc_read_path_as_int(str
, &tmpint
, data
->root_fd
) == 0)
3569 mypackageid
= (unsigned) tmpint
;
3571 if (merge_buggy_core_siblings
) {
3572 /* check for another package with same physical_package_id */
3573 hwloc_obj_t curpackage
= packages
;
3574 while (curpackage
) {
3575 if (curpackage
->os_index
== mypackageid
) {
3576 /* found another package with same physical_package_id but different core_siblings.
3577 * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
3578 * merge these core_siblings to extend the existing first package object.
3580 static int reported
= 0;
3581 if (!reported
&& !hwloc_hide_errors()) {
3583 hwloc_bitmap_asprintf(&a
, curpackage
->cpuset
);
3584 hwloc_bitmap_asprintf(&b
, packageset
);
3585 fprintf(stderr
, "****************************************************************************\n");
3586 fprintf(stderr
, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION
);
3587 fprintf(stderr
, "* the same physical package id %u but different core_siblings %s and %s\n",
3589 fprintf(stderr
, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
3590 fprintf(stderr
, "* does not support this processor correctly.\n");
3591 fprintf(stderr
, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
3592 fprintf(stderr
, "*\n");
3593 fprintf(stderr
, "* If hwloc does not report the right number of packages,\n");
3594 fprintf(stderr
, "* please report this error message to the hwloc user's mailing list,\n");
3595 fprintf(stderr
, "* along with the files generated by the hwloc-gather-topology script.\n");
3596 fprintf(stderr
, "****************************************************************************\n");
3601 hwloc_bitmap_or(curpackage
->cpuset
, curpackage
->cpuset
, packageset
);
3604 curpackage
= curpackage
->next_cousin
;
3608 /* no package with same physical_package_id, create a new one */
3609 package
= hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE
, mypackageid
);
3610 package
->cpuset
= packageset
;
3611 hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
3612 mypackageid
, packageset
);
3614 if (cpuinfo_Lprocs
) {
3615 for(j
=0; j
<(int) cpuinfo_numprocs
; j
++)
3616 if ((int) cpuinfo_Lprocs
[j
].Pproc
== i
) {
3617 hwloc__move_infos(&package
->infos
, &package
->infos_count
,
3618 &cpuinfo_Lprocs
[j
].infos
, &cpuinfo_Lprocs
[j
].infos_count
);
3621 /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
3622 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
3624 package
->next_cousin
= packages
;
3627 packageset
= NULL
; /* don't free it */
3630 hwloc_bitmap_free(packageset
);
3632 /* look at the core */
3633 sprintf(str
, "%s/cpu%d/topology/thread_siblings", path
, i
);
3634 coreset
= hwloc__alloc_read_path_as_cpumask(str
, data
->root_fd
);
3637 int gotcoreid
= 0; /* to avoid reading the coreid twice */
3638 if (hwloc_bitmap_weight(coreset
) > 1 && threadwithcoreid
== -1) {
3639 /* check if this is hyper-threading or different coreids */
3640 unsigned siblingid
, siblingcoreid
;
3642 mycoreid
= (unsigned) -1;
3643 sprintf(str
, "%s/cpu%d/topology/core_id", path
, i
); /* contains %d at least up to 4.9 */
3644 if (hwloc_read_path_as_int(str
, &tmpint
, data
->root_fd
) == 0)
3645 mycoreid
= (unsigned) tmpint
;
3648 siblingid
= hwloc_bitmap_first(coreset
);
3649 if (siblingid
== (unsigned) i
)
3650 siblingid
= hwloc_bitmap_next(coreset
, i
);
3651 siblingcoreid
= (unsigned) -1;
3652 sprintf(str
, "%s/cpu%u/topology/core_id", path
, siblingid
); /* contains %d at least up to 4.9 */
3653 if (hwloc_read_path_as_int(str
, &tmpint
, data
->root_fd
) == 0)
3654 siblingcoreid
= (unsigned) tmpint
;
3655 threadwithcoreid
= (siblingcoreid
!= mycoreid
);
3657 if (hwloc_bitmap_first(coreset
) == i
|| threadwithcoreid
) {
3659 struct hwloc_obj
*core
;
3662 mycoreid
= (unsigned) -1;
3663 sprintf(str
, "%s/cpu%d/topology/core_id", path
, i
); /* contains %d at least up to 4.9 */
3664 if (hwloc_read_path_as_int(str
, &tmpint
, data
->root_fd
) == 0)
3665 mycoreid
= (unsigned) tmpint
;
3668 core
= hwloc_alloc_setup_object(HWLOC_OBJ_CORE
, mycoreid
);
3669 if (threadwithcoreid
)
3670 /* amd multicore compute-unit, create one core per thread */
3671 hwloc_bitmap_only(coreset
, i
);
3672 core
->cpuset
= coreset
;
3673 hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
3674 mycoreid
, core
->cpuset
);
3675 hwloc_insert_object_by_cpuset(topology
, core
);
3676 coreset
= NULL
; /* don't free it */
3678 hwloc_bitmap_free(coreset
);
3681 /* look at the books */
3682 sprintf(str
, "%s/cpu%d/topology/book_siblings", path
, i
);
3683 bookset
= hwloc__alloc_read_path_as_cpumask(str
, data
->root_fd
);
3685 if (hwloc_bitmap_first(bookset
) == i
) {
3686 struct hwloc_obj
*book
;
3688 mybookid
= (unsigned) -1;
3689 sprintf(str
, "%s/cpu%d/topology/book_id", path
, i
); /* contains %d at least up to 4.9 */
3690 if (hwloc_read_path_as_int(str
, &tmpint
, data
->root_fd
) == 0) {
3691 mybookid
= (unsigned) tmpint
;
3693 book
= hwloc_alloc_setup_object(HWLOC_OBJ_GROUP
, mybookid
);
3694 book
->cpuset
= bookset
;
3695 hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
3697 hwloc_obj_add_info(book
, "Type", "Book");
3698 hwloc_insert_object_by_cpuset(topology
, book
);
3699 bookset
= NULL
; /* don't free it */
3702 hwloc_bitmap_free(bookset
);
3706 /* look at the thread */
3707 struct hwloc_obj
*thread
= hwloc_alloc_setup_object(HWLOC_OBJ_PU
, i
);
3708 threadset
= hwloc_bitmap_alloc();
3709 hwloc_bitmap_only(threadset
, i
);
3710 thread
->cpuset
= threadset
;
3711 hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
3713 hwloc_insert_object_by_cpuset(topology
, thread
);
3716 /* look at the caches */
3717 for(j
=0; j
<10; j
++) {
3718 char str2
[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
3719 hwloc_bitmap_t cacheset
;
3721 sprintf(str
, "%s/cpu%d/cache/index%d/shared_cpu_map", path
, i
, j
);
3722 cacheset
= hwloc__alloc_read_path_as_cpumask(str
, data
->root_fd
);
3724 if (hwloc_bitmap_iszero(cacheset
)) {
3725 hwloc_bitmap_t tmpset
;
3726 /* ia64 returning empty L3 and L2i? use the core set instead */
3727 sprintf(str
, "%s/cpu%d/topology/thread_siblings", path
, i
);
3728 tmpset
= hwloc__alloc_read_path_as_cpumask(str
, data
->root_fd
);
3729 /* only use it if we actually got something */
3731 hwloc_bitmap_free(cacheset
);
3736 if (hwloc_bitmap_first(cacheset
) == i
) {
3739 unsigned sets
, lines_per_tag
;
3740 unsigned depth
; /* 1 for L1, .... */
3741 hwloc_obj_cache_type_t type
= HWLOC_OBJ_CACHE_UNIFIED
; /* default */
3742 struct hwloc_obj
*cache
;
3744 /* get the cache level depth */
3745 sprintf(str
, "%s/cpu%d/cache/index%d/level", path
, i
, j
); /* contains %u at least up to 4.9 */
3746 if (hwloc_read_path_as_uint(str
, &depth
, data
->root_fd
) < 0) {
3747 hwloc_bitmap_free(cacheset
);
3752 sprintf(str
, "%s/cpu%d/cache/index%d/type", path
, i
, j
);
3753 if (hwloc_read_path_by_length(str
, str2
, sizeof(str2
), data
->root_fd
) == 0) {
3754 if (!strncmp(str2
, "Data", 4))
3755 type
= HWLOC_OBJ_CACHE_DATA
;
3756 else if (!strncmp(str2
, "Unified", 7))
3757 type
= HWLOC_OBJ_CACHE_UNIFIED
;
3758 else if (!strncmp(str2
, "Instruction", 11))
3759 type
= HWLOC_OBJ_CACHE_INSTRUCTION
;
3761 hwloc_bitmap_free(cacheset
);
3765 hwloc_bitmap_free(cacheset
);
3769 /* get the cache size */
3771 sprintf(str
, "%s/cpu%d/cache/index%d/size", path
, i
, j
); /* contains %uK at least up to 4.9 */
3772 hwloc_read_path_as_uint(str
, &kB
, data
->root_fd
);
3773 /* KNL reports L3 with size=0 and full cpuset in cpuid.
3774 * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
3776 if (!kB
&& depth
== 3 && data
->is_knl
) {
3777 hwloc_bitmap_free(cacheset
);
3781 /* get the line size */
3783 sprintf(str
, "%s/cpu%d/cache/index%d/coherency_line_size", path
, i
, j
); /* contains %u at least up to 4.9 */
3784 hwloc_read_path_as_uint(str
, &linesize
, data
->root_fd
);
3786 /* get the number of sets and lines per tag.
3787 * don't take the associativity directly in "ways_of_associativity" because
3788 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
3791 sprintf(str
, "%s/cpu%d/cache/index%d/number_of_sets", path
, i
, j
); /* contains %u at least up to 4.9 */
3792 hwloc_read_path_as_uint(str
, &sets
, data
->root_fd
);
3795 sprintf(str
, "%s/cpu%d/cache/index%d/physical_line_partition", path
, i
, j
); /* contains %u at least up to 4.9 */
3796 hwloc_read_path_as_uint(str
, &lines_per_tag
, data
->root_fd
);
3798 /* first cpu in this cache, add the cache */
3799 cache
= hwloc_alloc_setup_object(HWLOC_OBJ_CACHE
, -1);
3800 cache
->attr
->cache
.size
= ((uint64_t)kB
) << 10;
3801 cache
->attr
->cache
.depth
= depth
;
3802 cache
->attr
->cache
.linesize
= linesize
;
3803 cache
->attr
->cache
.type
= type
;
3804 if (!linesize
|| !lines_per_tag
|| !sets
)
3805 cache
->attr
->cache
.associativity
= 0; /* unknown */
3807 cache
->attr
->cache
.associativity
= 0; /* likely wrong, make it unknown */
3809 cache
->attr
->cache
.associativity
= (kB
<< 10) / linesize
/ lines_per_tag
/ sets
;
3810 cache
->cpuset
= cacheset
;
3811 hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
3813 hwloc_insert_object_by_cpuset(topology
, cache
);
3814 cacheset
= NULL
; /* don't free it */
3818 hwloc_bitmap_free(cacheset
);
3820 } hwloc_bitmap_foreach_end();
3822 /* actually insert in the tree now that package cpusets have been fixed-up */
3824 hwloc_obj_t next
= packages
->next_cousin
;
3825 packages
->next_cousin
= NULL
;
3826 hwloc_insert_object_by_cpuset(topology
, packages
);
3830 if (0 == caches_added
)
3831 look_powerpc_device_tree(topology
, data
);
3833 hwloc_bitmap_free(cpuset
);
3840 /****************************************
3841 ****** cpuinfo Topology Discovery ******
3842 ****************************************/
3845 hwloc_linux_parse_cpuinfo_x86(const char *prefix
, const char *value
,
3846 struct hwloc_obj_info_s
**infos
, unsigned *infos_count
,
3847 int is_global __hwloc_attribute_unused
)
3849 if (!strcmp("vendor_id", prefix
)) {
3850 hwloc__add_info(infos
, infos_count
, "CPUVendor", value
);
3851 } else if (!strcmp("model name", prefix
)) {
3852 hwloc__add_info(infos
, infos_count
, "CPUModel", value
);
3853 } else if (!strcmp("model", prefix
)) {
3854 hwloc__add_info(infos
, infos_count
, "CPUModelNumber", value
);
3855 } else if (!strcmp("cpu family", prefix
)) {
3856 hwloc__add_info(infos
, infos_count
, "CPUFamilyNumber", value
);
3857 } else if (!strcmp("stepping", prefix
)) {
3858 hwloc__add_info(infos
, infos_count
, "CPUStepping", value
);
3864 hwloc_linux_parse_cpuinfo_ia64(const char *prefix
, const char *value
,
3865 struct hwloc_obj_info_s
**infos
, unsigned *infos_count
,
3866 int is_global __hwloc_attribute_unused
)
3868 if (!strcmp("vendor", prefix
)) {
3869 hwloc__add_info(infos
, infos_count
, "CPUVendor", value
);
3870 } else if (!strcmp("model name", prefix
)) {
3871 hwloc__add_info(infos
, infos_count
, "CPUModel", value
);
3872 } else if (!strcmp("model", prefix
)) {
3873 hwloc__add_info(infos
, infos_count
, "CPUModelNumber", value
);
3874 } else if (!strcmp("family", prefix
)) {
3875 hwloc__add_info(infos
, infos_count
, "CPUFamilyNumber", value
);
3881 hwloc_linux_parse_cpuinfo_arm(const char *prefix
, const char *value
,
3882 struct hwloc_obj_info_s
**infos
, unsigned *infos_count
,
3883 int is_global __hwloc_attribute_unused
)
3885 if (!strcmp("Processor", prefix
) /* old kernels with one Processor header */
3886 || !strcmp("model name", prefix
) /* new kernels with one model name per core */) {
3887 hwloc__add_info(infos
, infos_count
, "CPUModel", value
);
3888 } else if (!strcmp("CPU implementer", prefix
)) {
3889 hwloc__add_info(infos
, infos_count
, "CPUImplementer", value
);
3890 } else if (!strcmp("CPU architecture", prefix
)) {
3891 hwloc__add_info(infos
, infos_count
, "CPUArchitecture", value
);
3892 } else if (!strcmp("CPU variant", prefix
)) {
3893 hwloc__add_info(infos
, infos_count
, "CPUVariant", value
);
3894 } else if (!strcmp("CPU part", prefix
)) {
3895 hwloc__add_info(infos
, infos_count
, "CPUPart", value
);
3896 } else if (!strcmp("CPU revision", prefix
)) {
3897 hwloc__add_info(infos
, infos_count
, "CPURevision", value
);
3898 } else if (!strcmp("Hardware", prefix
)) {
3899 hwloc__add_info(infos
, infos_count
, "HardwareName", value
);
3900 } else if (!strcmp("Revision", prefix
)) {
3901 hwloc__add_info(infos
, infos_count
, "HardwareRevision", value
);
3902 } else if (!strcmp("Serial", prefix
)) {
3903 hwloc__add_info(infos
, infos_count
, "HardwareSerial", value
);
3909 hwloc_linux_parse_cpuinfo_ppc(const char *prefix
, const char *value
,
3910 struct hwloc_obj_info_s
**infos
, unsigned *infos_count
,
3914 if (!strcmp("cpu", prefix
)) {
3915 hwloc__add_info(infos
, infos_count
, "CPUModel", value
);
3916 } else if (!strcmp("platform", prefix
)) {
3917 hwloc__add_info(infos
, infos_count
, "PlatformName", value
);
3918 } else if (!strcmp("model", prefix
)) {
3919 hwloc__add_info(infos
, infos_count
, "PlatformModel", value
);
3921 /* platform-specific fields */
3922 else if (!strcasecmp("vendor", prefix
)) {
3923 hwloc__add_info(infos
, infos_count
, "PlatformVendor", value
);
3924 } else if (!strcmp("Board ID", prefix
)) {
3925 hwloc__add_info(infos
, infos_count
, "PlatformBoardID", value
);
3926 } else if (!strcmp("Board", prefix
)
3927 || !strcasecmp("Machine", prefix
)) {
3928 /* machine and board are similar (and often more precise) than model above */
3929 char **valuep
= hwloc__find_info_slot(infos
, infos_count
, "PlatformModel");
3932 *valuep
= strdup(value
);
3933 } else if (!strcasecmp("Revision", prefix
)
3934 || !strcmp("Hardware rev", prefix
)) {
3935 hwloc__add_info(infos
, infos_count
, is_global
? "PlatformRevision" : "CPURevision", value
);
3936 } else if (!strcmp("SVR", prefix
)) {
3937 hwloc__add_info(infos
, infos_count
, "SystemVersionRegister", value
);
3938 } else if (!strcmp("PVR", prefix
)) {
3939 hwloc__add_info(infos
, infos_count
, "ProcessorVersionRegister", value
);
3941 /* don't match 'board*' because there's also "board l2" on some platforms */
3946 * avr32: "chip type\t:" => OK
3947 * blackfin: "model name\t:" => OK
3948 * h8300: "CPU:" => OK
3949 * m68k: "CPU:" => OK
3950 * mips: "cpu model\t\t:" => OK
3951 * openrisc: "CPU:" => OK
3952 * sparc: "cpu\t\t:" => OK
3953 * tile: "model name\t:" => OK
3954 * unicore32: "Processor\t:" => OK
3955 * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
3956 * cris: "cpu\t\t:" + "cpu model\t:" => only "cpu"
3957 * frv: "CPU-Core:" + "CPU:" => only "CPU"
3958 * mn10300: "cpu core :" + "model name :" => only "model name"
3959 * parisc: "cpu family\t:" + "cpu\t\t:" => only "cpu"
3961 * not supported because of conflicts with other arch minor lines:
3962 * m32r: "cpu family\t:" => KO (adding "cpu family" would break "blackfin")
3963 * microblaze: "CPU-Family:" => KO
3964 * sh: "cpu family\t:" + "cpu type\t:" => KO
3965 * xtensa: "model\t\t:" => KO
3968 hwloc_linux_parse_cpuinfo_generic(const char *prefix
, const char *value
,
3969 struct hwloc_obj_info_s
**infos
, unsigned *infos_count
,
3970 int is_global __hwloc_attribute_unused
)
3972 if (!strcmp("model name", prefix
)
3973 || !strcmp("Processor", prefix
)
3974 || !strcmp("chip type", prefix
)
3975 || !strcmp("cpu model", prefix
)
3976 || !strcasecmp("cpu", prefix
)) {
3977 /* keep the last one, assume it's more precise than the first one.
3978 * we should have the Architecture keypair for basic information anyway.
3980 char **valuep
= hwloc__find_info_slot(infos
, infos_count
, "CPUModel");
3983 *valuep
= strdup(value
);
3988 /* Lprocs_p set to NULL unless returns > 0 */
3990 hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s
*data
,
3992 struct hwloc_linux_cpuinfo_proc
** Lprocs_p
,
3993 struct hwloc_obj_info_s
**global_infos
, unsigned *global_infos_count
)
3999 unsigned allocated_Lprocs
= 0;
4000 struct hwloc_linux_cpuinfo_proc
* Lprocs
= NULL
;
4001 unsigned numprocs
= 0;
4003 int (*parse_cpuinfo_func
)(const char *, const char *, struct hwloc_obj_info_s
**, unsigned *, int) = NULL
;
4005 if (!(fd
=hwloc_fopen(path
,"r", data
->root_fd
)))
4007 hwloc_debug("could not open %s\n", path
);
4011 # define PROCESSOR "processor"
4012 # define PACKAGEID "physical id" /* the longest one */
4013 # define COREID "core id"
4014 len
= 128; /* vendor/model can be very long */
4016 hwloc_debug("\n\n * Topology extraction from %s *\n\n", path
);
4017 while (fgets(str
,len
,fd
)!=NULL
) {
4018 unsigned long Ppkg
, Pcore
, Pproc
;
4019 char *end
, *dot
, *prefix
, *value
;
4022 /* remove the ending \n */
4023 end
= strchr(str
, '\n');
4028 /* if empty line, skip and reset curproc */
4033 /* skip lines with no dot */
4034 dot
= strchr(str
, ':');
4037 /* skip lines not starting with a letter */
4038 if ((*str
> 'z' || *str
< 'a')
4039 && (*str
> 'Z' || *str
< 'A'))
4042 /* mark the end of the prefix */
4045 while (end
[-1] == ' ' || end
[-1] == '\t') end
--; /* need a strrspn() */
4047 /* find beginning of value, its end is already marked */
4048 value
= dot
+1 + strspn(dot
+1, " \t");
4050 /* defines for parsing numbers */
4051 # define getprocnb_begin(field, var) \
4052 if (!strcmp(field,prefix)) { \
4053 var = strtoul(value,&endptr,0); \
4054 if (endptr==value) { \
4055 hwloc_debug("no number in "field" field of %s\n", path); \
4057 } else if (var==ULONG_MAX) { \
4058 hwloc_debug("too big "field" number in %s\n", path); \
4061 hwloc_debug(field " %lu\n", var)
4062 # define getprocnb_end() \
4064 /* actually parse numbers */
4065 getprocnb_begin(PROCESSOR
, Pproc
);
4066 curproc
= numprocs
++;
4067 if (numprocs
> allocated_Lprocs
) {
4068 struct hwloc_linux_cpuinfo_proc
* tmp
;
4069 if (!allocated_Lprocs
)
4070 allocated_Lprocs
= 8;
4072 allocated_Lprocs
*= 2;
4073 tmp
= realloc(Lprocs
, allocated_Lprocs
* sizeof(*Lprocs
));
4078 Lprocs
[curproc
].Pproc
= Pproc
;
4079 Lprocs
[curproc
].Pcore
= -1;
4080 Lprocs
[curproc
].Ppkg
= -1;
4081 Lprocs
[curproc
].Lcore
= -1;
4082 Lprocs
[curproc
].Lpkg
= -1;
4083 Lprocs
[curproc
].infos
= NULL
;
4084 Lprocs
[curproc
].infos_count
= 0;
4085 getprocnb_end() else
4086 getprocnb_begin(PACKAGEID
, Ppkg
);
4087 Lprocs
[curproc
].Ppkg
= Ppkg
;
4088 getprocnb_end() else
4089 getprocnb_begin(COREID
, Pcore
);
4090 Lprocs
[curproc
].Pcore
= Pcore
;
4091 getprocnb_end() else {
4093 /* architecture specific or default routine for parsing cpumodel */
4094 switch (data
->arch
) {
4095 case HWLOC_LINUX_ARCH_X86
:
4096 parse_cpuinfo_func
= hwloc_linux_parse_cpuinfo_x86
;
4098 case HWLOC_LINUX_ARCH_ARM
:
4099 parse_cpuinfo_func
= hwloc_linux_parse_cpuinfo_arm
;
4101 case HWLOC_LINUX_ARCH_POWER
:
4102 parse_cpuinfo_func
= hwloc_linux_parse_cpuinfo_ppc
;
4104 case HWLOC_LINUX_ARCH_IA64
:
4105 parse_cpuinfo_func
= hwloc_linux_parse_cpuinfo_ia64
;
4108 parse_cpuinfo_func
= hwloc_linux_parse_cpuinfo_generic
;
4111 /* we can't assume that we already got a processor index line:
4112 * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
4113 * tile has a global section with model name before the list of processor lines.
4115 parse_cpuinfo_func(prefix
, value
,
4116 curproc
>= 0 ? &Lprocs
[curproc
].infos
: global_infos
,
4117 curproc
>= 0 ? &Lprocs
[curproc
].infos_count
: global_infos_count
,
4122 /* ignore end of line */
4123 if (fscanf(fd
,"%*[^\n]") == EOF
)
4143 hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc
* Lprocs
, unsigned numprocs
,
4144 struct hwloc_obj_info_s
*global_infos
, unsigned global_infos_count
)
4148 for(i
=0; i
<numprocs
; i
++) {
4149 hwloc__free_infos(Lprocs
[i
].infos
, Lprocs
[i
].infos_count
);
4153 hwloc__free_infos(global_infos
, global_infos_count
);
4157 look_cpuinfo(struct hwloc_topology
*topology
,
4158 struct hwloc_linux_cpuinfo_proc
* Lprocs
,
4159 unsigned numprocs
, hwloc_bitmap_t online_cpuset
)
4161 /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
4162 unsigned *Lcore_to_Pcore
;
4163 unsigned *Lcore_to_Ppkg
; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
4164 unsigned *Lpkg_to_Ppkg
;
4166 unsigned numcores
=0;
4167 unsigned long Lproc
;
4168 unsigned missingpkg
;
4169 unsigned missingcore
;
4171 hwloc_bitmap_t cpuset
;
4173 /* initialize misc arrays, there can be at most numprocs entries */
4174 Lcore_to_Pcore
= malloc(numprocs
* sizeof(*Lcore_to_Pcore
));
4175 Lcore_to_Ppkg
= malloc(numprocs
* sizeof(*Lcore_to_Ppkg
));
4176 Lpkg_to_Ppkg
= malloc(numprocs
* sizeof(*Lpkg_to_Ppkg
));
4177 for (i
= 0; i
< numprocs
; i
++) {
4178 Lcore_to_Pcore
[i
] = -1;
4179 Lcore_to_Ppkg
[i
] = -1;
4180 Lpkg_to_Ppkg
[i
] = -1;
4183 cpuset
= hwloc_bitmap_alloc();
4185 /* create PU objects */
4186 for(Lproc
=0; Lproc
<numprocs
; Lproc
++) {
4187 unsigned long Pproc
= Lprocs
[Lproc
].Pproc
;
4188 hwloc_obj_t obj
= hwloc_alloc_setup_object(HWLOC_OBJ_PU
, Pproc
);
4189 hwloc_bitmap_set(cpuset
, Pproc
);
4190 obj
->cpuset
= hwloc_bitmap_alloc();
4191 hwloc_bitmap_only(obj
->cpuset
, Pproc
);
4192 hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
4193 Lproc
, Pproc
, obj
->cpuset
);
4194 hwloc_insert_object_by_cpuset(topology
, obj
);
4197 topology
->support
.discovery
->pu
= 1;
4198 hwloc_bitmap_copy(online_cpuset
, cpuset
);
4199 hwloc_bitmap_free(cpuset
);
4201 hwloc_debug("%u online processors found\n", numprocs
);
4202 hwloc_debug_bitmap("online processor cpuset: %s\n", online_cpuset
);
4204 hwloc_debug("%s", "\n * Topology summary *\n");
4205 hwloc_debug("%u processors)\n", numprocs
);
4207 /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
4208 for(Lproc
=0; Lproc
<numprocs
; Lproc
++) {
4209 long Ppkg
= Lprocs
[Lproc
].Ppkg
;
4211 unsigned long Pproc
= Lprocs
[Lproc
].Pproc
;
4212 for (i
=0; i
<numpkgs
; i
++)
4213 if ((unsigned) Ppkg
== Lpkg_to_Ppkg
[i
])
4215 Lprocs
[Lproc
].Lpkg
= i
;
4216 hwloc_debug("%lu on package %u (%lx)\n", Pproc
, i
, (unsigned long) Ppkg
);
4218 Lpkg_to_Ppkg
[numpkgs
] = Ppkg
;
4223 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4224 * provide bogus information. We should rather drop it. */
4226 for(j
=0; j
<numprocs
; j
++)
4227 if (Lprocs
[j
].Ppkg
== -1) {
4231 /* create package objects */
4232 hwloc_debug("%u pkgs%s\n", numpkgs
, missingpkg
? ", but some missing package" : "");
4233 if (!missingpkg
&& numpkgs
>0) {
4234 for (i
= 0; i
< numpkgs
; i
++) {
4235 struct hwloc_obj
*obj
= hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE
, Lpkg_to_Ppkg
[i
]);
4237 obj
->cpuset
= hwloc_bitmap_alloc();
4238 for(j
=0; j
<numprocs
; j
++)
4239 if ((unsigned) Lprocs
[j
].Lpkg
== i
) {
4240 hwloc_bitmap_set(obj
->cpuset
, Lprocs
[j
].Pproc
);
4242 hwloc__move_infos(&obj
->infos
, &obj
->infos_count
, &Lprocs
[j
].infos
, &Lprocs
[j
].infos_count
);
4246 hwloc_debug_1arg_bitmap("Package %u has cpuset %s\n", i
, obj
->cpuset
);
4247 hwloc_insert_object_by_cpuset(topology
, obj
);
4249 hwloc_debug("%s", "\n");
4252 /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
4253 for(Lproc
=0; Lproc
<numprocs
; Lproc
++) {
4254 long Pcore
= Lprocs
[Lproc
].Pcore
;
4256 for (i
=0; i
<numcores
; i
++)
4257 if ((unsigned) Pcore
== Lcore_to_Pcore
[i
] && (unsigned) Lprocs
[Lproc
].Ppkg
== Lcore_to_Ppkg
[i
])
4259 Lprocs
[Lproc
].Lcore
= i
;
4261 Lcore_to_Ppkg
[numcores
] = Lprocs
[Lproc
].Ppkg
;
4262 Lcore_to_Pcore
[numcores
] = Pcore
;
4267 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4268 * provide bogus information. We should rather drop it. */
4270 for(j
=0; j
<numprocs
; j
++)
4271 if (Lprocs
[j
].Pcore
== -1) {
4275 /* create Core objects */
4276 hwloc_debug("%u cores%s\n", numcores
, missingcore
? ", but some missing core" : "");
4277 if (!missingcore
&& numcores
>0) {
4278 for (i
= 0; i
< numcores
; i
++) {
4279 struct hwloc_obj
*obj
= hwloc_alloc_setup_object(HWLOC_OBJ_CORE
, Lcore_to_Pcore
[i
]);
4280 obj
->cpuset
= hwloc_bitmap_alloc();
4281 for(j
=0; j
<numprocs
; j
++)
4282 if ((unsigned) Lprocs
[j
].Lcore
== i
)
4283 hwloc_bitmap_set(obj
->cpuset
, Lprocs
[j
].Pproc
);
4284 hwloc_debug_1arg_bitmap("Core %u has cpuset %s\n", i
, obj
->cpuset
);
4285 hwloc_insert_object_by_cpuset(topology
, obj
);
4287 hwloc_debug("%s", "\n");
4290 free(Lcore_to_Pcore
);
4291 free(Lcore_to_Ppkg
);
4298 /*************************************
4299 ****** Main Topology Discovery ******
4300 *************************************/
4303 hwloc__linux_get_mic_sn(struct hwloc_topology
*topology
, struct hwloc_linux_backend_data_s
*data
)
4305 char line
[64], *tmp
, *end
;
4306 if (hwloc_read_path_by_length("/proc/elog", line
, sizeof(line
), data
->root_fd
) < 0)
4308 if (strncmp(line
, "Card ", 5))
4311 end
= strchr(tmp
, ':');
4315 hwloc_obj_add_info(hwloc_get_root_obj(topology
), "MICSerialNumber", tmp
);
4319 hwloc_gather_system_info(struct hwloc_topology
*topology
,
4320 struct hwloc_linux_backend_data_s
*data
)
4323 char line
[128]; /* enough for utsname fields */
4326 /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
4327 memset(&data
->utsname
, 0, sizeof(data
->utsname
));
4328 data
->fallback_nbprocessors
= 1;
4329 data
->pagesize
= 4096;
4331 /* read thissystem info */
4332 if (topology
->is_thissystem
) {
4333 uname(&data
->utsname
);
4334 data
->fallback_nbprocessors
= hwloc_fallback_nbprocessors(topology
);
4335 data
->pagesize
= hwloc_getpagesize();
4338 /* overwrite with optional /proc/hwloc-nofile-info */
4339 file
= hwloc_fopen("/proc/hwloc-nofile-info", "r", data
->root_fd
);
4341 while (fgets(line
, sizeof(line
), file
)) {
4342 char *tmp
= strchr(line
, '\n');
4343 if (!strncmp("OSName: ", line
, 8)) {
4346 strncpy(data
->utsname
.sysname
, line
+8, sizeof(data
->utsname
.sysname
));
4347 data
->utsname
.sysname
[sizeof(data
->utsname
.sysname
)-1] = '\0';
4348 } else if (!strncmp("OSRelease: ", line
, 11)) {
4351 strncpy(data
->utsname
.release
, line
+11, sizeof(data
->utsname
.release
));
4352 data
->utsname
.release
[sizeof(data
->utsname
.release
)-1] = '\0';
4353 } else if (!strncmp("OSVersion: ", line
, 11)) {
4356 strncpy(data
->utsname
.version
, line
+11, sizeof(data
->utsname
.version
));
4357 data
->utsname
.version
[sizeof(data
->utsname
.version
)-1] = '\0';
4358 } else if (!strncmp("HostName: ", line
, 10)) {
4361 strncpy(data
->utsname
.nodename
, line
+10, sizeof(data
->utsname
.nodename
));
4362 data
->utsname
.nodename
[sizeof(data
->utsname
.nodename
)-1] = '\0';
4363 } else if (!strncmp("Architecture: ", line
, 14)) {
4366 strncpy(data
->utsname
.machine
, line
+14, sizeof(data
->utsname
.machine
));
4367 data
->utsname
.machine
[sizeof(data
->utsname
.machine
)-1] = '\0';
4368 } else if (!strncmp("FallbackNbProcessors: ", line
, 22)) {
4371 data
->fallback_nbprocessors
= atoi(line
+22);
4372 } else if (!strncmp("PageSize: ", line
, 10)) {
4375 data
->pagesize
= strtoull(line
+10, NULL
, 10);
4377 hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line
);
4384 env
= getenv("HWLOC_DUMP_NOFILE_INFO");
4386 file
= fopen(env
, "w");
4388 if (*data
->utsname
.sysname
)
4389 fprintf(file
, "OSName: %s\n", data
->utsname
.sysname
);
4390 if (*data
->utsname
.release
)
4391 fprintf(file
, "OSRelease: %s\n", data
->utsname
.release
);
4392 if (*data
->utsname
.version
)
4393 fprintf(file
, "OSVersion: %s\n", data
->utsname
.version
);
4394 if (*data
->utsname
.nodename
)
4395 fprintf(file
, "HostName: %s\n", data
->utsname
.nodename
);
4396 if (*data
->utsname
.machine
)
4397 fprintf(file
, "Architecture: %s\n", data
->utsname
.machine
);
4398 fprintf(file
, "FallbackNbProcessors: %u\n", data
->fallback_nbprocessors
);
4399 fprintf(file
, "PageSize: %llu\n", (unsigned long long) data
->pagesize
);
4404 /* detect arch for quirks, using configure #defines if possible, or uname */
4405 #if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
4406 if (topology
->is_thissystem
)
4407 data
->arch
= HWLOC_LINUX_ARCH_X86
;
4409 if (data
->arch
== HWLOC_LINUX_ARCH_UNKNOWN
&& *data
->utsname
.machine
) {
4410 if (!strcmp(data
->utsname
.machine
, "x86_64")
4411 || (data
->utsname
.machine
[0] == 'i' && !strcmp(data
->utsname
.machine
+2, "86"))
4412 || !strcmp(data
->utsname
.machine
, "k1om"))
4413 data
->arch
= HWLOC_LINUX_ARCH_X86
;
4414 else if (!strncmp(data
->utsname
.machine
, "arm", 3))
4415 data
->arch
= HWLOC_LINUX_ARCH_ARM
;
4416 else if (!strncmp(data
->utsname
.machine
, "ppc", 3)
4417 || !strncmp(data
->utsname
.machine
, "power", 5))
4418 data
->arch
= HWLOC_LINUX_ARCH_POWER
;
4419 else if (!strcmp(data
->utsname
.machine
, "ia64"))
4420 data
->arch
= HWLOC_LINUX_ARCH_IA64
;
4424 /* returns 0 on success, -1 on non-match or error during hardwired load */
4426 hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend
*backend
)
4428 struct hwloc_topology
*topology
= backend
->topology
;
4429 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
4431 if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
4434 if (!strcmp(data
->utsname
.machine
, "s64fx")) {
4436 /* Fujistu K-computer, FX10, and FX100 use specific processors
4437 * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
4438 * and existing machines will likely never be fixed by kernel upgrade.
4441 /* /proc/cpuinfo starts with one of these lines:
4442 * "cpu : Fujitsu SPARC64 VIIIfx"
4443 * "cpu : Fujitsu SPARC64 XIfx"
4444 * "cpu : Fujitsu SPARC64 IXfx"
4446 if (hwloc_read_path_by_length("/proc/cpuinfo", line
, sizeof(line
), data
->root_fd
) < 0)
4449 if (strncmp(line
, "cpu\t", 4))
4452 if (strstr(line
, "Fujitsu SPARC64 VIIIfx"))
4453 return hwloc_look_hardwired_fujitsu_k(topology
);
4454 else if (strstr(line
, "Fujitsu SPARC64 IXfx"))
4455 return hwloc_look_hardwired_fujitsu_fx10(topology
);
4456 else if (strstr(line
, "FUJITSU SPARC64 XIfx"))
4457 return hwloc_look_hardwired_fujitsu_fx100(topology
);
4462 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology
, const char *root_path
, int root_fd
, char **cpuset_namep
)
4464 char *cpuset_mntpnt
, *cgroup_mntpnt
, *cpuset_name
= NULL
;
4465 hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt
, &cpuset_mntpnt
, root_path
);
4466 if (cgroup_mntpnt
|| cpuset_mntpnt
) {
4467 cpuset_name
= hwloc_read_linux_cpuset_name(root_fd
, topology
->pid
);
4469 hwloc_admin_disable_set_from_cpuset(root_fd
, cgroup_mntpnt
, cpuset_mntpnt
, cpuset_name
, "cpus", topology
->levels
[0][0]->allowed_cpuset
);
4470 hwloc_admin_disable_set_from_cpuset(root_fd
, cgroup_mntpnt
, cpuset_mntpnt
, cpuset_name
, "mems", topology
->levels
[0][0]->allowed_nodeset
);
4472 free(cgroup_mntpnt
);
4473 free(cpuset_mntpnt
);
4475 *cpuset_namep
= cpuset_name
;
4479 hwloc_look_linuxfs(struct hwloc_backend
*backend
)
4481 struct hwloc_topology
*topology
= backend
->topology
;
4482 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
4486 struct hwloc_linux_cpuinfo_proc
* Lprocs
= NULL
;
4487 struct hwloc_obj_info_s
*global_infos
= NULL
;
4488 unsigned global_infos_count
= 0;
4493 already_pus
= (topology
->levels
[0][0]->complete_cpuset
!= NULL
4494 && !hwloc_bitmap_iszero(topology
->levels
[0][0]->complete_cpuset
));
4495 /* if there are PUs, still look at memory information
4496 * since x86 misses NUMA node information (unless the processor supports topoext)
4500 /* allocate root sets in case not done yet */
4501 hwloc_alloc_obj_cpusets(topology
->levels
[0][0]);
4503 /*********************************
4504 * Platform information for later
4506 hwloc_gather_system_info(topology
, data
);
4508 /**********************
4511 numprocs
= hwloc_linux_parse_cpuinfo(data
, "/proc/cpuinfo", &Lprocs
, &global_infos
, &global_infos_count
);
4515 /**************************
4516 * detect model for quirks
4518 if (data
->arch
== HWLOC_LINUX_ARCH_X86
&& numprocs
> 0) {
4520 const char *cpuvendor
= NULL
, *cpufamilynumber
= NULL
, *cpumodelnumber
= NULL
;
4521 for(i
=0; i
<Lprocs
[0].infos_count
; i
++) {
4522 if (!strcmp(Lprocs
[0].infos
[i
].name
, "CPUVendor")) {
4523 cpuvendor
= Lprocs
[0].infos
[i
].value
;
4524 } else if (!strcmp(Lprocs
[0].infos
[i
].name
, "CPUFamilyNumber")) {
4525 cpufamilynumber
= Lprocs
[0].infos
[i
].value
;
4526 } else if (!strcmp(Lprocs
[0].infos
[i
].name
, "CPUModelNumber")) {
4527 cpumodelnumber
= Lprocs
[0].infos
[i
].value
;
4530 if (cpuvendor
&& !strcmp(cpuvendor
, "GenuineIntel")
4531 && cpufamilynumber
&& !strcmp(cpufamilynumber
, "6")
4532 && cpumodelnumber
&& (!strcmp(cpumodelnumber
, "87")
4533 || !strcmp(cpumodelnumber
, "133")))
4535 if (cpuvendor
&& !strcmp(cpuvendor
, "AuthenticAMD")
4537 && (!strcmp(cpufamilynumber
, "21")
4538 || !strcmp(cpufamilynumber
, "22")))
4539 data
->is_amd_with_CU
= 1;
4542 /**********************
4543 * Gather the list of admin-disabled cpus and mems
4545 hwloc_linux__get_allowed_resources(topology
, data
->root_path
, data
->root_fd
, &cpuset_name
);
4547 nodes_dir
= hwloc_opendir("/proc/nodes", data
->root_fd
);
4550 struct dirent
*dirent
;
4552 hwloc_obj_t machine
;
4553 hwloc_bitmap_t machine_online_set
;
4556 /* we don't support extending kerrighed topologies */
4558 hwloc_linux_free_cpuinfo(Lprocs
, numprocs
, global_infos
, global_infos_count
);
4562 /* replace top-level object type with SYSTEM and add some MACHINE underneath */
4564 topology
->levels
[0][0]->type
= HWLOC_OBJ_SYSTEM
;
4565 topology
->levels
[0][0]->name
= strdup("Kerrighed");
4567 /* No cpuset support for now. */
4568 /* No sys support for now. */
4569 while ((dirent
= readdir(nodes_dir
)) != NULL
) {
4570 struct hwloc_linux_cpuinfo_proc
* machine_Lprocs
= NULL
;
4571 struct hwloc_obj_info_s
*machine_global_infos
= NULL
;
4572 unsigned machine_global_infos_count
= 0;
4573 int machine_numprocs
= 0;
4575 if (strncmp(dirent
->d_name
, "node", 4))
4577 machine_online_set
= hwloc_bitmap_alloc();
4578 node
= strtoul(dirent
->d_name
+4, NULL
, 0);
4579 snprintf(path
, sizeof(path
), "/proc/nodes/node%lu/cpuinfo", node
);
4580 machine_numprocs
= hwloc_linux_parse_cpuinfo(data
, path
, &machine_Lprocs
, &machine_global_infos
, &machine_global_infos_count
);
4581 if (machine_numprocs
< 0) {
4583 machine_numprocs
= 0;
4585 err
= look_cpuinfo(topology
, machine_Lprocs
, machine_numprocs
, machine_online_set
);
4588 hwloc_linux_free_cpuinfo(machine_Lprocs
, machine_numprocs
, machine_global_infos
, machine_global_infos_count
);
4590 hwloc_bitmap_free(machine_online_set
);
4593 hwloc_bitmap_or(topology
->levels
[0][0]->online_cpuset
, topology
->levels
[0][0]->online_cpuset
, machine_online_set
);
4594 machine
= hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE
, node
);
4595 machine
->cpuset
= machine_online_set
;
4596 hwloc_debug_1arg_bitmap("machine number %lu has cpuset %s\n",
4597 node
, machine_online_set
);
4599 /* Get the machine memory attributes */
4600 hwloc_get_kerrighed_node_meminfo_info(topology
, data
, node
, &machine
->memory
);
4602 /* Gather DMI info */
4603 /* FIXME: get the right DMI info of each machine */
4604 hwloc__get_dmi_id_info(data
, machine
);
4606 hwloc_insert_object_by_cpuset(topology
, machine
);
4608 closedir(nodes_dir
);
4610 /*********************
4611 * Memory information
4614 /* Get the machine memory attributes */
4615 hwloc_get_procfs_meminfo_info(topology
, data
, &topology
->levels
[0][0]->memory
);
4617 /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
4618 if (look_sysfsnode(topology
, data
, "/sys/bus/node/devices", &nbnodes
) < 0)
4619 look_sysfsnode(topology
, data
, "/sys/devices/system/node", &nbnodes
);
4621 /* if we found some numa nodes, the machine object has no local memory */
4624 topology
->levels
[0][0]->memory
.local_memory
= 0;
4625 if (topology
->levels
[0][0]->memory
.page_types
)
4626 for(i
=0; i
<topology
->levels
[0][0]->memory
.page_types_len
; i
++)
4627 topology
->levels
[0][0]->memory
.page_types
[i
].count
= 0;
4630 /**********************
4634 /* Don't rediscover CPU resources if already done */
4638 /* Gather the list of cpus now */
4639 err
= hwloc_linux_try_hardwired_cpuinfo(backend
);
4643 /* setup root info */
4644 hwloc__move_infos(&hwloc_get_root_obj(topology
)->infos
, &hwloc_get_root_obj(topology
)->infos_count
,
4645 &global_infos
, &global_infos_count
);
4647 if (getenv("HWLOC_LINUX_USE_CPUINFO")
4648 || (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK
, data
->root_fd
) < 0
4649 && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK
, data
->root_fd
) < 0
4650 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK
, data
->root_fd
) < 0
4651 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK
, data
->root_fd
) < 0)) {
4652 /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
4653 * or not containing anything interesting */
4655 err
= look_cpuinfo(topology
, Lprocs
, numprocs
, topology
->levels
[0][0]->online_cpuset
);
4659 hwloc_setup_pu_level(topology
, data
->fallback_nbprocessors
);
4660 look_powerpc_device_tree(topology
, data
);
4664 if (look_sysfscpu(topology
, data
, "/sys/bus/cpu/devices", Lprocs
, numprocs
) < 0)
4665 if (look_sysfscpu(topology
, data
, "/sys/devices/system/cpu", Lprocs
, numprocs
) < 0)
4666 /* sysfs but we failed to read cpu topology, fallback */
4667 hwloc_setup_pu_level(topology
, data
->fallback_nbprocessors
);
4672 /**********************
4676 /* Gather DMI info */
4677 hwloc__get_dmi_id_info(data
, topology
->levels
[0][0]);
4678 if (hwloc_topology_get_flags(topology
) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES
|HWLOC_TOPOLOGY_FLAG_WHOLE_IO
))
4679 hwloc__get_firmware_dmi_memory_info(topology
, data
);
4682 hwloc_obj_add_info(topology
->levels
[0][0], "Backend", "Linux");
4684 hwloc_obj_add_info(topology
->levels
[0][0], "LinuxCgroup", cpuset_name
);
4688 hwloc__linux_get_mic_sn(topology
, data
);
4690 /* data->utsname was filled with real uname or \0, we can safely pass it */
4691 hwloc_add_uname_info(topology
, &data
->utsname
);
4693 hwloc_linux_free_cpuinfo(Lprocs
, numprocs
, global_infos
, global_infos_count
);
4699 /****************************************
4700 ***** Linux PCI backend callbacks ******
4701 ****************************************
4702 * Do not support changing the fsroot (use sysfs)
4706 hwloc_linux_add_os_device(struct hwloc_backend
*backend
, struct hwloc_obj
*pcidev
, hwloc_obj_osdev_type_t type
, const char *name
)
4708 struct hwloc_topology
*topology
= backend
->topology
;
4709 struct hwloc_obj
*obj
= hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE
, -1);
4710 obj
->name
= strdup(name
);
4711 obj
->logical_index
= -1;
4712 obj
->attr
->osdev
.type
= type
;
4714 hwloc_insert_object_by_parent(topology
, pcidev
, obj
);
4715 /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
4720 typedef void (*hwloc_linux_class_fillinfos_t
)(struct hwloc_backend
*backend
, struct hwloc_obj
*osdev
, const char *osdevpath
);
4722 /* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
4725 hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s
*data
)
4727 int root_fd
= data
->root_fd
;
4729 struct dirent
*dirent
;
4733 data
->deprecated_classlinks_model
= -1;
4735 dir
= hwloc_opendir("/sys/class/net", root_fd
);
4738 while ((dirent
= readdir(dir
)) != NULL
) {
4740 if (!strcmp(dirent
->d_name
, ".") || !strcmp(dirent
->d_name
, "..") || !strcmp(dirent
->d_name
, "lo"))
4742 err
= snprintf(path
, sizeof(path
), "/sys/class/net/%s/device/net/%s", dirent
->d_name
, dirent
->d_name
);
4743 if ((size_t) err
< sizeof(path
)
4744 && hwloc_stat(path
, &st
, root_fd
) == 0) {
4745 data
->deprecated_classlinks_model
= 0;
4748 err
= snprintf(path
, sizeof(path
), "/sys/class/net/%s/device/net:%s", dirent
->d_name
, dirent
->d_name
);
4749 if ((size_t) err
< sizeof(path
)
4750 && hwloc_stat(path
, &st
, root_fd
) == 0) {
4751 data
->deprecated_classlinks_model
= 1;
4759 /* class objects that are immediately below pci devices:
4760 * look for objects of the given classname below a sysfs (pcidev) directory
4763 hwloc_linux_class_readdir(struct hwloc_backend
*backend
,
4764 struct hwloc_obj
*pcidev
, const char *devicepath
,
4765 hwloc_obj_osdev_type_t type
, const char *classname
,
4766 hwloc_linux_class_fillinfos_t fillinfo
)
4768 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
4769 int root_fd
= data
->root_fd
;
4770 size_t classnamelen
= strlen(classname
);
4773 struct dirent
*dirent
;
4777 if (data
->deprecated_classlinks_model
== -2)
4778 hwloc_linux_check_deprecated_classlinks_model(data
);
4780 if (data
->deprecated_classlinks_model
!= 1) {
4781 /* modern sysfs: <device>/<class>/<name> */
4784 err
= snprintf(path
, sizeof(path
), "%s/%s", devicepath
, classname
);
4785 if ((size_t) err
>= sizeof(path
))
4788 /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
4789 * make sure <device>/<class> is a directory to avoid this case.
4791 err
= hwloc_lstat(path
, &st
, root_fd
);
4792 if (err
< 0 || !S_ISDIR(st
.st_mode
))
4795 dir
= hwloc_opendir(path
, root_fd
);
4797 data
->deprecated_classlinks_model
= 0;
4798 while ((dirent
= readdir(dir
)) != NULL
) {
4799 if (!strcmp(dirent
->d_name
, ".") || !strcmp(dirent
->d_name
, ".."))
4801 obj
= hwloc_linux_add_os_device(backend
, pcidev
, type
, dirent
->d_name
);
4803 err
= snprintf(path
, sizeof(path
), "%s/%s/%s", devicepath
, classname
, dirent
->d_name
);
4804 if ((size_t) err
< sizeof(path
))
4805 fillinfo(backend
, obj
, path
);
4815 if (data
->deprecated_classlinks_model
!= 0) {
4816 /* deprecated sysfs: <device>/<class>:<name> */
4817 dir
= hwloc_opendir(devicepath
, root_fd
);
4819 while ((dirent
= readdir(dir
)) != NULL
) {
4820 if (strncmp(dirent
->d_name
, classname
, classnamelen
) || dirent
->d_name
[classnamelen
] != ':')
4822 data
->deprecated_classlinks_model
= 1;
4823 obj
= hwloc_linux_add_os_device(backend
, pcidev
, type
, dirent
->d_name
+ classnamelen
+1);
4825 err
= snprintf(path
, sizeof(path
), "%s/%s", devicepath
, dirent
->d_name
);
4826 if ((size_t) err
< sizeof(path
))
4827 fillinfo(backend
, obj
, path
);
4840 * look for net objects below a pcidev in sysfs
4843 hwloc_linux_net_class_fillinfos(struct hwloc_backend
*backend
,
4844 struct hwloc_obj
*obj
, const char *osdevpath
)
4846 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
4847 int root_fd
= data
->root_fd
;
4851 snprintf(path
, sizeof(path
), "%s/address", osdevpath
);
4852 if (!hwloc_read_path_by_length(path
, address
, sizeof(address
), root_fd
)) {
4853 char *eol
= strchr(address
, '\n');
4856 hwloc_obj_add_info(obj
, "Address", address
);
4858 snprintf(path
, sizeof(path
), "%s/device/infiniband", osdevpath
);
4859 if (!hwloc_stat(path
, &st
, root_fd
)) {
4861 snprintf(path
, sizeof(path
), "%s/dev_id", osdevpath
);
4862 if (!hwloc_read_path_by_length(path
, hexid
, sizeof(hexid
), root_fd
)) {
4865 port
= strtoul(hexid
, &eoid
, 0);
4866 if (eoid
!= hexid
) {
4868 snprintf(portstr
, sizeof(portstr
), "%lu", port
+1);
4869 hwloc_obj_add_info(obj
, "Port", portstr
);
4876 hwloc_linux_lookup_net_class(struct hwloc_backend
*backend
,
4877 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
4879 return hwloc_linux_class_readdir(backend
, pcidev
, pcidevpath
, HWLOC_OBJ_OSDEV_NETWORK
, "net", hwloc_linux_net_class_fillinfos
);
4883 * look for infiniband objects below a pcidev in sysfs
4886 hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend
*backend
,
4887 struct hwloc_obj
*obj
, const char *osdevpath
)
4889 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
4890 int root_fd
= data
->root_fd
;
4895 snprintf(path
, sizeof(path
), "%s/node_guid", osdevpath
);
4896 if (!hwloc_read_path_by_length(path
, guidvalue
, sizeof(guidvalue
), root_fd
)) {
4898 len
= strspn(guidvalue
, "0123456789abcdefx:");
4899 guidvalue
[len
] = '\0';
4900 hwloc_obj_add_info(obj
, "NodeGUID", guidvalue
);
4903 snprintf(path
, sizeof(path
), "%s/sys_image_guid", osdevpath
);
4904 if (!hwloc_read_path_by_length(path
, guidvalue
, sizeof(guidvalue
), root_fd
)) {
4906 len
= strspn(guidvalue
, "0123456789abcdefx:");
4907 guidvalue
[len
] = '\0';
4908 hwloc_obj_add_info(obj
, "SysImageGUID", guidvalue
);
4916 snprintf(path
, sizeof(path
), "%s/ports/%u/state", osdevpath
, i
);
4917 if (!hwloc_read_path_by_length(path
, statevalue
, sizeof(statevalue
), root_fd
)) {
4919 statevalue
[1] = '\0'; /* only keep the first byte/digit */
4920 snprintf(statename
, sizeof(statename
), "Port%uState", i
);
4921 hwloc_obj_add_info(obj
, statename
, statevalue
);
4927 snprintf(path
, sizeof(path
), "%s/ports/%u/lid", osdevpath
, i
);
4928 if (!hwloc_read_path_by_length(path
, lidvalue
, sizeof(lidvalue
), root_fd
)) {
4931 len
= strspn(lidvalue
, "0123456789abcdefx");
4932 lidvalue
[len
] = '\0';
4933 snprintf(lidname
, sizeof(lidname
), "Port%uLID", i
);
4934 hwloc_obj_add_info(obj
, lidname
, lidvalue
);
4937 snprintf(path
, sizeof(path
), "%s/ports/%u/lid_mask_count", osdevpath
, i
);
4938 if (!hwloc_read_path_by_length(path
, lidvalue
, sizeof(lidvalue
), root_fd
)) {
4941 len
= strspn(lidvalue
, "0123456789");
4942 lidvalue
[len
] = '\0';
4943 snprintf(lidname
, sizeof(lidname
), "Port%uLMC", i
);
4944 hwloc_obj_add_info(obj
, lidname
, lidvalue
);
4948 snprintf(path
, sizeof(path
), "%s/ports/%u/gids/%u", osdevpath
, i
, j
);
4949 if (!hwloc_read_path_by_length(path
, gidvalue
, sizeof(gidvalue
), root_fd
)) {
4952 len
= strspn(gidvalue
, "0123456789abcdefx:");
4953 gidvalue
[len
] = '\0';
4954 if (strncmp(gidvalue
+20, "0000:0000:0000:0000", 19)) {
4955 /* only keep initialized GIDs */
4956 snprintf(gidname
, sizeof(gidname
), "Port%uGID%u", i
, j
);
4957 hwloc_obj_add_info(obj
, gidname
, gidvalue
);
4968 hwloc_linux_lookup_openfabrics_class(struct hwloc_backend
*backend
,
4969 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
4971 return hwloc_linux_class_readdir(backend
, pcidev
, pcidevpath
, HWLOC_OBJ_OSDEV_OPENFABRICS
, "infiniband", hwloc_linux_infiniband_class_fillinfos
);
4974 /* look for dma objects below a pcidev in sysfs */
4976 hwloc_linux_lookup_dma_class(struct hwloc_backend
*backend
,
4977 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
4979 return hwloc_linux_class_readdir(backend
, pcidev
, pcidevpath
, HWLOC_OBJ_OSDEV_DMA
, "dma", NULL
);
4982 /* look for drm objects below a pcidev in sysfs */
4984 hwloc_linux_lookup_drm_class(struct hwloc_backend
*backend
,
4985 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
4987 return hwloc_linux_class_readdir(backend
, pcidev
, pcidevpath
, HWLOC_OBJ_OSDEV_GPU
, "drm", NULL
);
4989 /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
4991 /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
4992 * so we could create a OS device for each PCI devices with such a field.
4993 * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
4998 * look for block objects below a pcidev in sysfs
5002 hwloc_linux_block_class_fillinfos(struct hwloc_backend
*backend
,
5003 struct hwloc_obj
*obj
, const char *osdevpath
)
5005 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5006 int root_fd
= data
->root_fd
;
5010 char vendor
[64] = "";
5011 char model
[64] = "";
5012 char serial
[64] = "";
5013 char revision
[64] = "";
5014 char blocktype
[64] = "";
5015 unsigned major_id
, minor_id
;
5018 snprintf(path
, sizeof(path
), "%s/dev", osdevpath
);
5019 if (hwloc_read_path_by_length(path
, line
, sizeof(line
), root_fd
) < 0)
5022 if (sscanf(line
, "%u:%u", &major_id
, &minor_id
) != 2)
5024 tmp
= strchr(line
, '\n');
5027 hwloc_obj_add_info(obj
, "LinuxDeviceID", line
);
5029 #ifdef HWLOC_HAVE_LIBUDEV
5031 struct udev_device
*dev
;
5033 dev
= udev_device_new_from_subsystem_sysname(data
->udev
, "block", obj
->name
);
5036 prop
= udev_device_get_property_value(dev
, "ID_VENDOR");
5038 strncpy(vendor
, prop
, sizeof(vendor
));
5039 vendor
[sizeof(vendor
)-1] = '\0';
5041 prop
= udev_device_get_property_value(dev
, "ID_MODEL");
5043 strncpy(model
, prop
, sizeof(model
));
5044 model
[sizeof(model
)-1] = '\0';
5046 prop
= udev_device_get_property_value(dev
, "ID_REVISION");
5048 strncpy(revision
, prop
, sizeof(revision
));
5049 revision
[sizeof(revision
)-1] = '\0';
5051 prop
= udev_device_get_property_value(dev
, "ID_SERIAL_SHORT");
5053 strncpy(serial
, prop
, sizeof(serial
));
5054 serial
[sizeof(serial
)-1] = '\0';
5056 prop
= udev_device_get_property_value(dev
, "ID_TYPE");
5058 strncpy(blocktype
, prop
, sizeof(blocktype
));
5059 blocktype
[sizeof(blocktype
)-1] = '\0';
5062 udev_device_unref(dev
);
5064 /* fallback to reading files, works with any fsroot */
5067 snprintf(path
, sizeof(path
), "/run/udev/data/b%u:%u", major_id
, minor_id
);
5068 file
= hwloc_fopen(path
, "r", root_fd
);
5072 while (NULL
!= fgets(line
, sizeof(line
), file
)) {
5073 tmp
= strchr(line
, '\n');
5076 if (!strncmp(line
, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
5077 strncpy(vendor
, line
+strlen("E:ID_VENDOR="), sizeof(vendor
));
5078 vendor
[sizeof(vendor
)-1] = '\0';
5079 } else if (!strncmp(line
, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
5080 strncpy(model
, line
+strlen("E:ID_MODEL="), sizeof(model
));
5081 model
[sizeof(model
)-1] = '\0';
5082 } else if (!strncmp(line
, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
5083 strncpy(revision
, line
+strlen("E:ID_REVISION="), sizeof(revision
));
5084 revision
[sizeof(revision
)-1] = '\0';
5085 } else if (!strncmp(line
, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
5086 strncpy(serial
, line
+strlen("E:ID_SERIAL_SHORT="), sizeof(serial
));
5087 serial
[sizeof(serial
)-1] = '\0';
5088 } else if (!strncmp(line
, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
5089 strncpy(blocktype
, line
+strlen("E:ID_TYPE="), sizeof(blocktype
));
5090 blocktype
[sizeof(blocktype
)-1] = '\0';
5097 /* clear fake "ATA" vendor name */
5098 if (!strcasecmp(vendor
, "ATA"))
5100 /* overwrite vendor name from model when possible */
5102 if (!strncasecmp(model
, "wd", 2))
5103 strcpy(vendor
, "Western Digital");
5104 else if (!strncasecmp(model
, "st", 2))
5105 strcpy(vendor
, "Seagate");
5106 else if (!strncasecmp(model
, "samsung", 7))
5107 strcpy(vendor
, "Samsung");
5108 else if (!strncasecmp(model
, "sandisk", 7))
5109 strcpy(vendor
, "SanDisk");
5110 else if (!strncasecmp(model
, "toshiba", 7))
5111 strcpy(vendor
, "Toshiba");
5115 hwloc_obj_add_info(obj
, "Vendor", vendor
);
5117 hwloc_obj_add_info(obj
, "Model", model
);
5119 hwloc_obj_add_info(obj
, "Revision", revision
);
5121 hwloc_obj_add_info(obj
, "SerialNumber", serial
);
5123 if (!strcmp(blocktype
, "disk") || !strncmp(obj
->name
, "nvme", 4))
5124 hwloc_obj_add_info(obj
, "Type", "Disk");
5125 else if (!strcmp(blocktype
, "tape"))
5126 hwloc_obj_add_info(obj
, "Type", "Tape");
5127 else if (!strcmp(blocktype
, "cd") || !strcmp(blocktype
, "floppy") || !strcmp(blocktype
, "optical"))
5128 hwloc_obj_add_info(obj
, "Type", "Removable Media Device");
5129 else /* generic, usb mass storage/rbc, usb mass storage/scsi */
5130 hwloc_obj_add_info(obj
, "Type", "Other");
5133 /* block class objects are in
5134 * host%d/target%d:%d:%d/%d:%d:%d:%d/
5136 * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
5139 * below pci devices */
5141 hwloc_linux_lookup_host_block_class(struct hwloc_backend
*backend
,
5142 struct hwloc_obj
*pcidev
, char *path
, size_t pathlen
)
5144 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5145 int root_fd
= data
->root_fd
;
5146 DIR *hostdir
, *portdir
, *targetdir
;
5147 struct dirent
*hostdirent
, *portdirent
, *targetdirent
;
5148 size_t hostdlen
, portdlen
, targetdlen
;
5152 hostdir
= hwloc_opendir(path
, root_fd
);
5156 while ((hostdirent
= readdir(hostdir
)) != NULL
) {
5157 if (sscanf(hostdirent
->d_name
, "port-%d:%d", &dummy
, &dummy
) == 2)
5159 /* found host%d/port-%d:%d */
5160 path
[pathlen
] = '/';
5161 strcpy(&path
[pathlen
+1], hostdirent
->d_name
);
5162 pathlen
+= hostdlen
= 1+strlen(hostdirent
->d_name
);
5163 portdir
= hwloc_opendir(path
, root_fd
);
5166 while ((portdirent
= readdir(portdir
)) != NULL
) {
5167 if (sscanf(portdirent
->d_name
, "end_device-%d:%d", &dummy
, &dummy
) == 2) {
5168 /* found host%d/port-%d:%d/end_device-%d:%d */
5169 path
[pathlen
] = '/';
5170 strcpy(&path
[pathlen
+1], portdirent
->d_name
);
5171 pathlen
+= portdlen
= 1+strlen(portdirent
->d_name
);
5172 res
+= hwloc_linux_lookup_host_block_class(backend
, pcidev
, path
, pathlen
);
5173 /* restore parent path */
5174 pathlen
-= portdlen
;
5175 path
[pathlen
] = '\0';
5179 /* restore parent path */
5180 pathlen
-= hostdlen
;
5181 path
[pathlen
] = '\0';
5183 } else if (sscanf(hostdirent
->d_name
, "target%d:%d:%d", &dummy
, &dummy
, &dummy
) == 3) {
5184 /* found host%d/target%d:%d:%d */
5185 path
[pathlen
] = '/';
5186 strcpy(&path
[pathlen
+1], hostdirent
->d_name
);
5187 pathlen
+= hostdlen
= 1+strlen(hostdirent
->d_name
);
5188 targetdir
= hwloc_opendir(path
, root_fd
);
5191 while ((targetdirent
= readdir(targetdir
)) != NULL
) {
5192 if (sscanf(targetdirent
->d_name
, "%d:%d:%d:%d", &dummy
, &dummy
, &dummy
, &dummy
) != 4)
5194 /* found host%d/target%d:%d:%d/%d:%d:%d:%d */
5195 path
[pathlen
] = '/';
5196 strcpy(&path
[pathlen
+1], targetdirent
->d_name
);
5197 pathlen
+= targetdlen
= 1+strlen(targetdirent
->d_name
);
5198 /* lookup block class for real */
5199 res
+= hwloc_linux_class_readdir(backend
, pcidev
, path
, HWLOC_OBJ_OSDEV_BLOCK
, "block", hwloc_linux_block_class_fillinfos
);
5200 /* restore parent path */
5201 pathlen
-= targetdlen
;
5202 path
[pathlen
] = '\0';
5204 closedir(targetdir
);
5205 /* restore parent path */
5206 pathlen
-= hostdlen
;
5207 path
[pathlen
] = '\0';
5216 hwloc_linux_lookup_block_class(struct hwloc_backend
*backend
,
5217 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
5219 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5220 int root_fd
= data
->root_fd
;
5222 DIR *devicedir
, *hostdir
, *nvmedir
;
5223 struct dirent
*devicedirent
, *hostdirent
;
5224 size_t devicedlen
, hostdlen
;
5229 strcpy(path
, pcidevpath
);
5230 pathlen
= strlen(path
);
5232 /* look for a NVMe class (Linux 4.0+) under nvme/nvme%d/nvme%dn%d/ */
5233 strcpy(&path
[pathlen
], "/nvme");
5234 nvmedir
= hwloc_opendir(path
, root_fd
);
5236 struct dirent
*nvmedirent
;
5237 while ((nvmedirent
= readdir(nvmedir
)) != NULL
) {
5239 if (strncmp(nvmedirent
->d_name
, "nvme", 4))
5241 path
[pathlen
+5] = '/';
5242 strcpy(&path
[pathlen
+6], nvmedirent
->d_name
);
5243 nvmesubdir
= hwloc_opendir(path
, root_fd
);
5245 struct dirent
*nvmesubdirent
;
5246 while ((nvmesubdirent
= readdir(nvmesubdir
)) != NULL
) {
5248 size_t nvmednamelen
= strlen(nvmedirent
->d_name
);
5249 if (strncmp(nvmedirent
->d_name
, nvmesubdirent
->d_name
, nvmednamelen
))
5251 obj
= hwloc_linux_add_os_device(backend
, pcidev
, HWLOC_OBJ_OSDEV_BLOCK
, nvmesubdirent
->d_name
);
5253 path
[pathlen
+6+nvmednamelen
] = '/';
5254 strcpy(&path
[pathlen
+6+nvmednamelen
+1], nvmesubdirent
->d_name
);
5255 hwloc_linux_block_class_fillinfos(backend
, obj
, path
);
5259 closedir(nvmesubdir
);
5265 path
[pathlen
] = '\0';
5267 /* look for a direct block device here (such as NVMe before Linux 4.0,
5268 * or something without controller subdirs in the middle)
5270 res
+= hwloc_linux_class_readdir(backend
, pcidev
, path
,
5271 HWLOC_OBJ_OSDEV_BLOCK
, "block",
5272 hwloc_linux_block_class_fillinfos
);
5275 /* otherwise try to find controller subdirectories */
5277 devicedir
= hwloc_opendir(pcidevpath
, root_fd
);
5281 while ((devicedirent
= readdir(devicedir
)) != NULL
) {
5282 if (sscanf(devicedirent
->d_name
, "ide%d", &dummy
) == 1) {
5284 path
[pathlen
] = '/';
5285 strcpy(&path
[pathlen
+1], devicedirent
->d_name
);
5286 pathlen
+= devicedlen
= 1+strlen(devicedirent
->d_name
);
5287 hostdir
= hwloc_opendir(path
, root_fd
);
5290 while ((hostdirent
= readdir(hostdir
)) != NULL
) {
5291 if (sscanf(hostdirent
->d_name
, "%d.%d", &dummy
, &dummy
) == 2) {
5292 /* found ide%d/%d.%d */
5293 path
[pathlen
] = '/';
5294 strcpy(&path
[pathlen
+1], hostdirent
->d_name
);
5295 pathlen
+= hostdlen
= 1+strlen(hostdirent
->d_name
);
5296 /* lookup block class for real */
5297 res
+= hwloc_linux_class_readdir(backend
, pcidev
, path
, HWLOC_OBJ_OSDEV_BLOCK
, "block", NULL
);
5298 /* restore parent path */
5299 pathlen
-= hostdlen
;
5300 path
[pathlen
] = '\0';
5304 /* restore parent path */
5305 pathlen
-= devicedlen
;
5306 path
[pathlen
] = '\0';
5307 } else if (sscanf(devicedirent
->d_name
, "host%d", &dummy
) == 1) {
5309 path
[pathlen
] = '/';
5310 strcpy(&path
[pathlen
+1], devicedirent
->d_name
);
5311 pathlen
+= devicedlen
= 1+strlen(devicedirent
->d_name
);
5312 res
+= hwloc_linux_lookup_host_block_class(backend
, pcidev
, path
, pathlen
);
5313 /* restore parent path */
5314 pathlen
-= devicedlen
;
5315 path
[pathlen
] = '\0';
5316 } else if (sscanf(devicedirent
->d_name
, "ata%d", &dummy
) == 1) {
5318 path
[pathlen
] = '/';
5319 strcpy(&path
[pathlen
+1], devicedirent
->d_name
);
5320 pathlen
+= devicedlen
= 1+strlen(devicedirent
->d_name
);
5321 hostdir
= hwloc_opendir(path
, root_fd
);
5324 while ((hostdirent
= readdir(hostdir
)) != NULL
) {
5325 if (sscanf(hostdirent
->d_name
, "host%d", &dummy
) == 1) {
5326 /* found ata%d/host%d */
5327 path
[pathlen
] = '/';
5328 strcpy(&path
[pathlen
+1], hostdirent
->d_name
);
5329 pathlen
+= hostdlen
= 1+strlen(hostdirent
->d_name
);
5330 /* lookup block class for real */
5331 res
+= hwloc_linux_lookup_host_block_class(backend
, pcidev
, path
, pathlen
);
5332 /* restore parent path */
5333 pathlen
-= hostdlen
;
5334 path
[pathlen
] = '\0';
5338 /* restore parent path */
5339 pathlen
-= devicedlen
;
5340 path
[pathlen
] = '\0';
5343 closedir(devicedir
);
5349 hwloc_linux_mic_class_fillinfos(struct hwloc_backend
*backend
,
5350 struct hwloc_obj
*obj
, const char *osdevpath
)
5352 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5353 int root_fd
= data
->root_fd
;
5360 hwloc_obj_add_info(obj
, "CoProcType", "MIC");
5362 snprintf(path
, sizeof(path
), "%s/family", osdevpath
);
5363 if (!hwloc_read_path_by_length(path
, family
, sizeof(family
), root_fd
)) {
5364 char *eol
= strchr(family
, '\n');
5367 hwloc_obj_add_info(obj
, "MICFamily", family
);
5370 snprintf(path
, sizeof(path
), "%s/sku", osdevpath
);
5371 if (!hwloc_read_path_by_length(path
, sku
, sizeof(sku
), root_fd
)) {
5372 char *eol
= strchr(sku
, '\n');
5375 hwloc_obj_add_info(obj
, "MICSKU", sku
);
5378 snprintf(path
, sizeof(path
), "%s/serialnumber", osdevpath
);
5379 if (!hwloc_read_path_by_length(path
, sn
, sizeof(sn
), root_fd
)) {
5381 eol
= strchr(sn
, '\n');
5384 hwloc_obj_add_info(obj
, "MICSerialNumber", sn
);
5387 snprintf(path
, sizeof(path
), "%s/active_cores", osdevpath
);
5388 if (!hwloc_read_path_by_length(path
, string
, sizeof(string
), root_fd
)) {
5389 unsigned long count
= strtoul(string
, NULL
, 16);
5390 snprintf(string
, sizeof(string
), "%lu", count
);
5391 hwloc_obj_add_info(obj
, "MICActiveCores", string
);
5394 snprintf(path
, sizeof(path
), "%s/memsize", osdevpath
);
5395 if (!hwloc_read_path_by_length(path
, string
, sizeof(string
), root_fd
)) {
5396 unsigned long count
= strtoul(string
, NULL
, 16);
5397 snprintf(string
, sizeof(string
), "%lu", count
);
5398 hwloc_obj_add_info(obj
, "MICMemorySize", string
);
5403 hwloc_linux_lookup_mic_class(struct hwloc_backend
*backend
,
5404 struct hwloc_obj
*pcidev
, const char *pcidevpath
)
5406 return hwloc_linux_class_readdir(backend
, pcidev
, pcidevpath
, HWLOC_OBJ_OSDEV_COPROC
, "mic", hwloc_linux_mic_class_fillinfos
);
5410 hwloc_linux_directlookup_mic_class(struct hwloc_backend
*backend
,
5411 struct hwloc_obj
*pcidev
)
5413 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5414 int root_fd
= data
->root_fd
;
5421 if (!data
->mic_directlookup_id_max
)
5422 /* already tried, nothing to do */
5425 if (data
->mic_directlookup_id_max
== (unsigned) -1) {
5426 /* never tried, find out the max id */
5428 struct dirent
*dirent
;
5430 /* make sure we never do this lookup again */
5431 data
->mic_directlookup_id_max
= 0;
5433 /* read the entire class and find the max id of mic%u dirents */
5434 dir
= hwloc_opendir("/sys/devices/virtual/mic", root_fd
);
5436 dir
= hwloc_opendir("/sys/class/mic", root_fd
);
5440 while ((dirent
= readdir(dir
)) != NULL
) {
5441 if (!strcmp(dirent
->d_name
, ".") || !strcmp(dirent
->d_name
, ".."))
5443 if (sscanf(dirent
->d_name
, "mic%u", &idx
) != 1)
5445 if (idx
>= data
->mic_directlookup_id_max
)
5446 data
->mic_directlookup_id_max
= idx
+1;
5451 /* now iterate over the mic ids and see if one matches our pcidev */
5452 for(idx
=0; idx
<data
->mic_directlookup_id_max
; idx
++) {
5453 snprintf(path
, sizeof(path
), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
5454 idx
, pcidev
->attr
->pcidev
.bus
, pcidev
->attr
->pcidev
.dev
, pcidev
->attr
->pcidev
.func
);
5455 if (hwloc_stat(path
, &st
, root_fd
) < 0)
5457 snprintf(path
, sizeof(path
), "mic%u", idx
);
5458 obj
= hwloc_linux_add_os_device(backend
, pcidev
, HWLOC_OBJ_OSDEV_COPROC
, path
);
5459 snprintf(path
, sizeof(path
), "/sys/class/mic/mic%u", idx
);
5460 hwloc_linux_mic_class_fillinfos(backend
, obj
, path
);
5468 * backend callback for inserting objects inside a pci device
5471 hwloc_linux_backend_notify_new_object(struct hwloc_backend
*backend
, struct hwloc_backend
*caller __hwloc_attribute_unused
,
5472 struct hwloc_obj
*obj
)
5474 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5475 char pcidevpath
[256];
5478 /* this callback is only used in the libpci backend for now */
5479 assert(obj
->type
== HWLOC_OBJ_PCI_DEVICE
);
5481 snprintf(pcidevpath
, sizeof(pcidevpath
), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
5482 obj
->attr
->pcidev
.domain
, obj
->attr
->pcidev
.bus
,
5483 obj
->attr
->pcidev
.dev
, obj
->attr
->pcidev
.func
);
5485 res
+= hwloc_linux_lookup_net_class(backend
, obj
, pcidevpath
);
5486 res
+= hwloc_linux_lookup_openfabrics_class(backend
, obj
, pcidevpath
);
5487 res
+= hwloc_linux_lookup_dma_class(backend
, obj
, pcidevpath
);
5488 res
+= hwloc_linux_lookup_drm_class(backend
, obj
, pcidevpath
);
5489 res
+= hwloc_linux_lookup_block_class(backend
, obj
, pcidevpath
);
5491 if (data
->mic_need_directlookup
== -1) {
5493 if (hwloc_stat("/sys/class/mic/mic0", &st
, data
->root_fd
) == 0
5494 && hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st
, data
->root_fd
) == -1)
5495 /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
5496 * do not have mic/mic%u symlinks to mic devices (old mic driver).
5497 * if so, try from the mic class.
5499 data
->mic_need_directlookup
= 1;
5501 data
->mic_need_directlookup
= 0;
5503 if (data
->mic_need_directlookup
)
5504 res
+= hwloc_linux_directlookup_mic_class(backend
, obj
);
5506 res
+= hwloc_linux_lookup_mic_class(backend
, obj
, pcidevpath
);
5512 * backend callback for retrieving the location of a pci device
5515 hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend
*backend
,
5516 struct hwloc_backend
*caller __hwloc_attribute_unused
,
5517 struct hwloc_obj
*obj
, hwloc_bitmap_t cpuset
)
5519 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5522 /* this callback is only used in the libpci backend for now */
5523 assert(obj
->type
== HWLOC_OBJ_PCI_DEVICE
5524 || (obj
->type
== HWLOC_OBJ_BRIDGE
&& obj
->attr
->bridge
.upstream_type
== HWLOC_OBJ_BRIDGE_PCI
));
5526 snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
5527 obj
->attr
->pcidev
.domain
, obj
->attr
->pcidev
.bus
,
5528 obj
->attr
->pcidev
.dev
, obj
->attr
->pcidev
.func
);
5529 if (!hwloc__read_path_as_cpumask(path
, cpuset
, data
->root_fd
)
5530 && !hwloc_bitmap_iszero(cpuset
))
5537 /*******************************
5538 ******* Linux component *******
5539 *******************************/
5542 hwloc_linux_backend_disable(struct hwloc_backend
*backend
)
5544 struct hwloc_linux_backend_data_s
*data
= backend
->private_data
;
5546 if (data
->root_path
)
5547 free(data
->root_path
);
5548 close(data
->root_fd
);
5550 #ifdef HWLOC_HAVE_LIBUDEV
5552 udev_unref(data
->udev
);
5557 static struct hwloc_backend
*
5558 hwloc_linux_component_instantiate(struct hwloc_disc_component
*component
,
5560 const void *_data2 __hwloc_attribute_unused
,
5561 const void *_data3 __hwloc_attribute_unused
)
5563 struct hwloc_backend
*backend
;
5564 struct hwloc_linux_backend_data_s
*data
;
5565 const char * fsroot_path
= _data1
;
5566 int flags
, root
= -1;
5568 backend
= hwloc_backend_alloc(component
);
5572 data
= malloc(sizeof(*data
));
5575 goto out_with_backend
;
5578 backend
->private_data
= data
;
5579 backend
->flags
= HWLOC_BACKEND_FLAG_NEED_LEVELS
;
5580 backend
->discover
= hwloc_look_linuxfs
;
5581 backend
->get_obj_cpuset
= hwloc_linux_backend_get_obj_cpuset
;
5582 backend
->notify_new_object
= hwloc_linux_backend_notify_new_object
;
5583 backend
->disable
= hwloc_linux_backend_disable
;
5585 /* default values */
5586 data
->arch
= HWLOC_LINUX_ARCH_UNKNOWN
;
5588 data
->is_amd_with_CU
= 0;
5589 data
->is_real_fsroot
= 1;
5590 data
->root_path
= NULL
;
5595 root
= open(fsroot_path
, O_RDONLY
| O_DIRECTORY
);
5599 if (strcmp(fsroot_path
, "/")) {
5600 backend
->is_thissystem
= 0;
5601 data
->is_real_fsroot
= 0;
5602 data
->root_path
= strdup(fsroot_path
);
5605 /* Since this fd stays open after hwloc returns, mark it as
5606 close-on-exec so that children don't inherit it. Stevens says
5607 that we should GETFD before we SETFD, so we do. */
5608 flags
= fcntl(root
, F_GETFD
, 0);
5610 -1 == fcntl(root
, F_SETFD
, FD_CLOEXEC
| flags
)) {
5616 if (strcmp(fsroot_path
, "/")) {
5621 data
->root_fd
= root
;
5623 #ifdef HWLOC_HAVE_LIBUDEV
5625 if (data
->is_real_fsroot
) {
5626 data
->udev
= udev_new();
5630 data
->dumped_hwdata_dirname
= getenv("HWLOC_DUMPED_HWDATA_DIR");
5631 if (!data
->dumped_hwdata_dirname
) {
5633 data
->dumped_hwdata_dirname
= (char *) "/var/run/hwloc";
5635 data
->dumped_hwdata_dirname
= (char *) RUNSTATEDIR
"/hwloc";
5638 data
->deprecated_classlinks_model
= -2; /* never tried */
5639 data
->mic_need_directlookup
= -1; /* not initialized */
5640 data
->mic_directlookup_id_max
= -1; /* not initialized */
5646 if (data
->root_path
)
5647 free(data
->root_path
);
5656 static struct hwloc_disc_component hwloc_linux_disc_component
= {
5657 HWLOC_DISC_COMPONENT_TYPE_CPU
,
5659 HWLOC_DISC_COMPONENT_TYPE_GLOBAL
,
5660 hwloc_linux_component_instantiate
,
5665 const struct hwloc_component hwloc_linux_component
= {
5666 HWLOC_COMPONENT_ABI
,
5668 HWLOC_COMPONENT_TYPE_DISC
,
5670 &hwloc_linux_disc_component
5676 #ifdef HWLOC_HAVE_LINUXPCI
5678 /***********************************
5679 ******* Linux PCI component *******
5680 ***********************************/
5682 #define HWLOC_PCI_REVISION_ID 0x08
5683 #define HWLOC_PCI_CAP_ID_EXP 0x10
5684 #define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
5687 hwloc_look_linuxfs_pci(struct hwloc_backend
*backend
)
5689 struct hwloc_topology
*topology
= backend
->topology
;
5690 struct hwloc_backend
*tmpbackend
;
5691 hwloc_obj_t first_obj
= NULL
, last_obj
= NULL
;
5694 struct dirent
*dirent
;
5697 if (!(hwloc_topology_get_flags(topology
) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES
|HWLOC_TOPOLOGY_FLAG_WHOLE_IO
)))
5700 if (hwloc_get_next_pcidev(topology
, NULL
)) {
5701 hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
5705 /* hackily find the linux backend to steal its fsroot */
5706 tmpbackend
= topology
->backends
;
5707 while (tmpbackend
) {
5708 if (tmpbackend
->component
== &hwloc_linux_disc_component
) {
5709 root_fd
= ((struct hwloc_linux_backend_data_s
*) tmpbackend
->private_data
)->root_fd
;
5710 hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd
);
5712 tmpbackend
= tmpbackend
->next
;
5714 /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
5716 root_fd
= dup(root_fd
);
5718 root_fd
= open("/", O_RDONLY
| O_DIRECTORY
);
5720 dir
= hwloc_opendir("/sys/bus/pci/devices/", root_fd
);
5722 goto out_with_rootfd
;
5724 while ((dirent
= readdir(dir
)) != NULL
) {
5725 unsigned domain
, bus
, dev
, func
;
5727 struct hwloc_pcidev_attr_s
*attr
;
5734 if (sscanf(dirent
->d_name
, "%04x:%02x:%02x.%01x", &domain
, &bus
, &dev
, &func
) != 4)
5737 os_index
= (domain
<< 20) + (bus
<< 12) + (dev
<< 4) + func
;
5738 obj
= hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE
, os_index
);
5741 attr
= &obj
->attr
->pcidev
;
5743 attr
->domain
= domain
;
5748 /* default (unknown) values */
5749 attr
->vendor_id
= 0;
5750 attr
->device_id
= 0;
5751 attr
->class_id
= HWLOC_PCI_CLASS_NOT_DEFINED
;
5753 attr
->subvendor_id
= 0;
5754 attr
->subdevice_id
= 0;
5755 attr
->linkspeed
= 0;
5757 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/vendor", dirent
->d_name
);
5758 if ((size_t) err
< sizeof(path
)
5759 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5760 attr
->vendor_id
= strtoul(value
, NULL
, 16);
5762 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/device", dirent
->d_name
);
5763 if ((size_t) err
< sizeof(path
)
5764 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5765 attr
->device_id
= strtoul(value
, NULL
, 16);
5767 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/class", dirent
->d_name
);
5768 if ((size_t) err
< sizeof(path
)
5769 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5770 attr
->class_id
= strtoul(value
, NULL
, 16) >> 8;
5772 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent
->d_name
);
5773 if ((size_t) err
< sizeof(path
)
5774 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5775 attr
->subvendor_id
= strtoul(value
, NULL
, 16);
5777 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/subsystem_device", dirent
->d_name
);
5778 if ((size_t) err
< sizeof(path
)
5779 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5780 attr
->subdevice_id
= strtoul(value
, NULL
, 16);
5782 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/config", dirent
->d_name
);
5783 if ((size_t) err
< sizeof(path
)) {
5784 /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
5785 fd
= hwloc_open(path
, root_fd
);
5787 #define CONFIG_SPACE_CACHESIZE 256
5788 unsigned char config_space_cache
[CONFIG_SPACE_CACHESIZE
];
5791 /* initialize the config space in case we fail to read it (missing permissions, etc). */
5792 memset(config_space_cache
, 0xff, CONFIG_SPACE_CACHESIZE
);
5793 ret
= read(fd
, config_space_cache
, CONFIG_SPACE_CACHESIZE
);
5794 (void) ret
; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
5797 /* is this a bridge? */
5798 if (hwloc_pci_prepare_bridge(obj
, config_space_cache
) < 0)
5801 /* get the revision */
5802 attr
->revision
= config_space_cache
[HWLOC_PCI_REVISION_ID
];
5804 /* try to get the link speed */
5805 offset
= hwloc_pci_find_cap(config_space_cache
, HWLOC_PCI_CAP_ID_EXP
);
5806 if (offset
> 0 && offset
+ 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE
) {
5807 hwloc_pci_find_linkspeed(config_space_cache
, offset
, &attr
->linkspeed
);
5809 /* if not available from config-space (extended part is root-only), look in sysfs files added in 4.13 */
5812 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/current_link_speed", dirent
->d_name
);
5813 if ((size_t) err
< sizeof(path
)
5814 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5815 speed
= hwloc_linux_pci_link_speed_from_string(value
);
5816 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/devices/%s/current_link_width", dirent
->d_name
);
5817 if ((size_t) err
< sizeof(path
)
5818 && !hwloc_read_path_by_length(path
, value
, sizeof(value
), root_fd
))
5819 width
= atoi(value
);
5820 attr
->linkspeed
= speed
*width
/8;
5826 last_obj
->next_sibling
= obj
;
5834 dir
= hwloc_opendir("/sys/bus/pci/slots/", root_fd
);
5836 while ((dirent
= readdir(dir
)) != NULL
) {
5839 unsigned domain
, bus
, dev
;
5841 if (dirent
->d_name
[0] == '.')
5843 err
= snprintf(path
, sizeof(path
), "/sys/bus/pci/slots/%s/address", dirent
->d_name
);
5844 if ((size_t) err
< sizeof(path
)
5845 && !hwloc_read_path_by_length(path
, buf
, sizeof(buf
), root_fd
)
5846 && sscanf(buf
, "%x:%x:%x", &domain
, &bus
, &dev
) == 3) {
5847 hwloc_obj_t obj
= first_obj
;
5849 if (obj
->attr
->pcidev
.domain
== domain
5850 && obj
->attr
->pcidev
.bus
== bus
5851 && obj
->attr
->pcidev
.dev
== dev
) {
5852 hwloc_obj_add_info(obj
, "PCISlot", dirent
->d_name
);
5854 obj
= obj
->next_sibling
;
5861 res
= hwloc_insert_pci_device_list(backend
, first_obj
);
5868 static struct hwloc_backend
*
5869 hwloc_linuxpci_component_instantiate(struct hwloc_disc_component
*component
,
5870 const void *_data1 __hwloc_attribute_unused
,
5871 const void *_data2 __hwloc_attribute_unused
,
5872 const void *_data3 __hwloc_attribute_unused
)
5874 struct hwloc_backend
*backend
;
5876 /* thissystem may not be fully initialized yet, we'll check flags in discover() */
5878 backend
= hwloc_backend_alloc(component
);
5881 backend
->flags
= HWLOC_BACKEND_FLAG_NEED_LEVELS
;
5882 backend
->discover
= hwloc_look_linuxfs_pci
;
5886 static struct hwloc_disc_component hwloc_linuxpci_disc_component
= {
5887 HWLOC_DISC_COMPONENT_TYPE_MISC
,
5889 HWLOC_DISC_COMPONENT_TYPE_GLOBAL
,
5890 hwloc_linuxpci_component_instantiate
,
5895 const struct hwloc_component hwloc_linuxpci_component
= {
5896 HWLOC_COMPONENT_ABI
,
5898 HWLOC_COMPONENT_TYPE_DISC
,
5900 &hwloc_linuxpci_disc_component
5903 #endif /* HWLOC_HAVE_LINUXPCI */