build: fix travis MPI/SMP build
[charm.git] / contrib / hwloc / src / topology-linux.c
blobc49a7e2a06cff958e80c9eba988e9832977bd703
1 /*
2 * Copyright © 2009 CNRS
3 * Copyright © 2009-2018 Inria. All rights reserved.
4 * Copyright © 2009-2013, 2015 Université Bordeaux
5 * Copyright © 2009-2014 Cisco Systems, Inc. All rights reserved.
6 * Copyright © 2015 Intel, Inc. All rights reserved.
7 * Copyright © 2010 IBM
8 * See COPYING in top-level directory.
9 */
11 #include <private/autogen/config.h>
12 #include <hwloc.h>
13 #include <hwloc/linux.h>
14 #include <private/misc.h>
15 #include <private/private.h>
16 #include <private/misc.h>
17 #include <private/debug.h>
19 #include <limits.h>
20 #include <stdio.h>
21 #include <fcntl.h>
22 #include <errno.h>
23 #include <assert.h>
24 #ifdef HAVE_DIRENT_H
25 #include <dirent.h>
26 #endif
27 #ifdef HAVE_UNISTD_H
28 #include <unistd.h>
29 #endif
30 #ifdef HWLOC_HAVE_LIBUDEV
31 #include <libudev.h>
32 #endif
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <sched.h>
36 #include <pthread.h>
37 #include <sys/mman.h>
38 #include <sys/syscall.h>
39 #include <mntent.h>
40 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND || defined HWLOC_HAVE_MOVE_PAGES
41 #define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
42 #include <numaif.h>
43 #endif
45 struct hwloc_linux_backend_data_s {
46 char *root_path; /* NULL if unused */
47 int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
48 int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
49 #ifdef HWLOC_HAVE_LIBUDEV
50 struct udev *udev; /* Global udev context */
51 #endif
52 char *dumped_hwdata_dirname;
53 enum {
54 HWLOC_LINUX_ARCH_X86, /* x86 32 or 64bits, including k1om (KNC) */
55 HWLOC_LINUX_ARCH_IA64,
56 HWLOC_LINUX_ARCH_ARM,
57 HWLOC_LINUX_ARCH_POWER,
58 HWLOC_LINUX_ARCH_UNKNOWN
59 } arch;
60 int is_knl;
61 int is_amd_with_CU;
62 struct utsname utsname; /* fields contain \0 when unknown */
63 unsigned fallback_nbprocessors;
64 unsigned pagesize;
66 int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
67 int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
68 unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
73 /***************************
74 * Misc Abstraction layers *
75 ***************************/
77 #if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE_SYSCALL)
78 /* libc doesn't have support for sched_setaffinity, make system call
79 * ourselves: */
80 # include <linux/unistd.h>
81 # ifndef __NR_sched_setaffinity
82 # ifdef __i386__
83 # define __NR_sched_setaffinity 241
84 # elif defined(__x86_64__)
85 # define __NR_sched_setaffinity 203
86 # elif defined(__ia64__)
87 # define __NR_sched_setaffinity 1231
88 # elif defined(__hppa__)
89 # define __NR_sched_setaffinity 211
90 # elif defined(__alpha__)
91 # define __NR_sched_setaffinity 395
92 # elif defined(__s390__)
93 # define __NR_sched_setaffinity 239
94 # elif defined(__sparc__)
95 # define __NR_sched_setaffinity 261
96 # elif defined(__m68k__)
97 # define __NR_sched_setaffinity 311
98 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
99 # define __NR_sched_setaffinity 222
100 # elif defined(__arm__)
101 # define __NR_sched_setaffinity 241
102 # elif defined(__cris__)
103 # define __NR_sched_setaffinity 241
104 /*# elif defined(__mips__)
105 # define __NR_sched_setaffinity TODO (32/64/nabi) */
106 # else
107 # warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
108 # define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
109 # endif
110 # endif
111 # ifndef sched_setaffinity
112 # define sched_setaffinity(pid, lg, mask) syscall(__NR_sched_setaffinity, pid, lg, mask)
113 # endif
114 # ifndef __NR_sched_getaffinity
115 # ifdef __i386__
116 # define __NR_sched_getaffinity 242
117 # elif defined(__x86_64__)
118 # define __NR_sched_getaffinity 204
119 # elif defined(__ia64__)
120 # define __NR_sched_getaffinity 1232
121 # elif defined(__hppa__)
122 # define __NR_sched_getaffinity 212
123 # elif defined(__alpha__)
124 # define __NR_sched_getaffinity 396
125 # elif defined(__s390__)
126 # define __NR_sched_getaffinity 240
127 # elif defined(__sparc__)
128 # define __NR_sched_getaffinity 260
129 # elif defined(__m68k__)
130 # define __NR_sched_getaffinity 312
131 # elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
132 # define __NR_sched_getaffinity 223
133 # elif defined(__arm__)
134 # define __NR_sched_getaffinity 242
135 # elif defined(__cris__)
136 # define __NR_sched_getaffinity 242
137 /*# elif defined(__mips__)
138 # define __NR_sched_getaffinity TODO (32/64/nabi) */
139 # else
140 # warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
141 # define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
142 # endif
143 # endif
144 # ifndef sched_getaffinity
145 # define sched_getaffinity(pid, lg, mask) (syscall(__NR_sched_getaffinity, pid, lg, mask) < 0 ? -1 : 0)
146 # endif
147 #endif
149 /* Added for ntohl() */
150 #include <arpa/inet.h>
152 #ifdef HAVE_OPENAT
153 /* Use our own filesystem functions if we have openat */
155 static const char *
156 hwloc_checkat(const char *path, int fsroot_fd)
158 const char *relative_path;
159 if (fsroot_fd < 0) {
160 errno = EBADF;
161 return NULL;
164 /* Skip leading slashes. */
165 for (relative_path = path; *relative_path == '/'; relative_path++);
167 return relative_path;
170 static int
171 hwloc_openat(const char *path, int fsroot_fd)
173 const char *relative_path;
175 relative_path = hwloc_checkat(path, fsroot_fd);
176 if (!relative_path)
177 return -1;
179 return openat (fsroot_fd, relative_path, O_RDONLY);
182 static FILE *
183 hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
185 int fd;
187 if (strcmp(mode, "r")) {
188 errno = ENOTSUP;
189 return NULL;
192 fd = hwloc_openat (path, fsroot_fd);
193 if (fd == -1)
194 return NULL;
196 return fdopen(fd, mode);
199 static int
200 hwloc_accessat(const char *path, int mode, int fsroot_fd)
202 const char *relative_path;
204 relative_path = hwloc_checkat(path, fsroot_fd);
205 if (!relative_path)
206 return -1;
208 return faccessat(fsroot_fd, relative_path, mode, 0);
211 static int
212 hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
214 const char *relative_path;
216 relative_path = hwloc_checkat(path, fsroot_fd);
217 if (!relative_path)
218 return -1;
220 return fstatat(fsroot_fd, relative_path, st, flags);
223 static DIR*
224 hwloc_opendirat(const char *path, int fsroot_fd)
226 int dir_fd;
227 const char *relative_path;
229 relative_path = hwloc_checkat(path, fsroot_fd);
230 if (!relative_path)
231 return NULL;
233 dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
234 if (dir_fd < 0)
235 return NULL;
237 return fdopendir(dir_fd);
240 #endif /* HAVE_OPENAT */
242 /* Static inline version of fopen so that we can use openat if we have
243 it, but still preserve compiler parameter checking */
244 static __hwloc_inline int
245 hwloc_open(const char *p, int d __hwloc_attribute_unused)
247 #ifdef HAVE_OPENAT
248 return hwloc_openat(p, d);
249 #else
250 return open(p, O_RDONLY);
251 #endif
254 static __hwloc_inline FILE *
255 hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
257 #ifdef HAVE_OPENAT
258 return hwloc_fopenat(p, m, d);
259 #else
260 return fopen(p, m);
261 #endif
264 /* Static inline version of access so that we can use openat if we have
265 it, but still preserve compiler parameter checking */
266 static __hwloc_inline int
267 hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
269 #ifdef HAVE_OPENAT
270 return hwloc_accessat(p, m, d);
271 #else
272 return access(p, m);
273 #endif
276 static __hwloc_inline int
277 hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
279 #ifdef HAVE_OPENAT
280 return hwloc_fstatat(p, st, 0, d);
281 #else
282 return stat(p, st);
283 #endif
286 static __hwloc_inline int
287 hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
289 #ifdef HAVE_OPENAT
290 return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
291 #else
292 return lstat(p, st);
293 #endif
296 /* Static inline version of opendir so that we can use openat if we have
297 it, but still preserve compiler parameter checking */
298 static __hwloc_inline DIR *
299 hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
301 #ifdef HAVE_OPENAT
302 return hwloc_opendirat(p, d);
303 #else
304 return opendir(p);
305 #endif
309 /*****************************************
310 ******* Helpers for reading files *******
311 *****************************************/
313 static __hwloc_inline int
314 hwloc_read_path_by_length(const char *path, char *string, size_t length, int fsroot_fd)
316 int fd, ret;
318 fd = hwloc_open(path, fsroot_fd);
319 if (fd < 0)
320 return -1;
322 ret = read(fd, string, length-1); /* read -1 to put the ending \0 */
323 close(fd);
325 if (ret <= 0)
326 return -1;
328 string[ret] = 0;
330 return 0;
333 static __hwloc_inline int
334 hwloc_read_path_as_int(const char *path, int *value, int fsroot_fd)
336 char string[11];
337 if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
338 return -1;
339 *value = atoi(string);
340 return 0;
343 static __hwloc_inline int
344 hwloc_read_path_as_uint(const char *path, unsigned *value, int fsroot_fd)
346 char string[11];
347 if (hwloc_read_path_by_length(path, string, sizeof(string), fsroot_fd) < 0)
348 return -1;
349 *value = (unsigned) strtoul(string, NULL, 10);
350 return 0;
353 /* Read everything from fd and save it into a newly allocated buffer
354 * returned in bufferp. Use sizep as a default buffer size, and returned
355 * the actually needed size in sizep.
357 static __hwloc_inline int
358 hwloc__read_fd(int fd, char **bufferp, size_t *sizep)
360 char *buffer;
361 size_t toread, filesize, totalread;
362 ssize_t ret;
364 toread = filesize = *sizep;
366 /* Alloc and read +1 so that we get EOF on 2^n without reading once more */
367 buffer = malloc(filesize+1);
368 if (!buffer)
369 return -1;
371 ret = read(fd, buffer, toread+1);
372 if (ret < 0) {
373 free(buffer);
374 return -1;
377 totalread = (size_t) ret;
379 if (totalread < toread + 1)
380 /* Normal case, a single read got EOF */
381 goto done;
383 /* Unexpected case, must extend the buffer and read again.
384 * Only occurs on first invocation and if the kernel ever uses multiple page for a single mask.
386 do {
387 char *tmp;
389 toread = filesize;
390 filesize *= 2;
392 tmp = realloc(buffer, filesize+1);
393 if (!tmp) {
394 free(buffer);
395 return -1;
397 buffer = tmp;
399 ret = read(fd, buffer+toread+1, toread);
400 if (ret < 0) {
401 free(buffer);
402 return -1;
405 totalread += ret;
406 } while ((size_t) ret == toread);
408 done:
409 buffer[totalread] = '\0';
410 *bufferp = buffer;
411 *sizep = filesize;
412 return 0;
415 /* kernel cpumaps are composed of an array of 32bits cpumasks */
416 #define KERNEL_CPU_MASK_BITS 32
417 #define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
419 static __hwloc_inline int
420 hwloc__read_fd_as_cpumask(int fd, hwloc_bitmap_t set)
422 static size_t _filesize = 0; /* will be dynamically initialized to hwloc_get_pagesize(), and increased later if needed */
423 size_t filesize;
424 unsigned long *maps;
425 unsigned long map;
426 int nr_maps = 0;
427 static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
428 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
430 int nr_maps_allocated = _nr_maps_allocated;
431 char *buffer, *tmpbuf;
432 int i;
434 /* Kernel sysfs files are usually at most one page. 4kB may contain 455 32-bit
435 * masks (followed by comma), enough for 14k PUs. So allocate a page by default for now.
437 * If we ever need a larger buffer, we'll realloc() the buffer during the first
438 * invocation of this function so that others directly allocate the right size
439 * (all cpumask files have the exact same size).
441 filesize = _filesize;
442 if (!filesize)
443 filesize = hwloc_getpagesize();
444 if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
445 return -1;
446 /* Only update the static value with the final one,
447 * to avoid sharing intermediate values that we modify,
448 * in case there's ever multiple concurrent calls.
450 _filesize = filesize;
452 maps = malloc(nr_maps_allocated * sizeof(*maps));
453 if (!maps) {
454 free(buffer);
455 return -1;
458 /* reset to zero first */
459 hwloc_bitmap_zero(set);
461 /* parse the whole mask */
462 tmpbuf = buffer;
463 while (sscanf(tmpbuf, "%lx", &map) == 1) {
464 /* read one kernel cpu mask and the ending comma */
465 if (nr_maps == nr_maps_allocated) {
466 unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
467 if (!tmp) {
468 free(buffer);
469 free(maps);
470 return -1;
472 maps = tmp;
473 nr_maps_allocated *= 2;
476 tmpbuf = strchr(tmpbuf, ',');
477 if (!tmpbuf) {
478 maps[nr_maps++] = map;
479 break;
480 } else
481 tmpbuf++;
483 if (!map && !nr_maps)
484 /* ignore the first map if it's empty */
485 continue;
487 maps[nr_maps++] = map;
490 free(buffer);
492 /* convert into a set */
493 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
494 for(i=0; i<nr_maps; i++)
495 hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
496 #else
497 for(i=0; i<(nr_maps+1)/2; i++) {
498 unsigned long mask;
499 mask = maps[nr_maps-2*i-1];
500 if (2*i+1<nr_maps)
501 mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
502 hwloc_bitmap_set_ith_ulong(set, i, mask);
504 #endif
506 free(maps);
508 /* Only update the static value with the final one,
509 * to avoid sharing intermediate values that we modify,
510 * in case there's ever multiple concurrent calls.
512 if (nr_maps_allocated > _nr_maps_allocated)
513 _nr_maps_allocated = nr_maps_allocated;
514 return 0;
517 static __hwloc_inline int
518 hwloc__read_path_as_cpumask(const char *maskpath, hwloc_bitmap_t set, int fsroot_fd)
520 int fd, err;
521 fd = hwloc_open(maskpath, fsroot_fd);
522 if (fd < 0)
523 return -1;
524 err = hwloc__read_fd_as_cpumask(fd, set);
525 close(fd);
526 return err;
529 static __hwloc_inline hwloc_bitmap_t
530 hwloc__alloc_read_path_as_cpumask(const char *maskpath, int fsroot_fd)
532 hwloc_bitmap_t set;
533 int err;
534 set = hwloc_bitmap_alloc();
535 if (!set)
536 return NULL;
537 err = hwloc__read_path_as_cpumask(maskpath, set, fsroot_fd);
538 if (err < 0) {
539 hwloc_bitmap_free(set);
540 return NULL;
541 } else
542 return set;
545 /* set must be full on input */
546 static __hwloc_inline int
547 hwloc__read_fd_as_cpulist(int fd, hwloc_bitmap_t set)
549 /* Kernel sysfs files are usually at most one page.
550 * But cpulists can be of very different sizes depending on the fragmentation,
551 * so don't bother remember the actual read size between invocations.
552 * We don't have many invocations anyway.
554 size_t filesize = hwloc_getpagesize();
555 char *buffer, *current, *comma, *tmp;
556 int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
558 if (hwloc__read_fd(fd, &buffer, &filesize) < 0)
559 return -1;
561 current = buffer;
562 prevlast = -1;
564 while (1) {
565 /* save a pointer to the next comma and erase it to simplify things */
566 comma = strchr(current, ',');
567 if (comma)
568 *comma = '\0';
570 /* find current enabled-segment bounds */
571 nextfirst = strtoul(current, &tmp, 0);
572 if (*tmp == '-')
573 nextlast = strtoul(tmp+1, NULL, 0);
574 else
575 nextlast = nextfirst;
576 if (prevlast+1 <= nextfirst-1)
577 hwloc_bitmap_clr_range(set, prevlast+1, nextfirst-1);
579 /* switch to next enabled-segment */
580 prevlast = nextlast;
581 if (!comma)
582 break;
583 current = comma+1;
586 hwloc_bitmap_clr_range(set, prevlast+1, -1);
587 free(buffer);
588 return 0;
592 /*****************************
593 ******* CpuBind Hooks *******
594 *****************************/
597 hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
599 /* TODO Kerrighed: Use
600 * int migrate (pid_t pid, int destination_node);
601 * int migrate_self (int destination_node);
602 * int thread_migrate (int thread_id, int destination_node);
605 /* The resulting binding is always strict */
607 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
608 cpu_set_t *plinux_set;
609 unsigned cpu;
610 int last;
611 size_t setsize;
612 int err;
614 last = hwloc_bitmap_last(hwloc_set);
615 if (last == -1) {
616 errno = EINVAL;
617 return -1;
620 setsize = CPU_ALLOC_SIZE(last+1);
621 plinux_set = CPU_ALLOC(last+1);
623 CPU_ZERO_S(setsize, plinux_set);
624 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
625 CPU_SET_S(cpu, setsize, plinux_set);
626 hwloc_bitmap_foreach_end();
628 err = sched_setaffinity(tid, setsize, plinux_set);
630 CPU_FREE(plinux_set);
631 return err;
632 #elif defined(HWLOC_HAVE_CPU_SET)
633 cpu_set_t linux_set;
634 unsigned cpu;
636 CPU_ZERO(&linux_set);
637 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
638 CPU_SET(cpu, &linux_set);
639 hwloc_bitmap_foreach_end();
641 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
642 return sched_setaffinity(tid, &linux_set);
643 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
644 return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
645 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
646 #elif defined(HWLOC_HAVE_SYSCALL)
647 unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
649 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
650 return sched_setaffinity(tid, (void*) &mask);
651 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
652 return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
653 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
654 #else /* !SYSCALL */
655 errno = ENOSYS;
656 return -1;
657 #endif /* !SYSCALL */
660 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
662 * On some kernels, sched_getaffinity requires the output size to be larger
663 * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
664 * Try sched_affinity on ourself until we find a nr_cpus value that makes
665 * the kernel happy.
667 static int
668 hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
670 static int _nr_cpus = -1;
671 int nr_cpus = _nr_cpus;
672 int fd;
674 if (nr_cpus != -1)
675 /* already computed */
676 return nr_cpus;
678 if (topology->levels[0][0]->complete_cpuset)
679 /* start with a nr_cpus that may contain the whole topology */
680 nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
681 if (nr_cpus <= 0)
682 /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
683 nr_cpus = 1;
685 fd = open("/sys/devices/system/cpu/possible", O_RDONLY); /* binding only supported in real fsroot, no need for data->root_fd */
686 if (fd >= 0) {
687 hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc_full();
688 if (hwloc__read_fd_as_cpulist(fd, possible_bitmap) == 0) {
689 int max_possible = hwloc_bitmap_last(possible_bitmap);
690 hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
692 if (nr_cpus < max_possible + 1)
693 nr_cpus = max_possible + 1;
695 close(fd);
696 hwloc_bitmap_free(possible_bitmap);
699 while (1) {
700 cpu_set_t *set = CPU_ALLOC(nr_cpus);
701 size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
702 int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
703 CPU_FREE(set);
704 nr_cpus = setsize * 8; /* that's the value that was actually tested */
705 if (!err)
706 /* Found it. Only update the static value with the final one,
707 * to avoid sharing intermediate values that we modify,
708 * in case there's ever multiple concurrent calls.
710 return _nr_cpus = nr_cpus;
711 nr_cpus *= 2;
714 #endif
717 hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
719 int err __hwloc_attribute_unused;
720 /* TODO Kerrighed */
722 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
723 cpu_set_t *plinux_set;
724 unsigned cpu;
725 int last;
726 size_t setsize;
727 int kernel_nr_cpus;
729 /* find the kernel nr_cpus so as to use a large enough cpu_set size */
730 kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
731 setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
732 plinux_set = CPU_ALLOC(kernel_nr_cpus);
734 err = sched_getaffinity(tid, setsize, plinux_set);
736 if (err < 0) {
737 CPU_FREE(plinux_set);
738 return -1;
741 last = -1;
742 if (topology->levels[0][0]->complete_cpuset)
743 last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
744 if (last == -1)
745 /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
746 last = kernel_nr_cpus-1;
748 hwloc_bitmap_zero(hwloc_set);
749 for(cpu=0; cpu<=(unsigned) last; cpu++)
750 if (CPU_ISSET_S(cpu, setsize, plinux_set))
751 hwloc_bitmap_set(hwloc_set, cpu);
753 CPU_FREE(plinux_set);
754 #elif defined(HWLOC_HAVE_CPU_SET)
755 cpu_set_t linux_set;
756 unsigned cpu;
758 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
759 err = sched_getaffinity(tid, &linux_set);
760 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
761 err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
762 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
763 if (err < 0)
764 return -1;
766 hwloc_bitmap_zero(hwloc_set);
767 for(cpu=0; cpu<CPU_SETSIZE; cpu++)
768 if (CPU_ISSET(cpu, &linux_set))
769 hwloc_bitmap_set(hwloc_set, cpu);
770 #elif defined(HWLOC_HAVE_SYSCALL)
771 unsigned long mask;
773 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
774 err = sched_getaffinity(tid, (void*) &mask);
775 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
776 err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
777 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
778 if (err < 0)
779 return -1;
781 hwloc_bitmap_from_ulong(hwloc_set, mask);
782 #else /* !SYSCALL */
783 errno = ENOSYS;
784 return -1;
785 #endif /* !SYSCALL */
787 return 0;
790 /* Get the array of tids of a process from the task directory in /proc */
791 static int
792 hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
794 struct dirent *dirent;
795 unsigned nr_tids = 0;
796 unsigned max_tids = 32;
797 pid_t *tids;
798 struct stat sb;
800 /* take the number of links as a good estimate for the number of tids */
801 if (fstat(dirfd(taskdir), &sb) == 0)
802 max_tids = sb.st_nlink;
804 tids = malloc(max_tids*sizeof(pid_t));
805 if (!tids) {
806 errno = ENOMEM;
807 return -1;
810 rewinddir(taskdir);
812 while ((dirent = readdir(taskdir)) != NULL) {
813 if (nr_tids == max_tids) {
814 pid_t *newtids;
815 max_tids += 8;
816 newtids = realloc(tids, max_tids*sizeof(pid_t));
817 if (!newtids) {
818 free(tids);
819 errno = ENOMEM;
820 return -1;
822 tids = newtids;
824 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
825 continue;
826 tids[nr_tids++] = atoi(dirent->d_name);
829 *nr_tidsp = nr_tids;
830 *tidsp = tids;
831 return 0;
834 /* Per-tid callbacks */
835 typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
837 static int
838 hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
839 pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
840 void *data)
842 char taskdir_path[128];
843 DIR *taskdir;
844 pid_t *tids, *newtids;
845 unsigned i, nr, newnr, failed = 0, failed_errno = 0;
846 unsigned retrynr = 0;
847 int err;
849 if (pid)
850 snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
851 else
852 snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
854 taskdir = opendir(taskdir_path);
855 if (!taskdir) {
856 if (errno == ENOENT)
857 errno = EINVAL;
858 err = -1;
859 goto out;
862 /* read the current list of threads */
863 err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
864 if (err < 0)
865 goto out_with_dir;
867 retry:
868 /* apply the callback to all threads */
869 failed=0;
870 for(i=0; i<nr; i++) {
871 err = cb(topology, tids[i], data, i);
872 if (err < 0) {
873 failed++;
874 failed_errno = errno;
878 /* re-read the list of thread */
879 err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
880 if (err < 0)
881 goto out_with_tids;
882 /* retry if the list changed in the meantime, or we failed for *some* threads only.
883 * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
885 if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
886 free(tids);
887 tids = newtids;
888 nr = newnr;
889 if (++retrynr > 10) {
890 /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
891 errno = EAGAIN;
892 err = -1;
893 goto out_with_tids;
895 goto retry;
896 } else {
897 free(newtids);
900 /* if all threads failed, return the last errno. */
901 if (failed) {
902 err = -1;
903 errno = failed_errno;
904 goto out_with_tids;
907 err = 0;
908 out_with_tids:
909 free(tids);
910 out_with_dir:
911 closedir(taskdir);
912 out:
913 return err;
916 /* Per-tid proc_set_cpubind callback and caller.
917 * Callback data is a hwloc_bitmap_t. */
918 static int
919 hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
921 return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
924 static int
925 hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
927 return hwloc_linux_foreach_proc_tid(topology, pid,
928 hwloc_linux_foreach_proc_tid_set_cpubind_cb,
929 (void*) hwloc_set);
932 /* Per-tid proc_get_cpubind callback data, callback function and caller */
933 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
934 hwloc_bitmap_t cpuset;
935 hwloc_bitmap_t tidset;
936 int flags;
939 static int
940 hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
942 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
943 hwloc_bitmap_t cpuset = data->cpuset;
944 hwloc_bitmap_t tidset = data->tidset;
945 int flags = data->flags;
947 if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
948 return -1;
950 /* reset the cpuset on first iteration */
951 if (!idx)
952 hwloc_bitmap_zero(cpuset);
954 if (flags & HWLOC_CPUBIND_STRICT) {
955 /* if STRICT, we want all threads to have the same binding */
956 if (!idx) {
957 /* this is the first thread, copy its binding */
958 hwloc_bitmap_copy(cpuset, tidset);
959 } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
960 /* this is not the first thread, and it's binding is different */
961 errno = EXDEV;
962 return -1;
964 } else {
965 /* if not STRICT, just OR all thread bindings */
966 hwloc_bitmap_or(cpuset, cpuset, tidset);
968 return 0;
971 static int
972 hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
974 struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
975 hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
976 int ret;
978 data.cpuset = hwloc_set;
979 data.tidset = tidset;
980 data.flags = flags;
981 ret = hwloc_linux_foreach_proc_tid(topology, pid,
982 hwloc_linux_foreach_proc_tid_get_cpubind_cb,
983 (void*) &data);
984 hwloc_bitmap_free(tidset);
985 return ret;
988 static int
989 hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
991 if (pid == 0)
992 pid = topology->pid;
993 if (flags & HWLOC_CPUBIND_THREAD)
994 return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
995 else
996 return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
999 static int
1000 hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1002 if (pid == 0)
1003 pid = topology->pid;
1004 if (flags & HWLOC_CPUBIND_THREAD)
1005 return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
1006 else
1007 return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
1010 static int
1011 hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
1013 return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1016 static int
1017 hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1019 return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
1022 static int
1023 hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1025 if (topology->pid) {
1026 errno = ENOSYS;
1027 return -1;
1029 return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1032 static int
1033 hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1035 if (topology->pid) {
1036 errno = ENOSYS;
1037 return -1;
1039 return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1042 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1043 #pragma weak pthread_setaffinity_np
1044 #pragma weak pthread_self
1046 static int
1047 hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1049 int err;
1051 if (topology->pid) {
1052 errno = ENOSYS;
1053 return -1;
1056 if (!pthread_self) {
1057 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1058 errno = ENOSYS;
1059 return -1;
1061 if (tid == pthread_self())
1062 return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
1064 if (!pthread_setaffinity_np) {
1065 errno = ENOSYS;
1066 return -1;
1068 /* TODO Kerrighed: Use
1069 * int migrate (pid_t pid, int destination_node);
1070 * int migrate_self (int destination_node);
1071 * int thread_migrate (int thread_id, int destination_node);
1074 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1075 /* Use a separate block so that we can define specific variable
1076 types here */
1078 cpu_set_t *plinux_set;
1079 unsigned cpu;
1080 int last;
1081 size_t setsize;
1083 last = hwloc_bitmap_last(hwloc_set);
1084 if (last == -1) {
1085 errno = EINVAL;
1086 return -1;
1089 setsize = CPU_ALLOC_SIZE(last+1);
1090 plinux_set = CPU_ALLOC(last+1);
1092 CPU_ZERO_S(setsize, plinux_set);
1093 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1094 CPU_SET_S(cpu, setsize, plinux_set);
1095 hwloc_bitmap_foreach_end();
1097 err = pthread_setaffinity_np(tid, setsize, plinux_set);
1099 CPU_FREE(plinux_set);
1101 #elif defined(HWLOC_HAVE_CPU_SET)
1102 /* Use a separate block so that we can define specific variable
1103 types here */
1105 cpu_set_t linux_set;
1106 unsigned cpu;
1108 CPU_ZERO(&linux_set);
1109 hwloc_bitmap_foreach_begin(cpu, hwloc_set)
1110 CPU_SET(cpu, &linux_set);
1111 hwloc_bitmap_foreach_end();
1113 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1114 err = pthread_setaffinity_np(tid, &linux_set);
1115 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1116 err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
1117 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1119 #else /* CPU_SET */
1120 /* Use a separate block so that we can define specific variable
1121 types here */
1123 unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
1125 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1126 err = pthread_setaffinity_np(tid, (void*) &mask);
1127 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1128 err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
1129 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1131 #endif /* CPU_SET */
1133 if (err) {
1134 errno = err;
1135 return -1;
1137 return 0;
1139 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1141 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1142 #pragma weak pthread_getaffinity_np
1143 #pragma weak pthread_self
1145 static int
1146 hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1148 int err;
1150 if (topology->pid) {
1151 errno = ENOSYS;
1152 return -1;
1155 if (!pthread_self) {
1156 /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
1157 errno = ENOSYS;
1158 return -1;
1160 if (tid == pthread_self())
1161 return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
1163 if (!pthread_getaffinity_np) {
1164 errno = ENOSYS;
1165 return -1;
1167 /* TODO Kerrighed */
1169 #if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
1170 /* Use a separate block so that we can define specific variable
1171 types here */
1173 cpu_set_t *plinux_set;
1174 unsigned cpu;
1175 int last;
1176 size_t setsize;
1178 last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
1179 assert (last != -1);
1181 setsize = CPU_ALLOC_SIZE(last+1);
1182 plinux_set = CPU_ALLOC(last+1);
1184 err = pthread_getaffinity_np(tid, setsize, plinux_set);
1185 if (err) {
1186 CPU_FREE(plinux_set);
1187 errno = err;
1188 return -1;
1191 hwloc_bitmap_zero(hwloc_set);
1192 for(cpu=0; cpu<=(unsigned) last; cpu++)
1193 if (CPU_ISSET_S(cpu, setsize, plinux_set))
1194 hwloc_bitmap_set(hwloc_set, cpu);
1196 CPU_FREE(plinux_set);
1198 #elif defined(HWLOC_HAVE_CPU_SET)
1199 /* Use a separate block so that we can define specific variable
1200 types here */
1202 cpu_set_t linux_set;
1203 unsigned cpu;
1205 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1206 err = pthread_getaffinity_np(tid, &linux_set);
1207 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1208 err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
1209 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1210 if (err) {
1211 errno = err;
1212 return -1;
1215 hwloc_bitmap_zero(hwloc_set);
1216 for(cpu=0; cpu<CPU_SETSIZE; cpu++)
1217 if (CPU_ISSET(cpu, &linux_set))
1218 hwloc_bitmap_set(hwloc_set, cpu);
1220 #else /* CPU_SET */
1221 /* Use a separate block so that we can define specific variable
1222 types here */
1224 unsigned long mask;
1226 #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
1227 err = pthread_getaffinity_np(tid, (void*) &mask);
1228 #else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1229 err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
1230 #endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
1231 if (err) {
1232 errno = err;
1233 return -1;
1236 hwloc_bitmap_from_ulong(hwloc_set, mask);
1238 #endif /* CPU_SET */
1240 return 0;
1242 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1245 hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
1247 /* read /proc/pid/stat.
1248 * its second field contains the command name between parentheses,
1249 * and the command itself may contain parentheses,
1250 * so read the whole line and find the last closing parenthesis to find the third field.
1252 char buf[1024] = "";
1253 char name[64];
1254 char *tmp;
1255 int fd, i, err;
1257 /* TODO: find a way to use sched_getcpu().
1258 * either compare tid with gettid() in all callbacks.
1259 * or pass gettid() in the callback data.
1262 if (!tid) {
1263 #ifdef SYS_gettid
1264 tid = syscall(SYS_gettid);
1265 #else
1266 errno = ENOSYS;
1267 return -1;
1268 #endif
1271 snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
1272 fd = open(name, O_RDONLY); /* no fsroot for real /proc */
1273 if (fd < 0) {
1274 errno = ENOSYS;
1275 return -1;
1277 err = read(fd, buf, sizeof(buf)-1); /* read -1 to put the ending \0 */
1278 close(fd);
1279 if (err <= 0) {
1280 errno = ENOSYS;
1281 return -1;
1283 buf[err-1] = '\0';
1285 tmp = strrchr(buf, ')');
1286 if (!tmp) {
1287 errno = ENOSYS;
1288 return -1;
1290 /* skip ') ' to find the actual third argument */
1291 tmp += 2;
1293 /* skip 35 fields */
1294 for(i=0; i<36; i++) {
1295 tmp = strchr(tmp, ' ');
1296 if (!tmp) {
1297 errno = ENOSYS;
1298 return -1;
1300 /* skip the ' ' itself */
1301 tmp++;
1304 /* read the last cpu in the 38th field now */
1305 if (sscanf(tmp, "%d ", &i) != 1) {
1306 errno = ENOSYS;
1307 return -1;
1310 hwloc_bitmap_only(set, i);
1311 return 0;
1314 /* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
1315 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
1316 hwloc_bitmap_t cpuset;
1317 hwloc_bitmap_t tidset;
1320 static int
1321 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
1323 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
1324 hwloc_bitmap_t cpuset = data->cpuset;
1325 hwloc_bitmap_t tidset = data->tidset;
1327 if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
1328 return -1;
1330 /* reset the cpuset on first iteration */
1331 if (!idx)
1332 hwloc_bitmap_zero(cpuset);
1334 hwloc_bitmap_or(cpuset, cpuset, tidset);
1335 return 0;
1338 static int
1339 hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1341 struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
1342 hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
1343 int ret;
1345 data.cpuset = hwloc_set;
1346 data.tidset = tidset;
1347 ret = hwloc_linux_foreach_proc_tid(topology, pid,
1348 hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
1349 &data);
1350 hwloc_bitmap_free(tidset);
1351 return ret;
1354 static int
1355 hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
1357 if (pid == 0)
1358 pid = topology->pid;
1359 if (flags & HWLOC_CPUBIND_THREAD)
1360 return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
1361 else
1362 return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
1365 static int
1366 hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
1368 return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
1371 static int
1372 hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
1374 if (topology->pid) {
1375 errno = ENOSYS;
1376 return -1;
1379 #if HAVE_DECL_SCHED_GETCPU
1381 int pu = sched_getcpu();
1382 if (pu >= 0) {
1383 hwloc_bitmap_only(hwloc_set, pu);
1384 return 0;
1387 #endif
1389 return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
1394 /***************************
1395 ****** Membind hooks ******
1396 ***************************/
1398 #if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
1400 /* MPOL_LOCAL is not in numaif.h, and it's a enum if linux/mempolicy.h, define ours to avoid conflicts */
1401 #define HWLOC_MPOL_LOCAL 4
1403 static int
1404 hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
1406 switch (policy) {
1407 case HWLOC_MEMBIND_DEFAULT:
1408 *linuxpolicy = MPOL_DEFAULT;
1409 break;
1410 case HWLOC_MEMBIND_FIRSTTOUCH:
1411 *linuxpolicy = HWLOC_MPOL_LOCAL;
1412 break;
1413 case HWLOC_MEMBIND_BIND:
1414 if (flags & HWLOC_MEMBIND_STRICT)
1415 *linuxpolicy = MPOL_BIND;
1416 else
1417 *linuxpolicy = MPOL_PREFERRED;
1418 break;
1419 case HWLOC_MEMBIND_INTERLEAVE:
1420 *linuxpolicy = MPOL_INTERLEAVE;
1421 break;
1422 /* TODO: next-touch when (if?) patch applied upstream */
1423 default:
1424 errno = ENOSYS;
1425 return -1;
1427 return 0;
1430 static int
1431 hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1432 hwloc_const_nodeset_t nodeset,
1433 unsigned *max_os_index_p, unsigned long **linuxmaskp)
1435 unsigned max_os_index = 0; /* highest os_index + 1 */
1436 unsigned long *linuxmask;
1437 unsigned i;
1438 hwloc_nodeset_t linux_nodeset = NULL;
1440 if (hwloc_bitmap_isfull(nodeset)) {
1441 linux_nodeset = hwloc_bitmap_alloc();
1442 hwloc_bitmap_only(linux_nodeset, 0);
1443 nodeset = linux_nodeset;
1446 max_os_index = hwloc_bitmap_last(nodeset);
1447 if (max_os_index == (unsigned) -1)
1448 max_os_index = 0;
1449 /* add 1 to convert the last os_index into a max_os_index,
1450 * and round up to the nearest multiple of BITS_PER_LONG */
1451 max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
1453 linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1454 if (!linuxmask) {
1455 hwloc_bitmap_free(linux_nodeset);
1456 errno = ENOMEM;
1457 return -1;
1460 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1461 linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
1463 if (linux_nodeset)
1464 hwloc_bitmap_free(linux_nodeset);
1466 *max_os_index_p = max_os_index;
1467 *linuxmaskp = linuxmask;
1468 return 0;
1471 static void
1472 hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
1473 hwloc_nodeset_t nodeset,
1474 unsigned max_os_index, const unsigned long *linuxmask)
1476 unsigned i;
1478 #ifdef HWLOC_DEBUG
1479 /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
1480 assert(!(max_os_index%HWLOC_BITS_PER_LONG));
1481 #endif
1483 hwloc_bitmap_zero(nodeset);
1484 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1485 hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
1487 #endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
1489 #ifdef HWLOC_HAVE_MBIND
1490 static int
1491 hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1493 unsigned max_os_index; /* highest os_index + 1 */
1494 unsigned long *linuxmask;
1495 size_t remainder;
1496 int linuxpolicy;
1497 unsigned linuxflags = 0;
1498 int err;
1500 remainder = (uintptr_t) addr & (hwloc_getpagesize()-1);
1501 addr = (char*) addr - remainder;
1502 len += remainder;
1504 err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1505 if (err < 0)
1506 return err;
1508 if (linuxpolicy == MPOL_DEFAULT) {
1509 /* Some Linux kernels don't like being passed a set */
1510 return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
1512 } else if (linuxpolicy == HWLOC_MPOL_LOCAL) {
1513 /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1514 return mbind((void *) addr, len, MPOL_PREFERRED, NULL, 0, 0);
1517 err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1518 if (err < 0)
1519 goto out;
1521 if (flags & HWLOC_MEMBIND_MIGRATE) {
1522 #ifdef MPOL_MF_MOVE
1523 linuxflags = MPOL_MF_MOVE;
1524 if (flags & HWLOC_MEMBIND_STRICT)
1525 linuxflags |= MPOL_MF_STRICT;
1526 #else
1527 if (flags & HWLOC_MEMBIND_STRICT) {
1528 errno = ENOSYS;
1529 goto out_with_mask;
1531 #endif
1534 err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
1535 if (err < 0)
1536 goto out_with_mask;
1538 free(linuxmask);
1539 return 0;
1541 out_with_mask:
1542 free(linuxmask);
1543 out:
1544 return -1;
1547 static void *
1548 hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1550 void *buffer;
1551 int err;
1553 buffer = hwloc_alloc_mmap(topology, len);
1554 if (!buffer)
1555 return NULL;
1557 err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
1558 if (err < 0 && (flags & HWLOC_MEMBIND_STRICT)) {
1559 munmap(buffer, len);
1560 return NULL;
1563 return buffer;
1565 #endif /* HWLOC_HAVE_MBIND */
1567 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1568 static int
1569 hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
1571 unsigned max_os_index; /* highest os_index + 1 */
1572 unsigned long *linuxmask;
1573 int linuxpolicy;
1574 int err;
1576 err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
1577 if (err < 0)
1578 return err;
1580 if (linuxpolicy == MPOL_DEFAULT) {
1581 /* Some Linux kernels don't like being passed a set */
1582 return set_mempolicy(linuxpolicy, NULL, 0);
1584 } else if (linuxpolicy == HWLOC_MPOL_LOCAL) {
1585 /* MPOL_LOCAL isn't supported before 3.8, and it's identical to PREFERRED with no nodeset, which was supported way before */
1586 return set_mempolicy(MPOL_PREFERRED, NULL, 0);
1589 err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
1590 if (err < 0)
1591 goto out;
1593 if (flags & HWLOC_MEMBIND_MIGRATE) {
1594 #ifdef HWLOC_HAVE_MIGRATE_PAGES
1595 unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1596 if (fullmask) {
1597 memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1598 err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
1599 free(fullmask);
1600 } else
1601 err = -1;
1602 if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
1603 goto out_with_mask;
1604 #else
1605 errno = ENOSYS;
1606 goto out_with_mask;
1607 #endif
1610 err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
1611 if (err < 0)
1612 goto out_with_mask;
1614 free(linuxmask);
1615 return 0;
1617 out_with_mask:
1618 free(linuxmask);
1619 out:
1620 return -1;
1624 * On some kernels, get_mempolicy requires the output size to be larger
1625 * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
1626 * Try get_mempolicy on ourself until we find a max_os_index value that
1627 * makes the kernel happy.
1629 static int
1630 hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
1632 static int _max_numnodes = -1, max_numnodes;
1633 int linuxpolicy;
1635 if (_max_numnodes != -1)
1636 /* already computed */
1637 return _max_numnodes;
1639 /* start with a single ulong, it's the minimal and it's enough for most machines */
1640 max_numnodes = HWLOC_BITS_PER_LONG;
1641 while (1) {
1642 unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
1643 int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
1644 free(mask);
1645 if (!err || errno != EINVAL)
1646 /* Found it. Only update the static value with the final one,
1647 * to avoid sharing intermediate values that we modify,
1648 * in case there's ever multiple concurrent calls.
1650 return _max_numnodes = max_numnodes;
1651 max_numnodes *= 2;
1655 static int
1656 hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
1658 switch (linuxpolicy) {
1659 case MPOL_DEFAULT:
1660 case HWLOC_MPOL_LOCAL: /* converted from MPOL_PREFERRED + empty nodeset by the caller */
1661 *policy = HWLOC_MEMBIND_FIRSTTOUCH;
1662 return 0;
1663 case MPOL_PREFERRED:
1664 case MPOL_BIND:
1665 *policy = HWLOC_MEMBIND_BIND;
1666 return 0;
1667 case MPOL_INTERLEAVE:
1668 *policy = HWLOC_MEMBIND_INTERLEAVE;
1669 return 0;
1670 default:
1671 errno = EINVAL;
1672 return -1;
1676 static int hwloc_linux_mask_is_empty(unsigned max_os_index, unsigned long *linuxmask)
1678 unsigned i;
1679 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1680 if (linuxmask[i])
1681 return 0;
1682 return 1;
1685 static int
1686 hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1688 unsigned max_os_index;
1689 unsigned long *linuxmask;
1690 int linuxpolicy;
1691 int err;
1693 max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1695 linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1696 if (!linuxmask) {
1697 errno = ENOMEM;
1698 goto out;
1701 err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
1702 if (err < 0)
1703 goto out_with_mask;
1705 /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1706 if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
1707 linuxpolicy = HWLOC_MPOL_LOCAL;
1709 if (linuxpolicy == MPOL_DEFAULT || linuxpolicy == HWLOC_MPOL_LOCAL) {
1710 hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1711 } else {
1712 hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
1715 err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1716 if (err < 0)
1717 goto out_with_mask;
1719 free(linuxmask);
1720 return 0;
1722 out_with_mask:
1723 free(linuxmask);
1724 out:
1725 return -1;
1728 static int
1729 hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
1731 unsigned max_os_index;
1732 unsigned long *linuxmask, *globallinuxmask;
1733 int linuxpolicy, globallinuxpolicy = 0;
1734 int mixed = 0;
1735 int full = 0;
1736 int first = 1;
1737 int pagesize = hwloc_getpagesize();
1738 char *tmpaddr;
1739 int err;
1740 unsigned i;
1742 max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
1744 linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
1745 if (!linuxmask) {
1746 errno = ENOMEM;
1747 goto out;
1749 globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
1750 if (!globallinuxmask) {
1751 errno = ENOMEM;
1752 goto out_with_masks;
1755 for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
1756 tmpaddr < (char *)addr + len;
1757 tmpaddr += pagesize) {
1758 err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
1759 if (err < 0)
1760 goto out_with_masks;
1762 /* MPOL_PREFERRED with empty mask is MPOL_LOCAL */
1763 if (linuxpolicy == MPOL_PREFERRED && hwloc_linux_mask_is_empty(max_os_index, linuxmask))
1764 linuxpolicy = HWLOC_MPOL_LOCAL;
1766 /* use the first found policy. if we find a different one later, set mixed to 1 */
1767 if (first)
1768 globallinuxpolicy = linuxpolicy;
1769 else if (globallinuxpolicy != linuxpolicy)
1770 mixed = 1;
1772 /* agregate masks, and set full to 1 if we ever find DEFAULT or LOCAL */
1773 if (full || linuxpolicy == MPOL_DEFAULT || linuxpolicy == HWLOC_MPOL_LOCAL) {
1774 full = 1;
1775 } else {
1776 for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
1777 globallinuxmask[i] |= linuxmask[i];
1780 first = 0;
1783 if (mixed) {
1784 *policy = HWLOC_MEMBIND_MIXED;
1785 } else {
1786 err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
1787 if (err < 0)
1788 goto out_with_masks;
1791 if (full) {
1792 hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
1793 } else {
1794 hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
1797 free(globallinuxmask);
1798 free(linuxmask);
1799 return 0;
1801 out_with_masks:
1802 free(globallinuxmask);
1803 free(linuxmask);
1804 out:
1805 return -1;
1808 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1810 #ifdef HWLOC_HAVE_MOVE_PAGES
1811 static int
1812 hwloc_linux_get_area_memlocation(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr, size_t len, hwloc_nodeset_t nodeset, int flags __hwloc_attribute_unused)
1814 unsigned offset;
1815 unsigned long count;
1816 void **pages;
1817 int *status;
1818 int pagesize = hwloc_getpagesize();
1819 int ret;
1820 unsigned i;
1822 offset = ((unsigned long) addr) & (pagesize-1);
1823 addr = ((char*) addr) - offset;
1824 len += offset;
1825 count = (len + pagesize-1)/pagesize;
1826 pages = malloc(count*sizeof(*pages));
1827 status = malloc(count*sizeof(*status));
1828 if (!pages || !status) {
1829 ret = -1;
1830 goto out_with_pages;
1833 for(i=0; i<count; i++)
1834 pages[i] = ((char*)addr) + i*pagesize;
1836 ret = move_pages(0, count, pages, NULL, status, 0);
1837 if (ret < 0)
1838 goto out_with_pages;
1840 hwloc_bitmap_zero(nodeset);
1841 for(i=0; i<count; i++)
1842 if (status[i] >= 0)
1843 hwloc_bitmap_set(nodeset, status[i]);
1844 ret = 0;
1846 out_with_pages:
1847 free(pages);
1848 free(status);
1849 return ret;
1851 #endif /* HWLOC_HAVE_MOVE_PAGES */
1853 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep);
1855 static int hwloc_linux_get_allowed_resources_hook(hwloc_topology_t topology)
1857 const char *fsroot_path;
1858 char *cpuset_name;
1859 int root_fd = -1;
1861 fsroot_path = getenv("HWLOC_FSROOT");
1862 if (!fsroot_path)
1863 fsroot_path = "/";
1865 #ifdef HAVE_OPENAT
1866 root_fd = open(fsroot_path, O_RDONLY | O_DIRECTORY);
1867 if (root_fd < 0)
1868 goto out;
1869 #else
1870 if (strcmp(fsroot_path, "/")) {
1871 errno = ENOSYS;
1872 goto out;
1874 #endif
1876 /* we could also error-out if the current topology doesn't actually match the system,
1877 * at least for PUs and NUMA nodes. But it would increase the overhead of loading XMLs.
1879 * Just trust the user when he sets THISSYSTEM=1. It enables hacky
1880 * tests such as restricting random XML or synthetic to the current
1881 * machine (uses the default cgroup).
1884 hwloc_linux__get_allowed_resources(topology, fsroot_path, root_fd, &cpuset_name);
1885 if (cpuset_name) {
1886 hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
1887 free(cpuset_name);
1889 if (root_fd != -1)
1890 close(root_fd);
1892 out:
1893 return -1;
1896 void
1897 hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
1898 struct hwloc_topology_support *support __hwloc_attribute_unused)
1900 hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
1901 hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
1902 hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
1903 hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
1904 hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
1905 hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
1906 #if HAVE_DECL_PTHREAD_SETAFFINITY_NP
1907 hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
1908 #endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
1909 #if HAVE_DECL_PTHREAD_GETAFFINITY_NP
1910 hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
1911 #endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
1912 hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
1913 hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
1914 hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
1915 #ifdef HWLOC_HAVE_SET_MEMPOLICY
1916 hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
1917 hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
1918 hooks->get_area_membind = hwloc_linux_get_area_membind;
1919 #endif /* HWLOC_HAVE_SET_MEMPOLICY */
1920 #ifdef HWLOC_HAVE_MBIND
1921 hooks->set_area_membind = hwloc_linux_set_area_membind;
1922 #ifdef HWLOC_HAVE_MOVE_PAGES
1923 hooks->get_area_memlocation = hwloc_linux_get_area_memlocation;
1924 #endif /* HWLOC_HAVE_MOVE_PAGES */
1925 hooks->alloc_membind = hwloc_linux_alloc_membind;
1926 hooks->alloc = hwloc_alloc_mmap;
1927 hooks->free_membind = hwloc_free_mmap;
1928 support->membind->firsttouch_membind = 1;
1929 support->membind->bind_membind = 1;
1930 support->membind->interleave_membind = 1;
1931 #endif /* HWLOC_HAVE_MBIND */
1932 #if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
1933 support->membind->migrate_membind = 1;
1934 #endif
1935 hooks->get_allowed_resources = hwloc_linux_get_allowed_resources_hook;
1939 /*******************************************
1940 *** Misc Helpers for Topology Discovery ***
1941 *******************************************/
1943 /* cpuinfo array */
1944 struct hwloc_linux_cpuinfo_proc {
1945 /* set during hwloc_linux_parse_cpuinfo */
1946 unsigned long Pproc;
1947 /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
1948 long Pcore, Ppkg;
1949 /* set later, or -1 if unknown */
1950 long Lcore, Lpkg;
1952 /* custom info, set during hwloc_linux_parse_cpuinfo */
1953 struct hwloc_obj_info_s *infos;
1954 unsigned infos_count;
1957 /* deprecated but still needed in hwloc/linux.h for backward compat */
1959 hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
1961 unsigned long *maps;
1962 unsigned long map;
1963 int nr_maps = 0;
1964 static int _nr_maps_allocated = 8; /* Only compute the power-of-two above the kernel cpumask size once.
1965 * Actually, it may increase multiple times if first read cpumaps start with zeroes.
1967 int nr_maps_allocated = _nr_maps_allocated;
1968 int i;
1970 maps = malloc(nr_maps_allocated * sizeof(*maps));
1971 if (!maps)
1972 return -1;
1974 /* reset to zero first */
1975 hwloc_bitmap_zero(set);
1977 /* parse the whole mask */
1978 while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
1980 if (nr_maps == nr_maps_allocated) {
1981 unsigned long *tmp = realloc(maps, 2*nr_maps_allocated * sizeof(*maps));
1982 if (!tmp) {
1983 free(maps);
1984 return -1;
1986 maps = tmp;
1987 nr_maps_allocated *= 2;
1990 if (!map && !nr_maps)
1991 /* ignore the first map if it's empty */
1992 continue;
1994 maps[nr_maps++] = map;
1997 /* convert into a set */
1998 #if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
1999 for(i=0; i<nr_maps; i++)
2000 hwloc_bitmap_set_ith_ulong(set, i, maps[nr_maps-1-i]);
2001 #else
2002 for(i=0; i<(nr_maps+1)/2; i++) {
2003 unsigned long mask;
2004 mask = maps[nr_maps-2*i-1];
2005 if (2*i+1<nr_maps)
2006 mask |= maps[nr_maps-2*i-2] << KERNEL_CPU_MASK_BITS;
2007 hwloc_bitmap_set_ith_ulong(set, i, mask);
2009 #endif
2011 free(maps);
2013 /* Only update the static value with the final one,
2014 * to avoid sharing intermediate values that we modify,
2015 * in case there's ever multiple concurrent calls.
2017 if (nr_maps_allocated > _nr_maps_allocated)
2018 _nr_maps_allocated = nr_maps_allocated;
2019 return 0;
2022 static void
2023 hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, const char *root_path)
2025 char *mount_path;
2026 struct mntent mntent;
2027 FILE *fd;
2028 int err;
2029 size_t bufsize;
2030 char *buf;
2032 *cgroup_mntpnt = NULL;
2033 *cpuset_mntpnt = NULL;
2035 if (root_path) {
2036 /* setmntent() doesn't support openat(), so use the root_path directly */
2037 err = asprintf(&mount_path, "%s/proc/mounts", root_path);
2038 if (err < 0)
2039 return;
2040 fd = setmntent(mount_path, "r");
2041 free(mount_path);
2042 } else {
2043 fd = setmntent("/proc/mounts", "r");
2045 if (!fd)
2046 return;
2048 /* getmntent_r() doesn't actually report an error when the buffer
2049 * is too small. It just silently truncates things. So we can't
2050 * dynamically resize things.
2052 * Linux limits mount type, string, and options to one page each.
2053 * getmntent() limits the line size to 4kB.
2054 * so use 4*pagesize to be far above both.
2056 bufsize = hwloc_getpagesize()*4;
2057 buf = malloc(bufsize);
2059 while (getmntent_r(fd, &mntent, buf, bufsize)) {
2060 if (!strcmp(mntent.mnt_type, "cpuset")) {
2061 hwloc_debug("Found cpuset mount point on %s\n", mntent.mnt_dir);
2062 *cpuset_mntpnt = strdup(mntent.mnt_dir);
2063 break;
2064 } else if (!strcmp(mntent.mnt_type, "cgroup")) {
2065 /* found a cgroup mntpnt */
2066 char *opt, *opts = mntent.mnt_opts;
2067 int cpuset_opt = 0;
2068 int noprefix_opt = 0;
2069 /* look at options */
2070 while ((opt = strsep(&opts, ",")) != NULL) {
2071 if (!strcmp(opt, "cpuset"))
2072 cpuset_opt = 1;
2073 else if (!strcmp(opt, "noprefix"))
2074 noprefix_opt = 1;
2076 if (!cpuset_opt)
2077 continue;
2078 if (noprefix_opt) {
2079 hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", mntent.mnt_dir);
2080 *cpuset_mntpnt = strdup(mntent.mnt_dir);
2081 } else {
2082 hwloc_debug("Found cgroup/cpuset mount point on %s\n", mntent.mnt_dir);
2083 *cgroup_mntpnt = strdup(mntent.mnt_dir);
2085 break;
2089 free(buf);
2090 endmntent(fd);
2094 * Linux cpusets may be managed directly or through cgroup.
2095 * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
2096 * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
2097 * containing <name>.
2099 static char *
2100 hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
2102 #define CPUSET_NAME_LEN 128
2103 char cpuset_name[CPUSET_NAME_LEN];
2104 FILE *file;
2105 int err;
2106 char *tmp;
2108 /* check whether a cgroup-cpuset is enabled */
2109 if (!pid)
2110 file = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
2111 else {
2112 char path[] = "/proc/XXXXXXXXXX/cgroup";
2113 snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
2114 file = hwloc_fopen(path, "r", fsroot_fd);
2116 if (file) {
2117 /* find a cpuset line */
2118 #define CGROUP_LINE_LEN 256
2119 char line[CGROUP_LINE_LEN];
2120 while (fgets(line, sizeof(line), file)) {
2121 char *end, *colon = strchr(line, ':');
2122 if (!colon)
2123 continue;
2124 if (strncmp(colon, ":cpuset:", 8))
2125 continue;
2127 /* found a cgroup-cpuset line, return the name */
2128 fclose(file);
2129 end = strchr(colon, '\n');
2130 if (end)
2131 *end = '\0';
2132 hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
2133 return strdup(colon+8);
2135 fclose(file);
2138 /* check whether a cpuset is enabled */
2139 if (!pid)
2140 err = hwloc_read_path_by_length("/proc/self/cpuset", cpuset_name, sizeof(cpuset_name), fsroot_fd);
2141 else {
2142 char path[] = "/proc/XXXXXXXXXX/cpuset";
2143 snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
2144 err = hwloc_read_path_by_length(path, cpuset_name, sizeof(cpuset_name), fsroot_fd);
2146 if (err < 0) {
2147 /* found nothing */
2148 hwloc_debug("%s", "No cgroup or cpuset found\n");
2149 return NULL;
2152 /* found a cpuset, return the name */
2153 tmp = strchr(cpuset_name, '\n');
2154 if (tmp)
2155 *tmp = '\0';
2156 hwloc_debug("Found cpuset %s\n", cpuset_name);
2157 return strdup(cpuset_name);
2161 * Then, the cpuset description is available from either the cgroup or
2162 * the cpuset filesystem (usually mounted in / or /dev) where there
2163 * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
2165 static void
2166 hwloc_admin_disable_set_from_cpuset(int root_fd,
2167 const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
2168 const char *attr_name,
2169 hwloc_bitmap_t admin_enabled_cpus_set)
2171 #define CPUSET_FILENAME_LEN 256
2172 char cpuset_filename[CPUSET_FILENAME_LEN];
2173 int fd;
2174 int err;
2176 if (cgroup_mntpnt) {
2177 /* try to read the cpuset from cgroup */
2178 snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
2179 hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
2180 } else if (cpuset_mntpnt) {
2181 /* try to read the cpuset directly */
2182 snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
2183 hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
2186 fd = hwloc_open(cpuset_filename, root_fd);
2187 if (fd < 0) {
2188 /* found no cpuset description, ignore it */
2189 hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
2190 return;
2193 err = hwloc__read_fd_as_cpulist(fd, admin_enabled_cpus_set);
2194 close(fd);
2196 if (err < 0)
2197 hwloc_bitmap_fill(admin_enabled_cpus_set);
2198 else
2199 hwloc_debug_bitmap("cpuset includes %s\n", admin_enabled_cpus_set);
2202 static void
2203 hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
2204 const char *path,
2205 uint64_t *local_memory,
2206 uint64_t *meminfo_hugepages_count,
2207 uint64_t *meminfo_hugepages_size,
2208 int onlytotal)
2210 char *tmp;
2211 char buffer[4096];
2212 unsigned long long number;
2214 if (hwloc_read_path_by_length(path, buffer, sizeof(buffer), data->root_fd) < 0)
2215 return;
2217 tmp = strstr(buffer, "MemTotal: "); /* MemTotal: %llu kB */
2218 if (tmp) {
2219 number = strtoull(tmp+10, NULL, 10);
2220 *local_memory = number << 10;
2222 if (onlytotal)
2223 return;
2225 tmp = strstr(tmp, "Hugepagesize: "); /* Hugepagesize: %llu */
2226 if (tmp) {
2227 number = strtoull(tmp+14, NULL, 10);
2228 *meminfo_hugepages_size = number << 10;
2230 tmp = strstr(tmp, "HugePages_Free: "); /* HugePages_Free: %llu */
2231 if (tmp) {
2232 number = strtoull(tmp+16, NULL, 10);
2233 *meminfo_hugepages_count = number;
2239 #define SYSFS_NUMA_NODE_PATH_LEN 128
2241 static void
2242 hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
2243 const char *dirpath,
2244 struct hwloc_obj_memory_s *memory,
2245 uint64_t *remaining_local_memory)
2247 DIR *dir;
2248 struct dirent *dirent;
2249 unsigned long index_ = 1;
2250 char line[64];
2251 char path[SYSFS_NUMA_NODE_PATH_LEN];
2253 dir = hwloc_opendir(dirpath, data->root_fd);
2254 if (dir) {
2255 while ((dirent = readdir(dir)) != NULL) {
2256 int err;
2257 if (strncmp(dirent->d_name, "hugepages-", 10))
2258 continue;
2259 memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
2260 err = snprintf(path, sizeof(path), "%s/%s/nr_hugepages", dirpath, dirent->d_name);
2261 if ((size_t) err < sizeof(path)
2262 && !hwloc_read_path_by_length(path, line, sizeof(line), data->root_fd)) {
2263 /* these are the actual total amount of huge pages */
2264 memory->page_types[index_].count = strtoull(line, NULL, 0);
2265 *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
2266 index_++;
2269 closedir(dir);
2270 memory->page_types_len = index_;
2274 static void
2275 hwloc_get_kerrighed_node_meminfo_info(struct hwloc_topology *topology,
2276 struct hwloc_linux_backend_data_s *data,
2277 unsigned long node, struct hwloc_obj_memory_s *memory)
2279 char path[128];
2280 uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2281 int err;
2283 if (topology->is_thissystem) {
2284 memory->page_types_len = 2;
2285 memory->page_types = malloc(2*sizeof(*memory->page_types));
2286 memset(memory->page_types, 0, 2*sizeof(*memory->page_types));
2287 /* Try to get the hugepage size from sysconf in case we fail to get it from /proc/meminfo later */
2288 #ifdef HAVE__SC_LARGE_PAGESIZE
2289 memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2290 #endif
2291 memory->page_types[0].size = data->pagesize;
2294 err = snprintf(path, sizeof(path), "/proc/nodes/node%lu/meminfo", node);
2295 if ((size_t) err < sizeof(path))
2296 hwloc_parse_meminfo_info(data, path,
2297 &memory->local_memory,
2298 &meminfo_hugepages_count, &meminfo_hugepages_size,
2299 memory->page_types == NULL);
2301 if (memory->page_types) {
2302 uint64_t remaining_local_memory = memory->local_memory;
2303 if (meminfo_hugepages_size) {
2304 memory->page_types[1].size = meminfo_hugepages_size;
2305 memory->page_types[1].count = meminfo_hugepages_count;
2306 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2307 } else {
2308 memory->page_types_len = 1;
2310 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2314 static void
2315 hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
2316 struct hwloc_linux_backend_data_s *data,
2317 struct hwloc_obj_memory_s *memory)
2319 uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
2320 struct stat st;
2321 int has_sysfs_hugepages = 0;
2322 const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
2323 int types = 2;
2324 int err;
2326 err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
2327 if (!err) {
2328 types = 1 + st.st_nlink-2;
2329 has_sysfs_hugepages = 1;
2332 if (topology->is_thissystem || pagesize_env) {
2333 /* we cannot report any page_type info unless we have the page size.
2334 * we'll take it either from the system if local, or from the debug env variable
2336 memory->page_types_len = types;
2337 memory->page_types = calloc(types, sizeof(*memory->page_types));
2340 if (topology->is_thissystem) {
2341 /* Get the page and hugepage sizes from sysconf */
2342 #if HAVE_DECL__SC_LARGE_PAGESIZE
2343 memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
2344 #endif
2345 memory->page_types[0].size = data->pagesize; /* might be overwritten later by /proc/meminfo or sysfs */
2348 hwloc_parse_meminfo_info(data, "/proc/meminfo",
2349 &memory->local_memory,
2350 &meminfo_hugepages_count, &meminfo_hugepages_size,
2351 memory->page_types == NULL);
2353 if (memory->page_types) {
2354 uint64_t remaining_local_memory = memory->local_memory;
2355 if (has_sysfs_hugepages) {
2356 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2357 hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
2358 } else {
2359 /* use what we found in meminfo */
2360 if (meminfo_hugepages_size) {
2361 memory->page_types[1].size = meminfo_hugepages_size;
2362 memory->page_types[1].count = meminfo_hugepages_count;
2363 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2364 } else {
2365 memory->page_types_len = 1;
2369 if (pagesize_env) {
2370 /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
2371 memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
2372 /* If failed, use 4kB */
2373 if (!memory->page_types[0].size)
2374 memory->page_types[0].size = 4096;
2376 assert(memory->page_types[0].size); /* from sysconf if local or from the env */
2377 /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
2378 * may be 0 if no hugepage support in the kernel */
2380 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2384 static void
2385 hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
2386 struct hwloc_linux_backend_data_s *data,
2387 const char *syspath, int node,
2388 struct hwloc_obj_memory_s *memory)
2390 char path[SYSFS_NUMA_NODE_PATH_LEN];
2391 char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
2392 uint64_t meminfo_hugepages_count = 0;
2393 uint64_t meminfo_hugepages_size = 0;
2394 struct stat st;
2395 int has_sysfs_hugepages = 0;
2396 int types = 2;
2397 int err;
2399 sprintf(path, "%s/node%d/hugepages", syspath, node);
2400 err = hwloc_stat(path, &st, data->root_fd);
2401 if (!err) {
2402 types = 1 + st.st_nlink-2;
2403 has_sysfs_hugepages = 1;
2406 if (topology->is_thissystem) {
2407 memory->page_types_len = types;
2408 memory->page_types = malloc(types*sizeof(*memory->page_types));
2409 memset(memory->page_types, 0, types*sizeof(*memory->page_types));
2412 sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
2413 hwloc_parse_meminfo_info(data, meminfopath,
2414 &memory->local_memory,
2415 &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
2416 memory->page_types == NULL);
2418 if (memory->page_types) {
2419 uint64_t remaining_local_memory = memory->local_memory;
2420 if (has_sysfs_hugepages) {
2421 /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
2422 hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
2423 } else {
2424 /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
2425 * hwloc_get_procfs_meminfo_info must have been called earlier */
2426 meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
2427 /* use what we found in meminfo */
2428 if (meminfo_hugepages_size) {
2429 memory->page_types[1].count = meminfo_hugepages_count;
2430 memory->page_types[1].size = meminfo_hugepages_size;
2431 remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
2432 } else {
2433 memory->page_types_len = 1;
2436 /* update what's remaining as normal pages */
2437 memory->page_types[0].size = data->pagesize;
2438 memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
2442 static int
2443 hwloc_parse_nodes_distances(const char *path, unsigned nbnodes, unsigned *indexes, float *distances, int fsroot_fd)
2445 size_t len = (10+1)*nbnodes;
2446 float *curdist = distances;
2447 char *string;
2448 unsigned i;
2450 string = malloc(len); /* space-separated %d */
2451 if (!string)
2452 goto out;
2454 for(i=0; i<nbnodes; i++) {
2455 unsigned osnode = indexes[i];
2456 char distancepath[SYSFS_NUMA_NODE_PATH_LEN];
2457 char *tmp, *next;
2458 unsigned found;
2460 /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
2461 * store them in slots X*N...X*N+N-1 */
2462 sprintf(distancepath, "%s/node%u/distance", path, osnode);
2463 if (hwloc_read_path_by_length(distancepath, string, len, fsroot_fd) < 0)
2464 goto out_with_string;
2466 tmp = string;
2467 found = 0;
2468 while (tmp) {
2469 unsigned distance = strtoul(tmp, &next, 0); /* stored as a %d */
2470 if (next == tmp)
2471 break;
2472 *curdist = (float) distance;
2473 curdist++;
2474 found++;
2475 if (found == nbnodes)
2476 break;
2477 tmp = next+1;
2479 if (found != nbnodes)
2480 goto out_with_string;
2483 free(string);
2484 return 0;
2486 out_with_string:
2487 free(string);
2488 out:
2489 return -1;
2492 static void
2493 hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
2494 hwloc_obj_t obj,
2495 char *path, unsigned pathlen,
2496 const char *dmi_name, const char *hwloc_name)
2498 char dmi_line[64];
2500 strcpy(path+pathlen, dmi_name);
2501 if (hwloc_read_path_by_length(path, dmi_line, sizeof(dmi_line), data->root_fd) < 0)
2502 return;
2504 if (dmi_line[0] != '\0') {
2505 char *tmp = strchr(dmi_line, '\n');
2506 if (tmp)
2507 *tmp = '\0';
2508 hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
2509 hwloc_obj_add_info(obj, hwloc_name, dmi_line);
2513 static void
2514 hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
2516 char path[128];
2517 unsigned pathlen;
2518 DIR *dir;
2520 strcpy(path, "/sys/devices/virtual/dmi/id");
2521 dir = hwloc_opendir(path, data->root_fd);
2522 if (dir) {
2523 pathlen = 27;
2524 } else {
2525 strcpy(path, "/sys/class/dmi/id");
2526 dir = hwloc_opendir(path, data->root_fd);
2527 if (dir)
2528 pathlen = 17;
2529 else
2530 return;
2532 closedir(dir);
2534 path[pathlen++] = '/';
2536 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
2537 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
2538 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
2539 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
2540 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
2541 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
2542 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
2543 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
2544 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
2545 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
2546 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
2547 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
2548 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
2549 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
2550 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
2551 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
2552 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
2553 hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
2556 struct hwloc_firmware_dmi_mem_device_header {
2557 unsigned char type;
2558 unsigned char length;
2559 unsigned char handle[2];
2560 unsigned char phy_mem_handle[2];
2561 unsigned char mem_err_handle[2];
2562 unsigned char tot_width[2];
2563 unsigned char dat_width[2];
2564 unsigned char size[2];
2565 unsigned char ff;
2566 unsigned char dev_set;
2567 unsigned char dev_loc_str_num;
2568 unsigned char bank_loc_str_num;
2569 unsigned char mem_type;
2570 unsigned char type_detail[2];
2571 unsigned char speed[2];
2572 unsigned char manuf_str_num;
2573 unsigned char serial_str_num;
2574 unsigned char asset_tag_str_num;
2575 unsigned char part_num_str_num;
2576 /* don't include the following fields since we don't need them,
2577 * some old implementations may miss them.
2581 static int check_dmi_entry(const char *buffer)
2583 /* reject empty strings */
2584 if (!*buffer)
2585 return 0;
2586 /* reject strings of spaces (at least Dell use this for empty memory slots) */
2587 if (strspn(buffer, " ") == strlen(buffer))
2588 return 0;
2589 return 1;
2592 static void
2593 hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
2594 unsigned idx, const char *path, FILE *fd,
2595 struct hwloc_firmware_dmi_mem_device_header *header)
2597 unsigned slen;
2598 char buffer[256]; /* enough for memory device strings, or at least for each of them */
2599 unsigned foff; /* offset in raw file */
2600 unsigned boff; /* offset in buffer read from raw file */
2601 unsigned i;
2602 struct hwloc_obj_info_s *infos = NULL;
2603 unsigned infos_count = 0;
2604 hwloc_obj_t misc;
2605 int foundinfo = 0;
2607 hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
2609 /* start after the header */
2610 foff = header->length;
2611 i = 1;
2612 while (1) {
2613 /* read one buffer */
2614 if (fseek(fd, foff, SEEK_SET) < 0)
2615 break;
2616 if (!fgets(buffer, sizeof(buffer), fd))
2617 break;
2618 /* read string at the beginning of the buffer */
2619 boff = 0;
2620 while (1) {
2621 /* stop on empty string */
2622 if (!buffer[boff])
2623 goto done;
2624 /* stop if this string goes to the end of the buffer */
2625 slen = strlen(buffer+boff);
2626 if (boff + slen+1 == sizeof(buffer))
2627 break;
2628 /* string didn't get truncated, should be OK */
2629 if (i == header->manuf_str_num) {
2630 if (check_dmi_entry(buffer+boff)) {
2631 hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
2632 foundinfo = 1;
2634 } else if (i == header->serial_str_num) {
2635 if (check_dmi_entry(buffer+boff)) {
2636 hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
2637 foundinfo = 1;
2639 } else if (i == header->asset_tag_str_num) {
2640 if (check_dmi_entry(buffer+boff)) {
2641 hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
2642 foundinfo = 1;
2644 } else if (i == header->part_num_str_num) {
2645 if (check_dmi_entry(buffer+boff)) {
2646 hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
2647 foundinfo = 1;
2649 } else if (i == header->dev_loc_str_num) {
2650 if (check_dmi_entry(buffer+boff)) {
2651 hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
2652 /* only a location, not an actual info about the device */
2654 } else if (i == header->bank_loc_str_num) {
2655 if (check_dmi_entry(buffer+boff)) {
2656 hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
2657 /* only a location, not an actual info about the device */
2659 } else {
2660 goto done;
2662 /* next string in buffer */
2663 boff += slen+1;
2664 i++;
2666 /* couldn't read a single full string from that buffer, we're screwed */
2667 if (!boff) {
2668 fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
2669 i, path);
2670 break;
2672 /* reread buffer after previous string */
2673 foff += boff;
2676 done:
2677 if (!foundinfo) {
2678 /* found no actual info about the device. if there's only location info, the slot may be empty */
2679 goto out_with_infos;
2682 misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
2683 if (!misc)
2684 goto out_with_infos;
2686 hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
2687 /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
2688 * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
2689 * with the vendor, and it's hard to be 100% sure 'B' is second socket.
2690 * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
2691 * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
2693 hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
2694 return;
2696 out_with_infos:
2697 hwloc__free_infos(infos, infos_count);
2700 static void
2701 hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
2702 struct hwloc_linux_backend_data_s *data)
2704 char path[128];
2705 unsigned i;
2707 for(i=0; ; i++) {
2708 FILE *fd;
2709 struct hwloc_firmware_dmi_mem_device_header header;
2710 int err;
2712 snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
2713 fd = hwloc_fopen(path, "r", data->root_fd);
2714 if (!fd)
2715 break;
2717 err = fread(&header, sizeof(header), 1, fd);
2718 if (err != 1) {
2719 fclose(fd);
2720 break;
2722 if (header.length < sizeof(header)) {
2723 /* invalid, or too old entry/spec that doesn't contain what we need */
2724 fclose(fd);
2725 break;
2728 hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
2730 fclose(fd);
2735 /***********************************
2736 ****** Device tree Discovery ******
2737 ***********************************/
2739 /* Reads the entire file and returns bytes read if bytes_read != NULL
2740 * Returned pointer can be freed by using free(). */
2741 static void *
2742 hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
2744 char fname[256];
2745 char *ret = NULL;
2746 struct stat fs;
2747 int file = -1;
2749 snprintf(fname, sizeof(fname), "%s/%s", p, p1);
2751 file = hwloc_open(fname, root_fd);
2752 if (-1 == file) {
2753 goto out_no_close;
2755 if (fstat(file, &fs)) {
2756 goto out;
2759 ret = (char *) malloc(fs.st_size);
2760 if (NULL != ret) {
2761 ssize_t cb = read(file, ret, fs.st_size);
2762 if (cb == -1) {
2763 free(ret);
2764 ret = NULL;
2765 } else {
2766 if (NULL != bytes_read)
2767 *bytes_read = cb;
2771 out:
2772 close(file);
2773 out_no_close:
2774 return ret;
2777 /* Reads the entire file and returns it as a 0-terminated string
2778 * Returned pointer can be freed by using free(). */
2779 static char *
2780 hwloc_read_str(const char *p, const char *p1, int root_fd)
2782 size_t cb = 0;
2783 char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
2784 if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
2785 char *tmp = realloc(ret, cb + 1);
2786 if (!tmp) {
2787 free(ret);
2788 return NULL;
2790 ret = tmp;
2791 ret[cb] = 0;
2793 return ret;
2796 /* Reads first 32bit bigendian value */
2797 static ssize_t
2798 hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
2800 size_t cb = 0;
2801 uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
2802 if (sizeof(*buf) != cb) {
2803 errno = EINVAL;
2804 free(tmp); /* tmp is either NULL or contains useless things */
2805 return -1;
2807 *buf = htonl(*tmp);
2808 free(tmp);
2809 return sizeof(*buf);
2812 typedef struct {
2813 unsigned int n, allocated;
2814 struct {
2815 hwloc_bitmap_t cpuset;
2816 uint32_t phandle;
2817 uint32_t l2_cache;
2818 char *name;
2819 } *p;
2820 } device_tree_cpus_t;
2822 static void
2823 add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
2824 uint32_t l2_cache, uint32_t phandle, const char *name)
2826 if (cpus->n == cpus->allocated) {
2827 void *tmp;
2828 unsigned allocated;
2829 if (!cpus->allocated)
2830 allocated = 64;
2831 else
2832 allocated = 2 * cpus->allocated;
2833 tmp = realloc(cpus->p, allocated * sizeof(cpus->p[0]));
2834 if (!tmp)
2835 return; /* failed to realloc, ignore this entry */
2836 cpus->p = tmp;
2837 cpus->allocated = allocated;
2839 cpus->p[cpus->n].phandle = phandle;
2840 cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
2841 cpus->p[cpus->n].l2_cache = l2_cache;
2842 cpus->p[cpus->n].name = strdup(name);
2843 ++cpus->n;
2846 /* Walks over the cache list in order to detect nested caches and CPU mask for each */
2847 static int
2848 look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
2849 uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
2851 unsigned int i;
2852 int ret = -1;
2853 if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
2854 return ret;
2855 for (i = 0; i < cpus->n; ++i) {
2856 if (phandle != cpus->p[i].l2_cache)
2857 continue;
2858 if (NULL != cpus->p[i].cpuset) {
2859 hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
2860 ret = 0;
2861 } else {
2862 ++(*level);
2863 if (0 == look_powerpc_device_tree_discover_cache(cpus,
2864 cpus->p[i].phandle, level, cpuset))
2865 ret = 0;
2868 return ret;
2871 static void
2872 try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2873 unsigned int level, hwloc_obj_cache_type_t type,
2874 uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
2875 hwloc_bitmap_t cpuset)
2877 struct hwloc_obj *c = NULL;
2879 if (0 == cache_size)
2880 return;
2882 c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
2883 c->attr->cache.depth = level;
2884 c->attr->cache.linesize = cache_line_size;
2885 c->attr->cache.size = cache_size;
2886 c->attr->cache.type = type;
2887 if (cache_sets == 1)
2888 /* likely wrong, make it unknown */
2889 cache_sets = 0;
2890 if (cache_sets && cache_line_size)
2891 c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
2892 else
2893 c->attr->cache.associativity = 0;
2894 c->cpuset = hwloc_bitmap_dup(cpuset);
2895 hwloc_debug_2args_bitmap("cache (%s) depth %u has cpuset %s\n",
2896 type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
2897 level, c->cpuset);
2898 hwloc_insert_object_by_cpuset(topology, c);
2901 static void
2902 try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
2903 struct hwloc_linux_backend_data_s *data,
2904 const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
2906 /* d-cache-block-size - ignore */
2907 /* d-cache-line-size - to read, in bytes */
2908 /* d-cache-sets - ignore */
2909 /* d-cache-size - to read, in bytes */
2910 /* i-cache, same for instruction */
2911 /* cache-unified only exist if data and instruction caches are unified */
2912 /* d-tlb-sets - ignore */
2913 /* d-tlb-size - ignore, always 0 on power6 */
2914 /* i-tlb-*, same */
2915 uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
2916 uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
2917 char unified_path[1024];
2918 struct stat statbuf;
2919 int unified;
2921 snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
2922 unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
2924 hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
2925 data->root_fd);
2926 hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
2927 data->root_fd);
2928 hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
2929 data->root_fd);
2930 hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
2931 data->root_fd);
2932 hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
2933 data->root_fd);
2934 hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
2935 data->root_fd);
2937 if (!unified)
2938 try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
2939 i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
2940 try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
2941 d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
2945 * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
2946 * which provide NUMA nodes information without any details
2948 static void
2949 look_powerpc_device_tree(struct hwloc_topology *topology,
2950 struct hwloc_linux_backend_data_s *data)
2952 device_tree_cpus_t cpus;
2953 const char ofroot[] = "/proc/device-tree/cpus";
2954 unsigned int i;
2955 int root_fd = data->root_fd;
2956 DIR *dt = hwloc_opendir(ofroot, root_fd);
2957 struct dirent *dirent;
2959 if (NULL == dt)
2960 return;
2962 /* only works for Power so far, and not useful on ARM */
2963 if (data->arch != HWLOC_LINUX_ARCH_POWER) {
2964 closedir(dt);
2965 return;
2968 cpus.n = 0;
2969 cpus.p = NULL;
2970 cpus.allocated = 0;
2972 while (NULL != (dirent = readdir(dt))) {
2973 char cpu[256];
2974 char *device_type;
2975 uint32_t reg = -1, l2_cache = -1, phandle = -1;
2976 int err;
2978 if ('.' == dirent->d_name[0])
2979 continue;
2981 err = snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
2982 if ((size_t) err >= sizeof(cpu))
2983 continue;
2985 device_type = hwloc_read_str(cpu, "device_type", root_fd);
2986 if (NULL == device_type)
2987 continue;
2989 hwloc_read_unit32be(cpu, "reg", &reg, root_fd);
2990 if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
2991 hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
2992 if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
2993 if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
2994 hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
2996 if (0 == strcmp(device_type, "cache")) {
2997 add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
2999 else if (0 == strcmp(device_type, "cpu")) {
3000 /* Found CPU */
3001 hwloc_bitmap_t cpuset = NULL;
3002 size_t cb = 0;
3003 uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
3004 uint32_t nthreads = cb / sizeof(threads[0]);
3006 if (NULL != threads) {
3007 cpuset = hwloc_bitmap_alloc();
3008 for (i = 0; i < nthreads; ++i) {
3009 if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
3010 hwloc_bitmap_set(cpuset, ntohl(threads[i]));
3012 free(threads);
3013 } else if ((unsigned int)-1 != reg) {
3014 /* Doesn't work on ARM because cpu "reg" do not start at 0.
3015 * We know the first cpu "reg" is the lowest. The others are likely
3016 * in order assuming the device-tree shows objects in order.
3018 cpuset = hwloc_bitmap_alloc();
3019 hwloc_bitmap_set(cpuset, reg);
3022 if (NULL == cpuset) {
3023 hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
3024 } else {
3025 struct hwloc_obj *core = NULL;
3026 add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
3028 /* Add core */
3029 core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
3030 core->cpuset = hwloc_bitmap_dup(cpuset);
3031 hwloc_insert_object_by_cpuset(topology, core);
3033 /* Add L1 cache */
3034 try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
3036 hwloc_bitmap_free(cpuset);
3039 free(device_type);
3041 closedir(dt);
3043 /* No cores and L2 cache were found, exiting */
3044 if (0 == cpus.n) {
3045 hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
3046 return;
3049 #ifdef HWLOC_DEBUG
3050 for (i = 0; i < cpus.n; ++i) {
3051 hwloc_debug("%u: %s ibm,phandle=%08X l2_cache=%08X ",
3052 i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
3053 if (NULL == cpus.p[i].cpuset) {
3054 hwloc_debug("%s\n", "no cpuset");
3055 } else {
3056 hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
3059 #endif
3061 /* Scan L2/L3/... caches */
3062 for (i = 0; i < cpus.n; ++i) {
3063 unsigned int level = 2;
3064 hwloc_bitmap_t cpuset;
3065 /* Skip real CPUs */
3066 if (NULL != cpus.p[i].cpuset)
3067 continue;
3069 /* Calculate cache level and CPU mask */
3070 cpuset = hwloc_bitmap_alloc();
3071 if (0 == look_powerpc_device_tree_discover_cache(&cpus,
3072 cpus.p[i].phandle, &level, cpuset)) {
3073 char cpu[256];
3074 snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
3075 try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
3077 hwloc_bitmap_free(cpuset);
3080 /* Do cleanup */
3081 for (i = 0; i < cpus.n; ++i) {
3082 hwloc_bitmap_free(cpus.p[i].cpuset);
3083 free(cpus.p[i].name);
3085 free(cpus.p);
3088 /* Try to handle knl hwdata properties
3089 * Returns 0 on success and -1 otherwise */
3090 static int hwloc_linux_try_handle_knl_hwdata_properties(hwloc_topology_t topology, struct hwloc_linux_backend_data_s *data, hwloc_obj_t *nodes, unsigned nbnodes)
3092 char *knl_cache_file;
3093 long long int cache_size = -1;
3094 int associativity = -1;
3095 int inclusiveness = -1;
3096 int line_size = -1;
3097 int version = 0;
3098 unsigned i;
3099 char buffer[512] = {0};
3100 char *data_beg = NULL;
3101 char memory_mode_str[32] = {0};
3102 char cluster_mode_str[32] = {0};
3103 unsigned long MCDRAM_numa_size, DDR_numa_size;
3104 unsigned MCDRAM_nbnodes, DDR_nbnodes;
3105 unsigned long total_cache_size;
3106 char * fallback_env = getenv("HWLOC_KNL_HDH_FALLBACK");
3107 int fallback = fallback_env ? atoi(fallback_env) : -1; /* by default, only fallback if needed */
3109 if (fallback == 1) {
3110 hwloc_debug("KNL dumped hwdata ignored, forcing fallback\n");
3111 goto fallback;
3114 if (asprintf(&knl_cache_file, "%s/knl_memoryside_cache", data->dumped_hwdata_dirname) < 0)
3115 goto fallback;
3117 hwloc_debug("Reading knl cache data from: %s\n", knl_cache_file);
3118 if (hwloc_read_path_by_length(knl_cache_file, buffer, sizeof(buffer), data->root_fd) < 0) {
3119 hwloc_debug("Unable to open KNL data file `%s' (%s)\n", knl_cache_file, strerror(errno));
3120 free(knl_cache_file);
3121 goto fallback;
3123 free(knl_cache_file);
3125 data_beg = &buffer[0];
3127 /* file must start with version information */
3128 if (sscanf(data_beg, "version: %d", &version) != 1) {
3129 fprintf(stderr, "Invalid knl_memoryside_cache header, expected \"version: <int>\".\n");
3130 goto fallback;
3133 while (1) {
3134 char *line_end = strstr(data_beg, "\n");
3135 if (!line_end)
3136 break;
3137 if (version >= 1) {
3138 if (!strncmp("cache_size:", data_beg, strlen("cache_size"))) {
3139 sscanf(data_beg, "cache_size: %lld", &cache_size);
3140 hwloc_debug("read cache_size=%lld\n", cache_size);
3141 } else if (!strncmp("line_size:", data_beg, strlen("line_size:"))) {
3142 sscanf(data_beg, "line_size: %d", &line_size);
3143 hwloc_debug("read line_size=%d\n", line_size);
3144 } else if (!strncmp("inclusiveness:", data_beg, strlen("inclusiveness:"))) {
3145 sscanf(data_beg, "inclusiveness: %d", &inclusiveness);
3146 hwloc_debug("read inclusiveness=%d\n", inclusiveness);
3147 } else if (!strncmp("associativity:", data_beg, strlen("associativity:"))) {
3148 sscanf(data_beg, "associativity: %d\n", &associativity);
3149 hwloc_debug("read associativity=%d\n", associativity);
3152 if (version >= 2) {
3153 if (!strncmp("cluster_mode: ", data_beg, strlen("cluster_mode: "))) {
3154 size_t length;
3155 data_beg += strlen("cluster_mode: ");
3156 length = line_end-data_beg;
3157 if (length > sizeof(cluster_mode_str)-1)
3158 length = sizeof(cluster_mode_str)-1;
3159 memcpy(cluster_mode_str, data_beg, length);
3160 cluster_mode_str[length] = '\0';
3161 hwloc_debug("read cluster_mode=%s\n", cluster_mode_str);
3162 } else if (!strncmp("memory_mode: ", data_beg, strlen("memory_mode: "))) {
3163 size_t length;
3164 data_beg += strlen("memory_mode: ");
3165 length = line_end-data_beg;
3166 if (length > sizeof(memory_mode_str)-1)
3167 length = sizeof(memory_mode_str)-1;
3168 memcpy(memory_mode_str, data_beg, length);
3169 memory_mode_str[length] = '\0';
3170 hwloc_debug("read memory_mode=%s\n", memory_mode_str);
3174 data_beg = line_end + 1;
3177 if (line_size == -1 || cache_size == -1 || associativity == -1 || inclusiveness == -1) {
3178 hwloc_debug("Incorrect file format line_size=%d cache_size=%lld associativity=%d inclusiveness=%d\n",
3179 line_size, cache_size, associativity, inclusiveness);
3180 goto fallback;
3183 doit:
3184 /* In file version 1 mcdram_cache is always non-zero.
3185 * In file version 2 mcdram cache can be zero in flat mode. We need to check and do not expose cache in flat mode. */
3186 if (cache_size > 0) {
3187 for(i=0; i<nbnodes; i++) {
3188 hwloc_obj_t cache;
3190 if (hwloc_bitmap_iszero(nodes[i]->cpuset))
3191 /* one L3 per DDR, none for MCDRAM nodes */
3192 continue;
3194 cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3195 if (!cache)
3196 return -1;
3198 cache->attr->cache.depth = 3;
3199 cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
3200 cache->attr->cache.associativity = associativity;
3201 hwloc_obj_add_info(cache, "Inclusive", inclusiveness ? "1" : "0");
3202 cache->attr->cache.size = cache_size;
3203 cache->attr->cache.linesize = line_size;
3204 cache->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3205 hwloc_obj_add_info(cache, "Type", "MemorySideCache");
3206 hwloc_insert_object_by_cpuset(topology, cache);
3209 /* adding cluster and memory mode as properties of the machine */
3210 if (version >= 2) {
3211 if (*cluster_mode_str) /* in case the fallback below couldn't guess */
3212 hwloc_obj_add_info(topology->levels[0][0], "ClusterMode", cluster_mode_str);
3213 hwloc_obj_add_info(topology->levels[0][0], "MemoryMode", memory_mode_str);
3216 return 0;
3218 fallback:
3219 if (fallback == 0) {
3220 hwloc_debug("KNL hwdata fallback disabled\n");
3221 return -1;
3224 hwloc_debug("Falling back to a heuristic\n");
3226 DDR_numa_size = 0;
3227 DDR_nbnodes = 0;
3228 MCDRAM_numa_size = 0;
3229 MCDRAM_nbnodes = 0;
3230 for(i=0; i<nbnodes; i++)
3231 if (hwloc_bitmap_iszero(nodes[i]->cpuset)) {
3232 MCDRAM_numa_size += nodes[i]->memory.local_memory;
3233 MCDRAM_nbnodes++;
3234 } else {
3235 DDR_numa_size += nodes[i]->memory.local_memory;
3236 DDR_nbnodes++;
3238 assert(DDR_nbnodes + MCDRAM_nbnodes == nbnodes);
3240 /* there can be 0 MCDRAM_nbnodes, but we must have at least one DDR node (not cpuless) */
3241 assert(DDR_nbnodes);
3242 /* there are either no MCDRAM nodes, or as many as DDR nodes */
3243 assert(!MCDRAM_nbnodes || MCDRAM_nbnodes == DDR_nbnodes);
3245 if (!MCDRAM_nbnodes && DDR_numa_size <= 16UL*1024*1024*1024) {
3246 /* We only found DDR numa nodes, but they are <=16GB.
3247 * It could be a DDR-less KNL where numa nodes are actually MCDRAM, we can't know for sure.
3248 * Both cases are unlikely, disable the heuristic for now.
3250 * In theory we could check if DDR_numa_size == 8/12/16GB exactly (amount of MCDRAM numa size in H50/H25/Flat modes),
3251 * but that's never the case since some kilobytes are always stolen by the system.
3253 hwloc_debug("Cannot guess if MCDRAM is in Cache or if the node is DDR-less (total NUMA node size %lu)\n",
3254 DDR_numa_size);
3255 return -1;
3258 /* all commercial KNL/KNM have 16GB of MCDRAM */
3259 total_cache_size = 16UL*1024*1024*1024 - MCDRAM_numa_size;
3261 if (!MCDRAM_nbnodes) {
3262 strcpy(memory_mode_str, "Cache");
3263 } else {
3264 if (!total_cache_size)
3265 strcpy(memory_mode_str, "Flat");
3266 else if (total_cache_size == 8UL*1024*1024*1024)
3267 strcpy(memory_mode_str, "Hybrid50");
3268 else if (total_cache_size == 4UL*1024*1024*1024)
3269 strcpy(memory_mode_str, "Hybrid25");
3270 else
3271 fprintf(stderr, "Unexpected KNL MCDRAM cache size %lu\n", total_cache_size);
3273 if (DDR_nbnodes == 4) {
3274 strcpy(cluster_mode_str, "SNC4");
3275 } else if (DDR_nbnodes == 2) {
3276 strcpy(cluster_mode_str, "SNC2");
3277 } else if (DDR_nbnodes == 1) {
3278 /* either Quadrant, All2ALL or Hemisphere */
3279 } else {
3280 fprintf(stderr, "Unexpected number of KNL non-MCDRAM NUMA nodes %u\n", DDR_nbnodes);
3283 cache_size = total_cache_size/DDR_nbnodes;
3284 associativity = 1;
3285 inclusiveness = 1;
3286 line_size = 64;
3288 version = 2;
3289 goto doit;
3294 /**************************************
3295 ****** Sysfs Topology Discovery ******
3296 **************************************/
3298 static int
3299 look_sysfsnode(struct hwloc_topology *topology,
3300 struct hwloc_linux_backend_data_s *data,
3301 const char *path, unsigned *found)
3303 unsigned osnode;
3304 unsigned nbnodes = 0;
3305 DIR *dir;
3306 struct dirent *dirent;
3307 hwloc_bitmap_t nodeset;
3309 *found = 0;
3311 /* Get the list of nodes first */
3312 dir = hwloc_opendir(path, data->root_fd);
3313 if (dir)
3315 nodeset = hwloc_bitmap_alloc();
3316 while ((dirent = readdir(dir)) != NULL)
3318 if (strncmp(dirent->d_name, "node", 4))
3319 continue;
3320 osnode = strtoul(dirent->d_name+4, NULL, 0);
3321 hwloc_bitmap_set(nodeset, osnode);
3322 nbnodes++;
3324 closedir(dir);
3326 else
3327 return -1;
3329 if (!nbnodes || (nbnodes == 1 && !data->is_knl)) { /* always keep NUMA for KNL, or configs might look too different */
3330 hwloc_bitmap_free(nodeset);
3331 return 0;
3334 /* For convenience, put these declarations inside a block. */
3337 hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
3338 unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
3339 float * distances = NULL;
3340 int failednodes = 0;
3341 unsigned index_;
3343 if (NULL == nodes || NULL == indexes) {
3344 free(nodes);
3345 free(indexes);
3346 hwloc_bitmap_free(nodeset);
3347 nbnodes = 0;
3348 goto out;
3351 /* Unsparsify node indexes.
3352 * We'll need them later because Linux groups sparse distances
3353 * and keeps them in order in the sysfs distance files.
3354 * It'll simplify things in the meantime.
3356 index_ = 0;
3357 hwloc_bitmap_foreach_begin (osnode, nodeset) {
3358 indexes[index_] = osnode;
3359 index_++;
3360 } hwloc_bitmap_foreach_end();
3361 hwloc_bitmap_free(nodeset);
3363 #ifdef HWLOC_DEBUG
3364 hwloc_debug("%s", "NUMA indexes: ");
3365 for (index_ = 0; index_ < nbnodes; index_++) {
3366 hwloc_debug(" %u", indexes[index_]);
3368 hwloc_debug("%s", "\n");
3369 #endif
3371 /* Create NUMA objects */
3372 for (index_ = 0; index_ < nbnodes; index_++) {
3373 hwloc_obj_t node, res_obj;
3374 int annotate;
3376 osnode = indexes[index_];
3378 node = hwloc_get_numanode_obj_by_os_index(topology, osnode);
3379 annotate = (node != NULL);
3380 if (!annotate) {
3381 /* create a new node */
3382 char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
3383 hwloc_bitmap_t cpuset;
3384 sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
3385 cpuset = hwloc__alloc_read_path_as_cpumask(nodepath, data->root_fd);
3386 if (!cpuset) {
3387 /* This NUMA object won't be inserted, we'll ignore distances */
3388 failednodes++;
3389 continue;
3392 node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
3393 node->cpuset = cpuset;
3394 node->nodeset = hwloc_bitmap_alloc();
3395 hwloc_bitmap_set(node->nodeset, osnode);
3397 hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
3399 hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
3400 osnode, node->cpuset);
3402 if (annotate) {
3403 nodes[index_] = node;
3404 } else {
3405 res_obj = hwloc_insert_object_by_cpuset(topology, node);
3406 if (node == res_obj) {
3407 nodes[index_] = node;
3408 } else {
3409 /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
3410 * This object disappeared, we'll ignore distances */
3411 failednodes++;
3416 if (!failednodes && data->is_knl)
3417 hwloc_linux_try_handle_knl_hwdata_properties(topology, data, nodes, nbnodes);
3419 if (failednodes) {
3420 /* failed to read/create some nodes, don't bother reading/fixing
3421 * a distance matrix that would likely be wrong anyway.
3423 nbnodes -= failednodes;
3424 } else if (nbnodes > 1) {
3425 distances = malloc(nbnodes*nbnodes*sizeof(*distances));
3428 if (NULL == distances) {
3429 free(nodes);
3430 free(indexes);
3431 goto out;
3434 if (hwloc_parse_nodes_distances(path, nbnodes, indexes, distances, data->root_fd) < 0) {
3435 free(nodes);
3436 free(distances);
3437 free(indexes);
3438 goto out;
3441 if (data->is_knl && distances) {
3442 char *env = getenv("HWLOC_KNL_NUMA_QUIRK");
3443 if (!(env && !atoi(env)) && nbnodes>=2) { /* SNC2 or SNC4, with 0 or 2/4 MCDRAM, and 0-4 DDR nodes */
3444 unsigned i, j, closest;
3445 for(i=0; i<nbnodes; i++) {
3446 if (!hwloc_bitmap_iszero(nodes[i]->cpuset))
3447 /* nodes with CPU, that's DDR, skip it */
3448 continue;
3449 hwloc_obj_add_info(nodes[i], "Type", "MCDRAM");
3451 /* DDR is the closest node with CPUs */
3452 closest = (unsigned)-1;
3453 for(j=0; j<nbnodes; j++) {
3454 if (j==i)
3455 continue;
3456 if (hwloc_bitmap_iszero(nodes[j]->cpuset))
3457 /* nodes without CPU, that's another MCDRAM, skip it */
3458 continue;
3459 if (closest == (unsigned)-1 || distances[i*nbnodes+j]<distances[i*nbnodes+closest])
3460 closest = j;
3462 if (closest != (unsigned) -1) {
3463 /* Add a Group for Cluster containing this MCDRAM + DDR */
3464 hwloc_obj_t cluster = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
3465 cluster->cpuset = hwloc_bitmap_dup(nodes[i]->cpuset);
3466 cluster->nodeset = hwloc_bitmap_dup(nodes[i]->nodeset);
3467 hwloc_bitmap_or(cluster->cpuset, cluster->cpuset, nodes[closest]->cpuset);
3468 hwloc_bitmap_or(cluster->nodeset, cluster->nodeset, nodes[closest]->nodeset);
3469 hwloc_obj_add_info(cluster, "Type", "Cluster");
3470 hwloc_insert_object_by_cpuset(topology, cluster);
3473 /* drop the distance matrix, it contradicts the above NUMA layout groups */
3474 free(distances);
3475 free(nodes);
3476 free(indexes);
3477 goto out;
3481 hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
3484 out:
3485 *found = nbnodes;
3486 return 0;
3489 /* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
3490 static int
3491 look_sysfscpu(struct hwloc_topology *topology,
3492 struct hwloc_linux_backend_data_s *data,
3493 const char *path,
3494 struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
3496 hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
3497 #define CPU_TOPOLOGY_STR_LEN 128
3498 char str[CPU_TOPOLOGY_STR_LEN];
3499 DIR *dir;
3500 int i,j;
3501 unsigned caches_added, merge_buggy_core_siblings;
3502 hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
3503 int threadwithcoreid = data->is_amd_with_CU ? -1 : 0; /* -1 means we don't know yet if threads have their own coreids within thread_siblings */
3505 /* fill the cpuset of interesting cpus */
3506 dir = hwloc_opendir(path, data->root_fd);
3507 if (!dir)
3508 return -1;
3509 else {
3510 struct dirent *dirent;
3511 cpuset = hwloc_bitmap_alloc();
3513 while ((dirent = readdir(dir)) != NULL) {
3514 unsigned long cpu;
3515 char online[2];
3517 if (strncmp(dirent->d_name, "cpu", 3))
3518 continue;
3519 cpu = strtoul(dirent->d_name+3, NULL, 0);
3521 /* Maybe we don't have topology information but at least it exists */
3522 hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
3524 /* check whether this processor is online */
3525 sprintf(str, "%s/cpu%lu/online", path, cpu);
3526 if (hwloc_read_path_by_length(str, online, sizeof(online), data->root_fd) == 0) {
3527 if (atoi(online)) {
3528 hwloc_debug("os proc %lu is online\n", cpu);
3529 } else {
3530 hwloc_debug("os proc %lu is offline\n", cpu);
3531 hwloc_bitmap_clr(topology->levels[0][0]->online_cpuset, cpu);
3535 /* check whether the kernel exports topology information for this cpu */
3536 sprintf(str, "%s/cpu%lu/topology", path, cpu);
3537 if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
3538 hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
3539 cpu, path, cpu);
3540 continue;
3543 hwloc_bitmap_set(cpuset, cpu);
3545 closedir(dir);
3548 topology->support.discovery->pu = 1;
3549 hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
3550 hwloc_bitmap_weight(cpuset), cpuset);
3552 merge_buggy_core_siblings = (data->arch == HWLOC_LINUX_ARCH_X86);
3553 caches_added = 0;
3554 hwloc_bitmap_foreach_begin(i, cpuset) {
3555 hwloc_bitmap_t packageset, coreset, bookset, threadset;
3556 unsigned mypackageid, mycoreid, mybookid;
3557 int tmpint;
3559 /* look at the package */
3560 sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
3561 packageset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3562 if (packageset && hwloc_bitmap_first(packageset) == i) {
3563 /* first cpu in this package, add the package */
3564 struct hwloc_obj *package;
3566 mypackageid = (unsigned) -1;
3567 sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i); /* contains %d at least up to 4.9 */
3568 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3569 mypackageid = (unsigned) tmpint;
3571 if (merge_buggy_core_siblings) {
3572 /* check for another package with same physical_package_id */
3573 hwloc_obj_t curpackage = packages;
3574 while (curpackage) {
3575 if (curpackage->os_index == mypackageid) {
3576 /* found another package with same physical_package_id but different core_siblings.
3577 * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
3578 * merge these core_siblings to extend the existing first package object.
3580 static int reported = 0;
3581 if (!reported && !hwloc_hide_errors()) {
3582 char *a, *b;
3583 hwloc_bitmap_asprintf(&a, curpackage->cpuset);
3584 hwloc_bitmap_asprintf(&b, packageset);
3585 fprintf(stderr, "****************************************************************************\n");
3586 fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
3587 fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
3588 mypackageid, a, b);
3589 fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
3590 fprintf(stderr, "* does not support this processor correctly.\n");
3591 fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
3592 fprintf(stderr, "*\n");
3593 fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
3594 fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
3595 fprintf(stderr, "* along with the files generated by the hwloc-gather-topology script.\n");
3596 fprintf(stderr, "****************************************************************************\n");
3597 reported = 1;
3598 free(a);
3599 free(b);
3601 hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
3602 goto package_done;
3604 curpackage = curpackage->next_cousin;
3608 /* no package with same physical_package_id, create a new one */
3609 package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
3610 package->cpuset = packageset;
3611 hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
3612 mypackageid, packageset);
3613 /* add cpuinfo */
3614 if (cpuinfo_Lprocs) {
3615 for(j=0; j<(int) cpuinfo_numprocs; j++)
3616 if ((int) cpuinfo_Lprocs[j].Pproc == i) {
3617 hwloc__move_infos(&package->infos, &package->infos_count,
3618 &cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
3621 /* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
3622 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
3624 package->next_cousin = packages;
3625 packages = package;
3627 packageset = NULL; /* don't free it */
3629 package_done:
3630 hwloc_bitmap_free(packageset);
3632 /* look at the core */
3633 sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3634 coreset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3636 if (coreset) {
3637 int gotcoreid = 0; /* to avoid reading the coreid twice */
3638 if (hwloc_bitmap_weight(coreset) > 1 && threadwithcoreid == -1) {
3639 /* check if this is hyper-threading or different coreids */
3640 unsigned siblingid, siblingcoreid;
3642 mycoreid = (unsigned) -1;
3643 sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3644 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3645 mycoreid = (unsigned) tmpint;
3646 gotcoreid = 1;
3648 siblingid = hwloc_bitmap_first(coreset);
3649 if (siblingid == (unsigned) i)
3650 siblingid = hwloc_bitmap_next(coreset, i);
3651 siblingcoreid = (unsigned) -1;
3652 sprintf(str, "%s/cpu%u/topology/core_id", path, siblingid); /* contains %d at least up to 4.9 */
3653 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3654 siblingcoreid = (unsigned) tmpint;
3655 threadwithcoreid = (siblingcoreid != mycoreid);
3657 if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
3658 /* regular core */
3659 struct hwloc_obj *core;
3661 if (!gotcoreid) {
3662 mycoreid = (unsigned) -1;
3663 sprintf(str, "%s/cpu%d/topology/core_id", path, i); /* contains %d at least up to 4.9 */
3664 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0)
3665 mycoreid = (unsigned) tmpint;
3668 core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
3669 if (threadwithcoreid)
3670 /* amd multicore compute-unit, create one core per thread */
3671 hwloc_bitmap_only(coreset, i);
3672 core->cpuset = coreset;
3673 hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
3674 mycoreid, core->cpuset);
3675 hwloc_insert_object_by_cpuset(topology, core);
3676 coreset = NULL; /* don't free it */
3678 hwloc_bitmap_free(coreset);
3681 /* look at the books */
3682 sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
3683 bookset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3684 if (bookset) {
3685 if (hwloc_bitmap_first(bookset) == i) {
3686 struct hwloc_obj *book;
3688 mybookid = (unsigned) -1;
3689 sprintf(str, "%s/cpu%d/topology/book_id", path, i); /* contains %d at least up to 4.9 */
3690 if (hwloc_read_path_as_int(str, &tmpint, data->root_fd) == 0) {
3691 mybookid = (unsigned) tmpint;
3693 book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
3694 book->cpuset = bookset;
3695 hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
3696 mybookid, bookset);
3697 hwloc_obj_add_info(book, "Type", "Book");
3698 hwloc_insert_object_by_cpuset(topology, book);
3699 bookset = NULL; /* don't free it */
3702 hwloc_bitmap_free(bookset);
3706 /* look at the thread */
3707 struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
3708 threadset = hwloc_bitmap_alloc();
3709 hwloc_bitmap_only(threadset, i);
3710 thread->cpuset = threadset;
3711 hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
3712 i, threadset);
3713 hwloc_insert_object_by_cpuset(topology, thread);
3716 /* look at the caches */
3717 for(j=0; j<10; j++) {
3718 char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
3719 hwloc_bitmap_t cacheset;
3721 sprintf(str, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
3722 cacheset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3723 if (cacheset) {
3724 if (hwloc_bitmap_iszero(cacheset)) {
3725 hwloc_bitmap_t tmpset;
3726 /* ia64 returning empty L3 and L2i? use the core set instead */
3727 sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
3728 tmpset = hwloc__alloc_read_path_as_cpumask(str, data->root_fd);
3729 /* only use it if we actually got something */
3730 if (tmpset) {
3731 hwloc_bitmap_free(cacheset);
3732 cacheset = tmpset;
3736 if (hwloc_bitmap_first(cacheset) == i) {
3737 unsigned kB;
3738 unsigned linesize;
3739 unsigned sets, lines_per_tag;
3740 unsigned depth; /* 1 for L1, .... */
3741 hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
3742 struct hwloc_obj *cache;
3744 /* get the cache level depth */
3745 sprintf(str, "%s/cpu%d/cache/index%d/level", path, i, j); /* contains %u at least up to 4.9 */
3746 if (hwloc_read_path_as_uint(str, &depth, data->root_fd) < 0) {
3747 hwloc_bitmap_free(cacheset);
3748 continue;
3751 /* cache type */
3752 sprintf(str, "%s/cpu%d/cache/index%d/type", path, i, j);
3753 if (hwloc_read_path_by_length(str, str2, sizeof(str2), data->root_fd) == 0) {
3754 if (!strncmp(str2, "Data", 4))
3755 type = HWLOC_OBJ_CACHE_DATA;
3756 else if (!strncmp(str2, "Unified", 7))
3757 type = HWLOC_OBJ_CACHE_UNIFIED;
3758 else if (!strncmp(str2, "Instruction", 11))
3759 type = HWLOC_OBJ_CACHE_INSTRUCTION;
3760 else {
3761 hwloc_bitmap_free(cacheset);
3762 continue;
3764 } else {
3765 hwloc_bitmap_free(cacheset);
3766 continue;
3769 /* get the cache size */
3770 kB = 0;
3771 sprintf(str, "%s/cpu%d/cache/index%d/size", path, i, j); /* contains %uK at least up to 4.9 */
3772 hwloc_read_path_as_uint(str, &kB, data->root_fd);
3773 /* KNL reports L3 with size=0 and full cpuset in cpuid.
3774 * Let hwloc_linux_try_add_knl_mcdram_cache() detect it better.
3776 if (!kB && depth == 3 && data->is_knl) {
3777 hwloc_bitmap_free(cacheset);
3778 continue;
3781 /* get the line size */
3782 linesize = 0;
3783 sprintf(str, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j); /* contains %u at least up to 4.9 */
3784 hwloc_read_path_as_uint(str, &linesize, data->root_fd);
3786 /* get the number of sets and lines per tag.
3787 * don't take the associativity directly in "ways_of_associativity" because
3788 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
3790 sets = 0;
3791 sprintf(str, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j); /* contains %u at least up to 4.9 */
3792 hwloc_read_path_as_uint(str, &sets, data->root_fd);
3794 lines_per_tag = 1;
3795 sprintf(str, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j); /* contains %u at least up to 4.9 */
3796 hwloc_read_path_as_uint(str, &lines_per_tag, data->root_fd);
3798 /* first cpu in this cache, add the cache */
3799 cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
3800 cache->attr->cache.size = ((uint64_t)kB) << 10;
3801 cache->attr->cache.depth = depth;
3802 cache->attr->cache.linesize = linesize;
3803 cache->attr->cache.type = type;
3804 if (!linesize || !lines_per_tag || !sets)
3805 cache->attr->cache.associativity = 0; /* unknown */
3806 else if (sets == 1)
3807 cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
3808 else
3809 cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
3810 cache->cpuset = cacheset;
3811 hwloc_debug_1arg_bitmap("cache depth %u has cpuset %s\n",
3812 depth, cacheset);
3813 hwloc_insert_object_by_cpuset(topology, cache);
3814 cacheset = NULL; /* don't free it */
3815 ++caches_added;
3818 hwloc_bitmap_free(cacheset);
3820 } hwloc_bitmap_foreach_end();
3822 /* actually insert in the tree now that package cpusets have been fixed-up */
3823 while (packages) {
3824 hwloc_obj_t next = packages->next_cousin;
3825 packages->next_cousin = NULL;
3826 hwloc_insert_object_by_cpuset(topology, packages);
3827 packages = next;
3830 if (0 == caches_added)
3831 look_powerpc_device_tree(topology, data);
3833 hwloc_bitmap_free(cpuset);
3835 return 0;
3840 /****************************************
3841 ****** cpuinfo Topology Discovery ******
3842 ****************************************/
3844 static int
3845 hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
3846 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3847 int is_global __hwloc_attribute_unused)
3849 if (!strcmp("vendor_id", prefix)) {
3850 hwloc__add_info(infos, infos_count, "CPUVendor", value);
3851 } else if (!strcmp("model name", prefix)) {
3852 hwloc__add_info(infos, infos_count, "CPUModel", value);
3853 } else if (!strcmp("model", prefix)) {
3854 hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3855 } else if (!strcmp("cpu family", prefix)) {
3856 hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3857 } else if (!strcmp("stepping", prefix)) {
3858 hwloc__add_info(infos, infos_count, "CPUStepping", value);
3860 return 0;
3863 static int
3864 hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
3865 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3866 int is_global __hwloc_attribute_unused)
3868 if (!strcmp("vendor", prefix)) {
3869 hwloc__add_info(infos, infos_count, "CPUVendor", value);
3870 } else if (!strcmp("model name", prefix)) {
3871 hwloc__add_info(infos, infos_count, "CPUModel", value);
3872 } else if (!strcmp("model", prefix)) {
3873 hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
3874 } else if (!strcmp("family", prefix)) {
3875 hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
3877 return 0;
3880 static int
3881 hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
3882 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3883 int is_global __hwloc_attribute_unused)
3885 if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
3886 || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
3887 hwloc__add_info(infos, infos_count, "CPUModel", value);
3888 } else if (!strcmp("CPU implementer", prefix)) {
3889 hwloc__add_info(infos, infos_count, "CPUImplementer", value);
3890 } else if (!strcmp("CPU architecture", prefix)) {
3891 hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
3892 } else if (!strcmp("CPU variant", prefix)) {
3893 hwloc__add_info(infos, infos_count, "CPUVariant", value);
3894 } else if (!strcmp("CPU part", prefix)) {
3895 hwloc__add_info(infos, infos_count, "CPUPart", value);
3896 } else if (!strcmp("CPU revision", prefix)) {
3897 hwloc__add_info(infos, infos_count, "CPURevision", value);
3898 } else if (!strcmp("Hardware", prefix)) {
3899 hwloc__add_info(infos, infos_count, "HardwareName", value);
3900 } else if (!strcmp("Revision", prefix)) {
3901 hwloc__add_info(infos, infos_count, "HardwareRevision", value);
3902 } else if (!strcmp("Serial", prefix)) {
3903 hwloc__add_info(infos, infos_count, "HardwareSerial", value);
3905 return 0;
3908 static int
3909 hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
3910 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3911 int is_global)
3913 /* common fields */
3914 if (!strcmp("cpu", prefix)) {
3915 hwloc__add_info(infos, infos_count, "CPUModel", value);
3916 } else if (!strcmp("platform", prefix)) {
3917 hwloc__add_info(infos, infos_count, "PlatformName", value);
3918 } else if (!strcmp("model", prefix)) {
3919 hwloc__add_info(infos, infos_count, "PlatformModel", value);
3921 /* platform-specific fields */
3922 else if (!strcasecmp("vendor", prefix)) {
3923 hwloc__add_info(infos, infos_count, "PlatformVendor", value);
3924 } else if (!strcmp("Board ID", prefix)) {
3925 hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
3926 } else if (!strcmp("Board", prefix)
3927 || !strcasecmp("Machine", prefix)) {
3928 /* machine and board are similar (and often more precise) than model above */
3929 char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
3930 if (*valuep)
3931 free(*valuep);
3932 *valuep = strdup(value);
3933 } else if (!strcasecmp("Revision", prefix)
3934 || !strcmp("Hardware rev", prefix)) {
3935 hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
3936 } else if (!strcmp("SVR", prefix)) {
3937 hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
3938 } else if (!strcmp("PVR", prefix)) {
3939 hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
3941 /* don't match 'board*' because there's also "board l2" on some platforms */
3942 return 0;
3946 * avr32: "chip type\t:" => OK
3947 * blackfin: "model name\t:" => OK
3948 * h8300: "CPU:" => OK
3949 * m68k: "CPU:" => OK
3950 * mips: "cpu model\t\t:" => OK
3951 * openrisc: "CPU:" => OK
3952 * sparc: "cpu\t\t:" => OK
3953 * tile: "model name\t:" => OK
3954 * unicore32: "Processor\t:" => OK
3955 * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:" => "cpu" overwritten by "cpu model", no processor indexes
3956 * cris: "cpu\t\t:" + "cpu model\t:" => only "cpu"
3957 * frv: "CPU-Core:" + "CPU:" => only "CPU"
3958 * mn10300: "cpu core :" + "model name :" => only "model name"
3959 * parisc: "cpu family\t:" + "cpu\t\t:" => only "cpu"
3961 * not supported because of conflicts with other arch minor lines:
3962 * m32r: "cpu family\t:" => KO (adding "cpu family" would break "blackfin")
3963 * microblaze: "CPU-Family:" => KO
3964 * sh: "cpu family\t:" + "cpu type\t:" => KO
3965 * xtensa: "model\t\t:" => KO
3967 static int
3968 hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
3969 struct hwloc_obj_info_s **infos, unsigned *infos_count,
3970 int is_global __hwloc_attribute_unused)
3972 if (!strcmp("model name", prefix)
3973 || !strcmp("Processor", prefix)
3974 || !strcmp("chip type", prefix)
3975 || !strcmp("cpu model", prefix)
3976 || !strcasecmp("cpu", prefix)) {
3977 /* keep the last one, assume it's more precise than the first one.
3978 * we should have the Architecture keypair for basic information anyway.
3980 char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
3981 if (*valuep)
3982 free(*valuep);
3983 *valuep = strdup(value);
3985 return 0;
3988 /* Lprocs_p set to NULL unless returns > 0 */
3989 static int
3990 hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
3991 const char *path,
3992 struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
3993 struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
3995 FILE *fd;
3996 char *str = NULL;
3997 char *endptr;
3998 unsigned len;
3999 unsigned allocated_Lprocs = 0;
4000 struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
4001 unsigned numprocs = 0;
4002 int curproc = -1;
4003 int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
4005 if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
4007 hwloc_debug("could not open %s\n", path);
4008 return -1;
4011 # define PROCESSOR "processor"
4012 # define PACKAGEID "physical id" /* the longest one */
4013 # define COREID "core id"
4014 len = 128; /* vendor/model can be very long */
4015 str = malloc(len);
4016 hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
4017 while (fgets(str,len,fd)!=NULL) {
4018 unsigned long Ppkg, Pcore, Pproc;
4019 char *end, *dot, *prefix, *value;
4020 int noend = 0;
4022 /* remove the ending \n */
4023 end = strchr(str, '\n');
4024 if (end)
4025 *end = 0;
4026 else
4027 noend = 1;
4028 /* if empty line, skip and reset curproc */
4029 if (!*str) {
4030 curproc = -1;
4031 continue;
4033 /* skip lines with no dot */
4034 dot = strchr(str, ':');
4035 if (!dot)
4036 continue;
4037 /* skip lines not starting with a letter */
4038 if ((*str > 'z' || *str < 'a')
4039 && (*str > 'Z' || *str < 'A'))
4040 continue;
4042 /* mark the end of the prefix */
4043 prefix = str;
4044 end = dot;
4045 while (end[-1] == ' ' || end[-1] == '\t') end--; /* need a strrspn() */
4046 *end = 0;
4047 /* find beginning of value, its end is already marked */
4048 value = dot+1 + strspn(dot+1, " \t");
4050 /* defines for parsing numbers */
4051 # define getprocnb_begin(field, var) \
4052 if (!strcmp(field,prefix)) { \
4053 var = strtoul(value,&endptr,0); \
4054 if (endptr==value) { \
4055 hwloc_debug("no number in "field" field of %s\n", path); \
4056 goto err; \
4057 } else if (var==ULONG_MAX) { \
4058 hwloc_debug("too big "field" number in %s\n", path); \
4059 goto err; \
4061 hwloc_debug(field " %lu\n", var)
4062 # define getprocnb_end() \
4064 /* actually parse numbers */
4065 getprocnb_begin(PROCESSOR, Pproc);
4066 curproc = numprocs++;
4067 if (numprocs > allocated_Lprocs) {
4068 struct hwloc_linux_cpuinfo_proc * tmp;
4069 if (!allocated_Lprocs)
4070 allocated_Lprocs = 8;
4071 else
4072 allocated_Lprocs *= 2;
4073 tmp = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
4074 if (!tmp)
4075 goto err;
4076 Lprocs = tmp;
4078 Lprocs[curproc].Pproc = Pproc;
4079 Lprocs[curproc].Pcore = -1;
4080 Lprocs[curproc].Ppkg = -1;
4081 Lprocs[curproc].Lcore = -1;
4082 Lprocs[curproc].Lpkg = -1;
4083 Lprocs[curproc].infos = NULL;
4084 Lprocs[curproc].infos_count = 0;
4085 getprocnb_end() else
4086 getprocnb_begin(PACKAGEID, Ppkg);
4087 Lprocs[curproc].Ppkg = Ppkg;
4088 getprocnb_end() else
4089 getprocnb_begin(COREID, Pcore);
4090 Lprocs[curproc].Pcore = Pcore;
4091 getprocnb_end() else {
4093 /* architecture specific or default routine for parsing cpumodel */
4094 switch (data->arch) {
4095 case HWLOC_LINUX_ARCH_X86:
4096 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
4097 break;
4098 case HWLOC_LINUX_ARCH_ARM:
4099 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
4100 break;
4101 case HWLOC_LINUX_ARCH_POWER:
4102 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
4103 break;
4104 case HWLOC_LINUX_ARCH_IA64:
4105 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
4106 break;
4107 default:
4108 parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
4111 /* we can't assume that we already got a processor index line:
4112 * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
4113 * tile has a global section with model name before the list of processor lines.
4115 parse_cpuinfo_func(prefix, value,
4116 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
4117 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
4118 curproc < 0);
4121 if (noend) {
4122 /* ignore end of line */
4123 if (fscanf(fd,"%*[^\n]") == EOF)
4124 break;
4125 getc(fd);
4128 fclose(fd);
4129 free(str);
4131 *Lprocs_p = Lprocs;
4132 return numprocs;
4134 err:
4135 fclose(fd);
4136 free(str);
4137 free(Lprocs);
4138 *Lprocs_p = NULL;
4139 return -1;
4142 static void
4143 hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
4144 struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
4146 if (Lprocs) {
4147 unsigned i;
4148 for(i=0; i<numprocs; i++) {
4149 hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
4151 free(Lprocs);
4153 hwloc__free_infos(global_infos, global_infos_count);
4156 static int
4157 look_cpuinfo(struct hwloc_topology *topology,
4158 struct hwloc_linux_cpuinfo_proc * Lprocs,
4159 unsigned numprocs, hwloc_bitmap_t online_cpuset)
4161 /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
4162 unsigned *Lcore_to_Pcore;
4163 unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
4164 unsigned *Lpkg_to_Ppkg;
4165 unsigned numpkgs=0;
4166 unsigned numcores=0;
4167 unsigned long Lproc;
4168 unsigned missingpkg;
4169 unsigned missingcore;
4170 unsigned i,j;
4171 hwloc_bitmap_t cpuset;
4173 /* initialize misc arrays, there can be at most numprocs entries */
4174 Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
4175 Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
4176 Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
4177 for (i = 0; i < numprocs; i++) {
4178 Lcore_to_Pcore[i] = -1;
4179 Lcore_to_Ppkg[i] = -1;
4180 Lpkg_to_Ppkg[i] = -1;
4183 cpuset = hwloc_bitmap_alloc();
4185 /* create PU objects */
4186 for(Lproc=0; Lproc<numprocs; Lproc++) {
4187 unsigned long Pproc = Lprocs[Lproc].Pproc;
4188 hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
4189 hwloc_bitmap_set(cpuset, Pproc);
4190 obj->cpuset = hwloc_bitmap_alloc();
4191 hwloc_bitmap_only(obj->cpuset, Pproc);
4192 hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
4193 Lproc, Pproc, obj->cpuset);
4194 hwloc_insert_object_by_cpuset(topology, obj);
4197 topology->support.discovery->pu = 1;
4198 hwloc_bitmap_copy(online_cpuset, cpuset);
4199 hwloc_bitmap_free(cpuset);
4201 hwloc_debug("%u online processors found\n", numprocs);
4202 hwloc_debug_bitmap("online processor cpuset: %s\n", online_cpuset);
4204 hwloc_debug("%s", "\n * Topology summary *\n");
4205 hwloc_debug("%u processors)\n", numprocs);
4207 /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
4208 for(Lproc=0; Lproc<numprocs; Lproc++) {
4209 long Ppkg = Lprocs[Lproc].Ppkg;
4210 if (Ppkg != -1) {
4211 unsigned long Pproc = Lprocs[Lproc].Pproc;
4212 for (i=0; i<numpkgs; i++)
4213 if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
4214 break;
4215 Lprocs[Lproc].Lpkg = i;
4216 hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, (unsigned long) Ppkg);
4217 if (i==numpkgs) {
4218 Lpkg_to_Ppkg[numpkgs] = Ppkg;
4219 numpkgs++;
4223 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4224 * provide bogus information. We should rather drop it. */
4225 missingpkg=0;
4226 for(j=0; j<numprocs; j++)
4227 if (Lprocs[j].Ppkg == -1) {
4228 missingpkg=1;
4229 break;
4231 /* create package objects */
4232 hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
4233 if (!missingpkg && numpkgs>0) {
4234 for (i = 0; i < numpkgs; i++) {
4235 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
4236 int doneinfos = 0;
4237 obj->cpuset = hwloc_bitmap_alloc();
4238 for(j=0; j<numprocs; j++)
4239 if ((unsigned) Lprocs[j].Lpkg == i) {
4240 hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4241 if (!doneinfos) {
4242 hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
4243 doneinfos = 1;
4246 hwloc_debug_1arg_bitmap("Package %u has cpuset %s\n", i, obj->cpuset);
4247 hwloc_insert_object_by_cpuset(topology, obj);
4249 hwloc_debug("%s", "\n");
4252 /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
4253 for(Lproc=0; Lproc<numprocs; Lproc++) {
4254 long Pcore = Lprocs[Lproc].Pcore;
4255 if (Pcore != -1) {
4256 for (i=0; i<numcores; i++)
4257 if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
4258 break;
4259 Lprocs[Lproc].Lcore = i;
4260 if (i==numcores) {
4261 Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
4262 Lcore_to_Pcore[numcores] = Pcore;
4263 numcores++;
4267 /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
4268 * provide bogus information. We should rather drop it. */
4269 missingcore=0;
4270 for(j=0; j<numprocs; j++)
4271 if (Lprocs[j].Pcore == -1) {
4272 missingcore=1;
4273 break;
4275 /* create Core objects */
4276 hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
4277 if (!missingcore && numcores>0) {
4278 for (i = 0; i < numcores; i++) {
4279 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
4280 obj->cpuset = hwloc_bitmap_alloc();
4281 for(j=0; j<numprocs; j++)
4282 if ((unsigned) Lprocs[j].Lcore == i)
4283 hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
4284 hwloc_debug_1arg_bitmap("Core %u has cpuset %s\n", i, obj->cpuset);
4285 hwloc_insert_object_by_cpuset(topology, obj);
4287 hwloc_debug("%s", "\n");
4290 free(Lcore_to_Pcore);
4291 free(Lcore_to_Ppkg);
4292 free(Lpkg_to_Ppkg);
4293 return 0;
4298 /*************************************
4299 ****** Main Topology Discovery ******
4300 *************************************/
4302 static void
4303 hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
4305 char line[64], *tmp, *end;
4306 if (hwloc_read_path_by_length("/proc/elog", line, sizeof(line), data->root_fd) < 0)
4307 return;
4308 if (strncmp(line, "Card ", 5))
4309 return;
4310 tmp = line + 5;
4311 end = strchr(tmp, ':');
4312 if (!end)
4313 return;
4314 *end = '\0';
4315 hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
4318 static void
4319 hwloc_gather_system_info(struct hwloc_topology *topology,
4320 struct hwloc_linux_backend_data_s *data)
4322 FILE *file;
4323 char line[128]; /* enough for utsname fields */
4324 const char *env;
4326 /* initialize to something sane, in case !is_thissystem and we can't find things in /proc/hwloc-nofile-info */
4327 memset(&data->utsname, 0, sizeof(data->utsname));
4328 data->fallback_nbprocessors = 1;
4329 data->pagesize = 4096;
4331 /* read thissystem info */
4332 if (topology->is_thissystem) {
4333 uname(&data->utsname);
4334 data->fallback_nbprocessors = hwloc_fallback_nbprocessors(topology);
4335 data->pagesize = hwloc_getpagesize();
4338 /* overwrite with optional /proc/hwloc-nofile-info */
4339 file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
4340 if (file) {
4341 while (fgets(line, sizeof(line), file)) {
4342 char *tmp = strchr(line, '\n');
4343 if (!strncmp("OSName: ", line, 8)) {
4344 if (tmp)
4345 *tmp = '\0';
4346 strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
4347 data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
4348 } else if (!strncmp("OSRelease: ", line, 11)) {
4349 if (tmp)
4350 *tmp = '\0';
4351 strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
4352 data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
4353 } else if (!strncmp("OSVersion: ", line, 11)) {
4354 if (tmp)
4355 *tmp = '\0';
4356 strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
4357 data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
4358 } else if (!strncmp("HostName: ", line, 10)) {
4359 if (tmp)
4360 *tmp = '\0';
4361 strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
4362 data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
4363 } else if (!strncmp("Architecture: ", line, 14)) {
4364 if (tmp)
4365 *tmp = '\0';
4366 strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
4367 data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
4368 } else if (!strncmp("FallbackNbProcessors: ", line, 22)) {
4369 if (tmp)
4370 *tmp = '\0';
4371 data->fallback_nbprocessors = atoi(line+22);
4372 } else if (!strncmp("PageSize: ", line, 10)) {
4373 if (tmp)
4374 *tmp = '\0';
4375 data->pagesize = strtoull(line+10, NULL, 10);
4376 } else {
4377 hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
4378 /* ignored */
4381 fclose(file);
4384 env = getenv("HWLOC_DUMP_NOFILE_INFO");
4385 if (env && *env) {
4386 file = fopen(env, "w");
4387 if (file) {
4388 if (*data->utsname.sysname)
4389 fprintf(file, "OSName: %s\n", data->utsname.sysname);
4390 if (*data->utsname.release)
4391 fprintf(file, "OSRelease: %s\n", data->utsname.release);
4392 if (*data->utsname.version)
4393 fprintf(file, "OSVersion: %s\n", data->utsname.version);
4394 if (*data->utsname.nodename)
4395 fprintf(file, "HostName: %s\n", data->utsname.nodename);
4396 if (*data->utsname.machine)
4397 fprintf(file, "Architecture: %s\n", data->utsname.machine);
4398 fprintf(file, "FallbackNbProcessors: %u\n", data->fallback_nbprocessors);
4399 fprintf(file, "PageSize: %llu\n", (unsigned long long) data->pagesize);
4400 fclose(file);
4404 /* detect arch for quirks, using configure #defines if possible, or uname */
4405 #if (defined HWLOC_X86_32_ARCH) || (defined HWLOC_X86_64_ARCH) /* does not cover KNC */
4406 if (topology->is_thissystem)
4407 data->arch = HWLOC_LINUX_ARCH_X86;
4408 #endif
4409 if (data->arch == HWLOC_LINUX_ARCH_UNKNOWN && *data->utsname.machine) {
4410 if (!strcmp(data->utsname.machine, "x86_64")
4411 || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
4412 || !strcmp(data->utsname.machine, "k1om"))
4413 data->arch = HWLOC_LINUX_ARCH_X86;
4414 else if (!strncmp(data->utsname.machine, "arm", 3))
4415 data->arch = HWLOC_LINUX_ARCH_ARM;
4416 else if (!strncmp(data->utsname.machine, "ppc", 3)
4417 || !strncmp(data->utsname.machine, "power", 5))
4418 data->arch = HWLOC_LINUX_ARCH_POWER;
4419 else if (!strcmp(data->utsname.machine, "ia64"))
4420 data->arch = HWLOC_LINUX_ARCH_IA64;
4424 /* returns 0 on success, -1 on non-match or error during hardwired load */
4425 static int
4426 hwloc_linux_try_hardwired_cpuinfo(struct hwloc_backend *backend)
4428 struct hwloc_topology *topology = backend->topology;
4429 struct hwloc_linux_backend_data_s *data = backend->private_data;
4431 if (getenv("HWLOC_NO_HARDWIRED_TOPOLOGY"))
4432 return -1;
4434 if (!strcmp(data->utsname.machine, "s64fx")) {
4435 char line[128];
4436 /* Fujistu K-computer, FX10, and FX100 use specific processors
4437 * whose Linux topology support is broken until 4.1 (acc455cffa75070d55e74fc7802b49edbc080e92and)
4438 * and existing machines will likely never be fixed by kernel upgrade.
4441 /* /proc/cpuinfo starts with one of these lines:
4442 * "cpu : Fujitsu SPARC64 VIIIfx"
4443 * "cpu : Fujitsu SPARC64 XIfx"
4444 * "cpu : Fujitsu SPARC64 IXfx"
4446 if (hwloc_read_path_by_length("/proc/cpuinfo", line, sizeof(line), data->root_fd) < 0)
4447 return -1;
4449 if (strncmp(line, "cpu\t", 4))
4450 return -1;
4452 if (strstr(line, "Fujitsu SPARC64 VIIIfx"))
4453 return hwloc_look_hardwired_fujitsu_k(topology);
4454 else if (strstr(line, "Fujitsu SPARC64 IXfx"))
4455 return hwloc_look_hardwired_fujitsu_fx10(topology);
4456 else if (strstr(line, "FUJITSU SPARC64 XIfx"))
4457 return hwloc_look_hardwired_fujitsu_fx100(topology);
4459 return -1;
4462 static void hwloc_linux__get_allowed_resources(hwloc_topology_t topology, const char *root_path, int root_fd, char **cpuset_namep)
4464 char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
4465 hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, root_path);
4466 if (cgroup_mntpnt || cpuset_mntpnt) {
4467 cpuset_name = hwloc_read_linux_cpuset_name(root_fd, topology->pid);
4468 if (cpuset_name) {
4469 hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
4470 hwloc_admin_disable_set_from_cpuset(root_fd, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
4472 free(cgroup_mntpnt);
4473 free(cpuset_mntpnt);
4475 *cpuset_namep = cpuset_name;
4478 static int
4479 hwloc_look_linuxfs(struct hwloc_backend *backend)
4481 struct hwloc_topology *topology = backend->topology;
4482 struct hwloc_linux_backend_data_s *data = backend->private_data;
4483 DIR *nodes_dir;
4484 unsigned nbnodes;
4485 char *cpuset_name;
4486 struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
4487 struct hwloc_obj_info_s *global_infos = NULL;
4488 unsigned global_infos_count = 0;
4489 int numprocs;
4490 int already_pus;
4491 int err;
4493 already_pus = (topology->levels[0][0]->complete_cpuset != NULL
4494 && !hwloc_bitmap_iszero(topology->levels[0][0]->complete_cpuset));
4495 /* if there are PUs, still look at memory information
4496 * since x86 misses NUMA node information (unless the processor supports topoext)
4497 * memory size.
4500 /* allocate root sets in case not done yet */
4501 hwloc_alloc_obj_cpusets(topology->levels[0][0]);
4503 /*********************************
4504 * Platform information for later
4506 hwloc_gather_system_info(topology, data);
4508 /**********************
4509 * /proc/cpuinfo
4511 numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
4512 if (numprocs < 0)
4513 numprocs = 0;
4515 /**************************
4516 * detect model for quirks
4518 if (data->arch == HWLOC_LINUX_ARCH_X86 && numprocs > 0) {
4519 unsigned i;
4520 const char *cpuvendor = NULL, *cpufamilynumber = NULL, *cpumodelnumber = NULL;
4521 for(i=0; i<Lprocs[0].infos_count; i++) {
4522 if (!strcmp(Lprocs[0].infos[i].name, "CPUVendor")) {
4523 cpuvendor = Lprocs[0].infos[i].value;
4524 } else if (!strcmp(Lprocs[0].infos[i].name, "CPUFamilyNumber")) {
4525 cpufamilynumber = Lprocs[0].infos[i].value;
4526 } else if (!strcmp(Lprocs[0].infos[i].name, "CPUModelNumber")) {
4527 cpumodelnumber = Lprocs[0].infos[i].value;
4530 if (cpuvendor && !strcmp(cpuvendor, "GenuineIntel")
4531 && cpufamilynumber && !strcmp(cpufamilynumber, "6")
4532 && cpumodelnumber && (!strcmp(cpumodelnumber, "87")
4533 || !strcmp(cpumodelnumber, "133")))
4534 data->is_knl = 1;
4535 if (cpuvendor && !strcmp(cpuvendor, "AuthenticAMD")
4536 && cpufamilynumber
4537 && (!strcmp(cpufamilynumber, "21")
4538 || !strcmp(cpufamilynumber, "22")))
4539 data->is_amd_with_CU = 1;
4542 /**********************
4543 * Gather the list of admin-disabled cpus and mems
4545 hwloc_linux__get_allowed_resources(topology, data->root_path, data->root_fd, &cpuset_name);
4547 nodes_dir = hwloc_opendir("/proc/nodes", data->root_fd);
4548 if (nodes_dir) {
4549 /* Kerrighed */
4550 struct dirent *dirent;
4551 char path[128];
4552 hwloc_obj_t machine;
4553 hwloc_bitmap_t machine_online_set;
4555 if (already_pus) {
4556 /* we don't support extending kerrighed topologies */
4557 free(cpuset_name);
4558 hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4559 return 0;
4562 /* replace top-level object type with SYSTEM and add some MACHINE underneath */
4564 topology->levels[0][0]->type = HWLOC_OBJ_SYSTEM;
4565 topology->levels[0][0]->name = strdup("Kerrighed");
4567 /* No cpuset support for now. */
4568 /* No sys support for now. */
4569 while ((dirent = readdir(nodes_dir)) != NULL) {
4570 struct hwloc_linux_cpuinfo_proc * machine_Lprocs = NULL;
4571 struct hwloc_obj_info_s *machine_global_infos = NULL;
4572 unsigned machine_global_infos_count = 0;
4573 int machine_numprocs = 0;
4574 unsigned long node;
4575 if (strncmp(dirent->d_name, "node", 4))
4576 continue;
4577 machine_online_set = hwloc_bitmap_alloc();
4578 node = strtoul(dirent->d_name+4, NULL, 0);
4579 snprintf(path, sizeof(path), "/proc/nodes/node%lu/cpuinfo", node);
4580 machine_numprocs = hwloc_linux_parse_cpuinfo(data, path, &machine_Lprocs, &machine_global_infos, &machine_global_infos_count);
4581 if (machine_numprocs < 0) {
4582 err = -1;
4583 machine_numprocs = 0;
4584 } else {
4585 err = look_cpuinfo(topology, machine_Lprocs, machine_numprocs, machine_online_set);
4588 hwloc_linux_free_cpuinfo(machine_Lprocs, machine_numprocs, machine_global_infos, machine_global_infos_count);
4589 if (err < 0) {
4590 hwloc_bitmap_free(machine_online_set);
4591 continue;
4593 hwloc_bitmap_or(topology->levels[0][0]->online_cpuset, topology->levels[0][0]->online_cpuset, machine_online_set);
4594 machine = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, node);
4595 machine->cpuset = machine_online_set;
4596 hwloc_debug_1arg_bitmap("machine number %lu has cpuset %s\n",
4597 node, machine_online_set);
4599 /* Get the machine memory attributes */
4600 hwloc_get_kerrighed_node_meminfo_info(topology, data, node, &machine->memory);
4602 /* Gather DMI info */
4603 /* FIXME: get the right DMI info of each machine */
4604 hwloc__get_dmi_id_info(data, machine);
4606 hwloc_insert_object_by_cpuset(topology, machine);
4608 closedir(nodes_dir);
4609 } else {
4610 /*********************
4611 * Memory information
4614 /* Get the machine memory attributes */
4615 hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
4617 /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
4618 if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
4619 look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
4621 /* if we found some numa nodes, the machine object has no local memory */
4622 if (nbnodes) {
4623 unsigned i;
4624 topology->levels[0][0]->memory.local_memory = 0;
4625 if (topology->levels[0][0]->memory.page_types)
4626 for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
4627 topology->levels[0][0]->memory.page_types[i].count = 0;
4630 /**********************
4631 * CPU information
4634 /* Don't rediscover CPU resources if already done */
4635 if (already_pus)
4636 goto done;
4638 /* Gather the list of cpus now */
4639 err = hwloc_linux_try_hardwired_cpuinfo(backend);
4640 if (!err)
4641 goto done;
4643 /* setup root info */
4644 hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
4645 &global_infos, &global_infos_count);
4647 if (getenv("HWLOC_LINUX_USE_CPUINFO")
4648 || (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
4649 && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4650 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
4651 && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
4652 /* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
4653 * or not containing anything interesting */
4654 if (numprocs > 0)
4655 err = look_cpuinfo(topology, Lprocs, numprocs, topology->levels[0][0]->online_cpuset);
4656 else
4657 err = -1;
4658 if (err < 0)
4659 hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4660 look_powerpc_device_tree(topology, data);
4662 } else {
4663 /* sysfs */
4664 if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
4665 if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
4666 /* sysfs but we failed to read cpu topology, fallback */
4667 hwloc_setup_pu_level(topology, data->fallback_nbprocessors);
4670 done:
4672 /**********************
4673 * Misc
4676 /* Gather DMI info */
4677 hwloc__get_dmi_id_info(data, topology->levels[0][0]);
4678 if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
4679 hwloc__get_firmware_dmi_memory_info(topology, data);
4682 hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
4683 if (cpuset_name) {
4684 hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
4685 free(cpuset_name);
4688 hwloc__linux_get_mic_sn(topology, data);
4690 /* data->utsname was filled with real uname or \0, we can safely pass it */
4691 hwloc_add_uname_info(topology, &data->utsname);
4693 hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
4694 return 1;
4699 /****************************************
4700 ***** Linux PCI backend callbacks ******
4701 ****************************************
4702 * Do not support changing the fsroot (use sysfs)
4705 static hwloc_obj_t
4706 hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
4708 struct hwloc_topology *topology = backend->topology;
4709 struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
4710 obj->name = strdup(name);
4711 obj->logical_index = -1;
4712 obj->attr->osdev.type = type;
4714 hwloc_insert_object_by_parent(topology, pcidev, obj);
4715 /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
4717 return obj;
4720 typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
4722 /* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
4724 static void
4725 hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
4727 int root_fd = data->root_fd;
4728 DIR *dir;
4729 struct dirent *dirent;
4730 char path[128];
4731 struct stat st;
4733 data->deprecated_classlinks_model = -1;
4735 dir = hwloc_opendir("/sys/class/net", root_fd);
4736 if (!dir)
4737 return;
4738 while ((dirent = readdir(dir)) != NULL) {
4739 int err;
4740 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
4741 continue;
4742 err = snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
4743 if ((size_t) err < sizeof(path)
4744 && hwloc_stat(path, &st, root_fd) == 0) {
4745 data->deprecated_classlinks_model = 0;
4746 goto out;
4748 err = snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
4749 if ((size_t) err < sizeof(path)
4750 && hwloc_stat(path, &st, root_fd) == 0) {
4751 data->deprecated_classlinks_model = 1;
4752 goto out;
4755 out:
4756 closedir(dir);
4759 /* class objects that are immediately below pci devices:
4760 * look for objects of the given classname below a sysfs (pcidev) directory
4762 static int
4763 hwloc_linux_class_readdir(struct hwloc_backend *backend,
4764 struct hwloc_obj *pcidev, const char *devicepath,
4765 hwloc_obj_osdev_type_t type, const char *classname,
4766 hwloc_linux_class_fillinfos_t fillinfo)
4768 struct hwloc_linux_backend_data_s *data = backend->private_data;
4769 int root_fd = data->root_fd;
4770 size_t classnamelen = strlen(classname);
4771 char path[256];
4772 DIR *dir;
4773 struct dirent *dirent;
4774 hwloc_obj_t obj;
4775 int res = 0, err;
4777 if (data->deprecated_classlinks_model == -2)
4778 hwloc_linux_check_deprecated_classlinks_model(data);
4780 if (data->deprecated_classlinks_model != 1) {
4781 /* modern sysfs: <device>/<class>/<name> */
4782 struct stat st;
4784 err = snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
4785 if ((size_t) err >= sizeof(path))
4786 goto trydeprecated;
4788 /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
4789 * make sure <device>/<class> is a directory to avoid this case.
4791 err = hwloc_lstat(path, &st, root_fd);
4792 if (err < 0 || !S_ISDIR(st.st_mode))
4793 goto trydeprecated;
4795 dir = hwloc_opendir(path, root_fd);
4796 if (dir) {
4797 data->deprecated_classlinks_model = 0;
4798 while ((dirent = readdir(dir)) != NULL) {
4799 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
4800 continue;
4801 obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
4802 if (fillinfo) {
4803 err = snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
4804 if ((size_t) err < sizeof(path))
4805 fillinfo(backend, obj, path);
4807 res++;
4809 closedir(dir);
4810 return res;
4814 trydeprecated:
4815 if (data->deprecated_classlinks_model != 0) {
4816 /* deprecated sysfs: <device>/<class>:<name> */
4817 dir = hwloc_opendir(devicepath, root_fd);
4818 if (dir) {
4819 while ((dirent = readdir(dir)) != NULL) {
4820 if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
4821 continue;
4822 data->deprecated_classlinks_model = 1;
4823 obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
4824 if (fillinfo) {
4825 err = snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
4826 if ((size_t) err < sizeof(path))
4827 fillinfo(backend, obj, path);
4829 res++;
4831 closedir(dir);
4832 return res;
4836 return 0;
4840 * look for net objects below a pcidev in sysfs
4842 static void
4843 hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
4844 struct hwloc_obj *obj, const char *osdevpath)
4846 struct hwloc_linux_backend_data_s *data = backend->private_data;
4847 int root_fd = data->root_fd;
4848 struct stat st;
4849 char path[256];
4850 char address[128];
4851 snprintf(path, sizeof(path), "%s/address", osdevpath);
4852 if (!hwloc_read_path_by_length(path, address, sizeof(address), root_fd)) {
4853 char *eol = strchr(address, '\n');
4854 if (eol)
4855 *eol = 0;
4856 hwloc_obj_add_info(obj, "Address", address);
4858 snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
4859 if (!hwloc_stat(path, &st, root_fd)) {
4860 char hexid[16];
4861 snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
4862 if (!hwloc_read_path_by_length(path, hexid, sizeof(hexid), root_fd)) {
4863 char *eoid;
4864 unsigned long port;
4865 port = strtoul(hexid, &eoid, 0);
4866 if (eoid != hexid) {
4867 char portstr[16];
4868 snprintf(portstr, sizeof(portstr), "%lu", port+1);
4869 hwloc_obj_add_info(obj, "Port", portstr);
4875 static int
4876 hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
4877 struct hwloc_obj *pcidev, const char *pcidevpath)
4879 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
4883 * look for infiniband objects below a pcidev in sysfs
4885 static void
4886 hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
4887 struct hwloc_obj *obj, const char *osdevpath)
4889 struct hwloc_linux_backend_data_s *data = backend->private_data;
4890 int root_fd = data->root_fd;
4891 char path[256];
4892 char guidvalue[20];
4893 unsigned i,j;
4895 snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
4896 if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4897 size_t len;
4898 len = strspn(guidvalue, "0123456789abcdefx:");
4899 guidvalue[len] = '\0';
4900 hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
4903 snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
4904 if (!hwloc_read_path_by_length(path, guidvalue, sizeof(guidvalue), root_fd)) {
4905 size_t len;
4906 len = strspn(guidvalue, "0123456789abcdefx:");
4907 guidvalue[len] = '\0';
4908 hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
4911 for(i=1; ; i++) {
4912 char statevalue[2];
4913 char lidvalue[11];
4914 char gidvalue[40];
4916 snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
4917 if (!hwloc_read_path_by_length(path, statevalue, sizeof(statevalue), root_fd)) {
4918 char statename[32];
4919 statevalue[1] = '\0'; /* only keep the first byte/digit */
4920 snprintf(statename, sizeof(statename), "Port%uState", i);
4921 hwloc_obj_add_info(obj, statename, statevalue);
4922 } else {
4923 /* no such port */
4924 break;
4927 snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
4928 if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4929 char lidname[32];
4930 size_t len;
4931 len = strspn(lidvalue, "0123456789abcdefx");
4932 lidvalue[len] = '\0';
4933 snprintf(lidname, sizeof(lidname), "Port%uLID", i);
4934 hwloc_obj_add_info(obj, lidname, lidvalue);
4937 snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
4938 if (!hwloc_read_path_by_length(path, lidvalue, sizeof(lidvalue), root_fd)) {
4939 char lidname[32];
4940 size_t len;
4941 len = strspn(lidvalue, "0123456789");
4942 lidvalue[len] = '\0';
4943 snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
4944 hwloc_obj_add_info(obj, lidname, lidvalue);
4947 for(j=0; ; j++) {
4948 snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
4949 if (!hwloc_read_path_by_length(path, gidvalue, sizeof(gidvalue), root_fd)) {
4950 char gidname[32];
4951 size_t len;
4952 len = strspn(gidvalue, "0123456789abcdefx:");
4953 gidvalue[len] = '\0';
4954 if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
4955 /* only keep initialized GIDs */
4956 snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
4957 hwloc_obj_add_info(obj, gidname, gidvalue);
4959 } else {
4960 /* no such port */
4961 break;
4967 static int
4968 hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
4969 struct hwloc_obj *pcidev, const char *pcidevpath)
4971 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
4974 /* look for dma objects below a pcidev in sysfs */
4975 static int
4976 hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
4977 struct hwloc_obj *pcidev, const char *pcidevpath)
4979 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
4982 /* look for drm objects below a pcidev in sysfs */
4983 static int
4984 hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
4985 struct hwloc_obj *pcidev, const char *pcidevpath)
4987 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
4989 /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
4991 /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
4992 * so we could create a OS device for each PCI devices with such a field.
4993 * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
4998 * look for block objects below a pcidev in sysfs
5001 static void
5002 hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
5003 struct hwloc_obj *obj, const char *osdevpath)
5005 struct hwloc_linux_backend_data_s *data = backend->private_data;
5006 int root_fd = data->root_fd;
5007 FILE *file;
5008 char path[256];
5009 char line[128];
5010 char vendor[64] = "";
5011 char model[64] = "";
5012 char serial[64] = "";
5013 char revision[64] = "";
5014 char blocktype[64] = "";
5015 unsigned major_id, minor_id;
5016 char *tmp;
5018 snprintf(path, sizeof(path), "%s/dev", osdevpath);
5019 if (hwloc_read_path_by_length(path, line, sizeof(line), root_fd) < 0)
5020 goto done;
5022 if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
5023 goto done;
5024 tmp = strchr(line, '\n');
5025 if (tmp)
5026 *tmp = '\0';
5027 hwloc_obj_add_info(obj, "LinuxDeviceID", line);
5029 #ifdef HWLOC_HAVE_LIBUDEV
5030 if (data->udev) {
5031 struct udev_device *dev;
5032 const char *prop;
5033 dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
5034 if (!dev)
5035 goto done;
5036 prop = udev_device_get_property_value(dev, "ID_VENDOR");
5037 if (prop) {
5038 strncpy(vendor, prop, sizeof(vendor));
5039 vendor[sizeof(vendor)-1] = '\0';
5041 prop = udev_device_get_property_value(dev, "ID_MODEL");
5042 if (prop) {
5043 strncpy(model, prop, sizeof(model));
5044 model[sizeof(model)-1] = '\0';
5046 prop = udev_device_get_property_value(dev, "ID_REVISION");
5047 if (prop) {
5048 strncpy(revision, prop, sizeof(revision));
5049 revision[sizeof(revision)-1] = '\0';
5051 prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
5052 if (prop) {
5053 strncpy(serial, prop, sizeof(serial));
5054 serial[sizeof(serial)-1] = '\0';
5056 prop = udev_device_get_property_value(dev, "ID_TYPE");
5057 if (prop) {
5058 strncpy(blocktype, prop, sizeof(blocktype));
5059 blocktype[sizeof(blocktype)-1] = '\0';
5062 udev_device_unref(dev);
5063 } else
5064 /* fallback to reading files, works with any fsroot */
5065 #endif
5067 snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
5068 file = hwloc_fopen(path, "r", root_fd);
5069 if (!file)
5070 goto done;
5072 while (NULL != fgets(line, sizeof(line), file)) {
5073 tmp = strchr(line, '\n');
5074 if (tmp)
5075 *tmp = '\0';
5076 if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
5077 strncpy(vendor, line+strlen("E:ID_VENDOR="), sizeof(vendor));
5078 vendor[sizeof(vendor)-1] = '\0';
5079 } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
5080 strncpy(model, line+strlen("E:ID_MODEL="), sizeof(model));
5081 model[sizeof(model)-1] = '\0';
5082 } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
5083 strncpy(revision, line+strlen("E:ID_REVISION="), sizeof(revision));
5084 revision[sizeof(revision)-1] = '\0';
5085 } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
5086 strncpy(serial, line+strlen("E:ID_SERIAL_SHORT="), sizeof(serial));
5087 serial[sizeof(serial)-1] = '\0';
5088 } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
5089 strncpy(blocktype, line+strlen("E:ID_TYPE="), sizeof(blocktype));
5090 blocktype[sizeof(blocktype)-1] = '\0';
5093 fclose(file);
5096 done:
5097 /* clear fake "ATA" vendor name */
5098 if (!strcasecmp(vendor, "ATA"))
5099 *vendor = '\0';
5100 /* overwrite vendor name from model when possible */
5101 if (!*vendor) {
5102 if (!strncasecmp(model, "wd", 2))
5103 strcpy(vendor, "Western Digital");
5104 else if (!strncasecmp(model, "st", 2))
5105 strcpy(vendor, "Seagate");
5106 else if (!strncasecmp(model, "samsung", 7))
5107 strcpy(vendor, "Samsung");
5108 else if (!strncasecmp(model, "sandisk", 7))
5109 strcpy(vendor, "SanDisk");
5110 else if (!strncasecmp(model, "toshiba", 7))
5111 strcpy(vendor, "Toshiba");
5114 if (*vendor)
5115 hwloc_obj_add_info(obj, "Vendor", vendor);
5116 if (*model)
5117 hwloc_obj_add_info(obj, "Model", model);
5118 if (*revision)
5119 hwloc_obj_add_info(obj, "Revision", revision);
5120 if (*serial)
5121 hwloc_obj_add_info(obj, "SerialNumber", serial);
5123 if (!strcmp(blocktype, "disk") || !strncmp(obj->name, "nvme", 4))
5124 hwloc_obj_add_info(obj, "Type", "Disk");
5125 else if (!strcmp(blocktype, "tape"))
5126 hwloc_obj_add_info(obj, "Type", "Tape");
5127 else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
5128 hwloc_obj_add_info(obj, "Type", "Removable Media Device");
5129 else /* generic, usb mass storage/rbc, usb mass storage/scsi */
5130 hwloc_obj_add_info(obj, "Type", "Other");
5133 /* block class objects are in
5134 * host%d/target%d:%d:%d/%d:%d:%d:%d/
5135 * or
5136 * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
5137 * or
5138 * ide%d/%d.%d/
5139 * below pci devices */
5140 static int
5141 hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
5142 struct hwloc_obj *pcidev, char *path, size_t pathlen)
5144 struct hwloc_linux_backend_data_s *data = backend->private_data;
5145 int root_fd = data->root_fd;
5146 DIR *hostdir, *portdir, *targetdir;
5147 struct dirent *hostdirent, *portdirent, *targetdirent;
5148 size_t hostdlen, portdlen, targetdlen;
5149 int dummy;
5150 int res = 0;
5152 hostdir = hwloc_opendir(path, root_fd);
5153 if (!hostdir)
5154 return 0;
5156 while ((hostdirent = readdir(hostdir)) != NULL) {
5157 if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
5159 /* found host%d/port-%d:%d */
5160 path[pathlen] = '/';
5161 strcpy(&path[pathlen+1], hostdirent->d_name);
5162 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5163 portdir = hwloc_opendir(path, root_fd);
5164 if (!portdir)
5165 continue;
5166 while ((portdirent = readdir(portdir)) != NULL) {
5167 if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
5168 /* found host%d/port-%d:%d/end_device-%d:%d */
5169 path[pathlen] = '/';
5170 strcpy(&path[pathlen+1], portdirent->d_name);
5171 pathlen += portdlen = 1+strlen(portdirent->d_name);
5172 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5173 /* restore parent path */
5174 pathlen -= portdlen;
5175 path[pathlen] = '\0';
5178 closedir(portdir);
5179 /* restore parent path */
5180 pathlen -= hostdlen;
5181 path[pathlen] = '\0';
5182 continue;
5183 } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
5184 /* found host%d/target%d:%d:%d */
5185 path[pathlen] = '/';
5186 strcpy(&path[pathlen+1], hostdirent->d_name);
5187 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5188 targetdir = hwloc_opendir(path, root_fd);
5189 if (!targetdir)
5190 continue;
5191 while ((targetdirent = readdir(targetdir)) != NULL) {
5192 if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
5193 continue;
5194 /* found host%d/target%d:%d:%d/%d:%d:%d:%d */
5195 path[pathlen] = '/';
5196 strcpy(&path[pathlen+1], targetdirent->d_name);
5197 pathlen += targetdlen = 1+strlen(targetdirent->d_name);
5198 /* lookup block class for real */
5199 res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
5200 /* restore parent path */
5201 pathlen -= targetdlen;
5202 path[pathlen] = '\0';
5204 closedir(targetdir);
5205 /* restore parent path */
5206 pathlen -= hostdlen;
5207 path[pathlen] = '\0';
5210 closedir(hostdir);
5212 return res;
5215 static int
5216 hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
5217 struct hwloc_obj *pcidev, const char *pcidevpath)
5219 struct hwloc_linux_backend_data_s *data = backend->private_data;
5220 int root_fd = data->root_fd;
5221 size_t pathlen;
5222 DIR *devicedir, *hostdir, *nvmedir;
5223 struct dirent *devicedirent, *hostdirent;
5224 size_t devicedlen, hostdlen;
5225 char path[256];
5226 int dummy;
5227 int res = 0;
5229 strcpy(path, pcidevpath);
5230 pathlen = strlen(path);
5232 /* look for a NVMe class (Linux 4.0+) under nvme/nvme%d/nvme%dn%d/ */
5233 strcpy(&path[pathlen], "/nvme");
5234 nvmedir = hwloc_opendir(path, root_fd);
5235 if (nvmedir) {
5236 struct dirent *nvmedirent;
5237 while ((nvmedirent = readdir(nvmedir)) != NULL) {
5238 DIR *nvmesubdir;
5239 if (strncmp(nvmedirent->d_name, "nvme", 4))
5240 continue;
5241 path[pathlen+5] = '/';
5242 strcpy(&path[pathlen+6], nvmedirent->d_name);
5243 nvmesubdir = hwloc_opendir(path, root_fd);
5244 if (nvmesubdir) {
5245 struct dirent *nvmesubdirent;
5246 while ((nvmesubdirent = readdir(nvmesubdir)) != NULL) {
5247 hwloc_obj_t obj;
5248 size_t nvmednamelen = strlen(nvmedirent->d_name);
5249 if (strncmp(nvmedirent->d_name, nvmesubdirent->d_name, nvmednamelen))
5250 continue;
5251 obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_BLOCK, nvmesubdirent->d_name);
5252 if (obj) {
5253 path[pathlen+6+nvmednamelen] = '/';
5254 strcpy(&path[pathlen+6+nvmednamelen+1], nvmesubdirent->d_name);
5255 hwloc_linux_block_class_fillinfos(backend, obj, path);
5256 res++;
5259 closedir(nvmesubdir);
5262 closedir(nvmedir);
5263 return res;
5265 path[pathlen] = '\0';
5267 /* look for a direct block device here (such as NVMe before Linux 4.0,
5268 * or something without controller subdirs in the middle)
5270 res += hwloc_linux_class_readdir(backend, pcidev, path,
5271 HWLOC_OBJ_OSDEV_BLOCK, "block",
5272 hwloc_linux_block_class_fillinfos);
5273 if (res)
5274 return res;
5275 /* otherwise try to find controller subdirectories */
5277 devicedir = hwloc_opendir(pcidevpath, root_fd);
5278 if (!devicedir)
5279 return 0;
5281 while ((devicedirent = readdir(devicedir)) != NULL) {
5282 if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
5283 /* found ide%d */
5284 path[pathlen] = '/';
5285 strcpy(&path[pathlen+1], devicedirent->d_name);
5286 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5287 hostdir = hwloc_opendir(path, root_fd);
5288 if (!hostdir)
5289 continue;
5290 while ((hostdirent = readdir(hostdir)) != NULL) {
5291 if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
5292 /* found ide%d/%d.%d */
5293 path[pathlen] = '/';
5294 strcpy(&path[pathlen+1], hostdirent->d_name);
5295 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5296 /* lookup block class for real */
5297 res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
5298 /* restore parent path */
5299 pathlen -= hostdlen;
5300 path[pathlen] = '\0';
5303 closedir(hostdir);
5304 /* restore parent path */
5305 pathlen -= devicedlen;
5306 path[pathlen] = '\0';
5307 } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
5308 /* found host%d */
5309 path[pathlen] = '/';
5310 strcpy(&path[pathlen+1], devicedirent->d_name);
5311 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5312 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5313 /* restore parent path */
5314 pathlen -= devicedlen;
5315 path[pathlen] = '\0';
5316 } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
5317 /* found ata%d */
5318 path[pathlen] = '/';
5319 strcpy(&path[pathlen+1], devicedirent->d_name);
5320 pathlen += devicedlen = 1+strlen(devicedirent->d_name);
5321 hostdir = hwloc_opendir(path, root_fd);
5322 if (!hostdir)
5323 continue;
5324 while ((hostdirent = readdir(hostdir)) != NULL) {
5325 if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
5326 /* found ata%d/host%d */
5327 path[pathlen] = '/';
5328 strcpy(&path[pathlen+1], hostdirent->d_name);
5329 pathlen += hostdlen = 1+strlen(hostdirent->d_name);
5330 /* lookup block class for real */
5331 res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
5332 /* restore parent path */
5333 pathlen -= hostdlen;
5334 path[pathlen] = '\0';
5337 closedir(hostdir);
5338 /* restore parent path */
5339 pathlen -= devicedlen;
5340 path[pathlen] = '\0';
5343 closedir(devicedir);
5345 return res;
5348 static void
5349 hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
5350 struct hwloc_obj *obj, const char *osdevpath)
5352 struct hwloc_linux_backend_data_s *data = backend->private_data;
5353 int root_fd = data->root_fd;
5354 char path[256];
5355 char family[64];
5356 char sku[64];
5357 char sn[64];
5358 char string[20];
5360 hwloc_obj_add_info(obj, "CoProcType", "MIC");
5362 snprintf(path, sizeof(path), "%s/family", osdevpath);
5363 if (!hwloc_read_path_by_length(path, family, sizeof(family), root_fd)) {
5364 char *eol = strchr(family, '\n');
5365 if (eol)
5366 *eol = 0;
5367 hwloc_obj_add_info(obj, "MICFamily", family);
5370 snprintf(path, sizeof(path), "%s/sku", osdevpath);
5371 if (!hwloc_read_path_by_length(path, sku, sizeof(sku), root_fd)) {
5372 char *eol = strchr(sku, '\n');
5373 if (eol)
5374 *eol = 0;
5375 hwloc_obj_add_info(obj, "MICSKU", sku);
5378 snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
5379 if (!hwloc_read_path_by_length(path, sn, sizeof(sn), root_fd)) {
5380 char *eol;
5381 eol = strchr(sn, '\n');
5382 if (eol)
5383 *eol = 0;
5384 hwloc_obj_add_info(obj, "MICSerialNumber", sn);
5387 snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
5388 if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5389 unsigned long count = strtoul(string, NULL, 16);
5390 snprintf(string, sizeof(string), "%lu", count);
5391 hwloc_obj_add_info(obj, "MICActiveCores", string);
5394 snprintf(path, sizeof(path), "%s/memsize", osdevpath);
5395 if (!hwloc_read_path_by_length(path, string, sizeof(string), root_fd)) {
5396 unsigned long count = strtoul(string, NULL, 16);
5397 snprintf(string, sizeof(string), "%lu", count);
5398 hwloc_obj_add_info(obj, "MICMemorySize", string);
5402 static int
5403 hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
5404 struct hwloc_obj *pcidev, const char *pcidevpath)
5406 return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
5409 static int
5410 hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
5411 struct hwloc_obj *pcidev)
5413 struct hwloc_linux_backend_data_s *data = backend->private_data;
5414 int root_fd = data->root_fd;
5415 char path[256];
5416 struct stat st;
5417 hwloc_obj_t obj;
5418 unsigned idx;
5419 int res = 0;
5421 if (!data->mic_directlookup_id_max)
5422 /* already tried, nothing to do */
5423 return 0;
5425 if (data->mic_directlookup_id_max == (unsigned) -1) {
5426 /* never tried, find out the max id */
5427 DIR *dir;
5428 struct dirent *dirent;
5430 /* make sure we never do this lookup again */
5431 data->mic_directlookup_id_max = 0;
5433 /* read the entire class and find the max id of mic%u dirents */
5434 dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
5435 if (!dir) {
5436 dir = hwloc_opendir("/sys/class/mic", root_fd);
5437 if (!dir)
5438 return 0;
5440 while ((dirent = readdir(dir)) != NULL) {
5441 if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
5442 continue;
5443 if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
5444 continue;
5445 if (idx >= data->mic_directlookup_id_max)
5446 data->mic_directlookup_id_max = idx+1;
5448 closedir(dir);
5451 /* now iterate over the mic ids and see if one matches our pcidev */
5452 for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
5453 snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
5454 idx, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
5455 if (hwloc_stat(path, &st, root_fd) < 0)
5456 continue;
5457 snprintf(path, sizeof(path), "mic%u", idx);
5458 obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
5459 snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
5460 hwloc_linux_mic_class_fillinfos(backend, obj, path);
5461 res++;
5464 return res;
5468 * backend callback for inserting objects inside a pci device
5470 static int
5471 hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
5472 struct hwloc_obj *obj)
5474 struct hwloc_linux_backend_data_s *data = backend->private_data;
5475 char pcidevpath[256];
5476 int res = 0;
5478 /* this callback is only used in the libpci backend for now */
5479 assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
5481 snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
5482 obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5483 obj->attr->pcidev.dev, obj->attr->pcidev.func);
5485 res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
5486 res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
5487 res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
5488 res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
5489 res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
5491 if (data->mic_need_directlookup == -1) {
5492 struct stat st;
5493 if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
5494 && hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
5495 /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
5496 * do not have mic/mic%u symlinks to mic devices (old mic driver).
5497 * if so, try from the mic class.
5499 data->mic_need_directlookup = 1;
5500 else
5501 data->mic_need_directlookup = 0;
5503 if (data->mic_need_directlookup)
5504 res += hwloc_linux_directlookup_mic_class(backend, obj);
5505 else
5506 res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
5508 return res;
5512 * backend callback for retrieving the location of a pci device
5514 static int
5515 hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
5516 struct hwloc_backend *caller __hwloc_attribute_unused,
5517 struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
5519 struct hwloc_linux_backend_data_s *data = backend->private_data;
5520 char path[256];
5522 /* this callback is only used in the libpci backend for now */
5523 assert(obj->type == HWLOC_OBJ_PCI_DEVICE
5524 || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
5526 snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
5527 obj->attr->pcidev.domain, obj->attr->pcidev.bus,
5528 obj->attr->pcidev.dev, obj->attr->pcidev.func);
5529 if (!hwloc__read_path_as_cpumask(path, cpuset, data->root_fd)
5530 && !hwloc_bitmap_iszero(cpuset))
5531 return 0;
5532 return -1;
5537 /*******************************
5538 ******* Linux component *******
5539 *******************************/
5541 static void
5542 hwloc_linux_backend_disable(struct hwloc_backend *backend)
5544 struct hwloc_linux_backend_data_s *data = backend->private_data;
5545 #ifdef HAVE_OPENAT
5546 if (data->root_path)
5547 free(data->root_path);
5548 close(data->root_fd);
5549 #endif
5550 #ifdef HWLOC_HAVE_LIBUDEV
5551 if (data->udev)
5552 udev_unref(data->udev);
5553 #endif
5554 free(data);
5557 static struct hwloc_backend *
5558 hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
5559 const void *_data1,
5560 const void *_data2 __hwloc_attribute_unused,
5561 const void *_data3 __hwloc_attribute_unused)
5563 struct hwloc_backend *backend;
5564 struct hwloc_linux_backend_data_s *data;
5565 const char * fsroot_path = _data1;
5566 int flags, root = -1;
5568 backend = hwloc_backend_alloc(component);
5569 if (!backend)
5570 goto out;
5572 data = malloc(sizeof(*data));
5573 if (!data) {
5574 errno = ENOMEM;
5575 goto out_with_backend;
5578 backend->private_data = data;
5579 backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5580 backend->discover = hwloc_look_linuxfs;
5581 backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
5582 backend->notify_new_object = hwloc_linux_backend_notify_new_object;
5583 backend->disable = hwloc_linux_backend_disable;
5585 /* default values */
5586 data->arch = HWLOC_LINUX_ARCH_UNKNOWN;
5587 data->is_knl = 0;
5588 data->is_amd_with_CU = 0;
5589 data->is_real_fsroot = 1;
5590 data->root_path = NULL;
5591 if (!fsroot_path)
5592 fsroot_path = "/";
5594 #ifdef HAVE_OPENAT
5595 root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
5596 if (root < 0)
5597 goto out_with_data;
5599 if (strcmp(fsroot_path, "/")) {
5600 backend->is_thissystem = 0;
5601 data->is_real_fsroot = 0;
5602 data->root_path = strdup(fsroot_path);
5605 /* Since this fd stays open after hwloc returns, mark it as
5606 close-on-exec so that children don't inherit it. Stevens says
5607 that we should GETFD before we SETFD, so we do. */
5608 flags = fcntl(root, F_GETFD, 0);
5609 if (-1 == flags ||
5610 -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
5611 close(root);
5612 root = -1;
5613 goto out_with_data;
5615 #else
5616 if (strcmp(fsroot_path, "/")) {
5617 errno = ENOSYS;
5618 goto out_with_data;
5620 #endif
5621 data->root_fd = root;
5623 #ifdef HWLOC_HAVE_LIBUDEV
5624 data->udev = NULL;
5625 if (data->is_real_fsroot) {
5626 data->udev = udev_new();
5628 #endif
5630 data->dumped_hwdata_dirname = getenv("HWLOC_DUMPED_HWDATA_DIR");
5631 if (!data->dumped_hwdata_dirname) {
5632 if (_data1)
5633 data->dumped_hwdata_dirname = (char *) "/var/run/hwloc";
5634 else
5635 data->dumped_hwdata_dirname = (char *) RUNSTATEDIR "/hwloc";
5638 data->deprecated_classlinks_model = -2; /* never tried */
5639 data->mic_need_directlookup = -1; /* not initialized */
5640 data->mic_directlookup_id_max = -1; /* not initialized */
5642 return backend;
5644 out_with_data:
5645 #ifdef HAVE_OPENAT
5646 if (data->root_path)
5647 free(data->root_path);
5648 #endif
5649 free(data);
5650 out_with_backend:
5651 free(backend);
5652 out:
5653 return NULL;
5656 static struct hwloc_disc_component hwloc_linux_disc_component = {
5657 HWLOC_DISC_COMPONENT_TYPE_CPU,
5658 "linux",
5659 HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5660 hwloc_linux_component_instantiate,
5662 NULL
5665 const struct hwloc_component hwloc_linux_component = {
5666 HWLOC_COMPONENT_ABI,
5667 NULL, NULL,
5668 HWLOC_COMPONENT_TYPE_DISC,
5670 &hwloc_linux_disc_component
5676 #ifdef HWLOC_HAVE_LINUXPCI
5678 /***********************************
5679 ******* Linux PCI component *******
5680 ***********************************/
5682 #define HWLOC_PCI_REVISION_ID 0x08
5683 #define HWLOC_PCI_CAP_ID_EXP 0x10
5684 #define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
5686 static int
5687 hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
5689 struct hwloc_topology *topology = backend->topology;
5690 struct hwloc_backend *tmpbackend;
5691 hwloc_obj_t first_obj = NULL, last_obj = NULL;
5692 int root_fd = -1;
5693 DIR *dir;
5694 struct dirent *dirent;
5695 int res = 0;
5697 if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
5698 return 0;
5700 if (hwloc_get_next_pcidev(topology, NULL)) {
5701 hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
5702 return 0;
5705 /* hackily find the linux backend to steal its fsroot */
5706 tmpbackend = topology->backends;
5707 while (tmpbackend) {
5708 if (tmpbackend->component == &hwloc_linux_disc_component) {
5709 root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
5710 hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
5711 break; }
5712 tmpbackend = tmpbackend->next;
5714 /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
5715 if (root_fd >= 0)
5716 root_fd = dup(root_fd);
5717 else
5718 root_fd = open("/", O_RDONLY | O_DIRECTORY);
5720 dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
5721 if (!dir)
5722 goto out_with_rootfd;
5724 while ((dirent = readdir(dir)) != NULL) {
5725 unsigned domain, bus, dev, func;
5726 hwloc_obj_t obj;
5727 struct hwloc_pcidev_attr_s *attr;
5728 unsigned os_index;
5729 char path[64];
5730 char value[16];
5731 size_t ret;
5732 int fd, err;
5734 if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
5735 continue;
5737 os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
5738 obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
5739 if (!obj)
5740 break;
5741 attr = &obj->attr->pcidev;
5743 attr->domain = domain;
5744 attr->bus = bus;
5745 attr->dev = dev;
5746 attr->func = func;
5748 /* default (unknown) values */
5749 attr->vendor_id = 0;
5750 attr->device_id = 0;
5751 attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
5752 attr->revision = 0;
5753 attr->subvendor_id = 0;
5754 attr->subdevice_id = 0;
5755 attr->linkspeed = 0;
5757 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
5758 if ((size_t) err < sizeof(path)
5759 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5760 attr->vendor_id = strtoul(value, NULL, 16);
5762 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
5763 if ((size_t) err < sizeof(path)
5764 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5765 attr->device_id = strtoul(value, NULL, 16);
5767 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
5768 if ((size_t) err < sizeof(path)
5769 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5770 attr->class_id = strtoul(value, NULL, 16) >> 8;
5772 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
5773 if ((size_t) err < sizeof(path)
5774 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5775 attr->subvendor_id = strtoul(value, NULL, 16);
5777 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
5778 if ((size_t) err < sizeof(path)
5779 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5780 attr->subdevice_id = strtoul(value, NULL, 16);
5782 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
5783 if ((size_t) err < sizeof(path)) {
5784 /* don't use hwloc_read_path_by_length() because we don't want the ending \0 */
5785 fd = hwloc_open(path, root_fd);
5786 if (fd >= 0) {
5787 #define CONFIG_SPACE_CACHESIZE 256
5788 unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
5789 unsigned offset;
5791 /* initialize the config space in case we fail to read it (missing permissions, etc). */
5792 memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
5793 ret = read(fd, config_space_cache, CONFIG_SPACE_CACHESIZE);
5794 (void) ret; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
5795 close(fd);
5797 /* is this a bridge? */
5798 if (hwloc_pci_prepare_bridge(obj, config_space_cache) < 0)
5799 continue;
5801 /* get the revision */
5802 attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
5804 /* try to get the link speed */
5805 offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
5806 if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE) {
5807 hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
5808 } else {
5809 /* if not available from config-space (extended part is root-only), look in sysfs files added in 4.13 */
5810 float speed = 0.f;
5811 unsigned width = 0;
5812 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_speed", dirent->d_name);
5813 if ((size_t) err < sizeof(path)
5814 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5815 speed = hwloc_linux_pci_link_speed_from_string(value);
5816 err = snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/current_link_width", dirent->d_name);
5817 if ((size_t) err < sizeof(path)
5818 && !hwloc_read_path_by_length(path, value, sizeof(value), root_fd))
5819 width = atoi(value);
5820 attr->linkspeed = speed*width/8;
5825 if (first_obj)
5826 last_obj->next_sibling = obj;
5827 else
5828 first_obj = obj;
5829 last_obj = obj;
5832 closedir(dir);
5834 dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
5835 if (dir) {
5836 while ((dirent = readdir(dir)) != NULL) {
5837 char path[64];
5838 char buf[64];
5839 unsigned domain, bus, dev;
5840 int err;
5841 if (dirent->d_name[0] == '.')
5842 continue;
5843 err = snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
5844 if ((size_t) err < sizeof(path)
5845 && !hwloc_read_path_by_length(path, buf, sizeof(buf), root_fd)
5846 && sscanf(buf, "%x:%x:%x", &domain, &bus, &dev) == 3) {
5847 hwloc_obj_t obj = first_obj;
5848 while (obj) {
5849 if (obj->attr->pcidev.domain == domain
5850 && obj->attr->pcidev.bus == bus
5851 && obj->attr->pcidev.dev == dev) {
5852 hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
5854 obj = obj->next_sibling;
5858 closedir(dir);
5861 res = hwloc_insert_pci_device_list(backend, first_obj);
5863 out_with_rootfd:
5864 close(root_fd);
5865 return res;
5868 static struct hwloc_backend *
5869 hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
5870 const void *_data1 __hwloc_attribute_unused,
5871 const void *_data2 __hwloc_attribute_unused,
5872 const void *_data3 __hwloc_attribute_unused)
5874 struct hwloc_backend *backend;
5876 /* thissystem may not be fully initialized yet, we'll check flags in discover() */
5878 backend = hwloc_backend_alloc(component);
5879 if (!backend)
5880 return NULL;
5881 backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
5882 backend->discover = hwloc_look_linuxfs_pci;
5883 return backend;
5886 static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
5887 HWLOC_DISC_COMPONENT_TYPE_MISC,
5888 "linuxpci",
5889 HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
5890 hwloc_linuxpci_component_instantiate,
5891 19, /* after pci */
5892 NULL
5895 const struct hwloc_component hwloc_linuxpci_component = {
5896 HWLOC_COMPONENT_ABI,
5897 NULL, NULL,
5898 HWLOC_COMPONENT_TYPE_DISC,
5900 &hwloc_linuxpci_disc_component
5903 #endif /* HWLOC_HAVE_LINUXPCI */