LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
195 KMP_OS_AIX
196#if KMP_OS_LINUX
197/* On some of the older OS's that we build on, these constants aren't present
198 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
199 all systems of the same arch where they are defined, and they cannot change.
200 stone forever. */
201#include <sys/syscall.h>
202#if KMP_ARCH_X86 || KMP_ARCH_ARM
203#ifndef __NR_sched_setaffinity
204#define __NR_sched_setaffinity 241
205#elif __NR_sched_setaffinity != 241
206#error Wrong code for setaffinity system call.
207#endif /* __NR_sched_setaffinity */
208#ifndef __NR_sched_getaffinity
209#define __NR_sched_getaffinity 242
210#elif __NR_sched_getaffinity != 242
211#error Wrong code for getaffinity system call.
212#endif /* __NR_sched_getaffinity */
213#elif KMP_ARCH_AARCH64
214#ifndef __NR_sched_setaffinity
215#define __NR_sched_setaffinity 122
216#elif __NR_sched_setaffinity != 122
217#error Wrong code for setaffinity system call.
218#endif /* __NR_sched_setaffinity */
219#ifndef __NR_sched_getaffinity
220#define __NR_sched_getaffinity 123
221#elif __NR_sched_getaffinity != 123
222#error Wrong code for getaffinity system call.
223#endif /* __NR_sched_getaffinity */
224#elif KMP_ARCH_RISCV64
225#ifndef __NR_sched_setaffinity
226#define __NR_sched_setaffinity 122
227#elif __NR_sched_setaffinity != 122
228#error Wrong code for setaffinity system call.
229#endif /* __NR_sched_setaffinity */
230#ifndef __NR_sched_getaffinity
231#define __NR_sched_getaffinity 123
232#elif __NR_sched_getaffinity != 123
233#error Wrong code for getaffinity system call.
234#endif /* __NR_sched_getaffinity */
235#elif KMP_ARCH_X86_64
236#ifndef __NR_sched_setaffinity
237#define __NR_sched_setaffinity 203
238#elif __NR_sched_setaffinity != 203
239#error Wrong code for setaffinity system call.
240#endif /* __NR_sched_setaffinity */
241#ifndef __NR_sched_getaffinity
242#define __NR_sched_getaffinity 204
243#elif __NR_sched_getaffinity != 204
244#error Wrong code for getaffinity system call.
245#endif /* __NR_sched_getaffinity */
246#elif KMP_ARCH_PPC64
247#ifndef __NR_sched_setaffinity
248#define __NR_sched_setaffinity 222
249#elif __NR_sched_setaffinity != 222
250#error Wrong code for setaffinity system call.
251#endif /* __NR_sched_setaffinity */
252#ifndef __NR_sched_getaffinity
253#define __NR_sched_getaffinity 223
254#elif __NR_sched_getaffinity != 223
255#error Wrong code for getaffinity system call.
256#endif /* __NR_sched_getaffinity */
257#elif KMP_ARCH_MIPS
258#ifndef __NR_sched_setaffinity
259#define __NR_sched_setaffinity 4239
260#elif __NR_sched_setaffinity != 4239
261#error Wrong code for setaffinity system call.
262#endif /* __NR_sched_setaffinity */
263#ifndef __NR_sched_getaffinity
264#define __NR_sched_getaffinity 4240
265#elif __NR_sched_getaffinity != 4240
266#error Wrong code for getaffinity system call.
267#endif /* __NR_sched_getaffinity */
268#elif KMP_ARCH_MIPS64
269#ifndef __NR_sched_setaffinity
270#define __NR_sched_setaffinity 5195
271#elif __NR_sched_setaffinity != 5195
272#error Wrong code for setaffinity system call.
273#endif /* __NR_sched_setaffinity */
274#ifndef __NR_sched_getaffinity
275#define __NR_sched_getaffinity 5196
276#elif __NR_sched_getaffinity != 5196
277#error Wrong code for getaffinity system call.
278#endif /* __NR_sched_getaffinity */
279#elif KMP_ARCH_LOONGARCH64
280#ifndef __NR_sched_setaffinity
281#define __NR_sched_setaffinity 122
282#elif __NR_sched_setaffinity != 122
283#error Wrong code for setaffinity system call.
284#endif /* __NR_sched_setaffinity */
285#ifndef __NR_sched_getaffinity
286#define __NR_sched_getaffinity 123
287#elif __NR_sched_getaffinity != 123
288#error Wrong code for getaffinity system call.
289#endif /* __NR_sched_getaffinity */
290#elif KMP_ARCH_RISCV64
291#ifndef __NR_sched_setaffinity
292#define __NR_sched_setaffinity 122
293#elif __NR_sched_setaffinity != 122
294#error Wrong code for setaffinity system call.
295#endif /* __NR_sched_setaffinity */
296#ifndef __NR_sched_getaffinity
297#define __NR_sched_getaffinity 123
298#elif __NR_sched_getaffinity != 123
299#error Wrong code for getaffinity system call.
300#endif /* __NR_sched_getaffinity */
301#elif KMP_ARCH_VE
302#ifndef __NR_sched_setaffinity
303#define __NR_sched_setaffinity 203
304#elif __NR_sched_setaffinity != 203
305#error Wrong code for setaffinity system call.
306#endif /* __NR_sched_setaffinity */
307#ifndef __NR_sched_getaffinity
308#define __NR_sched_getaffinity 204
309#elif __NR_sched_getaffinity != 204
310#error Wrong code for getaffinity system call.
311#endif /* __NR_sched_getaffinity */
312#elif KMP_ARCH_S390X
313#ifndef __NR_sched_setaffinity
314#define __NR_sched_setaffinity 239
315#elif __NR_sched_setaffinity != 239
316#error Wrong code for setaffinity system call.
317#endif /* __NR_sched_setaffinity */
318#ifndef __NR_sched_getaffinity
319#define __NR_sched_getaffinity 240
320#elif __NR_sched_getaffinity != 240
321#error Wrong code for getaffinity system call.
322#endif /* __NR_sched_getaffinity */
323#else
324#error Unknown or unsupported architecture
325#endif /* KMP_ARCH_* */
326#elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
327#include <pthread.h>
328#include <pthread_np.h>
329#elif KMP_OS_NETBSD
330#include <pthread.h>
331#include <sched.h>
332#elif KMP_OS_AIX
333#include <sys/dr.h>
334#include <sys/rset.h>
335#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
336#endif
337class KMPNativeAffinity : public KMPAffinity {
338 class Mask : public KMPAffinity::Mask {
339 typedef unsigned long mask_t;
340 typedef decltype(__kmp_affin_mask_size) mask_size_type;
341 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
342 static const mask_t ONE = 1;
343 mask_size_type get_num_mask_types() const {
344 return __kmp_affin_mask_size / sizeof(mask_t);
345 }
346
347 public:
348 mask_t *mask;
349 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
350 ~Mask() {
351 if (mask)
352 __kmp_free(mask);
353 }
354 void set(int i) override {
355 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
356 }
357 bool is_set(int i) const override {
358 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
359 }
360 void clear(int i) override {
361 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
362 }
363 void zero() override {
364 mask_size_type e = get_num_mask_types();
365 for (mask_size_type i = 0; i < e; ++i)
366 mask[i] = (mask_t)0;
367 }
368 bool empty() const override {
369 mask_size_type e = get_num_mask_types();
370 for (mask_size_type i = 0; i < e; ++i)
371 if (mask[i] != (mask_t)0)
372 return false;
373 return true;
374 }
375 void copy(const KMPAffinity::Mask *src) override {
376 const Mask *convert = static_cast<const Mask *>(src);
377 mask_size_type e = get_num_mask_types();
378 for (mask_size_type i = 0; i < e; ++i)
379 mask[i] = convert->mask[i];
380 }
381 void bitwise_and(const KMPAffinity::Mask *rhs) override {
382 const Mask *convert = static_cast<const Mask *>(rhs);
383 mask_size_type e = get_num_mask_types();
384 for (mask_size_type i = 0; i < e; ++i)
385 mask[i] &= convert->mask[i];
386 }
387 void bitwise_or(const KMPAffinity::Mask *rhs) override {
388 const Mask *convert = static_cast<const Mask *>(rhs);
389 mask_size_type e = get_num_mask_types();
390 for (mask_size_type i = 0; i < e; ++i)
391 mask[i] |= convert->mask[i];
392 }
393 void bitwise_not() override {
394 mask_size_type e = get_num_mask_types();
395 for (mask_size_type i = 0; i < e; ++i)
396 mask[i] = ~(mask[i]);
397 }
398 bool is_equal(const KMPAffinity::Mask *rhs) const override {
399 const Mask *convert = static_cast<const Mask *>(rhs);
400 mask_size_type e = get_num_mask_types();
401 for (mask_size_type i = 0; i < e; ++i)
402 if (mask[i] != convert->mask[i])
403 return false;
404 return true;
405 }
406 int begin() const override {
407 int retval = 0;
408 while (retval < end() && !is_set(retval))
409 ++retval;
410 return retval;
411 }
412 int end() const override {
413 int e;
414 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
415 return e;
416 }
417 int next(int previous) const override {
418 int retval = previous + 1;
419 while (retval < end() && !is_set(retval))
420 ++retval;
421 return retval;
422 }
423#if KMP_OS_AIX
424 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
425 // This routine is only used to get the full mask.
426 int get_system_affinity(bool abort_on_error) override {
427 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
428 "Illegal get affinity operation when not capable");
429
430 (void)abort_on_error;
431
432 // Set the mask with all CPUs that are available.
433 for (int i = 0; i < __kmp_xproc; ++i)
434 KMP_CPU_SET(i, this);
435 return 0;
436 }
437 int set_system_affinity(bool abort_on_error) const override {
438 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
439
440 "Illegal set affinity operation when not capable");
441
442 int location;
443 int gtid = __kmp_entry_gtid();
444 int tid = thread_self();
445
446 // Unbind the thread if it was bound to any processors before so that
447 // we can bind the thread to CPUs specified by the mask not others.
448 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
449
450 // On AIX, we can only bind to one instead of a set of CPUs with the
451 // bindprocessor() system call.
452 KMP_CPU_SET_ITERATE(location, this) {
453 if (KMP_CPU_ISSET(location, this)) {
454 retval = bindprocessor(BINDTHREAD, tid, location);
455 if (retval == -1 && errno == 1) {
456 rsid_t rsid;
457 rsethandle_t rsh;
458 // Put something in rsh to prevent compiler warning
459 // about uninitalized use
460 rsh = rs_alloc(RS_EMPTY);
461 rsid.at_pid = getpid();
462 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
463 retval = ra_detachrset(R_PROCESS, rsid, 0);
464 retval = bindprocessor(BINDTHREAD, tid, location);
465 }
466 }
467 if (retval == 0) {
468 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
469 "T#%d to cpu=%d.\n",
470 gtid, location));
471 continue;
472 }
473 int error = errno;
474 if (abort_on_error) {
475 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
476 KMP_ERR(error), __kmp_msg_null);
477 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
478 "T#%d to cpu=%d, errno=%d.\n",
479 gtid, location, error));
480 return error;
481 }
482 }
483 }
484 return 0;
485 }
486#else // !KMP_OS_AIX
487 int get_system_affinity(bool abort_on_error) override {
488 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
489 "Illegal get affinity operation when not capable");
490#if KMP_OS_LINUX
491 long retval =
492 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
493#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
494 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
495 reinterpret_cast<cpuset_t *>(mask));
496 int retval = (r == 0 ? 0 : -1);
497#endif
498 if (retval >= 0) {
499 return 0;
500 }
501 int error = errno;
502 if (abort_on_error) {
503 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
504 KMP_ERR(error), __kmp_msg_null);
505 }
506 return error;
507 }
508 int set_system_affinity(bool abort_on_error) const override {
509 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
510 "Illegal set affinity operation when not capable");
511#if KMP_OS_LINUX
512 long retval =
513 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
514#elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
515 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
516 reinterpret_cast<cpuset_t *>(mask));
517 int retval = (r == 0 ? 0 : -1);
518#endif
519 if (retval >= 0) {
520 return 0;
521 }
522 int error = errno;
523 if (abort_on_error) {
524 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
525 KMP_ERR(error), __kmp_msg_null);
526 }
527 return error;
528 }
529#endif // KMP_OS_AIX
530 };
531 void determine_capable(const char *env_var) override {
532 __kmp_affinity_determine_capable(env_var);
533 }
534 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
535 KMPAffinity::Mask *allocate_mask() override {
536 KMPNativeAffinity::Mask *retval = new Mask();
537 return retval;
538 }
539 void deallocate_mask(KMPAffinity::Mask *m) override {
540 KMPNativeAffinity::Mask *native_mask =
541 static_cast<KMPNativeAffinity::Mask *>(m);
542 delete native_mask;
543 }
544 KMPAffinity::Mask *allocate_mask_array(int num) override {
545 return new Mask[num];
546 }
547 void deallocate_mask_array(KMPAffinity::Mask *array) override {
548 Mask *linux_array = static_cast<Mask *>(array);
549 delete[] linux_array;
550 }
551 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
552 int index) override {
553 Mask *linux_array = static_cast<Mask *>(array);
554 return &(linux_array[index]);
555 }
556 api_type get_api_type() const override { return NATIVE_OS; }
557};
558#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
559 || KMP_OS_AIX */
560
561#if KMP_OS_WINDOWS
562class KMPNativeAffinity : public KMPAffinity {
563 class Mask : public KMPAffinity::Mask {
564 typedef ULONG_PTR mask_t;
565 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
566 mask_t *mask;
567
568 public:
569 Mask() {
570 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
571 }
572 ~Mask() {
573 if (mask)
574 __kmp_free(mask);
575 }
576 void set(int i) override {
577 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
578 }
579 bool is_set(int i) const override {
580 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
581 }
582 void clear(int i) override {
583 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
584 }
585 void zero() override {
586 for (int i = 0; i < __kmp_num_proc_groups; ++i)
587 mask[i] = 0;
588 }
589 bool empty() const override {
590 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
591 if (mask[i])
592 return false;
593 return true;
594 }
595 void copy(const KMPAffinity::Mask *src) override {
596 const Mask *convert = static_cast<const Mask *>(src);
597 for (int i = 0; i < __kmp_num_proc_groups; ++i)
598 mask[i] = convert->mask[i];
599 }
600 void bitwise_and(const KMPAffinity::Mask *rhs) override {
601 const Mask *convert = static_cast<const Mask *>(rhs);
602 for (int i = 0; i < __kmp_num_proc_groups; ++i)
603 mask[i] &= convert->mask[i];
604 }
605 void bitwise_or(const KMPAffinity::Mask *rhs) override {
606 const Mask *convert = static_cast<const Mask *>(rhs);
607 for (int i = 0; i < __kmp_num_proc_groups; ++i)
608 mask[i] |= convert->mask[i];
609 }
610 void bitwise_not() override {
611 for (int i = 0; i < __kmp_num_proc_groups; ++i)
612 mask[i] = ~(mask[i]);
613 }
614 bool is_equal(const KMPAffinity::Mask *rhs) const override {
615 const Mask *convert = static_cast<const Mask *>(rhs);
616 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
617 if (mask[i] != convert->mask[i])
618 return false;
619 return true;
620 }
621 int begin() const override {
622 int retval = 0;
623 while (retval < end() && !is_set(retval))
624 ++retval;
625 return retval;
626 }
627 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
628 int next(int previous) const override {
629 int retval = previous + 1;
630 while (retval < end() && !is_set(retval))
631 ++retval;
632 return retval;
633 }
634 int set_process_affinity(bool abort_on_error) const override {
635 if (__kmp_num_proc_groups <= 1) {
636 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
637 DWORD error = GetLastError();
638 if (abort_on_error) {
639 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
640 __kmp_msg_null);
641 }
642 return error;
643 }
644 }
645 return 0;
646 }
647 int set_system_affinity(bool abort_on_error) const override {
648 if (__kmp_num_proc_groups > 1) {
649 // Check for a valid mask.
650 GROUP_AFFINITY ga;
651 int group = get_proc_group();
652 if (group < 0) {
653 if (abort_on_error) {
654 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
655 }
656 return -1;
657 }
658 // Transform the bit vector into a GROUP_AFFINITY struct
659 // and make the system call to set affinity.
660 ga.Group = group;
661 ga.Mask = mask[group];
662 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
663
664 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
665 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
666 DWORD error = GetLastError();
667 if (abort_on_error) {
668 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
669 __kmp_msg_null);
670 }
671 return error;
672 }
673 } else {
674 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
675 DWORD error = GetLastError();
676 if (abort_on_error) {
677 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
678 __kmp_msg_null);
679 }
680 return error;
681 }
682 }
683 return 0;
684 }
685 int get_system_affinity(bool abort_on_error) override {
686 if (__kmp_num_proc_groups > 1) {
687 this->zero();
688 GROUP_AFFINITY ga;
689 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
690 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
691 DWORD error = GetLastError();
692 if (abort_on_error) {
693 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
694 KMP_ERR(error), __kmp_msg_null);
695 }
696 return error;
697 }
698 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
699 (ga.Mask == 0)) {
700 return -1;
701 }
702 mask[ga.Group] = ga.Mask;
703 } else {
704 mask_t newMask, sysMask, retval;
705 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
706 DWORD error = GetLastError();
707 if (abort_on_error) {
708 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
709 KMP_ERR(error), __kmp_msg_null);
710 }
711 return error;
712 }
713 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
714 if (!retval) {
715 DWORD error = GetLastError();
716 if (abort_on_error) {
717 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
718 KMP_ERR(error), __kmp_msg_null);
719 }
720 return error;
721 }
722 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
723 if (!newMask) {
724 DWORD error = GetLastError();
725 if (abort_on_error) {
726 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
727 KMP_ERR(error), __kmp_msg_null);
728 }
729 }
730 *mask = retval;
731 }
732 return 0;
733 }
734 int get_proc_group() const override {
735 int group = -1;
736 if (__kmp_num_proc_groups == 1) {
737 return 1;
738 }
739 for (int i = 0; i < __kmp_num_proc_groups; i++) {
740 if (mask[i] == 0)
741 continue;
742 if (group >= 0)
743 return -1;
744 group = i;
745 }
746 return group;
747 }
748 };
749 void determine_capable(const char *env_var) override {
750 __kmp_affinity_determine_capable(env_var);
751 }
752 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
753 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
754 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
755 KMPAffinity::Mask *allocate_mask_array(int num) override {
756 return new Mask[num];
757 }
758 void deallocate_mask_array(KMPAffinity::Mask *array) override {
759 Mask *windows_array = static_cast<Mask *>(array);
760 delete[] windows_array;
761 }
762 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
763 int index) override {
764 Mask *windows_array = static_cast<Mask *>(array);
765 return &(windows_array[index]);
766 }
767 api_type get_api_type() const override { return NATIVE_OS; }
768};
769#endif /* KMP_OS_WINDOWS */
770#endif /* KMP_AFFINITY_SUPPORTED */
771
772// Describe an attribute for a level in the machine topology
773struct kmp_hw_attr_t {
774 int core_type : 8;
775 int core_eff : 8;
776 unsigned valid : 1;
777 unsigned reserved : 15;
778
779 static const int UNKNOWN_CORE_EFF = -1;
780
781 kmp_hw_attr_t()
782 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
783 valid(0), reserved(0) {}
784 void set_core_type(kmp_hw_core_type_t type) {
785 valid = 1;
786 core_type = type;
787 }
788 void set_core_eff(int eff) {
789 valid = 1;
790 core_eff = eff;
791 }
792 kmp_hw_core_type_t get_core_type() const {
793 return (kmp_hw_core_type_t)core_type;
794 }
795 int get_core_eff() const { return core_eff; }
796 bool is_core_type_valid() const {
797 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
798 }
799 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
800 operator bool() const { return valid; }
801 void clear() {
802 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
803 core_eff = UNKNOWN_CORE_EFF;
804 valid = 0;
805 }
806 bool contains(const kmp_hw_attr_t &other) const {
807 if (!valid && !other.valid)
808 return true;
809 if (valid && other.valid) {
810 if (other.is_core_type_valid()) {
811 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
812 return false;
813 }
814 if (other.is_core_eff_valid()) {
815 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
816 return false;
817 }
818 return true;
819 }
820 return false;
821 }
822#if KMP_AFFINITY_SUPPORTED
823 bool contains(const kmp_affinity_attrs_t &attr) const {
824 if (!valid && !attr.valid)
825 return true;
826 if (valid && attr.valid) {
827 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
828 return (is_core_type_valid() &&
829 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
830 if (attr.core_eff != UNKNOWN_CORE_EFF)
831 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
832 return true;
833 }
834 return false;
835 }
836#endif // KMP_AFFINITY_SUPPORTED
837 bool operator==(const kmp_hw_attr_t &rhs) const {
838 return (rhs.valid == valid && rhs.core_eff == core_eff &&
839 rhs.core_type == core_type);
840 }
841 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
842};
843
844#if KMP_AFFINITY_SUPPORTED
845KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
846#endif
847
848class kmp_hw_thread_t {
849public:
850 static const int UNKNOWN_ID = -1;
851 static const int MULTIPLE_ID = -2;
852 static int compare_ids(const void *a, const void *b);
853 static int compare_compact(const void *a, const void *b);
854 int ids[KMP_HW_LAST];
855 int sub_ids[KMP_HW_LAST];
856 bool leader;
857 int os_id;
858 kmp_hw_attr_t attrs;
859
860 void print() const;
861 void clear() {
862 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
863 ids[i] = UNKNOWN_ID;
864 leader = false;
865 attrs.clear();
866 }
867};
868
869class kmp_topology_t {
870
871 struct flags_t {
872 int uniform : 1;
873 int reserved : 31;
874 };
875
876 int depth;
877
878 // The following arrays are all 'depth' long and have been
879 // allocated to hold up to KMP_HW_LAST number of objects if
880 // needed so layers can be added without reallocation of any array
881
882 // Orderd array of the types in the topology
883 kmp_hw_t *types;
884
885 // Keep quick topology ratios, for non-uniform topologies,
886 // this ratio holds the max number of itemAs per itemB
887 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
888 int *ratio;
889
890 // Storage containing the absolute number of each topology layer
891 int *count;
892
893 // The number of core efficiencies. This is only useful for hybrid
894 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
895 int num_core_efficiencies;
896 int num_core_types;
897 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
898
899 // The hardware threads array
900 // hw_threads is num_hw_threads long
901 // Each hw_thread's ids and sub_ids are depth deep
902 int num_hw_threads;
903 kmp_hw_thread_t *hw_threads;
904
905 // Equivalence hash where the key is the hardware topology item
906 // and the value is the equivalent hardware topology type in the
907 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
908 // known equivalence for the topology type
909 kmp_hw_t equivalent[KMP_HW_LAST];
910
911 // Flags describing the topology
912 flags_t flags;
913
914 // Compact value used during sort_compact()
915 int compact;
916
917 // Insert a new topology layer after allocation
918 void _insert_layer(kmp_hw_t type, const int *ids);
919
920#if KMP_GROUP_AFFINITY
921 // Insert topology information about Windows Processor groups
922 void _insert_windows_proc_groups();
923#endif
924
925 // Count each item & get the num x's per y
926 // e.g., get the number of cores and the number of threads per core
927 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
928 void _gather_enumeration_information();
929
930 // Remove layers that don't add information to the topology.
931 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
932 void _remove_radix1_layers();
933
934 // Find out if the topology is uniform
935 void _discover_uniformity();
936
937 // Set all the sub_ids for each hardware thread
938 void _set_sub_ids();
939
940 // Set global affinity variables describing the number of threads per
941 // core, the number of packages, the number of cores per package, and
942 // the number of cores.
943 void _set_globals();
944
945 // Set the last level cache equivalent type
946 void _set_last_level_cache();
947
948 // Return the number of cores with a particular attribute, 'attr'.
949 // If 'find_all' is true, then find all cores on the machine, otherwise find
950 // all cores per the layer 'above'
951 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
952 bool find_all = false) const;
953
954public:
955 // Force use of allocate()/deallocate()
956 kmp_topology_t() = delete;
957 kmp_topology_t(const kmp_topology_t &t) = delete;
958 kmp_topology_t(kmp_topology_t &&t) = delete;
959 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
960 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
961
962 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
963 static void deallocate(kmp_topology_t *);
964
965 // Functions used in create_map() routines
966 kmp_hw_thread_t &at(int index) {
967 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
968 return hw_threads[index];
969 }
970 const kmp_hw_thread_t &at(int index) const {
971 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
972 return hw_threads[index];
973 }
974 int get_num_hw_threads() const { return num_hw_threads; }
975 void sort_ids() {
976 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
977 kmp_hw_thread_t::compare_ids);
978 }
979 // Check if the hardware ids are unique, if they are
980 // return true, otherwise return false
981 bool check_ids() const;
982
983 // Function to call after the create_map() routine
984 void canonicalize();
985 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
986
987// Functions used after canonicalize() called
988
989#if KMP_AFFINITY_SUPPORTED
990 // Set the granularity for affinity settings
991 void set_granularity(kmp_affinity_t &stgs) const;
992 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
993 bool restrict_to_mask(const kmp_affin_mask_t *mask);
994 bool filter_hw_subset();
995#endif
996 bool is_uniform() const { return flags.uniform; }
997 // Tell whether a type is a valid type in the topology
998 // returns KMP_HW_UNKNOWN when there is no equivalent type
999 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1000 if (type == KMP_HW_UNKNOWN)
1001 return KMP_HW_UNKNOWN;
1002 return equivalent[type];
1003 }
1004 // Set type1 = type2
1005 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1006 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1007 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1008 kmp_hw_t real_type2 = equivalent[type2];
1009 if (real_type2 == KMP_HW_UNKNOWN)
1010 real_type2 = type2;
1011 equivalent[type1] = real_type2;
1012 // This loop is required since any of the types may have been set to
1013 // be equivalent to type1. They all must be checked and reset to type2.
1014 KMP_FOREACH_HW_TYPE(type) {
1015 if (equivalent[type] == type1) {
1016 equivalent[type] = real_type2;
1017 }
1018 }
1019 }
1020 // Calculate number of types corresponding to level1
1021 // per types corresponding to level2 (e.g., number of threads per core)
1022 int calculate_ratio(int level1, int level2) const {
1023 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1024 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1025 int r = 1;
1026 for (int level = level1; level > level2; --level)
1027 r *= ratio[level];
1028 return r;
1029 }
1030 int get_ratio(int level) const {
1031 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1032 return ratio[level];
1033 }
1034 int get_depth() const { return depth; };
1035 kmp_hw_t get_type(int level) const {
1036 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1037 return types[level];
1038 }
1039 int get_level(kmp_hw_t type) const {
1040 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1041 int eq_type = equivalent[type];
1042 if (eq_type == KMP_HW_UNKNOWN)
1043 return -1;
1044 for (int i = 0; i < depth; ++i)
1045 if (types[i] == eq_type)
1046 return i;
1047 return -1;
1048 }
1049 int get_count(int level) const {
1050 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1051 return count[level];
1052 }
1053 // Return the total number of cores with attribute 'attr'
1054 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1055 return _get_ncores_with_attr(attr, -1, true);
1056 }
1057 // Return the number of cores with attribute
1058 // 'attr' per topology level 'above'
1059 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1060 return _get_ncores_with_attr(attr, above, false);
1061 }
1062
1063#if KMP_AFFINITY_SUPPORTED
1064 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1065 void sort_compact(kmp_affinity_t &affinity) {
1066 compact = affinity.compact;
1067 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1068 kmp_hw_thread_t::compare_compact);
1069 }
1070#endif
1071 void print(const char *env_var = "KMP_AFFINITY") const;
1072 void dump() const;
1073};
1074extern kmp_topology_t *__kmp_topology;
1075
1076class kmp_hw_subset_t {
1077 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1078
1079public:
1080 // Describe a machine topology item in KMP_HW_SUBSET
1081 struct item_t {
1082 kmp_hw_t type;
1083 int num_attrs;
1084 int num[MAX_ATTRS];
1085 int offset[MAX_ATTRS];
1086 kmp_hw_attr_t attr[MAX_ATTRS];
1087 };
1088 // Put parenthesis around max to avoid accidental use of Windows max macro.
1089 const static int USE_ALL = (std::numeric_limits<int>::max)();
1090
1091private:
1092 int depth;
1093 int capacity;
1094 item_t *items;
1095 kmp_uint64 set;
1096 bool absolute;
1097 // The set must be able to handle up to KMP_HW_LAST number of layers
1098 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1099 // Sorting the KMP_HW_SUBSET items to follow topology order
1100 // All unknown topology types will be at the beginning of the subset
1101 static int hw_subset_compare(const void *i1, const void *i2) {
1102 kmp_hw_t type1 = ((const item_t *)i1)->type;
1103 kmp_hw_t type2 = ((const item_t *)i2)->type;
1104 int level1 = __kmp_topology->get_level(type1);
1105 int level2 = __kmp_topology->get_level(type2);
1106 return level1 - level2;
1107 }
1108
1109public:
1110 // Force use of allocate()/deallocate()
1111 kmp_hw_subset_t() = delete;
1112 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1113 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1114 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1115 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1116
1117 static kmp_hw_subset_t *allocate() {
1118 int initial_capacity = 5;
1119 kmp_hw_subset_t *retval =
1120 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1121 retval->depth = 0;
1122 retval->capacity = initial_capacity;
1123 retval->set = 0ull;
1124 retval->absolute = false;
1125 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1126 return retval;
1127 }
1128 static void deallocate(kmp_hw_subset_t *subset) {
1129 __kmp_free(subset->items);
1130 __kmp_free(subset);
1131 }
1132 void set_absolute() { absolute = true; }
1133 bool is_absolute() const { return absolute; }
1134 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1135 for (int i = 0; i < depth; ++i) {
1136 // Found an existing item for this layer type
1137 // Add the num, offset, and attr to this item
1138 if (items[i].type == type) {
1139 int idx = items[i].num_attrs++;
1140 if ((size_t)idx >= MAX_ATTRS)
1141 return;
1142 items[i].num[idx] = num;
1143 items[i].offset[idx] = offset;
1144 items[i].attr[idx] = attr;
1145 return;
1146 }
1147 }
1148 if (depth == capacity - 1) {
1149 capacity *= 2;
1150 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1151 for (int i = 0; i < depth; ++i)
1152 new_items[i] = items[i];
1153 __kmp_free(items);
1154 items = new_items;
1155 }
1156 items[depth].num_attrs = 1;
1157 items[depth].type = type;
1158 items[depth].num[0] = num;
1159 items[depth].offset[0] = offset;
1160 items[depth].attr[0] = attr;
1161 depth++;
1162 set |= (1ull << type);
1163 }
1164 int get_depth() const { return depth; }
1165 const item_t &at(int index) const {
1166 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1167 return items[index];
1168 }
1169 item_t &at(int index) {
1170 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1171 return items[index];
1172 }
1173 void remove(int index) {
1174 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1175 set &= ~(1ull << items[index].type);
1176 for (int j = index + 1; j < depth; ++j) {
1177 items[j - 1] = items[j];
1178 }
1179 depth--;
1180 }
1181 void sort() {
1182 KMP_DEBUG_ASSERT(__kmp_topology);
1183 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1184 }
1185 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1186
1187 // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1188 // This means putting each of {sockets, cores, threads} in the topology if
1189 // they are not specified:
1190 // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1191 // e.g., 3module => *s,3module,*c,*t
1192 // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1193 // are expecting the traditional sockets/cores/threads topology. For newer
1194 // hardware, there can be intervening layers like dies/tiles/modules
1195 // (usually corresponding to a cache level). So when a user asks for
1196 // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1197 // should get 12 hardware threads across 6 cores and effectively ignore the
1198 // module layer.
1199 void canonicalize(const kmp_topology_t *top) {
1200 // Layers to target for KMP_HW_SUBSET canonicalization
1201 kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1202
1203 // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1204 if (is_absolute())
1205 return;
1206
1207 // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1208 // topology doesn't have these layers
1209 for (kmp_hw_t type : targeted)
1210 if (top->get_level(type) == KMP_HW_UNKNOWN)
1211 return;
1212
1213 // Put targeted layers in topology if they do not exist
1214 for (kmp_hw_t type : targeted) {
1215 bool found = false;
1216 for (int i = 0; i < get_depth(); ++i) {
1217 if (top->get_equivalent_type(items[i].type) == type) {
1218 found = true;
1219 break;
1220 }
1221 }
1222 if (!found) {
1223 push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1224 }
1225 }
1226 sort();
1227 // Set as an absolute topology that only targets the targeted layers
1228 set_absolute();
1229 }
1230 void dump() const {
1231 printf("**********************\n");
1232 printf("*** kmp_hw_subset: ***\n");
1233 printf("* depth: %d\n", depth);
1234 printf("* items:\n");
1235 for (int i = 0; i < depth; ++i) {
1236 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1237 for (int j = 0; j < items[i].num_attrs; ++j) {
1238 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1239 items[i].offset[j]);
1240 if (!items[i].attr[j]) {
1241 printf(" (none)\n");
1242 } else {
1243 printf(
1244 " core_type = %s, core_eff = %d\n",
1245 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1246 items[i].attr[j].get_core_eff());
1247 }
1248 }
1249 }
1250 printf("* set: 0x%llx\n", set);
1251 printf("* absolute: %d\n", absolute);
1252 printf("**********************\n");
1253 }
1254};
1255extern kmp_hw_subset_t *__kmp_hw_subset;
1256
1257/* A structure for holding machine-specific hierarchy info to be computed once
1258 at init. This structure represents a mapping of threads to the actual machine
1259 hierarchy, or to our best guess at what the hierarchy might be, for the
1260 purpose of performing an efficient barrier. In the worst case, when there is
1261 no machine hierarchy information, it produces a tree suitable for a barrier,
1262 similar to the tree used in the hyper barrier. */
1263class hierarchy_info {
1264public:
1265 /* Good default values for number of leaves and branching factor, given no
1266 affinity information. Behaves a bit like hyper barrier. */
1267 static const kmp_uint32 maxLeaves = 4;
1268 static const kmp_uint32 minBranch = 4;
1274 kmp_uint32 maxLevels;
1275
1280 kmp_uint32 depth;
1281 kmp_uint32 base_num_threads;
1282 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1283 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1284 // 2=initialization in progress
1285 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1286
1291 kmp_uint32 *numPerLevel;
1292 kmp_uint32 *skipPerLevel;
1293
1294 void deriveLevels() {
1295 int hier_depth = __kmp_topology->get_depth();
1296 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1297 numPerLevel[level] = __kmp_topology->get_ratio(i);
1298 }
1299 }
1300
1301 hierarchy_info()
1302 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1303
1304 void fini() {
1305 if (!uninitialized && numPerLevel) {
1306 __kmp_free(numPerLevel);
1307 numPerLevel = NULL;
1308 uninitialized = not_initialized;
1309 }
1310 }
1311
1312 void init(int num_addrs) {
1313 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1314 &uninitialized, not_initialized, initializing);
1315 if (bool_result == 0) { // Wait for initialization
1316 while (TCR_1(uninitialized) != initialized)
1317 KMP_CPU_PAUSE();
1318 return;
1319 }
1320 KMP_DEBUG_ASSERT(bool_result == 1);
1321
1322 /* Added explicit initialization of the data fields here to prevent usage of
1323 dirty value observed when static library is re-initialized multiple times
1324 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1325 OpenMP). */
1326 depth = 1;
1327 resizing = 0;
1328 maxLevels = 7;
1329 numPerLevel =
1330 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1331 skipPerLevel = &(numPerLevel[maxLevels]);
1332 for (kmp_uint32 i = 0; i < maxLevels;
1333 ++i) { // init numPerLevel[*] to 1 item per level
1334 numPerLevel[i] = 1;
1335 skipPerLevel[i] = 1;
1336 }
1337
1338 // Sort table by physical ID
1339 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1340 deriveLevels();
1341 } else {
1342 numPerLevel[0] = maxLeaves;
1343 numPerLevel[1] = num_addrs / maxLeaves;
1344 if (num_addrs % maxLeaves)
1345 numPerLevel[1]++;
1346 }
1347
1348 base_num_threads = num_addrs;
1349 for (int i = maxLevels - 1; i >= 0;
1350 --i) // count non-empty levels to get depth
1351 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1352 depth++;
1353
1354 kmp_uint32 branch = minBranch;
1355 if (numPerLevel[0] == 1)
1356 branch = num_addrs / maxLeaves;
1357 if (branch < minBranch)
1358 branch = minBranch;
1359 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1360 while (numPerLevel[d] > branch ||
1361 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1362 if (numPerLevel[d] & 1)
1363 numPerLevel[d]++;
1364 numPerLevel[d] = numPerLevel[d] >> 1;
1365 if (numPerLevel[d + 1] == 1)
1366 depth++;
1367 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1368 }
1369 if (numPerLevel[0] == 1) {
1370 branch = branch >> 1;
1371 if (branch < 4)
1372 branch = minBranch;
1373 }
1374 }
1375
1376 for (kmp_uint32 i = 1; i < depth; ++i)
1377 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1378 // Fill in hierarchy in the case of oversubscription
1379 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1380 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1381
1382 uninitialized = initialized; // One writer
1383 }
1384
1385 // Resize the hierarchy if nproc changes to something larger than before
1386 void resize(kmp_uint32 nproc) {
1387 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1388 while (bool_result == 0) { // someone else is trying to resize
1389 KMP_CPU_PAUSE();
1390 if (nproc <= base_num_threads) // happy with other thread's resize
1391 return;
1392 else // try to resize
1393 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1394 }
1395 KMP_DEBUG_ASSERT(bool_result != 0);
1396 if (nproc <= base_num_threads)
1397 return; // happy with other thread's resize
1398
1399 // Calculate new maxLevels
1400 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1401 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1402 // First see if old maxLevels is enough to contain new size
1403 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1404 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1405 numPerLevel[i - 1] *= 2;
1406 old_sz *= 2;
1407 depth++;
1408 }
1409 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1410 while (nproc > old_sz) {
1411 old_sz *= 2;
1412 incs++;
1413 depth++;
1414 }
1415 maxLevels += incs;
1416
1417 // Resize arrays
1418 kmp_uint32 *old_numPerLevel = numPerLevel;
1419 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1420 numPerLevel = skipPerLevel = NULL;
1421 numPerLevel =
1422 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1423 skipPerLevel = &(numPerLevel[maxLevels]);
1424
1425 // Copy old elements from old arrays
1426 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1427 // init numPerLevel[*] to 1 item per level
1428 numPerLevel[i] = old_numPerLevel[i];
1429 skipPerLevel[i] = old_skipPerLevel[i];
1430 }
1431
1432 // Init new elements in arrays to 1
1433 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1434 // init numPerLevel[*] to 1 item per level
1435 numPerLevel[i] = 1;
1436 skipPerLevel[i] = 1;
1437 }
1438
1439 // Free old arrays
1440 __kmp_free(old_numPerLevel);
1441 }
1442
1443 // Fill in oversubscription levels of hierarchy
1444 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1445 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1446
1447 base_num_threads = nproc;
1448 resizing = 0; // One writer
1449 }
1450};
1451#endif // KMP_AFFINITY_H