LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24  class Mask : public KMPAffinity::Mask {
25  hwloc_cpuset_t mask;
26 
27  public:
28  Mask() {
29  mask = hwloc_bitmap_alloc();
30  this->zero();
31  }
32  Mask(const Mask &other) = delete;
33  Mask &operator=(const Mask &other) = delete;
34  ~Mask() { hwloc_bitmap_free(mask); }
35  void set(int i) override { hwloc_bitmap_set(mask, i); }
36  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38  void zero() override { hwloc_bitmap_zero(mask); }
39  bool empty() const override { return hwloc_bitmap_iszero(mask); }
40  void copy(const KMPAffinity::Mask *src) override {
41  const Mask *convert = static_cast<const Mask *>(src);
42  hwloc_bitmap_copy(mask, convert->mask);
43  }
44  void bitwise_and(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_and(mask, mask, convert->mask);
47  }
48  void bitwise_or(const KMPAffinity::Mask *rhs) override {
49  const Mask *convert = static_cast<const Mask *>(rhs);
50  hwloc_bitmap_or(mask, mask, convert->mask);
51  }
52  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53  bool is_equal(const KMPAffinity::Mask *rhs) const override {
54  const Mask *convert = static_cast<const Mask *>(rhs);
55  return hwloc_bitmap_isequal(mask, convert->mask);
56  }
57  int begin() const override { return hwloc_bitmap_first(mask); }
58  int end() const override { return -1; }
59  int next(int previous) const override {
60  return hwloc_bitmap_next(mask, previous);
61  }
62  int get_system_affinity(bool abort_on_error) override {
63  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64  "Illegal get affinity operation when not capable");
65  long retval =
66  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67  if (retval >= 0) {
68  return 0;
69  }
70  int error = errno;
71  if (abort_on_error) {
72  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73  KMP_ERR(error), __kmp_msg_null);
74  }
75  return error;
76  }
77  int set_system_affinity(bool abort_on_error) const override {
78  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79  "Illegal set affinity operation when not capable");
80  long retval =
81  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82  if (retval >= 0) {
83  return 0;
84  }
85  int error = errno;
86  if (abort_on_error) {
87  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88  KMP_ERR(error), __kmp_msg_null);
89  }
90  return error;
91  }
92 #if KMP_OS_WINDOWS
93  int set_process_affinity(bool abort_on_error) const override {
94  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95  "Illegal set process affinity operation when not capable");
96  int error = 0;
97  const hwloc_topology_support *support =
98  hwloc_topology_get_support(__kmp_hwloc_topology);
99  if (support->cpubind->set_proc_cpubind) {
100  int retval;
101  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102  HWLOC_CPUBIND_PROCESS);
103  if (retval >= 0)
104  return 0;
105  error = errno;
106  if (abort_on_error)
107  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108  KMP_ERR(error), __kmp_msg_null);
109  }
110  return error;
111  }
112 #endif
113  int get_proc_group() const override {
114  int group = -1;
115 #if KMP_OS_WINDOWS
116  if (__kmp_num_proc_groups == 1) {
117  return 1;
118  }
119  for (int i = 0; i < __kmp_num_proc_groups; i++) {
120  // On windows, the long type is always 32 bits
121  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122  unsigned long second_32_bits =
123  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124  if (first_32_bits == 0 && second_32_bits == 0) {
125  continue;
126  }
127  if (group >= 0) {
128  return -1;
129  }
130  group = i;
131  }
132 #endif /* KMP_OS_WINDOWS */
133  return group;
134  }
135  };
136  void determine_capable(const char *var) override {
137  const hwloc_topology_support *topology_support;
138  if (__kmp_hwloc_topology == NULL) {
139  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140  __kmp_hwloc_error = TRUE;
141  if (__kmp_affinity.flags.verbose) {
142  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143  }
144  }
145  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146  __kmp_hwloc_error = TRUE;
147  if (__kmp_affinity.flags.verbose) {
148  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149  }
150  }
151  }
152  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153  // Is the system capable of setting/getting this thread's affinity?
154  // Also, is topology discovery possible? (pu indicates ability to discover
155  // processing units). And finally, were there no errors when calling any
156  // hwloc_* API functions?
157  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158  topology_support->cpubind->get_thisthread_cpubind &&
159  topology_support->discovery->pu && !__kmp_hwloc_error) {
160  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161  KMP_AFFINITY_ENABLE(TRUE);
162  } else {
163  // indicate that hwloc didn't work and disable affinity
164  __kmp_hwloc_error = TRUE;
165  KMP_AFFINITY_DISABLE();
166  }
167  }
168  void bind_thread(int which) override {
169  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170  "Illegal set affinity operation when not capable");
171  KMPAffinity::Mask *mask;
172  KMP_CPU_ALLOC_ON_STACK(mask);
173  KMP_CPU_ZERO(mask);
174  KMP_CPU_SET(which, mask);
175  __kmp_set_system_affinity(mask, TRUE);
176  KMP_CPU_FREE_FROM_STACK(mask);
177  }
178  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180  KMPAffinity::Mask *allocate_mask_array(int num) override {
181  return new Mask[num];
182  }
183  void deallocate_mask_array(KMPAffinity::Mask *array) override {
184  Mask *hwloc_array = static_cast<Mask *>(array);
185  delete[] hwloc_array;
186  }
187  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188  int index) override {
189  Mask *hwloc_array = static_cast<Mask *>(array);
190  return &(hwloc_array[index]);
191  }
192  api_type get_api_type() const override { return HWLOC; }
193 };
194 #endif /* KMP_USE_HWLOC */
195 
196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
197  KMP_OS_AIX
198 #if KMP_OS_LINUX
199 /* On some of the older OS's that we build on, these constants aren't present
200  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201  all systems of the same arch where they are defined, and they cannot change.
202  stone forever. */
203 #include <sys/syscall.h>
204 #if KMP_ARCH_X86 || KMP_ARCH_ARM
205 #ifndef __NR_sched_setaffinity
206 #define __NR_sched_setaffinity 241
207 #elif __NR_sched_setaffinity != 241
208 #error Wrong code for setaffinity system call.
209 #endif /* __NR_sched_setaffinity */
210 #ifndef __NR_sched_getaffinity
211 #define __NR_sched_getaffinity 242
212 #elif __NR_sched_getaffinity != 242
213 #error Wrong code for getaffinity system call.
214 #endif /* __NR_sched_getaffinity */
215 #elif KMP_ARCH_AARCH64
216 #ifndef __NR_sched_setaffinity
217 #define __NR_sched_setaffinity 122
218 #elif __NR_sched_setaffinity != 122
219 #error Wrong code for setaffinity system call.
220 #endif /* __NR_sched_setaffinity */
221 #ifndef __NR_sched_getaffinity
222 #define __NR_sched_getaffinity 123
223 #elif __NR_sched_getaffinity != 123
224 #error Wrong code for getaffinity system call.
225 #endif /* __NR_sched_getaffinity */
226 #elif KMP_ARCH_RISCV64
227 #ifndef __NR_sched_setaffinity
228 #define __NR_sched_setaffinity 122
229 #elif __NR_sched_setaffinity != 122
230 #error Wrong code for setaffinity system call.
231 #endif /* __NR_sched_setaffinity */
232 #ifndef __NR_sched_getaffinity
233 #define __NR_sched_getaffinity 123
234 #elif __NR_sched_getaffinity != 123
235 #error Wrong code for getaffinity system call.
236 #endif /* __NR_sched_getaffinity */
237 #elif KMP_ARCH_X86_64
238 #ifndef __NR_sched_setaffinity
239 #define __NR_sched_setaffinity 203
240 #elif __NR_sched_setaffinity != 203
241 #error Wrong code for setaffinity system call.
242 #endif /* __NR_sched_setaffinity */
243 #ifndef __NR_sched_getaffinity
244 #define __NR_sched_getaffinity 204
245 #elif __NR_sched_getaffinity != 204
246 #error Wrong code for getaffinity system call.
247 #endif /* __NR_sched_getaffinity */
248 #elif KMP_ARCH_PPC64
249 #ifndef __NR_sched_setaffinity
250 #define __NR_sched_setaffinity 222
251 #elif __NR_sched_setaffinity != 222
252 #error Wrong code for setaffinity system call.
253 #endif /* __NR_sched_setaffinity */
254 #ifndef __NR_sched_getaffinity
255 #define __NR_sched_getaffinity 223
256 #elif __NR_sched_getaffinity != 223
257 #error Wrong code for getaffinity system call.
258 #endif /* __NR_sched_getaffinity */
259 #elif KMP_ARCH_MIPS
260 #ifndef __NR_sched_setaffinity
261 #define __NR_sched_setaffinity 4239
262 #elif __NR_sched_setaffinity != 4239
263 #error Wrong code for setaffinity system call.
264 #endif /* __NR_sched_setaffinity */
265 #ifndef __NR_sched_getaffinity
266 #define __NR_sched_getaffinity 4240
267 #elif __NR_sched_getaffinity != 4240
268 #error Wrong code for getaffinity system call.
269 #endif /* __NR_sched_getaffinity */
270 #elif KMP_ARCH_MIPS64
271 #ifndef __NR_sched_setaffinity
272 #define __NR_sched_setaffinity 5195
273 #elif __NR_sched_setaffinity != 5195
274 #error Wrong code for setaffinity system call.
275 #endif /* __NR_sched_setaffinity */
276 #ifndef __NR_sched_getaffinity
277 #define __NR_sched_getaffinity 5196
278 #elif __NR_sched_getaffinity != 5196
279 #error Wrong code for getaffinity system call.
280 #endif /* __NR_sched_getaffinity */
281 #elif KMP_ARCH_LOONGARCH64
282 #ifndef __NR_sched_setaffinity
283 #define __NR_sched_setaffinity 122
284 #elif __NR_sched_setaffinity != 122
285 #error Wrong code for setaffinity system call.
286 #endif /* __NR_sched_setaffinity */
287 #ifndef __NR_sched_getaffinity
288 #define __NR_sched_getaffinity 123
289 #elif __NR_sched_getaffinity != 123
290 #error Wrong code for getaffinity system call.
291 #endif /* __NR_sched_getaffinity */
292 #elif KMP_ARCH_RISCV64
293 #ifndef __NR_sched_setaffinity
294 #define __NR_sched_setaffinity 122
295 #elif __NR_sched_setaffinity != 122
296 #error Wrong code for setaffinity system call.
297 #endif /* __NR_sched_setaffinity */
298 #ifndef __NR_sched_getaffinity
299 #define __NR_sched_getaffinity 123
300 #elif __NR_sched_getaffinity != 123
301 #error Wrong code for getaffinity system call.
302 #endif /* __NR_sched_getaffinity */
303 #elif KMP_ARCH_VE
304 #ifndef __NR_sched_setaffinity
305 #define __NR_sched_setaffinity 203
306 #elif __NR_sched_setaffinity != 203
307 #error Wrong code for setaffinity system call.
308 #endif /* __NR_sched_setaffinity */
309 #ifndef __NR_sched_getaffinity
310 #define __NR_sched_getaffinity 204
311 #elif __NR_sched_getaffinity != 204
312 #error Wrong code for getaffinity system call.
313 #endif /* __NR_sched_getaffinity */
314 #elif KMP_ARCH_S390X
315 #ifndef __NR_sched_setaffinity
316 #define __NR_sched_setaffinity 239
317 #elif __NR_sched_setaffinity != 239
318 #error Wrong code for setaffinity system call.
319 #endif /* __NR_sched_setaffinity */
320 #ifndef __NR_sched_getaffinity
321 #define __NR_sched_getaffinity 240
322 #elif __NR_sched_getaffinity != 240
323 #error Wrong code for getaffinity system call.
324 #endif /* __NR_sched_getaffinity */
325 #elif KMP_ARCH_SPARC
326 #ifndef __NR_sched_setaffinity
327 #define __NR_sched_setaffinity 261
328 #elif __NR_sched_setaffinity != 261
329 #error Wrong code for setaffinity system call.
330 #endif /* __NR_sched_setaffinity */
331 #ifndef __NR_sched_getaffinity
332 #define __NR_sched_getaffinity 260
333 #elif __NR_sched_getaffinity != 260
334 #error Wrong code for getaffinity system call.
335 #endif /* __NR_sched_getaffinity */
336 #else
337 #error Unknown or unsupported architecture
338 #endif /* KMP_ARCH_* */
339 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
340 #include <pthread.h>
341 #include <pthread_np.h>
342 #elif KMP_OS_NETBSD
343 #include <pthread.h>
344 #include <sched.h>
345 #elif KMP_OS_AIX
346 #include <sys/dr.h>
347 #include <sys/rset.h>
348 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
349 #define GET_NUMBER_SMT_SETS 0x0004
350 extern "C" int syssmt(int flags, int, int, int *);
351 #endif
352 class KMPNativeAffinity : public KMPAffinity {
353  class Mask : public KMPAffinity::Mask {
354  typedef unsigned long mask_t;
355  typedef decltype(__kmp_affin_mask_size) mask_size_type;
356  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
357  static const mask_t ONE = 1;
358  mask_size_type get_num_mask_types() const {
359  return __kmp_affin_mask_size / sizeof(mask_t);
360  }
361 
362  public:
363  mask_t *mask;
364  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
365  ~Mask() {
366  if (mask)
367  __kmp_free(mask);
368  }
369  void set(int i) override {
370  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
371  }
372  bool is_set(int i) const override {
373  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
374  }
375  void clear(int i) override {
376  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
377  }
378  void zero() override {
379  mask_size_type e = get_num_mask_types();
380  for (mask_size_type i = 0; i < e; ++i)
381  mask[i] = (mask_t)0;
382  }
383  bool empty() const override {
384  mask_size_type e = get_num_mask_types();
385  for (mask_size_type i = 0; i < e; ++i)
386  if (mask[i] != (mask_t)0)
387  return false;
388  return true;
389  }
390  void copy(const KMPAffinity::Mask *src) override {
391  const Mask *convert = static_cast<const Mask *>(src);
392  mask_size_type e = get_num_mask_types();
393  for (mask_size_type i = 0; i < e; ++i)
394  mask[i] = convert->mask[i];
395  }
396  void bitwise_and(const KMPAffinity::Mask *rhs) override {
397  const Mask *convert = static_cast<const Mask *>(rhs);
398  mask_size_type e = get_num_mask_types();
399  for (mask_size_type i = 0; i < e; ++i)
400  mask[i] &= convert->mask[i];
401  }
402  void bitwise_or(const KMPAffinity::Mask *rhs) override {
403  const Mask *convert = static_cast<const Mask *>(rhs);
404  mask_size_type e = get_num_mask_types();
405  for (mask_size_type i = 0; i < e; ++i)
406  mask[i] |= convert->mask[i];
407  }
408  void bitwise_not() override {
409  mask_size_type e = get_num_mask_types();
410  for (mask_size_type i = 0; i < e; ++i)
411  mask[i] = ~(mask[i]);
412  }
413  bool is_equal(const KMPAffinity::Mask *rhs) const override {
414  const Mask *convert = static_cast<const Mask *>(rhs);
415  mask_size_type e = get_num_mask_types();
416  for (mask_size_type i = 0; i < e; ++i)
417  if (mask[i] != convert->mask[i])
418  return false;
419  return true;
420  }
421  int begin() const override {
422  int retval = 0;
423  while (retval < end() && !is_set(retval))
424  ++retval;
425  return retval;
426  }
427  int end() const override {
428  int e;
429  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
430  return e;
431  }
432  int next(int previous) const override {
433  int retval = previous + 1;
434  while (retval < end() && !is_set(retval))
435  ++retval;
436  return retval;
437  }
438 #if KMP_OS_AIX
439  // On AIX, we don't have a way to get CPU(s) a thread is bound to.
440  // This routine is only used to get the full mask.
441  int get_system_affinity(bool abort_on_error) override {
442  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443  "Illegal get affinity operation when not capable");
444 
445  (void)abort_on_error;
446 
447  // Set the mask with all CPUs that are available.
448  for (int i = 0; i < __kmp_xproc; ++i)
449  KMP_CPU_SET(i, this);
450  return 0;
451  }
452  int set_system_affinity(bool abort_on_error) const override {
453  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
454 
455  "Illegal set affinity operation when not capable");
456 
457  int location;
458  int gtid = __kmp_entry_gtid();
459  int tid = thread_self();
460 
461  // Unbind the thread if it was bound to any processors before so that
462  // we can bind the thread to CPUs specified by the mask not others.
463  int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
464 
465  // On AIX, we can only bind to one instead of a set of CPUs with the
466  // bindprocessor() system call.
467  KMP_CPU_SET_ITERATE(location, this) {
468  if (KMP_CPU_ISSET(location, this)) {
469  retval = bindprocessor(BINDTHREAD, tid, location);
470  if (retval == -1 && errno == 1) {
471  rsid_t rsid;
472  rsethandle_t rsh;
473  // Put something in rsh to prevent compiler warning
474  // about uninitalized use
475  rsh = rs_alloc(RS_EMPTY);
476  rsid.at_pid = getpid();
477  if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
478  retval = ra_detachrset(R_PROCESS, rsid, 0);
479  retval = bindprocessor(BINDTHREAD, tid, location);
480  }
481  }
482  if (retval == 0) {
483  KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
484  "T#%d to cpu=%d.\n",
485  gtid, location));
486  continue;
487  }
488  int error = errno;
489  if (abort_on_error) {
490  __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
491  KMP_ERR(error), __kmp_msg_null);
492  KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
493  "T#%d to cpu=%d, errno=%d.\n",
494  gtid, location, error));
495  return error;
496  }
497  }
498  }
499  return 0;
500  }
501 #else // !KMP_OS_AIX
502  int get_system_affinity(bool abort_on_error) override {
503  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
504  "Illegal get affinity operation when not capable");
505 #if KMP_OS_LINUX
506  long retval =
507  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
508 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
509  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
510  reinterpret_cast<cpuset_t *>(mask));
511  int retval = (r == 0 ? 0 : -1);
512 #endif
513  if (retval >= 0) {
514  return 0;
515  }
516  int error = errno;
517  if (abort_on_error) {
518  __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
519  KMP_ERR(error), __kmp_msg_null);
520  }
521  return error;
522  }
523  int set_system_affinity(bool abort_on_error) const override {
524  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
525  "Illegal set affinity operation when not capable");
526 #if KMP_OS_LINUX
527  long retval =
528  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
529 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
530  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
531  reinterpret_cast<cpuset_t *>(mask));
532  int retval = (r == 0 ? 0 : -1);
533 #endif
534  if (retval >= 0) {
535  return 0;
536  }
537  int error = errno;
538  if (abort_on_error) {
539  __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
540  KMP_ERR(error), __kmp_msg_null);
541  }
542  return error;
543  }
544 #endif // KMP_OS_AIX
545  };
546  void determine_capable(const char *env_var) override {
547  __kmp_affinity_determine_capable(env_var);
548  }
549  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
550  KMPAffinity::Mask *allocate_mask() override {
551  KMPNativeAffinity::Mask *retval = new Mask();
552  return retval;
553  }
554  void deallocate_mask(KMPAffinity::Mask *m) override {
555  KMPNativeAffinity::Mask *native_mask =
556  static_cast<KMPNativeAffinity::Mask *>(m);
557  delete native_mask;
558  }
559  KMPAffinity::Mask *allocate_mask_array(int num) override {
560  return new Mask[num];
561  }
562  void deallocate_mask_array(KMPAffinity::Mask *array) override {
563  Mask *linux_array = static_cast<Mask *>(array);
564  delete[] linux_array;
565  }
566  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
567  int index) override {
568  Mask *linux_array = static_cast<Mask *>(array);
569  return &(linux_array[index]);
570  }
571  api_type get_api_type() const override { return NATIVE_OS; }
572 };
573 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
574  || KMP_OS_AIX */
575 
576 #if KMP_OS_WINDOWS
577 class KMPNativeAffinity : public KMPAffinity {
578  class Mask : public KMPAffinity::Mask {
579  typedef ULONG_PTR mask_t;
580  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
581  mask_t *mask;
582 
583  public:
584  Mask() {
585  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
586  }
587  ~Mask() {
588  if (mask)
589  __kmp_free(mask);
590  }
591  void set(int i) override {
592  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
593  }
594  bool is_set(int i) const override {
595  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
596  }
597  void clear(int i) override {
598  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
599  }
600  void zero() override {
601  for (int i = 0; i < __kmp_num_proc_groups; ++i)
602  mask[i] = 0;
603  }
604  bool empty() const override {
605  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
606  if (mask[i])
607  return false;
608  return true;
609  }
610  void copy(const KMPAffinity::Mask *src) override {
611  const Mask *convert = static_cast<const Mask *>(src);
612  for (int i = 0; i < __kmp_num_proc_groups; ++i)
613  mask[i] = convert->mask[i];
614  }
615  void bitwise_and(const KMPAffinity::Mask *rhs) override {
616  const Mask *convert = static_cast<const Mask *>(rhs);
617  for (int i = 0; i < __kmp_num_proc_groups; ++i)
618  mask[i] &= convert->mask[i];
619  }
620  void bitwise_or(const KMPAffinity::Mask *rhs) override {
621  const Mask *convert = static_cast<const Mask *>(rhs);
622  for (int i = 0; i < __kmp_num_proc_groups; ++i)
623  mask[i] |= convert->mask[i];
624  }
625  void bitwise_not() override {
626  for (int i = 0; i < __kmp_num_proc_groups; ++i)
627  mask[i] = ~(mask[i]);
628  }
629  bool is_equal(const KMPAffinity::Mask *rhs) const override {
630  const Mask *convert = static_cast<const Mask *>(rhs);
631  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
632  if (mask[i] != convert->mask[i])
633  return false;
634  return true;
635  }
636  int begin() const override {
637  int retval = 0;
638  while (retval < end() && !is_set(retval))
639  ++retval;
640  return retval;
641  }
642  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
643  int next(int previous) const override {
644  int retval = previous + 1;
645  while (retval < end() && !is_set(retval))
646  ++retval;
647  return retval;
648  }
649  int set_process_affinity(bool abort_on_error) const override {
650  if (__kmp_num_proc_groups <= 1) {
651  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
652  DWORD error = GetLastError();
653  if (abort_on_error) {
654  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
655  __kmp_msg_null);
656  }
657  return error;
658  }
659  }
660  return 0;
661  }
662  int set_system_affinity(bool abort_on_error) const override {
663  if (__kmp_num_proc_groups > 1) {
664  // Check for a valid mask.
665  GROUP_AFFINITY ga;
666  int group = get_proc_group();
667  if (group < 0) {
668  if (abort_on_error) {
669  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
670  }
671  return -1;
672  }
673  // Transform the bit vector into a GROUP_AFFINITY struct
674  // and make the system call to set affinity.
675  ga.Group = group;
676  ga.Mask = mask[group];
677  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
678 
679  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
680  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
681  DWORD error = GetLastError();
682  if (abort_on_error) {
683  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
684  __kmp_msg_null);
685  }
686  return error;
687  }
688  } else {
689  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
690  DWORD error = GetLastError();
691  if (abort_on_error) {
692  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
693  __kmp_msg_null);
694  }
695  return error;
696  }
697  }
698  return 0;
699  }
700  int get_system_affinity(bool abort_on_error) override {
701  if (__kmp_num_proc_groups > 1) {
702  this->zero();
703  GROUP_AFFINITY ga;
704  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
705  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
706  DWORD error = GetLastError();
707  if (abort_on_error) {
708  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
709  KMP_ERR(error), __kmp_msg_null);
710  }
711  return error;
712  }
713  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
714  (ga.Mask == 0)) {
715  return -1;
716  }
717  mask[ga.Group] = ga.Mask;
718  } else {
719  mask_t newMask, sysMask, retval;
720  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
721  DWORD error = GetLastError();
722  if (abort_on_error) {
723  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
724  KMP_ERR(error), __kmp_msg_null);
725  }
726  return error;
727  }
728  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
729  if (!retval) {
730  DWORD error = GetLastError();
731  if (abort_on_error) {
732  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
733  KMP_ERR(error), __kmp_msg_null);
734  }
735  return error;
736  }
737  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
738  if (!newMask) {
739  DWORD error = GetLastError();
740  if (abort_on_error) {
741  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
742  KMP_ERR(error), __kmp_msg_null);
743  }
744  }
745  *mask = retval;
746  }
747  return 0;
748  }
749  int get_proc_group() const override {
750  int group = -1;
751  if (__kmp_num_proc_groups == 1) {
752  return 1;
753  }
754  for (int i = 0; i < __kmp_num_proc_groups; i++) {
755  if (mask[i] == 0)
756  continue;
757  if (group >= 0)
758  return -1;
759  group = i;
760  }
761  return group;
762  }
763  };
764  void determine_capable(const char *env_var) override {
765  __kmp_affinity_determine_capable(env_var);
766  }
767  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
768  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
769  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
770  KMPAffinity::Mask *allocate_mask_array(int num) override {
771  return new Mask[num];
772  }
773  void deallocate_mask_array(KMPAffinity::Mask *array) override {
774  Mask *windows_array = static_cast<Mask *>(array);
775  delete[] windows_array;
776  }
777  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
778  int index) override {
779  Mask *windows_array = static_cast<Mask *>(array);
780  return &(windows_array[index]);
781  }
782  api_type get_api_type() const override { return NATIVE_OS; }
783 };
784 #endif /* KMP_OS_WINDOWS */
785 #endif /* KMP_AFFINITY_SUPPORTED */
786 
787 // Describe an attribute for a level in the machine topology
788 struct kmp_hw_attr_t {
789  int core_type : 8;
790  int core_eff : 8;
791  unsigned valid : 1;
792  unsigned reserved : 15;
793 
794  static const int UNKNOWN_CORE_EFF = -1;
795 
796  kmp_hw_attr_t()
797  : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
798  valid(0), reserved(0) {}
799  void set_core_type(kmp_hw_core_type_t type) {
800  valid = 1;
801  core_type = type;
802  }
803  void set_core_eff(int eff) {
804  valid = 1;
805  core_eff = eff;
806  }
807  kmp_hw_core_type_t get_core_type() const {
808  return (kmp_hw_core_type_t)core_type;
809  }
810  int get_core_eff() const { return core_eff; }
811  bool is_core_type_valid() const {
812  return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
813  }
814  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
815  operator bool() const { return valid; }
816  void clear() {
817  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
818  core_eff = UNKNOWN_CORE_EFF;
819  valid = 0;
820  }
821  bool contains(const kmp_hw_attr_t &other) const {
822  if (!valid && !other.valid)
823  return true;
824  if (valid && other.valid) {
825  if (other.is_core_type_valid()) {
826  if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
827  return false;
828  }
829  if (other.is_core_eff_valid()) {
830  if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
831  return false;
832  }
833  return true;
834  }
835  return false;
836  }
837 #if KMP_AFFINITY_SUPPORTED
838  bool contains(const kmp_affinity_attrs_t &attr) const {
839  if (!valid && !attr.valid)
840  return true;
841  if (valid && attr.valid) {
842  if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
843  return (is_core_type_valid() &&
844  (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
845  if (attr.core_eff != UNKNOWN_CORE_EFF)
846  return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
847  return true;
848  }
849  return false;
850  }
851 #endif // KMP_AFFINITY_SUPPORTED
852  bool operator==(const kmp_hw_attr_t &rhs) const {
853  return (rhs.valid == valid && rhs.core_eff == core_eff &&
854  rhs.core_type == core_type);
855  }
856  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
857 };
858 
859 #if KMP_AFFINITY_SUPPORTED
860 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
861 #endif
862 
863 class kmp_hw_thread_t {
864 public:
865  static const int UNKNOWN_ID = -1;
866  static const int MULTIPLE_ID = -2;
867  static int compare_ids(const void *a, const void *b);
868  static int compare_compact(const void *a, const void *b);
869  int ids[KMP_HW_LAST];
870  int sub_ids[KMP_HW_LAST];
871  bool leader;
872  int os_id;
873  int original_idx;
874  kmp_hw_attr_t attrs;
875 
876  void print() const;
877  void clear() {
878  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
879  ids[i] = UNKNOWN_ID;
880  leader = false;
881  attrs.clear();
882  }
883 };
884 
885 class kmp_topology_t {
886 
887  struct flags_t {
888  int uniform : 1;
889  int reserved : 31;
890  };
891 
892  int depth;
893 
894  // The following arrays are all 'depth' long and have been
895  // allocated to hold up to KMP_HW_LAST number of objects if
896  // needed so layers can be added without reallocation of any array
897 
898  // Orderd array of the types in the topology
899  kmp_hw_t *types;
900 
901  // Keep quick topology ratios, for non-uniform topologies,
902  // this ratio holds the max number of itemAs per itemB
903  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
904  int *ratio;
905 
906  // Storage containing the absolute number of each topology layer
907  int *count;
908 
909  // The number of core efficiencies. This is only useful for hybrid
910  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
911  int num_core_efficiencies;
912  int num_core_types;
913  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
914 
915  // The hardware threads array
916  // hw_threads is num_hw_threads long
917  // Each hw_thread's ids and sub_ids are depth deep
918  int num_hw_threads;
919  kmp_hw_thread_t *hw_threads;
920 
921  // Equivalence hash where the key is the hardware topology item
922  // and the value is the equivalent hardware topology type in the
923  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
924  // known equivalence for the topology type
925  kmp_hw_t equivalent[KMP_HW_LAST];
926 
927  // Flags describing the topology
928  flags_t flags;
929 
930  // Compact value used during sort_compact()
931  int compact;
932 
933 #if KMP_GROUP_AFFINITY
934  // Insert topology information about Windows Processor groups
935  void _insert_windows_proc_groups();
936 #endif
937 
938  // Count each item & get the num x's per y
939  // e.g., get the number of cores and the number of threads per core
940  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
941  void _gather_enumeration_information();
942 
943  // Remove layers that don't add information to the topology.
944  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
945  void _remove_radix1_layers();
946 
947  // Find out if the topology is uniform
948  void _discover_uniformity();
949 
950  // Set all the sub_ids for each hardware thread
951  void _set_sub_ids();
952 
953  // Set global affinity variables describing the number of threads per
954  // core, the number of packages, the number of cores per package, and
955  // the number of cores.
956  void _set_globals();
957 
958  // Set the last level cache equivalent type
959  void _set_last_level_cache();
960 
961  // Return the number of cores with a particular attribute, 'attr'.
962  // If 'find_all' is true, then find all cores on the machine, otherwise find
963  // all cores per the layer 'above'
964  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
965  bool find_all = false) const;
966 
967 public:
968  // Force use of allocate()/deallocate()
969  kmp_topology_t() = delete;
970  kmp_topology_t(const kmp_topology_t &t) = delete;
971  kmp_topology_t(kmp_topology_t &&t) = delete;
972  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
973  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
974 
975  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
976  static void deallocate(kmp_topology_t *);
977 
978  // Functions used in create_map() routines
979  kmp_hw_thread_t &at(int index) {
980  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
981  return hw_threads[index];
982  }
983  const kmp_hw_thread_t &at(int index) const {
984  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
985  return hw_threads[index];
986  }
987  int get_num_hw_threads() const { return num_hw_threads; }
988  void sort_ids() {
989  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
990  kmp_hw_thread_t::compare_ids);
991  }
992 
993  // Insert a new topology layer after allocation
994  void insert_layer(kmp_hw_t type, const int *ids);
995 
996  // Check if the hardware ids are unique, if they are
997  // return true, otherwise return false
998  bool check_ids() const;
999 
1000  // Function to call after the create_map() routine
1001  void canonicalize();
1002  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
1003 
1004 // Functions used after canonicalize() called
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Set the granularity for affinity settings
1008  void set_granularity(kmp_affinity_t &stgs) const;
1009  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
1010  bool restrict_to_mask(const kmp_affin_mask_t *mask);
1011  bool filter_hw_subset();
1012 #endif
1013  bool is_uniform() const { return flags.uniform; }
1014  // Tell whether a type is a valid type in the topology
1015  // returns KMP_HW_UNKNOWN when there is no equivalent type
1016  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1017  if (type == KMP_HW_UNKNOWN)
1018  return KMP_HW_UNKNOWN;
1019  return equivalent[type];
1020  }
1021  // Set type1 = type2
1022  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1023  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1024  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1025  kmp_hw_t real_type2 = equivalent[type2];
1026  if (real_type2 == KMP_HW_UNKNOWN)
1027  real_type2 = type2;
1028  equivalent[type1] = real_type2;
1029  // This loop is required since any of the types may have been set to
1030  // be equivalent to type1. They all must be checked and reset to type2.
1031  KMP_FOREACH_HW_TYPE(type) {
1032  if (equivalent[type] == type1) {
1033  equivalent[type] = real_type2;
1034  }
1035  }
1036  }
1037  // Calculate number of types corresponding to level1
1038  // per types corresponding to level2 (e.g., number of threads per core)
1039  int calculate_ratio(int level1, int level2) const {
1040  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1041  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1042  int r = 1;
1043  for (int level = level1; level > level2; --level)
1044  r *= ratio[level];
1045  return r;
1046  }
1047  int get_ratio(int level) const {
1048  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1049  return ratio[level];
1050  }
1051  int get_depth() const { return depth; };
1052  kmp_hw_t get_type(int level) const {
1053  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1054  return types[level];
1055  }
1056  int get_level(kmp_hw_t type) const {
1057  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1058  int eq_type = equivalent[type];
1059  if (eq_type == KMP_HW_UNKNOWN)
1060  return -1;
1061  for (int i = 0; i < depth; ++i)
1062  if (types[i] == eq_type)
1063  return i;
1064  return -1;
1065  }
1066  int get_count(int level) const {
1067  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1068  return count[level];
1069  }
1070  // Return the total number of cores with attribute 'attr'
1071  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1072  return _get_ncores_with_attr(attr, -1, true);
1073  }
1074  // Return the number of cores with attribute
1075  // 'attr' per topology level 'above'
1076  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1077  return _get_ncores_with_attr(attr, above, false);
1078  }
1079 
1080 #if KMP_AFFINITY_SUPPORTED
1081  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1082  void sort_compact(kmp_affinity_t &affinity) {
1083  compact = affinity.compact;
1084  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1085  kmp_hw_thread_t::compare_compact);
1086  }
1087 #endif
1088  void print(const char *env_var = "KMP_AFFINITY") const;
1089  void dump() const;
1090 };
1091 extern kmp_topology_t *__kmp_topology;
1092 
1093 class kmp_hw_subset_t {
1094  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1095 
1096 public:
1097  // Describe a machine topology item in KMP_HW_SUBSET
1098  struct item_t {
1099  kmp_hw_t type;
1100  int num_attrs;
1101  int num[MAX_ATTRS];
1102  int offset[MAX_ATTRS];
1103  kmp_hw_attr_t attr[MAX_ATTRS];
1104  };
1105  // Put parenthesis around max to avoid accidental use of Windows max macro.
1106  const static int USE_ALL = (std::numeric_limits<int>::max)();
1107 
1108 private:
1109  int depth;
1110  int capacity;
1111  item_t *items;
1112  kmp_uint64 set;
1113  bool absolute;
1114  // The set must be able to handle up to KMP_HW_LAST number of layers
1115  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1116  // Sorting the KMP_HW_SUBSET items to follow topology order
1117  // All unknown topology types will be at the beginning of the subset
1118  static int hw_subset_compare(const void *i1, const void *i2) {
1119  kmp_hw_t type1 = ((const item_t *)i1)->type;
1120  kmp_hw_t type2 = ((const item_t *)i2)->type;
1121  int level1 = __kmp_topology->get_level(type1);
1122  int level2 = __kmp_topology->get_level(type2);
1123  return level1 - level2;
1124  }
1125 
1126 public:
1127  // Force use of allocate()/deallocate()
1128  kmp_hw_subset_t() = delete;
1129  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1130  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1131  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1132  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1133 
1134  static kmp_hw_subset_t *allocate() {
1135  int initial_capacity = 5;
1136  kmp_hw_subset_t *retval =
1137  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1138  retval->depth = 0;
1139  retval->capacity = initial_capacity;
1140  retval->set = 0ull;
1141  retval->absolute = false;
1142  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1143  return retval;
1144  }
1145  static void deallocate(kmp_hw_subset_t *subset) {
1146  __kmp_free(subset->items);
1147  __kmp_free(subset);
1148  }
1149  void set_absolute() { absolute = true; }
1150  bool is_absolute() const { return absolute; }
1151  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1152  for (int i = 0; i < depth; ++i) {
1153  // Found an existing item for this layer type
1154  // Add the num, offset, and attr to this item
1155  if (items[i].type == type) {
1156  int idx = items[i].num_attrs++;
1157  if ((size_t)idx >= MAX_ATTRS)
1158  return;
1159  items[i].num[idx] = num;
1160  items[i].offset[idx] = offset;
1161  items[i].attr[idx] = attr;
1162  return;
1163  }
1164  }
1165  if (depth == capacity - 1) {
1166  capacity *= 2;
1167  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1168  for (int i = 0; i < depth; ++i)
1169  new_items[i] = items[i];
1170  __kmp_free(items);
1171  items = new_items;
1172  }
1173  items[depth].num_attrs = 1;
1174  items[depth].type = type;
1175  items[depth].num[0] = num;
1176  items[depth].offset[0] = offset;
1177  items[depth].attr[0] = attr;
1178  depth++;
1179  set |= (1ull << type);
1180  }
1181  int get_depth() const { return depth; }
1182  const item_t &at(int index) const {
1183  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1184  return items[index];
1185  }
1186  item_t &at(int index) {
1187  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1188  return items[index];
1189  }
1190  void remove(int index) {
1191  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1192  set &= ~(1ull << items[index].type);
1193  for (int j = index + 1; j < depth; ++j) {
1194  items[j - 1] = items[j];
1195  }
1196  depth--;
1197  }
1198  void sort() {
1199  KMP_DEBUG_ASSERT(__kmp_topology);
1200  qsort(items, depth, sizeof(item_t), hw_subset_compare);
1201  }
1202  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1203 
1204  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1205  // This means putting each of {sockets, cores, threads} in the topology if
1206  // they are not specified:
1207  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1208  // e.g., 3module => *s,3module,*c,*t
1209  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1210  // are expecting the traditional sockets/cores/threads topology. For newer
1211  // hardware, there can be intervening layers like dies/tiles/modules
1212  // (usually corresponding to a cache level). So when a user asks for
1213  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1214  // should get 12 hardware threads across 6 cores and effectively ignore the
1215  // module layer.
1216  void canonicalize(const kmp_topology_t *top) {
1217  // Layers to target for KMP_HW_SUBSET canonicalization
1218  kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1219 
1220  // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1221  if (is_absolute())
1222  return;
1223 
1224  // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1225  // topology doesn't have these layers
1226  for (kmp_hw_t type : targeted)
1227  if (top->get_level(type) == KMP_HW_UNKNOWN)
1228  return;
1229 
1230  // Put targeted layers in topology if they do not exist
1231  for (kmp_hw_t type : targeted) {
1232  bool found = false;
1233  for (int i = 0; i < get_depth(); ++i) {
1234  if (top->get_equivalent_type(items[i].type) == type) {
1235  found = true;
1236  break;
1237  }
1238  }
1239  if (!found) {
1240  push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1241  }
1242  }
1243  sort();
1244  // Set as an absolute topology that only targets the targeted layers
1245  set_absolute();
1246  }
1247  void dump() const {
1248  printf("**********************\n");
1249  printf("*** kmp_hw_subset: ***\n");
1250  printf("* depth: %d\n", depth);
1251  printf("* items:\n");
1252  for (int i = 0; i < depth; ++i) {
1253  printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1254  for (int j = 0; j < items[i].num_attrs; ++j) {
1255  printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1256  items[i].offset[j]);
1257  if (!items[i].attr[j]) {
1258  printf(" (none)\n");
1259  } else {
1260  printf(
1261  " core_type = %s, core_eff = %d\n",
1262  __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1263  items[i].attr[j].get_core_eff());
1264  }
1265  }
1266  }
1267  printf("* set: 0x%llx\n", set);
1268  printf("* absolute: %d\n", absolute);
1269  printf("**********************\n");
1270  }
1271 };
1272 extern kmp_hw_subset_t *__kmp_hw_subset;
1273 
1274 /* A structure for holding machine-specific hierarchy info to be computed once
1275  at init. This structure represents a mapping of threads to the actual machine
1276  hierarchy, or to our best guess at what the hierarchy might be, for the
1277  purpose of performing an efficient barrier. In the worst case, when there is
1278  no machine hierarchy information, it produces a tree suitable for a barrier,
1279  similar to the tree used in the hyper barrier. */
1280 class hierarchy_info {
1281 public:
1282  /* Good default values for number of leaves and branching factor, given no
1283  affinity information. Behaves a bit like hyper barrier. */
1284  static const kmp_uint32 maxLeaves = 4;
1285  static const kmp_uint32 minBranch = 4;
1291  kmp_uint32 maxLevels;
1292 
1297  kmp_uint32 depth;
1298  kmp_uint32 base_num_threads = 0;
1299  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1300  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1301  // 2=initialization in progress
1302  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1303 
1308  kmp_uint32 *numPerLevel = nullptr;
1309  kmp_uint32 *skipPerLevel = nullptr;
1310 
1311  void deriveLevels() {
1312  int hier_depth = __kmp_topology->get_depth();
1313  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1314  numPerLevel[level] = __kmp_topology->get_ratio(i);
1315  }
1316  }
1317 
1318  hierarchy_info()
1319  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1320 
1321  void fini() {
1322  if (!uninitialized && numPerLevel) {
1323  __kmp_free(numPerLevel);
1324  numPerLevel = NULL;
1325  uninitialized = not_initialized;
1326  }
1327  }
1328 
1329  void init(int num_addrs) {
1330  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1331  &uninitialized, not_initialized, initializing);
1332  if (bool_result == 0) { // Wait for initialization
1333  while (TCR_1(uninitialized) != initialized)
1334  KMP_CPU_PAUSE();
1335  return;
1336  }
1337  KMP_DEBUG_ASSERT(bool_result == 1);
1338 
1339  /* Added explicit initialization of the data fields here to prevent usage of
1340  dirty value observed when static library is re-initialized multiple times
1341  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1342  OpenMP). */
1343  depth = 1;
1344  resizing = 0;
1345  maxLevels = 7;
1346  numPerLevel =
1347  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1348  skipPerLevel = &(numPerLevel[maxLevels]);
1349  for (kmp_uint32 i = 0; i < maxLevels;
1350  ++i) { // init numPerLevel[*] to 1 item per level
1351  numPerLevel[i] = 1;
1352  skipPerLevel[i] = 1;
1353  }
1354 
1355  // Sort table by physical ID
1356  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1357  deriveLevels();
1358  } else {
1359  numPerLevel[0] = maxLeaves;
1360  numPerLevel[1] = num_addrs / maxLeaves;
1361  if (num_addrs % maxLeaves)
1362  numPerLevel[1]++;
1363  }
1364 
1365  base_num_threads = num_addrs;
1366  for (int i = maxLevels - 1; i >= 0;
1367  --i) // count non-empty levels to get depth
1368  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1369  depth++;
1370 
1371  kmp_uint32 branch = minBranch;
1372  if (numPerLevel[0] == 1)
1373  branch = num_addrs / maxLeaves;
1374  if (branch < minBranch)
1375  branch = minBranch;
1376  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1377  while (numPerLevel[d] > branch ||
1378  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1379  if (numPerLevel[d] & 1)
1380  numPerLevel[d]++;
1381  numPerLevel[d] = numPerLevel[d] >> 1;
1382  if (numPerLevel[d + 1] == 1)
1383  depth++;
1384  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1385  }
1386  if (numPerLevel[0] == 1) {
1387  branch = branch >> 1;
1388  if (branch < 4)
1389  branch = minBranch;
1390  }
1391  }
1392 
1393  for (kmp_uint32 i = 1; i < depth; ++i)
1394  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1395  // Fill in hierarchy in the case of oversubscription
1396  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1397  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1398 
1399  uninitialized = initialized; // One writer
1400  }
1401 
1402  // Resize the hierarchy if nproc changes to something larger than before
1403  void resize(kmp_uint32 nproc) {
1404  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1405  while (bool_result == 0) { // someone else is trying to resize
1406  KMP_CPU_PAUSE();
1407  if (nproc <= base_num_threads) // happy with other thread's resize
1408  return;
1409  else // try to resize
1410  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1411  }
1412  KMP_DEBUG_ASSERT(bool_result != 0);
1413  if (nproc <= base_num_threads)
1414  return; // happy with other thread's resize
1415 
1416  // Calculate new maxLevels
1417  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1418  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1419  // First see if old maxLevels is enough to contain new size
1420  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1421  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1422  numPerLevel[i - 1] *= 2;
1423  old_sz *= 2;
1424  depth++;
1425  }
1426  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1427  while (nproc > old_sz) {
1428  old_sz *= 2;
1429  incs++;
1430  depth++;
1431  }
1432  maxLevels += incs;
1433 
1434  // Resize arrays
1435  kmp_uint32 *old_numPerLevel = numPerLevel;
1436  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1437  numPerLevel = skipPerLevel = NULL;
1438  numPerLevel =
1439  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1440  skipPerLevel = &(numPerLevel[maxLevels]);
1441 
1442  // Copy old elements from old arrays
1443  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1444  // init numPerLevel[*] to 1 item per level
1445  numPerLevel[i] = old_numPerLevel[i];
1446  skipPerLevel[i] = old_skipPerLevel[i];
1447  }
1448 
1449  // Init new elements in arrays to 1
1450  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1451  // init numPerLevel[*] to 1 item per level
1452  numPerLevel[i] = 1;
1453  skipPerLevel[i] = 1;
1454  }
1455 
1456  // Free old arrays
1457  __kmp_free(old_numPerLevel);
1458  }
1459 
1460  // Fill in oversubscription levels of hierarchy
1461  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1462  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1463 
1464  base_num_threads = nproc;
1465  resizing = 0; // One writer
1466  }
1467 };
1468 #endif // KMP_AFFINITY_H