LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23  class Mask : public KMPAffinity::Mask {
24  hwloc_cpuset_t mask;
25 
26  public:
27  Mask() {
28  mask = hwloc_bitmap_alloc();
29  this->zero();
30  }
31  ~Mask() { hwloc_bitmap_free(mask); }
32  void set(int i) override { hwloc_bitmap_set(mask, i); }
33  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35  void zero() override { hwloc_bitmap_zero(mask); }
36  void copy(const KMPAffinity::Mask *src) override {
37  const Mask *convert = static_cast<const Mask *>(src);
38  hwloc_bitmap_copy(mask, convert->mask);
39  }
40  void bitwise_and(const KMPAffinity::Mask *rhs) override {
41  const Mask *convert = static_cast<const Mask *>(rhs);
42  hwloc_bitmap_and(mask, mask, convert->mask);
43  }
44  void bitwise_or(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_or(mask, mask, convert->mask);
47  }
48  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49  int begin() const override { return hwloc_bitmap_first(mask); }
50  int end() const override { return -1; }
51  int next(int previous) const override {
52  return hwloc_bitmap_next(mask, previous);
53  }
54  int get_system_affinity(bool abort_on_error) override {
55  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56  "Illegal get affinity operation when not capable");
57  long retval =
58  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59  if (retval >= 0) {
60  return 0;
61  }
62  int error = errno;
63  if (abort_on_error) {
64  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65  }
66  return error;
67  }
68  int set_system_affinity(bool abort_on_error) const override {
69  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70  "Illegal set affinity operation when not capable");
71  long retval =
72  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73  if (retval >= 0) {
74  return 0;
75  }
76  int error = errno;
77  if (abort_on_error) {
78  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79  }
80  return error;
81  }
82 #if KMP_OS_WINDOWS
83  int set_process_affinity(bool abort_on_error) const override {
84  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85  "Illegal set process affinity operation when not capable");
86  int error = 0;
87  const hwloc_topology_support *support =
88  hwloc_topology_get_support(__kmp_hwloc_topology);
89  if (support->cpubind->set_proc_cpubind) {
90  int retval;
91  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92  HWLOC_CPUBIND_PROCESS);
93  if (retval >= 0)
94  return 0;
95  error = errno;
96  if (abort_on_error)
97  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98  }
99  return error;
100  }
101 #endif
102  int get_proc_group() const override {
103  int group = -1;
104 #if KMP_OS_WINDOWS
105  if (__kmp_num_proc_groups == 1) {
106  return 1;
107  }
108  for (int i = 0; i < __kmp_num_proc_groups; i++) {
109  // On windows, the long type is always 32 bits
110  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111  unsigned long second_32_bits =
112  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113  if (first_32_bits == 0 && second_32_bits == 0) {
114  continue;
115  }
116  if (group >= 0) {
117  return -1;
118  }
119  group = i;
120  }
121 #endif /* KMP_OS_WINDOWS */
122  return group;
123  }
124  };
125  void determine_capable(const char *var) override {
126  const hwloc_topology_support *topology_support;
127  if (__kmp_hwloc_topology == NULL) {
128  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129  __kmp_hwloc_error = TRUE;
130  if (__kmp_affinity_verbose)
131  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132  }
133  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134  __kmp_hwloc_error = TRUE;
135  if (__kmp_affinity_verbose)
136  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137  }
138  }
139  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140  // Is the system capable of setting/getting this thread's affinity?
141  // Also, is topology discovery possible? (pu indicates ability to discover
142  // processing units). And finally, were there no errors when calling any
143  // hwloc_* API functions?
144  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145  topology_support->cpubind->get_thisthread_cpubind &&
146  topology_support->discovery->pu && !__kmp_hwloc_error) {
147  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148  KMP_AFFINITY_ENABLE(TRUE);
149  } else {
150  // indicate that hwloc didn't work and disable affinity
151  __kmp_hwloc_error = TRUE;
152  KMP_AFFINITY_DISABLE();
153  }
154  }
155  void bind_thread(int which) override {
156  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157  "Illegal set affinity operation when not capable");
158  KMPAffinity::Mask *mask;
159  KMP_CPU_ALLOC_ON_STACK(mask);
160  KMP_CPU_ZERO(mask);
161  KMP_CPU_SET(which, mask);
162  __kmp_set_system_affinity(mask, TRUE);
163  KMP_CPU_FREE_FROM_STACK(mask);
164  }
165  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167  KMPAffinity::Mask *allocate_mask_array(int num) override {
168  return new Mask[num];
169  }
170  void deallocate_mask_array(KMPAffinity::Mask *array) override {
171  Mask *hwloc_array = static_cast<Mask *>(array);
172  delete[] hwloc_array;
173  }
174  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175  int index) override {
176  Mask *hwloc_array = static_cast<Mask *>(array);
177  return &(hwloc_array[index]);
178  }
179  api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187  all systems of the same arch where they are defined, and they cannot change.
188  stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_X86_64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 203
215 #elif __NR_sched_setaffinity != 203
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 204
220 #elif __NR_sched_getaffinity != 204
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_PPC64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 222
226 #elif __NR_sched_setaffinity != 222
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 223
231 #elif __NR_sched_getaffinity != 223
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 # elif KMP_ARCH_MIPS
235 # ifndef __NR_sched_setaffinity
236 # define __NR_sched_setaffinity 4239
237 # elif __NR_sched_setaffinity != 4239
238 # error Wrong code for setaffinity system call.
239 # endif /* __NR_sched_setaffinity */
240 # ifndef __NR_sched_getaffinity
241 # define __NR_sched_getaffinity 4240
242 # elif __NR_sched_getaffinity != 4240
243 # error Wrong code for getaffinity system call.
244 # endif /* __NR_sched_getaffinity */
245 # elif KMP_ARCH_MIPS64
246 # ifndef __NR_sched_setaffinity
247 # define __NR_sched_setaffinity 5195
248 # elif __NR_sched_setaffinity != 5195
249 # error Wrong code for setaffinity system call.
250 # endif /* __NR_sched_setaffinity */
251 # ifndef __NR_sched_getaffinity
252 # define __NR_sched_getaffinity 5196
253 # elif __NR_sched_getaffinity != 5196
254 # error Wrong code for getaffinity system call.
255 # endif /* __NR_sched_getaffinity */
256 # else
257 #error Unknown or unsupported architecture
258 #endif /* KMP_ARCH_* */
259 #elif KMP_OS_FREEBSD
260 #include <pthread.h>
261 #include <pthread_np.h>
262 #endif
263 class KMPNativeAffinity : public KMPAffinity {
264  class Mask : public KMPAffinity::Mask {
265  typedef unsigned long mask_t;
266  typedef decltype(__kmp_affin_mask_size) mask_size_type;
267  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
268  static const mask_t ONE = 1;
269  mask_size_type get_num_mask_types() const {
270  return __kmp_affin_mask_size / sizeof(mask_t);
271  }
272 
273  public:
274  mask_t *mask;
275  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
276  ~Mask() {
277  if (mask)
278  __kmp_free(mask);
279  }
280  void set(int i) override {
281  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
282  }
283  bool is_set(int i) const override {
284  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
285  }
286  void clear(int i) override {
287  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
288  }
289  void zero() override {
290  mask_size_type e = get_num_mask_types();
291  for (mask_size_type i = 0; i < e; ++i)
292  mask[i] = (mask_t)0;
293  }
294  void copy(const KMPAffinity::Mask *src) override {
295  const Mask *convert = static_cast<const Mask *>(src);
296  mask_size_type e = get_num_mask_types();
297  for (mask_size_type i = 0; i < e; ++i)
298  mask[i] = convert->mask[i];
299  }
300  void bitwise_and(const KMPAffinity::Mask *rhs) override {
301  const Mask *convert = static_cast<const Mask *>(rhs);
302  mask_size_type e = get_num_mask_types();
303  for (mask_size_type i = 0; i < e; ++i)
304  mask[i] &= convert->mask[i];
305  }
306  void bitwise_or(const KMPAffinity::Mask *rhs) override {
307  const Mask *convert = static_cast<const Mask *>(rhs);
308  mask_size_type e = get_num_mask_types();
309  for (mask_size_type i = 0; i < e; ++i)
310  mask[i] |= convert->mask[i];
311  }
312  void bitwise_not() override {
313  mask_size_type e = get_num_mask_types();
314  for (mask_size_type i = 0; i < e; ++i)
315  mask[i] = ~(mask[i]);
316  }
317  int begin() const override {
318  int retval = 0;
319  while (retval < end() && !is_set(retval))
320  ++retval;
321  return retval;
322  }
323  int end() const override {
324  int e;
325  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
326  return e;
327  }
328  int next(int previous) const override {
329  int retval = previous + 1;
330  while (retval < end() && !is_set(retval))
331  ++retval;
332  return retval;
333  }
334  int get_system_affinity(bool abort_on_error) override {
335  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
336  "Illegal get affinity operation when not capable");
337 #if KMP_OS_LINUX
338  long retval =
339  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
340 #elif KMP_OS_FREEBSD
341  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
342  reinterpret_cast<cpuset_t *>(mask));
343  int retval = (r == 0 ? 0 : -1);
344 #endif
345  if (retval >= 0) {
346  return 0;
347  }
348  int error = errno;
349  if (abort_on_error) {
350  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
351  }
352  return error;
353  }
354  int set_system_affinity(bool abort_on_error) const override {
355  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
356  "Illegal set affinity operation when not capable");
357 #if KMP_OS_LINUX
358  long retval =
359  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
360 #elif KMP_OS_FREEBSD
361  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
362  reinterpret_cast<cpuset_t *>(mask));
363  int retval = (r == 0 ? 0 : -1);
364 #endif
365  if (retval >= 0) {
366  return 0;
367  }
368  int error = errno;
369  if (abort_on_error) {
370  __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
371  }
372  return error;
373  }
374  };
375  void determine_capable(const char *env_var) override {
376  __kmp_affinity_determine_capable(env_var);
377  }
378  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
379  KMPAffinity::Mask *allocate_mask() override {
380  KMPNativeAffinity::Mask *retval = new Mask();
381  return retval;
382  }
383  void deallocate_mask(KMPAffinity::Mask *m) override {
384  KMPNativeAffinity::Mask *native_mask =
385  static_cast<KMPNativeAffinity::Mask *>(m);
386  delete native_mask;
387  }
388  KMPAffinity::Mask *allocate_mask_array(int num) override {
389  return new Mask[num];
390  }
391  void deallocate_mask_array(KMPAffinity::Mask *array) override {
392  Mask *linux_array = static_cast<Mask *>(array);
393  delete[] linux_array;
394  }
395  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
396  int index) override {
397  Mask *linux_array = static_cast<Mask *>(array);
398  return &(linux_array[index]);
399  }
400  api_type get_api_type() const override { return NATIVE_OS; }
401 };
402 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
403 
404 #if KMP_OS_WINDOWS
405 class KMPNativeAffinity : public KMPAffinity {
406  class Mask : public KMPAffinity::Mask {
407  typedef ULONG_PTR mask_t;
408  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
409  mask_t *mask;
410 
411  public:
412  Mask() {
413  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
414  }
415  ~Mask() {
416  if (mask)
417  __kmp_free(mask);
418  }
419  void set(int i) override {
420  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
421  }
422  bool is_set(int i) const override {
423  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
424  }
425  void clear(int i) override {
426  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
427  }
428  void zero() override {
429  for (int i = 0; i < __kmp_num_proc_groups; ++i)
430  mask[i] = 0;
431  }
432  void copy(const KMPAffinity::Mask *src) override {
433  const Mask *convert = static_cast<const Mask *>(src);
434  for (int i = 0; i < __kmp_num_proc_groups; ++i)
435  mask[i] = convert->mask[i];
436  }
437  void bitwise_and(const KMPAffinity::Mask *rhs) override {
438  const Mask *convert = static_cast<const Mask *>(rhs);
439  for (int i = 0; i < __kmp_num_proc_groups; ++i)
440  mask[i] &= convert->mask[i];
441  }
442  void bitwise_or(const KMPAffinity::Mask *rhs) override {
443  const Mask *convert = static_cast<const Mask *>(rhs);
444  for (int i = 0; i < __kmp_num_proc_groups; ++i)
445  mask[i] |= convert->mask[i];
446  }
447  void bitwise_not() override {
448  for (int i = 0; i < __kmp_num_proc_groups; ++i)
449  mask[i] = ~(mask[i]);
450  }
451  int begin() const override {
452  int retval = 0;
453  while (retval < end() && !is_set(retval))
454  ++retval;
455  return retval;
456  }
457  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
458  int next(int previous) const override {
459  int retval = previous + 1;
460  while (retval < end() && !is_set(retval))
461  ++retval;
462  return retval;
463  }
464  int set_process_affinity(bool abort_on_error) const override {
465  if (__kmp_num_proc_groups <= 1) {
466  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
467  DWORD error = GetLastError();
468  if (abort_on_error) {
469  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
470  __kmp_msg_null);
471  }
472  return error;
473  }
474  }
475  return 0;
476  }
477  int set_system_affinity(bool abort_on_error) const override {
478  if (__kmp_num_proc_groups > 1) {
479  // Check for a valid mask.
480  GROUP_AFFINITY ga;
481  int group = get_proc_group();
482  if (group < 0) {
483  if (abort_on_error) {
484  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
485  }
486  return -1;
487  }
488  // Transform the bit vector into a GROUP_AFFINITY struct
489  // and make the system call to set affinity.
490  ga.Group = group;
491  ga.Mask = mask[group];
492  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
493 
494  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
495  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
496  DWORD error = GetLastError();
497  if (abort_on_error) {
498  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
499  __kmp_msg_null);
500  }
501  return error;
502  }
503  } else {
504  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
505  DWORD error = GetLastError();
506  if (abort_on_error) {
507  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
508  __kmp_msg_null);
509  }
510  return error;
511  }
512  }
513  return 0;
514  }
515  int get_system_affinity(bool abort_on_error) override {
516  if (__kmp_num_proc_groups > 1) {
517  this->zero();
518  GROUP_AFFINITY ga;
519  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
520  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
521  DWORD error = GetLastError();
522  if (abort_on_error) {
523  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
524  KMP_ERR(error), __kmp_msg_null);
525  }
526  return error;
527  }
528  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
529  (ga.Mask == 0)) {
530  return -1;
531  }
532  mask[ga.Group] = ga.Mask;
533  } else {
534  mask_t newMask, sysMask, retval;
535  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
536  DWORD error = GetLastError();
537  if (abort_on_error) {
538  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
539  KMP_ERR(error), __kmp_msg_null);
540  }
541  return error;
542  }
543  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
544  if (!retval) {
545  DWORD error = GetLastError();
546  if (abort_on_error) {
547  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
548  KMP_ERR(error), __kmp_msg_null);
549  }
550  return error;
551  }
552  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
553  if (!newMask) {
554  DWORD error = GetLastError();
555  if (abort_on_error) {
556  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
557  KMP_ERR(error), __kmp_msg_null);
558  }
559  }
560  *mask = retval;
561  }
562  return 0;
563  }
564  int get_proc_group() const override {
565  int group = -1;
566  if (__kmp_num_proc_groups == 1) {
567  return 1;
568  }
569  for (int i = 0; i < __kmp_num_proc_groups; i++) {
570  if (mask[i] == 0)
571  continue;
572  if (group >= 0)
573  return -1;
574  group = i;
575  }
576  return group;
577  }
578  };
579  void determine_capable(const char *env_var) override {
580  __kmp_affinity_determine_capable(env_var);
581  }
582  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
583  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
584  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
585  KMPAffinity::Mask *allocate_mask_array(int num) override {
586  return new Mask[num];
587  }
588  void deallocate_mask_array(KMPAffinity::Mask *array) override {
589  Mask *windows_array = static_cast<Mask *>(array);
590  delete[] windows_array;
591  }
592  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
593  int index) override {
594  Mask *windows_array = static_cast<Mask *>(array);
595  return &(windows_array[index]);
596  }
597  api_type get_api_type() const override { return NATIVE_OS; }
598 };
599 #endif /* KMP_OS_WINDOWS */
600 #endif /* KMP_AFFINITY_SUPPORTED */
601 
602 typedef enum kmp_hw_core_type_t {
603  KMP_HW_CORE_TYPE_UNKNOWN = 0x0,
604 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
605  KMP_HW_CORE_TYPE_ATOM = 0x20,
606  KMP_HW_CORE_TYPE_CORE = 0x40,
607  KMP_HW_MAX_NUM_CORE_TYPES = 3,
608 #else
609  KMP_HW_MAX_NUM_CORE_TYPES = 1,
610 #endif
611 } kmp_hw_core_type_t;
612 
613 class kmp_hw_thread_t {
614 public:
615  static const int UNKNOWN_ID = -1;
616  static int compare_ids(const void *a, const void *b);
617  static int compare_compact(const void *a, const void *b);
618  int ids[KMP_HW_LAST];
619  int sub_ids[KMP_HW_LAST];
620  bool leader;
621  int os_id;
622  kmp_hw_core_type_t core_type;
623 
624  void print() const;
625  void clear() {
626  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
627  ids[i] = UNKNOWN_ID;
628  leader = false;
629  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
630  }
631 };
632 
633 class kmp_topology_t {
634 
635  struct flags_t {
636  int uniform : 1;
637  int reserved : 31;
638  };
639 
640  int depth;
641 
642  // The following arrays are all 'depth' long and have been
643  // allocated to hold up to KMP_HW_LAST number of objects if
644  // needed so layers can be added without reallocation of any array
645 
646  // Orderd array of the types in the topology
647  kmp_hw_t *types;
648 
649  // Keep quick topology ratios, for non-uniform topologies,
650  // this ratio holds the max number of itemAs per itemB
651  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
652  int *ratio;
653 
654  // Storage containing the absolute number of each topology layer
655  int *count;
656 
657  // Storage containing the core types and the number of
658  // each core type for hybrid processors
659  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
660  int core_types_count[KMP_HW_MAX_NUM_CORE_TYPES];
661 
662  // The hardware threads array
663  // hw_threads is num_hw_threads long
664  // Each hw_thread's ids and sub_ids are depth deep
665  int num_hw_threads;
666  kmp_hw_thread_t *hw_threads;
667 
668  // Equivalence hash where the key is the hardware topology item
669  // and the value is the equivalent hardware topology type in the
670  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
671  // known equivalence for the topology type
672  kmp_hw_t equivalent[KMP_HW_LAST];
673 
674  // Flags describing the topology
675  flags_t flags;
676 
677  // Insert a new topology layer after allocation
678  void _insert_layer(kmp_hw_t type, const int *ids);
679 
680 #if KMP_GROUP_AFFINITY
681  // Insert topology information about Windows Processor groups
682  void _insert_windows_proc_groups();
683 #endif
684 
685  // Count each item & get the num x's per y
686  // e.g., get the number of cores and the number of threads per core
687  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
688  void _gather_enumeration_information();
689 
690  // Remove layers that don't add information to the topology.
691  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
692  void _remove_radix1_layers();
693 
694  // Find out if the topology is uniform
695  void _discover_uniformity();
696 
697  // Set all the sub_ids for each hardware thread
698  void _set_sub_ids();
699 
700  // Set global affinity variables describing the number of threads per
701  // core, the number of packages, the number of cores per package, and
702  // the number of cores.
703  void _set_globals();
704 
705  // Set the last level cache equivalent type
706  void _set_last_level_cache();
707 
708  // Increments the number of cores of type 'type'
709  void _increment_core_type(kmp_hw_core_type_t type) {
710  for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) {
711  if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) {
712  core_types[i] = type;
713  core_types_count[i] = 1;
714  break;
715  } else if (core_types[i] == type) {
716  core_types_count[i]++;
717  break;
718  }
719  }
720  }
721 
722 public:
723  // Force use of allocate()/deallocate()
724  kmp_topology_t() = delete;
725  kmp_topology_t(const kmp_topology_t &t) = delete;
726  kmp_topology_t(kmp_topology_t &&t) = delete;
727  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
728  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
729 
730  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
731  static void deallocate(kmp_topology_t *);
732 
733  // Functions used in create_map() routines
734  kmp_hw_thread_t &at(int index) {
735  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
736  return hw_threads[index];
737  }
738  const kmp_hw_thread_t &at(int index) const {
739  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
740  return hw_threads[index];
741  }
742  int get_num_hw_threads() const { return num_hw_threads; }
743  void sort_ids() {
744  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
745  kmp_hw_thread_t::compare_ids);
746  }
747  // Check if the hardware ids are unique, if they are
748  // return true, otherwise return false
749  bool check_ids() const;
750 
751  // Function to call after the create_map() routine
752  void canonicalize();
753  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
754 
755  // Functions used after canonicalize() called
756  bool filter_hw_subset();
757  bool is_close(int hwt1, int hwt2, int level) const;
758  bool is_uniform() const { return flags.uniform; }
759  // Tell whether a type is a valid type in the topology
760  // returns KMP_HW_UNKNOWN when there is no equivalent type
761  kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
762  // Set type1 = type2
763  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
764  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
765  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
766  kmp_hw_t real_type2 = equivalent[type2];
767  if (real_type2 == KMP_HW_UNKNOWN)
768  real_type2 = type2;
769  equivalent[type1] = real_type2;
770  // This loop is required since any of the types may have been set to
771  // be equivalent to type1. They all must be checked and reset to type2.
772  KMP_FOREACH_HW_TYPE(type) {
773  if (equivalent[type] == type1) {
774  equivalent[type] = real_type2;
775  }
776  }
777  }
778  // Calculate number of types corresponding to level1
779  // per types corresponding to level2 (e.g., number of threads per core)
780  int calculate_ratio(int level1, int level2) const {
781  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
782  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
783  int r = 1;
784  for (int level = level1; level > level2; --level)
785  r *= ratio[level];
786  return r;
787  }
788  int get_ratio(int level) const {
789  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
790  return ratio[level];
791  }
792  int get_depth() const { return depth; };
793  kmp_hw_t get_type(int level) const {
794  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
795  return types[level];
796  }
797  int get_level(kmp_hw_t type) const {
798  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
799  int eq_type = equivalent[type];
800  if (eq_type == KMP_HW_UNKNOWN)
801  return -1;
802  for (int i = 0; i < depth; ++i)
803  if (types[i] == eq_type)
804  return i;
805  return -1;
806  }
807  int get_count(int level) const {
808  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
809  return count[level];
810  }
811 #if KMP_AFFINITY_SUPPORTED
812  void sort_compact() {
813  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
814  kmp_hw_thread_t::compare_compact);
815  }
816 #endif
817  void print(const char *env_var = "KMP_AFFINITY") const;
818  void dump() const;
819 };
820 extern kmp_topology_t *__kmp_topology;
821 
822 class kmp_hw_subset_t {
823 public:
824  struct item_t {
825  int num;
826  kmp_hw_t type;
827  int offset;
828  };
829 
830 private:
831  int depth;
832  int capacity;
833  item_t *items;
834  kmp_uint64 set;
835  bool absolute;
836  // The set must be able to handle up to KMP_HW_LAST number of layers
837  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
838  // Sorting the KMP_HW_SUBSET items to follow topology order
839  // All unknown topology types will be at the beginning of the subset
840  static int hw_subset_compare(const void *i1, const void *i2) {
841  kmp_hw_t type1 = ((const item_t *)i1)->type;
842  kmp_hw_t type2 = ((const item_t *)i2)->type;
843  int level1 = __kmp_topology->get_level(type1);
844  int level2 = __kmp_topology->get_level(type2);
845  return level1 - level2;
846  }
847 
848 public:
849  // Force use of allocate()/deallocate()
850  kmp_hw_subset_t() = delete;
851  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
852  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
853  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
854  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
855 
856  static kmp_hw_subset_t *allocate() {
857  int initial_capacity = 5;
858  kmp_hw_subset_t *retval =
859  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
860  retval->depth = 0;
861  retval->capacity = initial_capacity;
862  retval->set = 0ull;
863  retval->absolute = false;
864  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
865  return retval;
866  }
867  static void deallocate(kmp_hw_subset_t *subset) {
868  __kmp_free(subset->items);
869  __kmp_free(subset);
870  }
871  void set_absolute() { absolute = true; }
872  bool is_absolute() const { return absolute; }
873  void push_back(int num, kmp_hw_t type, int offset) {
874  if (depth == capacity - 1) {
875  capacity *= 2;
876  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
877  for (int i = 0; i < depth; ++i)
878  new_items[i] = items[i];
879  __kmp_free(items);
880  items = new_items;
881  }
882  items[depth].num = num;
883  items[depth].type = type;
884  items[depth].offset = offset;
885  depth++;
886  set |= (1ull << type);
887  }
888  int get_depth() const { return depth; }
889  const item_t &at(int index) const {
890  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
891  return items[index];
892  }
893  item_t &at(int index) {
894  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
895  return items[index];
896  }
897  void remove(int index) {
898  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
899  set &= ~(1ull << items[index].type);
900  for (int j = index + 1; j < depth; ++j) {
901  items[j - 1] = items[j];
902  }
903  depth--;
904  }
905  void sort() {
906  KMP_DEBUG_ASSERT(__kmp_topology);
907  qsort(items, depth, sizeof(item_t), hw_subset_compare);
908  }
909  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
910  void dump() const {
911  printf("**********************\n");
912  printf("*** kmp_hw_subset: ***\n");
913  printf("* depth: %d\n", depth);
914  printf("* items:\n");
915  for (int i = 0; i < depth; ++i) {
916  printf("num: %d, type: %s, offset: %d\n", items[i].num,
917  __kmp_hw_get_keyword(items[i].type), items[i].offset);
918  }
919  printf("* set: 0x%llx\n", set);
920  printf("* absolute: %d\n", absolute);
921  printf("**********************\n");
922  }
923 };
924 extern kmp_hw_subset_t *__kmp_hw_subset;
925 
926 /* A structure for holding machine-specific hierarchy info to be computed once
927  at init. This structure represents a mapping of threads to the actual machine
928  hierarchy, or to our best guess at what the hierarchy might be, for the
929  purpose of performing an efficient barrier. In the worst case, when there is
930  no machine hierarchy information, it produces a tree suitable for a barrier,
931  similar to the tree used in the hyper barrier. */
932 class hierarchy_info {
933 public:
934  /* Good default values for number of leaves and branching factor, given no
935  affinity information. Behaves a bit like hyper barrier. */
936  static const kmp_uint32 maxLeaves = 4;
937  static const kmp_uint32 minBranch = 4;
943  kmp_uint32 maxLevels;
944 
949  kmp_uint32 depth;
950  kmp_uint32 base_num_threads;
951  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
952  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
953  // 2=initialization in progress
954  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
955 
960  kmp_uint32 *numPerLevel;
961  kmp_uint32 *skipPerLevel;
962 
963  void deriveLevels() {
964  int hier_depth = __kmp_topology->get_depth();
965  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
966  numPerLevel[level] = __kmp_topology->get_ratio(i);
967  }
968  }
969 
970  hierarchy_info()
971  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
972 
973  void fini() {
974  if (!uninitialized && numPerLevel) {
975  __kmp_free(numPerLevel);
976  numPerLevel = NULL;
977  uninitialized = not_initialized;
978  }
979  }
980 
981  void init(int num_addrs) {
982  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
983  &uninitialized, not_initialized, initializing);
984  if (bool_result == 0) { // Wait for initialization
985  while (TCR_1(uninitialized) != initialized)
986  KMP_CPU_PAUSE();
987  return;
988  }
989  KMP_DEBUG_ASSERT(bool_result == 1);
990 
991  /* Added explicit initialization of the data fields here to prevent usage of
992  dirty value observed when static library is re-initialized multiple times
993  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
994  OpenMP). */
995  depth = 1;
996  resizing = 0;
997  maxLevels = 7;
998  numPerLevel =
999  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1000  skipPerLevel = &(numPerLevel[maxLevels]);
1001  for (kmp_uint32 i = 0; i < maxLevels;
1002  ++i) { // init numPerLevel[*] to 1 item per level
1003  numPerLevel[i] = 1;
1004  skipPerLevel[i] = 1;
1005  }
1006 
1007  // Sort table by physical ID
1008  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1009  deriveLevels();
1010  } else {
1011  numPerLevel[0] = maxLeaves;
1012  numPerLevel[1] = num_addrs / maxLeaves;
1013  if (num_addrs % maxLeaves)
1014  numPerLevel[1]++;
1015  }
1016 
1017  base_num_threads = num_addrs;
1018  for (int i = maxLevels - 1; i >= 0;
1019  --i) // count non-empty levels to get depth
1020  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1021  depth++;
1022 
1023  kmp_uint32 branch = minBranch;
1024  if (numPerLevel[0] == 1)
1025  branch = num_addrs / maxLeaves;
1026  if (branch < minBranch)
1027  branch = minBranch;
1028  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1029  while (numPerLevel[d] > branch ||
1030  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1031  if (numPerLevel[d] & 1)
1032  numPerLevel[d]++;
1033  numPerLevel[d] = numPerLevel[d] >> 1;
1034  if (numPerLevel[d + 1] == 1)
1035  depth++;
1036  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1037  }
1038  if (numPerLevel[0] == 1) {
1039  branch = branch >> 1;
1040  if (branch < 4)
1041  branch = minBranch;
1042  }
1043  }
1044 
1045  for (kmp_uint32 i = 1; i < depth; ++i)
1046  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1047  // Fill in hierarchy in the case of oversubscription
1048  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1049  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1050 
1051  uninitialized = initialized; // One writer
1052  }
1053 
1054  // Resize the hierarchy if nproc changes to something larger than before
1055  void resize(kmp_uint32 nproc) {
1056  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1057  while (bool_result == 0) { // someone else is trying to resize
1058  KMP_CPU_PAUSE();
1059  if (nproc <= base_num_threads) // happy with other thread's resize
1060  return;
1061  else // try to resize
1062  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1063  }
1064  KMP_DEBUG_ASSERT(bool_result != 0);
1065  if (nproc <= base_num_threads)
1066  return; // happy with other thread's resize
1067 
1068  // Calculate new maxLevels
1069  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1070  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1071  // First see if old maxLevels is enough to contain new size
1072  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1073  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1074  numPerLevel[i - 1] *= 2;
1075  old_sz *= 2;
1076  depth++;
1077  }
1078  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1079  while (nproc > old_sz) {
1080  old_sz *= 2;
1081  incs++;
1082  depth++;
1083  }
1084  maxLevels += incs;
1085 
1086  // Resize arrays
1087  kmp_uint32 *old_numPerLevel = numPerLevel;
1088  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1089  numPerLevel = skipPerLevel = NULL;
1090  numPerLevel =
1091  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1092  skipPerLevel = &(numPerLevel[maxLevels]);
1093 
1094  // Copy old elements from old arrays
1095  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1096  // init numPerLevel[*] to 1 item per level
1097  numPerLevel[i] = old_numPerLevel[i];
1098  skipPerLevel[i] = old_skipPerLevel[i];
1099  }
1100 
1101  // Init new elements in arrays to 1
1102  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1103  // init numPerLevel[*] to 1 item per level
1104  numPerLevel[i] = 1;
1105  skipPerLevel[i] = 1;
1106  }
1107 
1108  // Free old arrays
1109  __kmp_free(old_numPerLevel);
1110  }
1111 
1112  // Fill in oversubscription levels of hierarchy
1113  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1114  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1115 
1116  base_num_threads = nproc;
1117  resizing = 0; // One writer
1118  }
1119 };
1120 #endif // KMP_AFFINITY_H