LLVM OpenMP* Runtime Library
kmp_affinity.h
1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 #include <limits>
19 
20 #if KMP_AFFINITY_SUPPORTED
21 #if KMP_USE_HWLOC
22 class KMPHwlocAffinity : public KMPAffinity {
23 public:
24  class Mask : public KMPAffinity::Mask {
25  hwloc_cpuset_t mask;
26 
27  public:
28  Mask() {
29  mask = hwloc_bitmap_alloc();
30  this->zero();
31  }
32  Mask(const Mask &other) = delete;
33  Mask &operator=(const Mask &other) = delete;
34  ~Mask() { hwloc_bitmap_free(mask); }
35  void set(int i) override { hwloc_bitmap_set(mask, i); }
36  bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
37  void clear(int i) override { hwloc_bitmap_clr(mask, i); }
38  void zero() override { hwloc_bitmap_zero(mask); }
39  bool empty() const override { return hwloc_bitmap_iszero(mask); }
40  void copy(const KMPAffinity::Mask *src) override {
41  const Mask *convert = static_cast<const Mask *>(src);
42  hwloc_bitmap_copy(mask, convert->mask);
43  }
44  void bitwise_and(const KMPAffinity::Mask *rhs) override {
45  const Mask *convert = static_cast<const Mask *>(rhs);
46  hwloc_bitmap_and(mask, mask, convert->mask);
47  }
48  void bitwise_or(const KMPAffinity::Mask *rhs) override {
49  const Mask *convert = static_cast<const Mask *>(rhs);
50  hwloc_bitmap_or(mask, mask, convert->mask);
51  }
52  void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
53  bool is_equal(const KMPAffinity::Mask *rhs) const override {
54  const Mask *convert = static_cast<const Mask *>(rhs);
55  return hwloc_bitmap_isequal(mask, convert->mask);
56  }
57  int begin() const override { return hwloc_bitmap_first(mask); }
58  int end() const override { return -1; }
59  int next(int previous) const override {
60  return hwloc_bitmap_next(mask, previous);
61  }
62  int get_system_affinity(bool abort_on_error) override {
63  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
64  "Illegal get affinity operation when not capable");
65  long retval =
66  hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
67  if (retval >= 0) {
68  return 0;
69  }
70  int error = errno;
71  if (abort_on_error) {
72  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
73  KMP_ERR(error), __kmp_msg_null);
74  }
75  return error;
76  }
77  int set_system_affinity(bool abort_on_error) const override {
78  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
79  "Illegal set affinity operation when not capable");
80  long retval =
81  hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
82  if (retval >= 0) {
83  return 0;
84  }
85  int error = errno;
86  if (abort_on_error) {
87  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
88  KMP_ERR(error), __kmp_msg_null);
89  }
90  return error;
91  }
92 #if KMP_OS_WINDOWS
93  int set_process_affinity(bool abort_on_error) const override {
94  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
95  "Illegal set process affinity operation when not capable");
96  int error = 0;
97  const hwloc_topology_support *support =
98  hwloc_topology_get_support(__kmp_hwloc_topology);
99  if (support->cpubind->set_proc_cpubind) {
100  int retval;
101  retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
102  HWLOC_CPUBIND_PROCESS);
103  if (retval >= 0)
104  return 0;
105  error = errno;
106  if (abort_on_error)
107  __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
108  KMP_ERR(error), __kmp_msg_null);
109  }
110  return error;
111  }
112 #endif
113  int get_proc_group() const override {
114  int group = -1;
115 #if KMP_OS_WINDOWS
116  if (__kmp_num_proc_groups == 1) {
117  return 1;
118  }
119  for (int i = 0; i < __kmp_num_proc_groups; i++) {
120  // On windows, the long type is always 32 bits
121  unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
122  unsigned long second_32_bits =
123  hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
124  if (first_32_bits == 0 && second_32_bits == 0) {
125  continue;
126  }
127  if (group >= 0) {
128  return -1;
129  }
130  group = i;
131  }
132 #endif /* KMP_OS_WINDOWS */
133  return group;
134  }
135  };
136  void determine_capable(const char *var) override {
137  const hwloc_topology_support *topology_support;
138  if (__kmp_hwloc_topology == NULL) {
139  if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
140  __kmp_hwloc_error = TRUE;
141  if (__kmp_affinity.flags.verbose) {
142  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
143  }
144  }
145  if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
146  __kmp_hwloc_error = TRUE;
147  if (__kmp_affinity.flags.verbose) {
148  KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
149  }
150  }
151  }
152  topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
153  // Is the system capable of setting/getting this thread's affinity?
154  // Also, is topology discovery possible? (pu indicates ability to discover
155  // processing units). And finally, were there no errors when calling any
156  // hwloc_* API functions?
157  if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
158  topology_support->cpubind->get_thisthread_cpubind &&
159  topology_support->discovery->pu && !__kmp_hwloc_error) {
160  // enables affinity according to KMP_AFFINITY_CAPABLE() macro
161  KMP_AFFINITY_ENABLE(TRUE);
162  } else {
163  // indicate that hwloc didn't work and disable affinity
164  __kmp_hwloc_error = TRUE;
165  KMP_AFFINITY_DISABLE();
166  }
167  }
168  void bind_thread(int which) override {
169  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
170  "Illegal set affinity operation when not capable");
171  KMPAffinity::Mask *mask;
172  KMP_CPU_ALLOC_ON_STACK(mask);
173  KMP_CPU_ZERO(mask);
174  KMP_CPU_SET(which, mask);
175  __kmp_set_system_affinity(mask, TRUE);
176  KMP_CPU_FREE_FROM_STACK(mask);
177  }
178  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
179  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
180  KMPAffinity::Mask *allocate_mask_array(int num) override {
181  return new Mask[num];
182  }
183  void deallocate_mask_array(KMPAffinity::Mask *array) override {
184  Mask *hwloc_array = static_cast<Mask *>(array);
185  delete[] hwloc_array;
186  }
187  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
188  int index) override {
189  Mask *hwloc_array = static_cast<Mask *>(array);
190  return &(hwloc_array[index]);
191  }
192  api_type get_api_type() const override { return HWLOC; }
193 };
194 #endif /* KMP_USE_HWLOC */
195 
196 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY || \
197  KMP_OS_AIX
198 #if KMP_OS_LINUX
199 /* On some of the older OS's that we build on, these constants aren't present
200  in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
201  all systems of the same arch where they are defined, and they cannot change.
202  stone forever. */
203 #include <sys/syscall.h>
204 #if KMP_ARCH_X86 || KMP_ARCH_ARM
205 #ifndef __NR_sched_setaffinity
206 #define __NR_sched_setaffinity 241
207 #elif __NR_sched_setaffinity != 241
208 #error Wrong code for setaffinity system call.
209 #endif /* __NR_sched_setaffinity */
210 #ifndef __NR_sched_getaffinity
211 #define __NR_sched_getaffinity 242
212 #elif __NR_sched_getaffinity != 242
213 #error Wrong code for getaffinity system call.
214 #endif /* __NR_sched_getaffinity */
215 #elif KMP_ARCH_AARCH64
216 #ifndef __NR_sched_setaffinity
217 #define __NR_sched_setaffinity 122
218 #elif __NR_sched_setaffinity != 122
219 #error Wrong code for setaffinity system call.
220 #endif /* __NR_sched_setaffinity */
221 #ifndef __NR_sched_getaffinity
222 #define __NR_sched_getaffinity 123
223 #elif __NR_sched_getaffinity != 123
224 #error Wrong code for getaffinity system call.
225 #endif /* __NR_sched_getaffinity */
226 #elif KMP_ARCH_RISCV64
227 #ifndef __NR_sched_setaffinity
228 #define __NR_sched_setaffinity 122
229 #elif __NR_sched_setaffinity != 122
230 #error Wrong code for setaffinity system call.
231 #endif /* __NR_sched_setaffinity */
232 #ifndef __NR_sched_getaffinity
233 #define __NR_sched_getaffinity 123
234 #elif __NR_sched_getaffinity != 123
235 #error Wrong code for getaffinity system call.
236 #endif /* __NR_sched_getaffinity */
237 #elif KMP_ARCH_X86_64
238 #ifndef __NR_sched_setaffinity
239 #define __NR_sched_setaffinity 203
240 #elif __NR_sched_setaffinity != 203
241 #error Wrong code for setaffinity system call.
242 #endif /* __NR_sched_setaffinity */
243 #ifndef __NR_sched_getaffinity
244 #define __NR_sched_getaffinity 204
245 #elif __NR_sched_getaffinity != 204
246 #error Wrong code for getaffinity system call.
247 #endif /* __NR_sched_getaffinity */
248 #elif KMP_ARCH_PPC64
249 #ifndef __NR_sched_setaffinity
250 #define __NR_sched_setaffinity 222
251 #elif __NR_sched_setaffinity != 222
252 #error Wrong code for setaffinity system call.
253 #endif /* __NR_sched_setaffinity */
254 #ifndef __NR_sched_getaffinity
255 #define __NR_sched_getaffinity 223
256 #elif __NR_sched_getaffinity != 223
257 #error Wrong code for getaffinity system call.
258 #endif /* __NR_sched_getaffinity */
259 #elif KMP_ARCH_MIPS
260 #ifndef __NR_sched_setaffinity
261 #define __NR_sched_setaffinity 4239
262 #elif __NR_sched_setaffinity != 4239
263 #error Wrong code for setaffinity system call.
264 #endif /* __NR_sched_setaffinity */
265 #ifndef __NR_sched_getaffinity
266 #define __NR_sched_getaffinity 4240
267 #elif __NR_sched_getaffinity != 4240
268 #error Wrong code for getaffinity system call.
269 #endif /* __NR_sched_getaffinity */
270 #elif KMP_ARCH_MIPS64
271 #ifndef __NR_sched_setaffinity
272 #define __NR_sched_setaffinity 5195
273 #elif __NR_sched_setaffinity != 5195
274 #error Wrong code for setaffinity system call.
275 #endif /* __NR_sched_setaffinity */
276 #ifndef __NR_sched_getaffinity
277 #define __NR_sched_getaffinity 5196
278 #elif __NR_sched_getaffinity != 5196
279 #error Wrong code for getaffinity system call.
280 #endif /* __NR_sched_getaffinity */
281 #elif KMP_ARCH_LOONGARCH64
282 #ifndef __NR_sched_setaffinity
283 #define __NR_sched_setaffinity 122
284 #elif __NR_sched_setaffinity != 122
285 #error Wrong code for setaffinity system call.
286 #endif /* __NR_sched_setaffinity */
287 #ifndef __NR_sched_getaffinity
288 #define __NR_sched_getaffinity 123
289 #elif __NR_sched_getaffinity != 123
290 #error Wrong code for getaffinity system call.
291 #endif /* __NR_sched_getaffinity */
292 #elif KMP_ARCH_RISCV64
293 #ifndef __NR_sched_setaffinity
294 #define __NR_sched_setaffinity 122
295 #elif __NR_sched_setaffinity != 122
296 #error Wrong code for setaffinity system call.
297 #endif /* __NR_sched_setaffinity */
298 #ifndef __NR_sched_getaffinity
299 #define __NR_sched_getaffinity 123
300 #elif __NR_sched_getaffinity != 123
301 #error Wrong code for getaffinity system call.
302 #endif /* __NR_sched_getaffinity */
303 #elif KMP_ARCH_VE
304 #ifndef __NR_sched_setaffinity
305 #define __NR_sched_setaffinity 203
306 #elif __NR_sched_setaffinity != 203
307 #error Wrong code for setaffinity system call.
308 #endif /* __NR_sched_setaffinity */
309 #ifndef __NR_sched_getaffinity
310 #define __NR_sched_getaffinity 204
311 #elif __NR_sched_getaffinity != 204
312 #error Wrong code for getaffinity system call.
313 #endif /* __NR_sched_getaffinity */
314 #elif KMP_ARCH_S390X
315 #ifndef __NR_sched_setaffinity
316 #define __NR_sched_setaffinity 239
317 #elif __NR_sched_setaffinity != 239
318 #error Wrong code for setaffinity system call.
319 #endif /* __NR_sched_setaffinity */
320 #ifndef __NR_sched_getaffinity
321 #define __NR_sched_getaffinity 240
322 #elif __NR_sched_getaffinity != 240
323 #error Wrong code for getaffinity system call.
324 #endif /* __NR_sched_getaffinity */
325 #else
326 #error Unknown or unsupported architecture
327 #endif /* KMP_ARCH_* */
328 #elif KMP_OS_FREEBSD || KMP_OS_DRAGONFLY
329 #include <pthread.h>
330 #include <pthread_np.h>
331 #elif KMP_OS_NETBSD
332 #include <pthread.h>
333 #include <sched.h>
334 #elif KMP_OS_AIX
335 #include <sys/dr.h>
336 #include <sys/rset.h>
337 #define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
338 #define GET_NUMBER_SMT_SETS 0x0004
339 extern "C" int syssmt(int flags, int, int, int *);
340 #endif
341 class KMPNativeAffinity : public KMPAffinity {
342  class Mask : public KMPAffinity::Mask {
343  typedef unsigned long mask_t;
344  typedef decltype(__kmp_affin_mask_size) mask_size_type;
345  static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
346  static const mask_t ONE = 1;
347  mask_size_type get_num_mask_types() const {
348  return __kmp_affin_mask_size / sizeof(mask_t);
349  }
350 
351  public:
352  mask_t *mask;
353  Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
354  ~Mask() {
355  if (mask)
356  __kmp_free(mask);
357  }
358  void set(int i) override {
359  mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
360  }
361  bool is_set(int i) const override {
362  return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
363  }
364  void clear(int i) override {
365  mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
366  }
367  void zero() override {
368  mask_size_type e = get_num_mask_types();
369  for (mask_size_type i = 0; i < e; ++i)
370  mask[i] = (mask_t)0;
371  }
372  bool empty() const override {
373  mask_size_type e = get_num_mask_types();
374  for (mask_size_type i = 0; i < e; ++i)
375  if (mask[i] != (mask_t)0)
376  return false;
377  return true;
378  }
379  void copy(const KMPAffinity::Mask *src) override {
380  const Mask *convert = static_cast<const Mask *>(src);
381  mask_size_type e = get_num_mask_types();
382  for (mask_size_type i = 0; i < e; ++i)
383  mask[i] = convert->mask[i];
384  }
385  void bitwise_and(const KMPAffinity::Mask *rhs) override {
386  const Mask *convert = static_cast<const Mask *>(rhs);
387  mask_size_type e = get_num_mask_types();
388  for (mask_size_type i = 0; i < e; ++i)
389  mask[i] &= convert->mask[i];
390  }
391  void bitwise_or(const KMPAffinity::Mask *rhs) override {
392  const Mask *convert = static_cast<const Mask *>(rhs);
393  mask_size_type e = get_num_mask_types();
394  for (mask_size_type i = 0; i < e; ++i)
395  mask[i] |= convert->mask[i];
396  }
397  void bitwise_not() override {
398  mask_size_type e = get_num_mask_types();
399  for (mask_size_type i = 0; i < e; ++i)
400  mask[i] = ~(mask[i]);
401  }
402  bool is_equal(const KMPAffinity::Mask *rhs) const override {
403  const Mask *convert = static_cast<const Mask *>(rhs);
404  mask_size_type e = get_num_mask_types();
405  for (mask_size_type i = 0; i < e; ++i)
406  if (mask[i] != convert->mask[i])
407  return false;
408  return true;
409  }
410  int begin() const override {
411  int retval = 0;
412  while (retval < end() && !is_set(retval))
413  ++retval;
414  return retval;
415  }
416  int end() const override {
417  int e;
418  __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
419  return e;
420  }
421  int next(int previous) const override {
422  int retval = previous + 1;
423  while (retval < end() && !is_set(retval))
424  ++retval;
425  return retval;
426  }
427 #if KMP_OS_AIX
428  // On AIX, we don't have a way to get CPU(s) a thread is bound to.
429  // This routine is only used to get the full mask.
430  int get_system_affinity(bool abort_on_error) override {
431  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
432  "Illegal get affinity operation when not capable");
433 
434  (void)abort_on_error;
435 
436  // Set the mask with all CPUs that are available.
437  for (int i = 0; i < __kmp_xproc; ++i)
438  KMP_CPU_SET(i, this);
439  return 0;
440  }
441  int set_system_affinity(bool abort_on_error) const override {
442  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
443 
444  "Illegal set affinity operation when not capable");
445 
446  int location;
447  int gtid = __kmp_entry_gtid();
448  int tid = thread_self();
449 
450  // Unbind the thread if it was bound to any processors before so that
451  // we can bind the thread to CPUs specified by the mask not others.
452  int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
453 
454  // On AIX, we can only bind to one instead of a set of CPUs with the
455  // bindprocessor() system call.
456  KMP_CPU_SET_ITERATE(location, this) {
457  if (KMP_CPU_ISSET(location, this)) {
458  retval = bindprocessor(BINDTHREAD, tid, location);
459  if (retval == -1 && errno == 1) {
460  rsid_t rsid;
461  rsethandle_t rsh;
462  // Put something in rsh to prevent compiler warning
463  // about uninitalized use
464  rsh = rs_alloc(RS_EMPTY);
465  rsid.at_pid = getpid();
466  if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
467  retval = ra_detachrset(R_PROCESS, rsid, 0);
468  retval = bindprocessor(BINDTHREAD, tid, location);
469  }
470  }
471  if (retval == 0) {
472  KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
473  "T#%d to cpu=%d.\n",
474  gtid, location));
475  continue;
476  }
477  int error = errno;
478  if (abort_on_error) {
479  __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
480  KMP_ERR(error), __kmp_msg_null);
481  KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
482  "T#%d to cpu=%d, errno=%d.\n",
483  gtid, location, error));
484  return error;
485  }
486  }
487  }
488  return 0;
489  }
490 #else // !KMP_OS_AIX
491  int get_system_affinity(bool abort_on_error) override {
492  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
493  "Illegal get affinity operation when not capable");
494 #if KMP_OS_LINUX
495  long retval =
496  syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
497 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
498  int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
499  reinterpret_cast<cpuset_t *>(mask));
500  int retval = (r == 0 ? 0 : -1);
501 #endif
502  if (retval >= 0) {
503  return 0;
504  }
505  int error = errno;
506  if (abort_on_error) {
507  __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
508  KMP_ERR(error), __kmp_msg_null);
509  }
510  return error;
511  }
512  int set_system_affinity(bool abort_on_error) const override {
513  KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
514  "Illegal set affinity operation when not capable");
515 #if KMP_OS_LINUX
516  long retval =
517  syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
518 #elif KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY
519  int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
520  reinterpret_cast<cpuset_t *>(mask));
521  int retval = (r == 0 ? 0 : -1);
522 #endif
523  if (retval >= 0) {
524  return 0;
525  }
526  int error = errno;
527  if (abort_on_error) {
528  __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
529  KMP_ERR(error), __kmp_msg_null);
530  }
531  return error;
532  }
533 #endif // KMP_OS_AIX
534  };
535  void determine_capable(const char *env_var) override {
536  __kmp_affinity_determine_capable(env_var);
537  }
538  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
539  KMPAffinity::Mask *allocate_mask() override {
540  KMPNativeAffinity::Mask *retval = new Mask();
541  return retval;
542  }
543  void deallocate_mask(KMPAffinity::Mask *m) override {
544  KMPNativeAffinity::Mask *native_mask =
545  static_cast<KMPNativeAffinity::Mask *>(m);
546  delete native_mask;
547  }
548  KMPAffinity::Mask *allocate_mask_array(int num) override {
549  return new Mask[num];
550  }
551  void deallocate_mask_array(KMPAffinity::Mask *array) override {
552  Mask *linux_array = static_cast<Mask *>(array);
553  delete[] linux_array;
554  }
555  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
556  int index) override {
557  Mask *linux_array = static_cast<Mask *>(array);
558  return &(linux_array[index]);
559  }
560  api_type get_api_type() const override { return NATIVE_OS; }
561 };
562 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY \
563  || KMP_OS_AIX */
564 
565 #if KMP_OS_WINDOWS
566 class KMPNativeAffinity : public KMPAffinity {
567  class Mask : public KMPAffinity::Mask {
568  typedef ULONG_PTR mask_t;
569  static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
570  mask_t *mask;
571 
572  public:
573  Mask() {
574  mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
575  }
576  ~Mask() {
577  if (mask)
578  __kmp_free(mask);
579  }
580  void set(int i) override {
581  mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
582  }
583  bool is_set(int i) const override {
584  return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
585  }
586  void clear(int i) override {
587  mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
588  }
589  void zero() override {
590  for (int i = 0; i < __kmp_num_proc_groups; ++i)
591  mask[i] = 0;
592  }
593  bool empty() const override {
594  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
595  if (mask[i])
596  return false;
597  return true;
598  }
599  void copy(const KMPAffinity::Mask *src) override {
600  const Mask *convert = static_cast<const Mask *>(src);
601  for (int i = 0; i < __kmp_num_proc_groups; ++i)
602  mask[i] = convert->mask[i];
603  }
604  void bitwise_and(const KMPAffinity::Mask *rhs) override {
605  const Mask *convert = static_cast<const Mask *>(rhs);
606  for (int i = 0; i < __kmp_num_proc_groups; ++i)
607  mask[i] &= convert->mask[i];
608  }
609  void bitwise_or(const KMPAffinity::Mask *rhs) override {
610  const Mask *convert = static_cast<const Mask *>(rhs);
611  for (int i = 0; i < __kmp_num_proc_groups; ++i)
612  mask[i] |= convert->mask[i];
613  }
614  void bitwise_not() override {
615  for (int i = 0; i < __kmp_num_proc_groups; ++i)
616  mask[i] = ~(mask[i]);
617  }
618  bool is_equal(const KMPAffinity::Mask *rhs) const override {
619  const Mask *convert = static_cast<const Mask *>(rhs);
620  for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
621  if (mask[i] != convert->mask[i])
622  return false;
623  return true;
624  }
625  int begin() const override {
626  int retval = 0;
627  while (retval < end() && !is_set(retval))
628  ++retval;
629  return retval;
630  }
631  int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
632  int next(int previous) const override {
633  int retval = previous + 1;
634  while (retval < end() && !is_set(retval))
635  ++retval;
636  return retval;
637  }
638  int set_process_affinity(bool abort_on_error) const override {
639  if (__kmp_num_proc_groups <= 1) {
640  if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
641  DWORD error = GetLastError();
642  if (abort_on_error) {
643  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
644  __kmp_msg_null);
645  }
646  return error;
647  }
648  }
649  return 0;
650  }
651  int set_system_affinity(bool abort_on_error) const override {
652  if (__kmp_num_proc_groups > 1) {
653  // Check for a valid mask.
654  GROUP_AFFINITY ga;
655  int group = get_proc_group();
656  if (group < 0) {
657  if (abort_on_error) {
658  KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
659  }
660  return -1;
661  }
662  // Transform the bit vector into a GROUP_AFFINITY struct
663  // and make the system call to set affinity.
664  ga.Group = group;
665  ga.Mask = mask[group];
666  ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
667 
668  KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
669  if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
670  DWORD error = GetLastError();
671  if (abort_on_error) {
672  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673  __kmp_msg_null);
674  }
675  return error;
676  }
677  } else {
678  if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
679  DWORD error = GetLastError();
680  if (abort_on_error) {
681  __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
682  __kmp_msg_null);
683  }
684  return error;
685  }
686  }
687  return 0;
688  }
689  int get_system_affinity(bool abort_on_error) override {
690  if (__kmp_num_proc_groups > 1) {
691  this->zero();
692  GROUP_AFFINITY ga;
693  KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
694  if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
695  DWORD error = GetLastError();
696  if (abort_on_error) {
697  __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
698  KMP_ERR(error), __kmp_msg_null);
699  }
700  return error;
701  }
702  if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
703  (ga.Mask == 0)) {
704  return -1;
705  }
706  mask[ga.Group] = ga.Mask;
707  } else {
708  mask_t newMask, sysMask, retval;
709  if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
710  DWORD error = GetLastError();
711  if (abort_on_error) {
712  __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
713  KMP_ERR(error), __kmp_msg_null);
714  }
715  return error;
716  }
717  retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
718  if (!retval) {
719  DWORD error = GetLastError();
720  if (abort_on_error) {
721  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722  KMP_ERR(error), __kmp_msg_null);
723  }
724  return error;
725  }
726  newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
727  if (!newMask) {
728  DWORD error = GetLastError();
729  if (abort_on_error) {
730  __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
731  KMP_ERR(error), __kmp_msg_null);
732  }
733  }
734  *mask = retval;
735  }
736  return 0;
737  }
738  int get_proc_group() const override {
739  int group = -1;
740  if (__kmp_num_proc_groups == 1) {
741  return 1;
742  }
743  for (int i = 0; i < __kmp_num_proc_groups; i++) {
744  if (mask[i] == 0)
745  continue;
746  if (group >= 0)
747  return -1;
748  group = i;
749  }
750  return group;
751  }
752  };
753  void determine_capable(const char *env_var) override {
754  __kmp_affinity_determine_capable(env_var);
755  }
756  void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
757  KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
758  void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
759  KMPAffinity::Mask *allocate_mask_array(int num) override {
760  return new Mask[num];
761  }
762  void deallocate_mask_array(KMPAffinity::Mask *array) override {
763  Mask *windows_array = static_cast<Mask *>(array);
764  delete[] windows_array;
765  }
766  KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
767  int index) override {
768  Mask *windows_array = static_cast<Mask *>(array);
769  return &(windows_array[index]);
770  }
771  api_type get_api_type() const override { return NATIVE_OS; }
772 };
773 #endif /* KMP_OS_WINDOWS */
774 #endif /* KMP_AFFINITY_SUPPORTED */
775 
776 // Describe an attribute for a level in the machine topology
777 struct kmp_hw_attr_t {
778  int core_type : 8;
779  int core_eff : 8;
780  unsigned valid : 1;
781  unsigned reserved : 15;
782 
783  static const int UNKNOWN_CORE_EFF = -1;
784 
785  kmp_hw_attr_t()
786  : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
787  valid(0), reserved(0) {}
788  void set_core_type(kmp_hw_core_type_t type) {
789  valid = 1;
790  core_type = type;
791  }
792  void set_core_eff(int eff) {
793  valid = 1;
794  core_eff = eff;
795  }
796  kmp_hw_core_type_t get_core_type() const {
797  return (kmp_hw_core_type_t)core_type;
798  }
799  int get_core_eff() const { return core_eff; }
800  bool is_core_type_valid() const {
801  return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
802  }
803  bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
804  operator bool() const { return valid; }
805  void clear() {
806  core_type = KMP_HW_CORE_TYPE_UNKNOWN;
807  core_eff = UNKNOWN_CORE_EFF;
808  valid = 0;
809  }
810  bool contains(const kmp_hw_attr_t &other) const {
811  if (!valid && !other.valid)
812  return true;
813  if (valid && other.valid) {
814  if (other.is_core_type_valid()) {
815  if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
816  return false;
817  }
818  if (other.is_core_eff_valid()) {
819  if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
820  return false;
821  }
822  return true;
823  }
824  return false;
825  }
826 #if KMP_AFFINITY_SUPPORTED
827  bool contains(const kmp_affinity_attrs_t &attr) const {
828  if (!valid && !attr.valid)
829  return true;
830  if (valid && attr.valid) {
831  if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
832  return (is_core_type_valid() &&
833  (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
834  if (attr.core_eff != UNKNOWN_CORE_EFF)
835  return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
836  return true;
837  }
838  return false;
839  }
840 #endif // KMP_AFFINITY_SUPPORTED
841  bool operator==(const kmp_hw_attr_t &rhs) const {
842  return (rhs.valid == valid && rhs.core_eff == core_eff &&
843  rhs.core_type == core_type);
844  }
845  bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
846 };
847 
848 #if KMP_AFFINITY_SUPPORTED
849 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
850 #endif
851 
852 class kmp_hw_thread_t {
853 public:
854  static const int UNKNOWN_ID = -1;
855  static const int MULTIPLE_ID = -2;
856  static int compare_ids(const void *a, const void *b);
857  static int compare_compact(const void *a, const void *b);
858  int ids[KMP_HW_LAST];
859  int sub_ids[KMP_HW_LAST];
860  bool leader;
861  int os_id;
862  int original_idx;
863  kmp_hw_attr_t attrs;
864 
865  void print() const;
866  void clear() {
867  for (int i = 0; i < (int)KMP_HW_LAST; ++i)
868  ids[i] = UNKNOWN_ID;
869  leader = false;
870  attrs.clear();
871  }
872 };
873 
874 class kmp_topology_t {
875 
876  struct flags_t {
877  int uniform : 1;
878  int reserved : 31;
879  };
880 
881  int depth;
882 
883  // The following arrays are all 'depth' long and have been
884  // allocated to hold up to KMP_HW_LAST number of objects if
885  // needed so layers can be added without reallocation of any array
886 
887  // Orderd array of the types in the topology
888  kmp_hw_t *types;
889 
890  // Keep quick topology ratios, for non-uniform topologies,
891  // this ratio holds the max number of itemAs per itemB
892  // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
893  int *ratio;
894 
895  // Storage containing the absolute number of each topology layer
896  int *count;
897 
898  // The number of core efficiencies. This is only useful for hybrid
899  // topologies. Core efficiencies will range from 0 to num efficiencies - 1
900  int num_core_efficiencies;
901  int num_core_types;
902  kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
903 
904  // The hardware threads array
905  // hw_threads is num_hw_threads long
906  // Each hw_thread's ids and sub_ids are depth deep
907  int num_hw_threads;
908  kmp_hw_thread_t *hw_threads;
909 
910  // Equivalence hash where the key is the hardware topology item
911  // and the value is the equivalent hardware topology type in the
912  // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
913  // known equivalence for the topology type
914  kmp_hw_t equivalent[KMP_HW_LAST];
915 
916  // Flags describing the topology
917  flags_t flags;
918 
919  // Compact value used during sort_compact()
920  int compact;
921 
922 #if KMP_GROUP_AFFINITY
923  // Insert topology information about Windows Processor groups
924  void _insert_windows_proc_groups();
925 #endif
926 
927  // Count each item & get the num x's per y
928  // e.g., get the number of cores and the number of threads per core
929  // for each (x, y) in (KMP_HW_* , KMP_HW_*)
930  void _gather_enumeration_information();
931 
932  // Remove layers that don't add information to the topology.
933  // This is done by having the layer take on the id = UNKNOWN_ID (-1)
934  void _remove_radix1_layers();
935 
936  // Find out if the topology is uniform
937  void _discover_uniformity();
938 
939  // Set all the sub_ids for each hardware thread
940  void _set_sub_ids();
941 
942  // Set global affinity variables describing the number of threads per
943  // core, the number of packages, the number of cores per package, and
944  // the number of cores.
945  void _set_globals();
946 
947  // Set the last level cache equivalent type
948  void _set_last_level_cache();
949 
950  // Return the number of cores with a particular attribute, 'attr'.
951  // If 'find_all' is true, then find all cores on the machine, otherwise find
952  // all cores per the layer 'above'
953  int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
954  bool find_all = false) const;
955 
956 public:
957  // Force use of allocate()/deallocate()
958  kmp_topology_t() = delete;
959  kmp_topology_t(const kmp_topology_t &t) = delete;
960  kmp_topology_t(kmp_topology_t &&t) = delete;
961  kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
962  kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
963 
964  static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
965  static void deallocate(kmp_topology_t *);
966 
967  // Functions used in create_map() routines
968  kmp_hw_thread_t &at(int index) {
969  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
970  return hw_threads[index];
971  }
972  const kmp_hw_thread_t &at(int index) const {
973  KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
974  return hw_threads[index];
975  }
976  int get_num_hw_threads() const { return num_hw_threads; }
977  void sort_ids() {
978  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
979  kmp_hw_thread_t::compare_ids);
980  }
981 
982  // Insert a new topology layer after allocation
983  void insert_layer(kmp_hw_t type, const int *ids);
984 
985  // Check if the hardware ids are unique, if they are
986  // return true, otherwise return false
987  bool check_ids() const;
988 
989  // Function to call after the create_map() routine
990  void canonicalize();
991  void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
992 
993 // Functions used after canonicalize() called
994 
995 #if KMP_AFFINITY_SUPPORTED
996  // Set the granularity for affinity settings
997  void set_granularity(kmp_affinity_t &stgs) const;
998  bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
999  bool restrict_to_mask(const kmp_affin_mask_t *mask);
1000  bool filter_hw_subset();
1001 #endif
1002  bool is_uniform() const { return flags.uniform; }
1003  // Tell whether a type is a valid type in the topology
1004  // returns KMP_HW_UNKNOWN when there is no equivalent type
1005  kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
1006  if (type == KMP_HW_UNKNOWN)
1007  return KMP_HW_UNKNOWN;
1008  return equivalent[type];
1009  }
1010  // Set type1 = type2
1011  void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1012  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1013  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1014  kmp_hw_t real_type2 = equivalent[type2];
1015  if (real_type2 == KMP_HW_UNKNOWN)
1016  real_type2 = type2;
1017  equivalent[type1] = real_type2;
1018  // This loop is required since any of the types may have been set to
1019  // be equivalent to type1. They all must be checked and reset to type2.
1020  KMP_FOREACH_HW_TYPE(type) {
1021  if (equivalent[type] == type1) {
1022  equivalent[type] = real_type2;
1023  }
1024  }
1025  }
1026  // Calculate number of types corresponding to level1
1027  // per types corresponding to level2 (e.g., number of threads per core)
1028  int calculate_ratio(int level1, int level2) const {
1029  KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1030  KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1031  int r = 1;
1032  for (int level = level1; level > level2; --level)
1033  r *= ratio[level];
1034  return r;
1035  }
1036  int get_ratio(int level) const {
1037  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1038  return ratio[level];
1039  }
1040  int get_depth() const { return depth; };
1041  kmp_hw_t get_type(int level) const {
1042  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1043  return types[level];
1044  }
1045  int get_level(kmp_hw_t type) const {
1046  KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1047  int eq_type = equivalent[type];
1048  if (eq_type == KMP_HW_UNKNOWN)
1049  return -1;
1050  for (int i = 0; i < depth; ++i)
1051  if (types[i] == eq_type)
1052  return i;
1053  return -1;
1054  }
1055  int get_count(int level) const {
1056  KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1057  return count[level];
1058  }
1059  // Return the total number of cores with attribute 'attr'
1060  int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1061  return _get_ncores_with_attr(attr, -1, true);
1062  }
1063  // Return the number of cores with attribute
1064  // 'attr' per topology level 'above'
1065  int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1066  return _get_ncores_with_attr(attr, above, false);
1067  }
1068 
1069 #if KMP_AFFINITY_SUPPORTED
1070  friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1071  void sort_compact(kmp_affinity_t &affinity) {
1072  compact = affinity.compact;
1073  qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1074  kmp_hw_thread_t::compare_compact);
1075  }
1076 #endif
1077  void print(const char *env_var = "KMP_AFFINITY") const;
1078  void dump() const;
1079 };
1080 extern kmp_topology_t *__kmp_topology;
1081 
1082 class kmp_hw_subset_t {
1083  const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1084 
1085 public:
1086  // Describe a machine topology item in KMP_HW_SUBSET
1087  struct item_t {
1088  kmp_hw_t type;
1089  int num_attrs;
1090  int num[MAX_ATTRS];
1091  int offset[MAX_ATTRS];
1092  kmp_hw_attr_t attr[MAX_ATTRS];
1093  };
1094  // Put parenthesis around max to avoid accidental use of Windows max macro.
1095  const static int USE_ALL = (std::numeric_limits<int>::max)();
1096 
1097 private:
1098  int depth;
1099  int capacity;
1100  item_t *items;
1101  kmp_uint64 set;
1102  bool absolute;
1103  // The set must be able to handle up to KMP_HW_LAST number of layers
1104  KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1105  // Sorting the KMP_HW_SUBSET items to follow topology order
1106  // All unknown topology types will be at the beginning of the subset
1107  static int hw_subset_compare(const void *i1, const void *i2) {
1108  kmp_hw_t type1 = ((const item_t *)i1)->type;
1109  kmp_hw_t type2 = ((const item_t *)i2)->type;
1110  int level1 = __kmp_topology->get_level(type1);
1111  int level2 = __kmp_topology->get_level(type2);
1112  return level1 - level2;
1113  }
1114 
1115 public:
1116  // Force use of allocate()/deallocate()
1117  kmp_hw_subset_t() = delete;
1118  kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1119  kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1120  kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1121  kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1122 
1123  static kmp_hw_subset_t *allocate() {
1124  int initial_capacity = 5;
1125  kmp_hw_subset_t *retval =
1126  (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1127  retval->depth = 0;
1128  retval->capacity = initial_capacity;
1129  retval->set = 0ull;
1130  retval->absolute = false;
1131  retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1132  return retval;
1133  }
1134  static void deallocate(kmp_hw_subset_t *subset) {
1135  __kmp_free(subset->items);
1136  __kmp_free(subset);
1137  }
1138  void set_absolute() { absolute = true; }
1139  bool is_absolute() const { return absolute; }
1140  void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1141  for (int i = 0; i < depth; ++i) {
1142  // Found an existing item for this layer type
1143  // Add the num, offset, and attr to this item
1144  if (items[i].type == type) {
1145  int idx = items[i].num_attrs++;
1146  if ((size_t)idx >= MAX_ATTRS)
1147  return;
1148  items[i].num[idx] = num;
1149  items[i].offset[idx] = offset;
1150  items[i].attr[idx] = attr;
1151  return;
1152  }
1153  }
1154  if (depth == capacity - 1) {
1155  capacity *= 2;
1156  item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1157  for (int i = 0; i < depth; ++i)
1158  new_items[i] = items[i];
1159  __kmp_free(items);
1160  items = new_items;
1161  }
1162  items[depth].num_attrs = 1;
1163  items[depth].type = type;
1164  items[depth].num[0] = num;
1165  items[depth].offset[0] = offset;
1166  items[depth].attr[0] = attr;
1167  depth++;
1168  set |= (1ull << type);
1169  }
1170  int get_depth() const { return depth; }
1171  const item_t &at(int index) const {
1172  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1173  return items[index];
1174  }
1175  item_t &at(int index) {
1176  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1177  return items[index];
1178  }
1179  void remove(int index) {
1180  KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1181  set &= ~(1ull << items[index].type);
1182  for (int j = index + 1; j < depth; ++j) {
1183  items[j - 1] = items[j];
1184  }
1185  depth--;
1186  }
1187  void sort() {
1188  KMP_DEBUG_ASSERT(__kmp_topology);
1189  qsort(items, depth, sizeof(item_t), hw_subset_compare);
1190  }
1191  bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1192 
1193  // Canonicalize the KMP_HW_SUBSET value if it is not an absolute subset.
1194  // This means putting each of {sockets, cores, threads} in the topology if
1195  // they are not specified:
1196  // e.g., 1s,2c => 1s,2c,*t | 2c,1t => *s,2c,1t | 1t => *s,*c,1t | etc.
1197  // e.g., 3module => *s,3module,*c,*t
1198  // By doing this, the runtime assumes users who fiddle with KMP_HW_SUBSET
1199  // are expecting the traditional sockets/cores/threads topology. For newer
1200  // hardware, there can be intervening layers like dies/tiles/modules
1201  // (usually corresponding to a cache level). So when a user asks for
1202  // 1s,6c,2t and the topology is really 1s,2modules,4cores,2threads, the user
1203  // should get 12 hardware threads across 6 cores and effectively ignore the
1204  // module layer.
1205  void canonicalize(const kmp_topology_t *top) {
1206  // Layers to target for KMP_HW_SUBSET canonicalization
1207  kmp_hw_t targeted[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD};
1208 
1209  // Do not target-layer-canonicalize absolute KMP_HW_SUBSETS
1210  if (is_absolute())
1211  return;
1212 
1213  // Do not target-layer-canonicalize KMP_HW_SUBSETS when the
1214  // topology doesn't have these layers
1215  for (kmp_hw_t type : targeted)
1216  if (top->get_level(type) == KMP_HW_UNKNOWN)
1217  return;
1218 
1219  // Put targeted layers in topology if they do not exist
1220  for (kmp_hw_t type : targeted) {
1221  bool found = false;
1222  for (int i = 0; i < get_depth(); ++i) {
1223  if (top->get_equivalent_type(items[i].type) == type) {
1224  found = true;
1225  break;
1226  }
1227  }
1228  if (!found) {
1229  push_back(USE_ALL, type, 0, kmp_hw_attr_t{});
1230  }
1231  }
1232  sort();
1233  // Set as an absolute topology that only targets the targeted layers
1234  set_absolute();
1235  }
1236  void dump() const {
1237  printf("**********************\n");
1238  printf("*** kmp_hw_subset: ***\n");
1239  printf("* depth: %d\n", depth);
1240  printf("* items:\n");
1241  for (int i = 0; i < depth; ++i) {
1242  printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1243  for (int j = 0; j < items[i].num_attrs; ++j) {
1244  printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1245  items[i].offset[j]);
1246  if (!items[i].attr[j]) {
1247  printf(" (none)\n");
1248  } else {
1249  printf(
1250  " core_type = %s, core_eff = %d\n",
1251  __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1252  items[i].attr[j].get_core_eff());
1253  }
1254  }
1255  }
1256  printf("* set: 0x%llx\n", set);
1257  printf("* absolute: %d\n", absolute);
1258  printf("**********************\n");
1259  }
1260 };
1261 extern kmp_hw_subset_t *__kmp_hw_subset;
1262 
1263 /* A structure for holding machine-specific hierarchy info to be computed once
1264  at init. This structure represents a mapping of threads to the actual machine
1265  hierarchy, or to our best guess at what the hierarchy might be, for the
1266  purpose of performing an efficient barrier. In the worst case, when there is
1267  no machine hierarchy information, it produces a tree suitable for a barrier,
1268  similar to the tree used in the hyper barrier. */
1269 class hierarchy_info {
1270 public:
1271  /* Good default values for number of leaves and branching factor, given no
1272  affinity information. Behaves a bit like hyper barrier. */
1273  static const kmp_uint32 maxLeaves = 4;
1274  static const kmp_uint32 minBranch = 4;
1280  kmp_uint32 maxLevels;
1281 
1286  kmp_uint32 depth;
1287  kmp_uint32 base_num_threads = 0;
1288  enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1289  volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1290  // 2=initialization in progress
1291  volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1292 
1297  kmp_uint32 *numPerLevel = nullptr;
1298  kmp_uint32 *skipPerLevel = nullptr;
1299 
1300  void deriveLevels() {
1301  int hier_depth = __kmp_topology->get_depth();
1302  for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1303  numPerLevel[level] = __kmp_topology->get_ratio(i);
1304  }
1305  }
1306 
1307  hierarchy_info()
1308  : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1309 
1310  void fini() {
1311  if (!uninitialized && numPerLevel) {
1312  __kmp_free(numPerLevel);
1313  numPerLevel = NULL;
1314  uninitialized = not_initialized;
1315  }
1316  }
1317 
1318  void init(int num_addrs) {
1319  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1320  &uninitialized, not_initialized, initializing);
1321  if (bool_result == 0) { // Wait for initialization
1322  while (TCR_1(uninitialized) != initialized)
1323  KMP_CPU_PAUSE();
1324  return;
1325  }
1326  KMP_DEBUG_ASSERT(bool_result == 1);
1327 
1328  /* Added explicit initialization of the data fields here to prevent usage of
1329  dirty value observed when static library is re-initialized multiple times
1330  (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1331  OpenMP). */
1332  depth = 1;
1333  resizing = 0;
1334  maxLevels = 7;
1335  numPerLevel =
1336  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1337  skipPerLevel = &(numPerLevel[maxLevels]);
1338  for (kmp_uint32 i = 0; i < maxLevels;
1339  ++i) { // init numPerLevel[*] to 1 item per level
1340  numPerLevel[i] = 1;
1341  skipPerLevel[i] = 1;
1342  }
1343 
1344  // Sort table by physical ID
1345  if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1346  deriveLevels();
1347  } else {
1348  numPerLevel[0] = maxLeaves;
1349  numPerLevel[1] = num_addrs / maxLeaves;
1350  if (num_addrs % maxLeaves)
1351  numPerLevel[1]++;
1352  }
1353 
1354  base_num_threads = num_addrs;
1355  for (int i = maxLevels - 1; i >= 0;
1356  --i) // count non-empty levels to get depth
1357  if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1358  depth++;
1359 
1360  kmp_uint32 branch = minBranch;
1361  if (numPerLevel[0] == 1)
1362  branch = num_addrs / maxLeaves;
1363  if (branch < minBranch)
1364  branch = minBranch;
1365  for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1366  while (numPerLevel[d] > branch ||
1367  (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1368  if (numPerLevel[d] & 1)
1369  numPerLevel[d]++;
1370  numPerLevel[d] = numPerLevel[d] >> 1;
1371  if (numPerLevel[d + 1] == 1)
1372  depth++;
1373  numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1374  }
1375  if (numPerLevel[0] == 1) {
1376  branch = branch >> 1;
1377  if (branch < 4)
1378  branch = minBranch;
1379  }
1380  }
1381 
1382  for (kmp_uint32 i = 1; i < depth; ++i)
1383  skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1384  // Fill in hierarchy in the case of oversubscription
1385  for (kmp_uint32 i = depth; i < maxLevels; ++i)
1386  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1387 
1388  uninitialized = initialized; // One writer
1389  }
1390 
1391  // Resize the hierarchy if nproc changes to something larger than before
1392  void resize(kmp_uint32 nproc) {
1393  kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1394  while (bool_result == 0) { // someone else is trying to resize
1395  KMP_CPU_PAUSE();
1396  if (nproc <= base_num_threads) // happy with other thread's resize
1397  return;
1398  else // try to resize
1399  bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1400  }
1401  KMP_DEBUG_ASSERT(bool_result != 0);
1402  if (nproc <= base_num_threads)
1403  return; // happy with other thread's resize
1404 
1405  // Calculate new maxLevels
1406  kmp_uint32 old_sz = skipPerLevel[depth - 1];
1407  kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1408  // First see if old maxLevels is enough to contain new size
1409  for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1410  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1411  numPerLevel[i - 1] *= 2;
1412  old_sz *= 2;
1413  depth++;
1414  }
1415  if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1416  while (nproc > old_sz) {
1417  old_sz *= 2;
1418  incs++;
1419  depth++;
1420  }
1421  maxLevels += incs;
1422 
1423  // Resize arrays
1424  kmp_uint32 *old_numPerLevel = numPerLevel;
1425  kmp_uint32 *old_skipPerLevel = skipPerLevel;
1426  numPerLevel = skipPerLevel = NULL;
1427  numPerLevel =
1428  (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1429  skipPerLevel = &(numPerLevel[maxLevels]);
1430 
1431  // Copy old elements from old arrays
1432  for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1433  // init numPerLevel[*] to 1 item per level
1434  numPerLevel[i] = old_numPerLevel[i];
1435  skipPerLevel[i] = old_skipPerLevel[i];
1436  }
1437 
1438  // Init new elements in arrays to 1
1439  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1440  // init numPerLevel[*] to 1 item per level
1441  numPerLevel[i] = 1;
1442  skipPerLevel[i] = 1;
1443  }
1444 
1445  // Free old arrays
1446  __kmp_free(old_numPerLevel);
1447  }
1448 
1449  // Fill in oversubscription levels of hierarchy
1450  for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1451  skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1452 
1453  base_num_threads = nproc;
1454  resizing = 0; // One writer
1455  }
1456 };
1457 #endif // KMP_AFFINITY_H