LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 #if ENABLE_LIBOMPTARGET
97 static void __kmp_target_init(void);
98 #endif // ENABLE_LIBOMPTARGET
99 void __kmp_fork_barrier(int gtid, int tid);
100 void __kmp_join_barrier(int gtid);
101 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
102  kmp_internal_control_t *new_icvs, ident_t *loc);
103 
104 #ifdef USE_LOAD_BALANCE
105 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
106 #endif
107 
108 static int __kmp_expand_threads(int nNeed);
109 #if KMP_OS_WINDOWS
110 static int __kmp_unregister_root_other_thread(int gtid);
111 #endif
112 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
113 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
114 
115 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
116  int new_nthreads);
117 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
118 
119 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
120  int level) {
121  kmp_nested_nthreads_t *new_nested_nth =
122  (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
123  sizeof(kmp_nested_nthreads_t));
124  int new_size = level + thr->th.th_set_nested_nth_sz;
125  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
126  for (int i = 0; i < level + 1; ++i)
127  new_nested_nth->nth[i] = 0;
128  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
129  new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
130  new_nested_nth->size = new_nested_nth->used = new_size;
131  return new_nested_nth;
132 }
133 
134 /* Calculate the identifier of the current thread */
135 /* fast (and somewhat portable) way to get unique identifier of executing
136  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
137 int __kmp_get_global_thread_id() {
138  int i;
139  kmp_info_t **other_threads;
140  size_t stack_data;
141  char *stack_addr;
142  size_t stack_size;
143  char *stack_base;
144 
145  KA_TRACE(
146  1000,
147  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
148  __kmp_nth, __kmp_all_nth));
149 
150  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
151  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
152  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
153  __kmp_init_gtid for this to work. */
154 
155  if (!TCR_4(__kmp_init_gtid))
156  return KMP_GTID_DNE;
157 
158 #ifdef KMP_TDATA_GTID
159  if (TCR_4(__kmp_gtid_mode) >= 3) {
160  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
161  return __kmp_gtid;
162  }
163 #endif
164  if (TCR_4(__kmp_gtid_mode) >= 2) {
165  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
166  return __kmp_gtid_get_specific();
167  }
168  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
169 
170  stack_addr = (char *)&stack_data;
171  other_threads = __kmp_threads;
172 
173  /* ATT: The code below is a source of potential bugs due to unsynchronized
174  access to __kmp_threads array. For example:
175  1. Current thread loads other_threads[i] to thr and checks it, it is
176  non-NULL.
177  2. Current thread is suspended by OS.
178  3. Another thread unregisters and finishes (debug versions of free()
179  may fill memory with something like 0xEF).
180  4. Current thread is resumed.
181  5. Current thread reads junk from *thr.
182  TODO: Fix it. --ln */
183 
184  for (i = 0; i < __kmp_threads_capacity; i++) {
185 
186  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
187  if (!thr)
188  continue;
189 
190  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
191  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
192 
193  /* stack grows down -- search through all of the active threads */
194 
195  if (stack_addr <= stack_base) {
196  size_t stack_diff = stack_base - stack_addr;
197 
198  if (stack_diff <= stack_size) {
199  /* The only way we can be closer than the allocated */
200  /* stack size is if we are running on this thread. */
201  // __kmp_gtid_get_specific can return negative value because this
202  // function can be called by thread destructor. However, before the
203  // thread destructor is called, the value of the corresponding
204  // thread-specific data will be reset to NULL.
205  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
206  __kmp_gtid_get_specific() == i);
207  return i;
208  }
209  }
210  }
211 
212  /* get specific to try and determine our gtid */
213  KA_TRACE(1000,
214  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
215  "thread, using TLS\n"));
216  i = __kmp_gtid_get_specific();
217 
218  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
219 
220  /* if we havn't been assigned a gtid, then return code */
221  if (i < 0)
222  return i;
223 
224  // other_threads[i] can be nullptr at this point because the corresponding
225  // thread could have already been destructed. It can happen when this function
226  // is called in end library routine.
227  if (!TCR_SYNC_PTR(other_threads[i]))
228  return i;
229 
230  /* dynamically updated stack window for uber threads to avoid get_specific
231  call */
232  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
233  KMP_FATAL(StackOverflow, i);
234  }
235 
236  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
237  if (stack_addr > stack_base) {
238  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
239  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
240  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
241  stack_base);
242  } else {
243  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
244  stack_base - stack_addr);
245  }
246 
247  /* Reprint stack bounds for ubermaster since they have been refined */
248  if (__kmp_storage_map) {
249  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
250  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
251  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
252  other_threads[i]->th.th_info.ds.ds_stacksize,
253  "th_%d stack (refinement)", i);
254  }
255  return i;
256 }
257 
258 int __kmp_get_global_thread_id_reg() {
259  int gtid;
260 
261  if (!__kmp_init_serial) {
262  gtid = KMP_GTID_DNE;
263  } else
264 #ifdef KMP_TDATA_GTID
265  if (TCR_4(__kmp_gtid_mode) >= 3) {
266  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
267  gtid = __kmp_gtid;
268  } else
269 #endif
270  if (TCR_4(__kmp_gtid_mode) >= 2) {
271  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
272  gtid = __kmp_gtid_get_specific();
273  } else {
274  KA_TRACE(1000,
275  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
276  gtid = __kmp_get_global_thread_id();
277  }
278 
279  /* we must be a new uber master sibling thread */
280  if (gtid == KMP_GTID_DNE) {
281  KA_TRACE(10,
282  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
283  "Registering a new gtid.\n"));
284  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
285  if (!__kmp_init_serial) {
286  __kmp_do_serial_initialize();
287  gtid = __kmp_gtid_get_specific();
288  } else {
289  gtid = __kmp_register_root(FALSE);
290  }
291  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
292  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
293  }
294 
295  KMP_DEBUG_ASSERT(gtid >= 0);
296 
297  return gtid;
298 }
299 
300 /* caller must hold forkjoin_lock */
301 void __kmp_check_stack_overlap(kmp_info_t *th) {
302  int f;
303  char *stack_beg = NULL;
304  char *stack_end = NULL;
305  int gtid;
306 
307  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
308  if (__kmp_storage_map) {
309  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
310  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
311 
312  gtid = __kmp_gtid_from_thread(th);
313 
314  if (gtid == KMP_GTID_MONITOR) {
315  __kmp_print_storage_map_gtid(
316  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
317  "th_%s stack (%s)", "mon",
318  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
319  } else {
320  __kmp_print_storage_map_gtid(
321  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
322  "th_%d stack (%s)", gtid,
323  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
324  }
325  }
326 
327  /* No point in checking ubermaster threads since they use refinement and
328  * cannot overlap */
329  gtid = __kmp_gtid_from_thread(th);
330  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
331  KA_TRACE(10,
332  ("__kmp_check_stack_overlap: performing extensive checking\n"));
333  if (stack_beg == NULL) {
334  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
335  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
336  }
337 
338  for (f = 0; f < __kmp_threads_capacity; f++) {
339  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
340 
341  if (f_th && f_th != th) {
342  char *other_stack_end =
343  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
344  char *other_stack_beg =
345  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
346  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
347  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
348 
349  /* Print the other stack values before the abort */
350  if (__kmp_storage_map)
351  __kmp_print_storage_map_gtid(
352  -1, other_stack_beg, other_stack_end,
353  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
354  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
355 
356  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
357  __kmp_msg_null);
358  }
359  }
360  }
361  }
362  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
363 }
364 
365 /* ------------------------------------------------------------------------ */
366 
367 void __kmp_infinite_loop(void) {
368  static int done = FALSE;
369 
370  while (!done) {
371  KMP_YIELD(TRUE);
372  }
373 }
374 
375 #define MAX_MESSAGE 512
376 
377 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
378  char const *format, ...) {
379  char buffer[MAX_MESSAGE];
380  va_list ap;
381 
382  va_start(ap, format);
383  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
384  p2, (unsigned long)size, format);
385  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
386  __kmp_vprintf(kmp_err, buffer, ap);
387 #if KMP_PRINT_DATA_PLACEMENT
388  int node;
389  if (gtid >= 0) {
390  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
391  if (__kmp_storage_map_verbose) {
392  node = __kmp_get_host_node(p1);
393  if (node < 0) /* doesn't work, so don't try this next time */
394  __kmp_storage_map_verbose = FALSE;
395  else {
396  char *last;
397  int lastNode;
398  int localProc = __kmp_get_cpu_from_gtid(gtid);
399 
400  const int page_size = KMP_GET_PAGE_SIZE();
401 
402  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
403  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
404  if (localProc >= 0)
405  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
406  localProc >> 1);
407  else
408  __kmp_printf_no_lock(" GTID %d\n", gtid);
409 #if KMP_USE_PRCTL
410  /* The more elaborate format is disabled for now because of the prctl
411  * hanging bug. */
412  do {
413  last = p1;
414  lastNode = node;
415  /* This loop collates adjacent pages with the same host node. */
416  do {
417  (char *)p1 += page_size;
418  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
419  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
420  lastNode);
421  } while (p1 <= p2);
422 #else
423  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
424  (char *)p1 + (page_size - 1),
425  __kmp_get_host_node(p1));
426  if (p1 < p2) {
427  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
428  (char *)p2 + (page_size - 1),
429  __kmp_get_host_node(p2));
430  }
431 #endif
432  }
433  }
434  } else
435  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
436  }
437 #endif /* KMP_PRINT_DATA_PLACEMENT */
438  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
439 
440  va_end(ap);
441 }
442 
443 void __kmp_warn(char const *format, ...) {
444  char buffer[MAX_MESSAGE];
445  va_list ap;
446 
447  if (__kmp_generate_warnings == kmp_warnings_off) {
448  return;
449  }
450 
451  va_start(ap, format);
452 
453  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
454  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
455  __kmp_vprintf(kmp_err, buffer, ap);
456  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
457 
458  va_end(ap);
459 }
460 
461 void __kmp_abort_process() {
462  // Later threads may stall here, but that's ok because abort() will kill them.
463  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
464 
465  if (__kmp_debug_buf) {
466  __kmp_dump_debug_buffer();
467  }
468 
469 #if KMP_OS_WINDOWS
470  // Let other threads know of abnormal termination and prevent deadlock
471  // if abort happened during library initialization or shutdown
472  __kmp_global.g.g_abort = SIGABRT;
473 
474  /* On Windows* OS by default abort() causes pop-up error box, which stalls
475  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
476  boxes. _set_abort_behavior() works well, but this function is not
477  available in VS7 (this is not problem for DLL, but it is a problem for
478  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
479  help, at least in some versions of MS C RTL.
480 
481  It seems following sequence is the only way to simulate abort() and
482  avoid pop-up error box. */
483  raise(SIGABRT);
484  _exit(3); // Just in case, if signal ignored, exit anyway.
485 #else
486  __kmp_unregister_library();
487  abort();
488 #endif
489 
490  __kmp_infinite_loop();
491  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
492 
493 } // __kmp_abort_process
494 
495 void __kmp_abort_thread(void) {
496  // TODO: Eliminate g_abort global variable and this function.
497  // In case of abort just call abort(), it will kill all the threads.
498  __kmp_infinite_loop();
499 } // __kmp_abort_thread
500 
501 /* Print out the storage map for the major kmp_info_t thread data structures
502  that are allocated together. */
503 
504 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
505  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
506  gtid);
507 
508  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
509  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
510 
511  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
512  sizeof(kmp_local_t), "th_%d.th_local", gtid);
513 
514  __kmp_print_storage_map_gtid(
515  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
516  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
517 
518  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
519  &thr->th.th_bar[bs_plain_barrier + 1],
520  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
521  gtid);
522 
523  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
524  &thr->th.th_bar[bs_forkjoin_barrier + 1],
525  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
526  gtid);
527 
528 #if KMP_FAST_REDUCTION_BARRIER
529  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
530  &thr->th.th_bar[bs_reduction_barrier + 1],
531  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
532  gtid);
533 #endif // KMP_FAST_REDUCTION_BARRIER
534 }
535 
536 /* Print out the storage map for the major kmp_team_t team data structures
537  that are allocated together. */
538 
539 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
540  int team_id, int num_thr) {
541  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
542  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
543  header, team_id);
544 
545  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
546  &team->t.t_bar[bs_last_barrier],
547  sizeof(kmp_balign_team_t) * bs_last_barrier,
548  "%s_%d.t_bar", header, team_id);
549 
550  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
551  &team->t.t_bar[bs_plain_barrier + 1],
552  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
553  header, team_id);
554 
555  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
556  &team->t.t_bar[bs_forkjoin_barrier + 1],
557  sizeof(kmp_balign_team_t),
558  "%s_%d.t_bar[forkjoin]", header, team_id);
559 
560 #if KMP_FAST_REDUCTION_BARRIER
561  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
562  &team->t.t_bar[bs_reduction_barrier + 1],
563  sizeof(kmp_balign_team_t),
564  "%s_%d.t_bar[reduction]", header, team_id);
565 #endif // KMP_FAST_REDUCTION_BARRIER
566 
567  __kmp_print_storage_map_gtid(
568  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
569  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
570 
571  __kmp_print_storage_map_gtid(
572  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
573  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
574 
575  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
576  &team->t.t_disp_buffer[num_disp_buff],
577  sizeof(dispatch_shared_info_t) * num_disp_buff,
578  "%s_%d.t_disp_buffer", header, team_id);
579 }
580 
581 static void __kmp_init_allocator() {
582  __kmp_init_memkind();
583  __kmp_init_target_mem();
584 }
585 static void __kmp_fini_allocator() {
586  __kmp_fini_target_mem();
587  __kmp_fini_memkind();
588 }
589 
590 /* ------------------------------------------------------------------------ */
591 
592 #if ENABLE_LIBOMPTARGET
593 static void __kmp_init_omptarget() {
594  __kmp_init_target_task();
595 }
596 #endif
597 
598 /* ------------------------------------------------------------------------ */
599 
600 #if KMP_DYNAMIC_LIB
601 #if KMP_OS_WINDOWS
602 
603 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
604  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
605 
606  switch (fdwReason) {
607 
608  case DLL_PROCESS_ATTACH:
609  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
610 
611  return TRUE;
612 
613  case DLL_PROCESS_DETACH:
614  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
615 
616  // According to Windows* documentation for DllMain entry point:
617  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
618  // lpReserved == NULL when FreeLibrary() is called,
619  // lpReserved != NULL when the process is terminated.
620  // When FreeLibrary() is called, worker threads remain alive. So the
621  // runtime's state is consistent and executing proper shutdown is OK.
622  // When the process is terminated, worker threads have exited or been
623  // forcefully terminated by the OS and only the shutdown thread remains.
624  // This can leave the runtime in an inconsistent state.
625  // Hence, only attempt proper cleanup when FreeLibrary() is called.
626  // Otherwise, rely on OS to reclaim resources.
627  if (lpReserved == NULL)
628  __kmp_internal_end_library(__kmp_gtid_get_specific());
629 
630  return TRUE;
631 
632  case DLL_THREAD_ATTACH:
633  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
634 
635  /* if we want to register new siblings all the time here call
636  * __kmp_get_gtid(); */
637  return TRUE;
638 
639  case DLL_THREAD_DETACH:
640  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
641 
642  __kmp_internal_end_thread(__kmp_gtid_get_specific());
643  return TRUE;
644  }
645 
646  return TRUE;
647 }
648 
649 #endif /* KMP_OS_WINDOWS */
650 #endif /* KMP_DYNAMIC_LIB */
651 
652 /* __kmp_parallel_deo -- Wait until it's our turn. */
653 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
654  int gtid = *gtid_ref;
655 #ifdef BUILD_PARALLEL_ORDERED
656  kmp_team_t *team = __kmp_team_from_gtid(gtid);
657 #endif /* BUILD_PARALLEL_ORDERED */
658 
659  if (__kmp_env_consistency_check) {
660  if (__kmp_threads[gtid]->th.th_root->r.r_active)
661 #if KMP_USE_DYNAMIC_LOCK
662  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
663 #else
664  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
665 #endif
666  }
667 #ifdef BUILD_PARALLEL_ORDERED
668  if (!team->t.t_serialized) {
669  KMP_MB();
670  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
671  NULL);
672  KMP_MB();
673  }
674 #endif /* BUILD_PARALLEL_ORDERED */
675 }
676 
677 /* __kmp_parallel_dxo -- Signal the next task. */
678 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
679  int gtid = *gtid_ref;
680 #ifdef BUILD_PARALLEL_ORDERED
681  int tid = __kmp_tid_from_gtid(gtid);
682  kmp_team_t *team = __kmp_team_from_gtid(gtid);
683 #endif /* BUILD_PARALLEL_ORDERED */
684 
685  if (__kmp_env_consistency_check) {
686  if (__kmp_threads[gtid]->th.th_root->r.r_active)
687  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
688  }
689 #ifdef BUILD_PARALLEL_ORDERED
690  if (!team->t.t_serialized) {
691  KMP_MB(); /* Flush all pending memory write invalidates. */
692 
693  /* use the tid of the next thread in this team */
694  /* TODO replace with general release procedure */
695  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
696 
697  KMP_MB(); /* Flush all pending memory write invalidates. */
698  }
699 #endif /* BUILD_PARALLEL_ORDERED */
700 }
701 
702 /* ------------------------------------------------------------------------ */
703 /* The BARRIER for a SINGLE process section is always explicit */
704 
705 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
706  int status;
707  kmp_info_t *th;
708  kmp_team_t *team;
709 
710  if (!TCR_4(__kmp_init_parallel))
711  __kmp_parallel_initialize();
712  __kmp_resume_if_soft_paused();
713 
714  th = __kmp_threads[gtid];
715  team = th->th.th_team;
716  status = 0;
717 
718  th->th.th_ident = id_ref;
719 
720  if (team->t.t_serialized) {
721  status = 1;
722  } else {
723  kmp_int32 old_this = th->th.th_local.this_construct;
724 
725  ++th->th.th_local.this_construct;
726  /* try to set team count to thread count--success means thread got the
727  single block */
728  /* TODO: Should this be acquire or release? */
729  if (team->t.t_construct == old_this) {
730  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
731  th->th.th_local.this_construct);
732  }
733 #if USE_ITT_BUILD
734  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
735  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
736  team->t.t_active_level == 1) {
737  // Only report metadata by primary thread of active team at level 1
738  __kmp_itt_metadata_single(id_ref);
739  }
740 #endif /* USE_ITT_BUILD */
741  }
742 
743  if (__kmp_env_consistency_check) {
744  if (status && push_ws) {
745  __kmp_push_workshare(gtid, ct_psingle, id_ref);
746  } else {
747  __kmp_check_workshare(gtid, ct_psingle, id_ref);
748  }
749  }
750 #if USE_ITT_BUILD
751  if (status) {
752  __kmp_itt_single_start(gtid);
753  }
754 #endif /* USE_ITT_BUILD */
755  return status;
756 }
757 
758 void __kmp_exit_single(int gtid) {
759 #if USE_ITT_BUILD
760  __kmp_itt_single_end(gtid);
761 #endif /* USE_ITT_BUILD */
762  if (__kmp_env_consistency_check)
763  __kmp_pop_workshare(gtid, ct_psingle, NULL);
764 }
765 
766 /* determine if we can go parallel or must use a serialized parallel region and
767  * how many threads we can use
768  * set_nproc is the number of threads requested for the team
769  * returns 0 if we should serialize or only use one thread,
770  * otherwise the number of threads to use
771  * The forkjoin lock is held by the caller. */
772 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
773  int master_tid, int set_nthreads,
774  int enter_teams) {
775  int capacity;
776  int new_nthreads;
777  KMP_DEBUG_ASSERT(__kmp_init_serial);
778  KMP_DEBUG_ASSERT(root && parent_team);
779  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
780 
781  // If dyn-var is set, dynamically adjust the number of desired threads,
782  // according to the method specified by dynamic_mode.
783  new_nthreads = set_nthreads;
784  if (!get__dynamic_2(parent_team, master_tid)) {
785  ;
786  }
787 #ifdef USE_LOAD_BALANCE
788  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
789  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
790  if (new_nthreads == 1) {
791  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
792  "reservation to 1 thread\n",
793  master_tid));
794  return 1;
795  }
796  if (new_nthreads < set_nthreads) {
797  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
798  "reservation to %d threads\n",
799  master_tid, new_nthreads));
800  }
801  }
802 #endif /* USE_LOAD_BALANCE */
803  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
804  new_nthreads = __kmp_avail_proc - __kmp_nth +
805  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
806  if (new_nthreads <= 1) {
807  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
808  "reservation to 1 thread\n",
809  master_tid));
810  return 1;
811  }
812  if (new_nthreads < set_nthreads) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
814  "reservation to %d threads\n",
815  master_tid, new_nthreads));
816  } else {
817  new_nthreads = set_nthreads;
818  }
819  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
820  if (set_nthreads > 2) {
821  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
822  new_nthreads = (new_nthreads % set_nthreads) + 1;
823  if (new_nthreads == 1) {
824  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
825  "reservation to 1 thread\n",
826  master_tid));
827  return 1;
828  }
829  if (new_nthreads < set_nthreads) {
830  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
831  "reservation to %d threads\n",
832  master_tid, new_nthreads));
833  }
834  }
835  } else {
836  KMP_ASSERT(0);
837  }
838 
839  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
840  if (__kmp_nth + new_nthreads -
841  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
842  __kmp_max_nth) {
843  int tl_nthreads = __kmp_max_nth - __kmp_nth +
844  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
845  if (tl_nthreads <= 0) {
846  tl_nthreads = 1;
847  }
848 
849  // If dyn-var is false, emit a 1-time warning.
850  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
851  __kmp_reserve_warn = 1;
852  __kmp_msg(kmp_ms_warning,
853  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
854  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
855  }
856  if (tl_nthreads == 1) {
857  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
858  "reduced reservation to 1 thread\n",
859  master_tid));
860  return 1;
861  }
862  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
863  "reservation to %d threads\n",
864  master_tid, tl_nthreads));
865  new_nthreads = tl_nthreads;
866  }
867 
868  // Respect OMP_THREAD_LIMIT
869  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
870  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
871  if (cg_nthreads + new_nthreads -
872  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
873  max_cg_threads) {
874  int tl_nthreads = max_cg_threads - cg_nthreads +
875  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
876  if (tl_nthreads <= 0) {
877  tl_nthreads = 1;
878  }
879 
880  // If dyn-var is false, emit a 1-time warning.
881  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
882  __kmp_reserve_warn = 1;
883  __kmp_msg(kmp_ms_warning,
884  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
885  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
886  }
887  if (tl_nthreads == 1) {
888  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
889  "reduced reservation to 1 thread\n",
890  master_tid));
891  return 1;
892  }
893  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
894  "reservation to %d threads\n",
895  master_tid, tl_nthreads));
896  new_nthreads = tl_nthreads;
897  }
898 
899  // Check if the threads array is large enough, or needs expanding.
900  // See comment in __kmp_register_root() about the adjustment if
901  // __kmp_threads[0] == NULL.
902  capacity = __kmp_threads_capacity;
903  if (TCR_PTR(__kmp_threads[0]) == NULL) {
904  --capacity;
905  }
906  // If it is not for initializing the hidden helper team, we need to take
907  // __kmp_hidden_helper_threads_num out of the capacity because it is included
908  // in __kmp_threads_capacity.
909  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
910  capacity -= __kmp_hidden_helper_threads_num;
911  }
912  if (__kmp_nth + new_nthreads -
913  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
914  capacity) {
915  // Expand the threads array.
916  int slotsRequired = __kmp_nth + new_nthreads -
917  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
918  capacity;
919  int slotsAdded = __kmp_expand_threads(slotsRequired);
920  if (slotsAdded < slotsRequired) {
921  // The threads array was not expanded enough.
922  new_nthreads -= (slotsRequired - slotsAdded);
923  KMP_ASSERT(new_nthreads >= 1);
924 
925  // If dyn-var is false, emit a 1-time warning.
926  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
927  __kmp_reserve_warn = 1;
928  if (__kmp_tp_cached) {
929  __kmp_msg(kmp_ms_warning,
930  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
931  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
932  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
933  } else {
934  __kmp_msg(kmp_ms_warning,
935  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
936  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
937  }
938  }
939  }
940  }
941 
942 #ifdef KMP_DEBUG
943  if (new_nthreads == 1) {
944  KC_TRACE(10,
945  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
946  "dead roots and rechecking; requested %d threads\n",
947  __kmp_get_gtid(), set_nthreads));
948  } else {
949  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
950  " %d threads\n",
951  __kmp_get_gtid(), new_nthreads, set_nthreads));
952  }
953 #endif // KMP_DEBUG
954 
955  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
956  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
957  this_thr->th.th_nt_msg);
958  }
959  return new_nthreads;
960 }
961 
962 /* Allocate threads from the thread pool and assign them to the new team. We are
963  assured that there are enough threads available, because we checked on that
964  earlier within critical section forkjoin */
965 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
966  kmp_info_t *master_th, int master_gtid,
967  int fork_teams_workers) {
968  int i;
969  int use_hot_team;
970 
971  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973  KMP_MB();
974 
975  /* first, let's setup the primary thread */
976  master_th->th.th_info.ds.ds_tid = 0;
977  master_th->th.th_team = team;
978  master_th->th.th_team_nproc = team->t.t_nproc;
979  master_th->th.th_team_master = master_th;
980  master_th->th.th_team_serialized = FALSE;
981  master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983  /* make sure we are not the optimized hot team */
984  use_hot_team = 0;
985  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
986  if (hot_teams) { // hot teams array is not allocated if
987  // KMP_HOT_TEAMS_MAX_LEVEL=0
988  int level = team->t.t_active_level - 1; // index in array of hot teams
989  if (master_th->th.th_teams_microtask) { // are we inside the teams?
990  if (master_th->th.th_teams_size.nteams > 1) {
991  ++level; // level was not increased in teams construct for
992  // team_of_masters
993  }
994  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
995  master_th->th.th_teams_level == team->t.t_level) {
996  ++level; // level was not increased in teams construct for
997  // team_of_workers before the parallel
998  } // team->t.t_level will be increased inside parallel
999  }
1000  if (level < __kmp_hot_teams_max_level) {
1001  if (hot_teams[level].hot_team) {
1002  // hot team has already been allocated for given level
1003  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1004  use_hot_team = 1; // the team is ready to use
1005  } else {
1006  use_hot_team = 0; // AC: threads are not allocated yet
1007  hot_teams[level].hot_team = team; // remember new hot team
1008  hot_teams[level].hot_team_nth = team->t.t_nproc;
1009  }
1010  } else {
1011  use_hot_team = 0;
1012  }
1013  }
1014  if (!use_hot_team) {
1015 
1016  /* install the primary thread */
1017  team->t.t_threads[0] = master_th;
1018  __kmp_initialize_info(master_th, team, 0, master_gtid);
1019 
1020  /* now, install the worker threads */
1021  for (i = 1; i < team->t.t_nproc; i++) {
1022 
1023  /* fork or reallocate a new thread and install it in team */
1024  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1025  team->t.t_threads[i] = thr;
1026  KMP_DEBUG_ASSERT(thr);
1027  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1028  /* align team and thread arrived states */
1029  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1030  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1031  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1032  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1033  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1034  team->t.t_bar[bs_plain_barrier].b_arrived));
1035  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1036  thr->th.th_teams_level = master_th->th.th_teams_level;
1037  thr->th.th_teams_size = master_th->th.th_teams_size;
1038  { // Initialize threads' barrier data.
1039  int b;
1040  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1041  for (b = 0; b < bs_last_barrier; ++b) {
1042  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1043  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1044 #if USE_DEBUGGER
1045  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1046 #endif
1047  }
1048  }
1049  }
1050 
1051 #if KMP_AFFINITY_SUPPORTED
1052  // Do not partition the places list for teams construct workers who
1053  // haven't actually been forked to do real work yet. This partitioning
1054  // will take place in the parallel region nested within the teams construct.
1055  if (!fork_teams_workers) {
1056  __kmp_partition_places(team);
1057  }
1058 #endif
1059 
1060  if (team->t.t_nproc > 1 &&
1061  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1062  team->t.b->update_num_threads(team->t.t_nproc);
1063  __kmp_add_threads_to_team(team, team->t.t_nproc);
1064  }
1065  }
1066 
1067  // Take care of primary thread's task state
1068  if (__kmp_tasking_mode != tskm_immediate_exec) {
1069  if (use_hot_team) {
1070  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1071  KA_TRACE(
1072  20,
1073  ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1074  "%p, new task_team %p / team %p\n",
1075  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1076  team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1077  team));
1078 
1079  // Store primary thread's current task state on new team
1080  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1081  master_th->th.th_task_state);
1082 
1083  // Restore primary thread's task state to hot team's state
1084  // by using thread 1's task state
1085  if (team->t.t_nproc > 1) {
1086  KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1087  team->t.t_threads[1]->th.th_task_state == 1);
1088  KMP_CHECK_UPDATE(master_th->th.th_task_state,
1089  team->t.t_threads[1]->th.th_task_state);
1090  } else {
1091  master_th->th.th_task_state = 0;
1092  }
1093  } else {
1094  // Store primary thread's current task_state on new team
1095  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1096  master_th->th.th_task_state);
1097  // Are not using hot team, so set task state to 0.
1098  master_th->th.th_task_state = 0;
1099  }
1100  }
1101 
1102  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1103  for (i = 0; i < team->t.t_nproc; i++) {
1104  kmp_info_t *thr = team->t.t_threads[i];
1105  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1106  thr->th.th_prev_level != team->t.t_level) {
1107  team->t.t_display_affinity = 1;
1108  break;
1109  }
1110  }
1111  }
1112 
1113  KMP_MB();
1114 }
1115 
1116 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1117 // Propagate any changes to the floating point control registers out to the team
1118 // We try to avoid unnecessary writes to the relevant cache line in the team
1119 // structure, so we don't make changes unless they are needed.
1120 inline static void propagateFPControl(kmp_team_t *team) {
1121  if (__kmp_inherit_fp_control) {
1122  kmp_int16 x87_fpu_control_word;
1123  kmp_uint32 mxcsr;
1124 
1125  // Get primary thread's values of FPU control flags (both X87 and vector)
1126  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1127  __kmp_store_mxcsr(&mxcsr);
1128  mxcsr &= KMP_X86_MXCSR_MASK;
1129 
1130  // There is no point looking at t_fp_control_saved here.
1131  // If it is TRUE, we still have to update the values if they are different
1132  // from those we now have. If it is FALSE we didn't save anything yet, but
1133  // our objective is the same. We have to ensure that the values in the team
1134  // are the same as those we have.
1135  // So, this code achieves what we need whether or not t_fp_control_saved is
1136  // true. By checking whether the value needs updating we avoid unnecessary
1137  // writes that would put the cache-line into a written state, causing all
1138  // threads in the team to have to read it again.
1139  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1140  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1141  // Although we don't use this value, other code in the runtime wants to know
1142  // whether it should restore them. So we must ensure it is correct.
1143  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1144  } else {
1145  // Similarly here. Don't write to this cache-line in the team structure
1146  // unless we have to.
1147  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1148  }
1149 }
1150 
1151 // Do the opposite, setting the hardware registers to the updated values from
1152 // the team.
1153 inline static void updateHWFPControl(kmp_team_t *team) {
1154  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1155  // Only reset the fp control regs if they have been changed in the team.
1156  // the parallel region that we are exiting.
1157  kmp_int16 x87_fpu_control_word;
1158  kmp_uint32 mxcsr;
1159  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1160  __kmp_store_mxcsr(&mxcsr);
1161  mxcsr &= KMP_X86_MXCSR_MASK;
1162 
1163  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1164  __kmp_clear_x87_fpu_status_word();
1165  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1166  }
1167 
1168  if (team->t.t_mxcsr != mxcsr) {
1169  __kmp_load_mxcsr(&team->t.t_mxcsr);
1170  }
1171  }
1172 }
1173 #else
1174 #define propagateFPControl(x) ((void)0)
1175 #define updateHWFPControl(x) ((void)0)
1176 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1177 
1178 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1179  int realloc); // forward declaration
1180 
1181 /* Run a parallel region that has been serialized, so runs only in a team of the
1182  single primary thread. */
1183 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1184  kmp_info_t *this_thr;
1185  kmp_team_t *serial_team;
1186 
1187  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1188 
1189  /* Skip all this code for autopar serialized loops since it results in
1190  unacceptable overhead */
1191  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1192  return;
1193 
1194  if (!TCR_4(__kmp_init_parallel))
1195  __kmp_parallel_initialize();
1196  __kmp_resume_if_soft_paused();
1197 
1198  this_thr = __kmp_threads[global_tid];
1199  serial_team = this_thr->th.th_serial_team;
1200 
1201  /* utilize the serialized team held by this thread */
1202  KMP_DEBUG_ASSERT(serial_team);
1203  KMP_MB();
1204 
1205  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1206  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1207  proc_bind = proc_bind_false;
1208  } else if (proc_bind == proc_bind_default) {
1209  // No proc_bind clause was specified, so use the current value
1210  // of proc-bind-var for this parallel region.
1211  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1212  }
1213  // Reset for next parallel region
1214  this_thr->th.th_set_proc_bind = proc_bind_default;
1215 
1216  // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
1217  // effect when parallel execution is disabled by a corresponding if clause
1218  // attached to the parallel directive.
1219  if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1)
1220  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
1221  this_thr->th.th_nt_msg);
1222  // Reset num_threads for next parallel region
1223  this_thr->th.th_set_nproc = 0;
1224 
1225 #if OMPT_SUPPORT
1226  ompt_data_t ompt_parallel_data = ompt_data_none;
1227  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1228  if (ompt_enabled.enabled &&
1229  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1230 
1231  ompt_task_info_t *parent_task_info;
1232  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1233 
1234  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1235  if (ompt_enabled.ompt_callback_parallel_begin) {
1236  int team_size = 1;
1237 
1238  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1239  &(parent_task_info->task_data), &(parent_task_info->frame),
1240  &ompt_parallel_data, team_size,
1241  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1242  }
1243  }
1244 #endif // OMPT_SUPPORT
1245 
1246  if (this_thr->th.th_team != serial_team) {
1247  // Nested level will be an index in the nested nthreads array
1248  int level = this_thr->th.th_team->t.t_level;
1249 
1250  if (serial_team->t.t_serialized) {
1251  /* this serial team was already used
1252  TODO increase performance by making this locks more specific */
1253  kmp_team_t *new_team;
1254 
1255  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1256 
1257  new_team = __kmp_allocate_team(
1258  this_thr->th.th_root, 1, 1,
1259 #if OMPT_SUPPORT
1260  ompt_parallel_data,
1261 #endif
1262  proc_bind, &this_thr->th.th_current_task->td_icvs, 0, NULL);
1263  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1264  KMP_ASSERT(new_team);
1265 
1266  /* setup new serialized team and install it */
1267  new_team->t.t_threads[0] = this_thr;
1268  new_team->t.t_parent = this_thr->th.th_team;
1269  serial_team = new_team;
1270  this_thr->th.th_serial_team = serial_team;
1271 
1272  KF_TRACE(
1273  10,
1274  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1275  global_tid, serial_team));
1276 
1277  /* TODO the above breaks the requirement that if we run out of resources,
1278  then we can still guarantee that serialized teams are ok, since we may
1279  need to allocate a new one */
1280  } else {
1281  KF_TRACE(
1282  10,
1283  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1284  global_tid, serial_team));
1285  }
1286 
1287  /* we have to initialize this serial team */
1288  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1289  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1290  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1291  serial_team->t.t_ident = loc;
1292  serial_team->t.t_serialized = 1;
1293  serial_team->t.t_nproc = 1;
1294  serial_team->t.t_parent = this_thr->th.th_team;
1295  if (this_thr->th.th_team->t.t_nested_nth)
1296  serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1297  else
1298  serial_team->t.t_nested_nth = &__kmp_nested_nth;
1299  // Save previous team's task state on serial team structure
1300  serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1301  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1302  this_thr->th.th_team = serial_team;
1303  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1304 
1305  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1306  this_thr->th.th_current_task));
1307  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1308  this_thr->th.th_current_task->td_flags.executing = 0;
1309 
1310  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1311 
1312  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1313  implicit task for each serialized task represented by
1314  team->t.t_serialized? */
1315  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1316  &this_thr->th.th_current_task->td_parent->td_icvs);
1317 
1318  // Thread value exists in the nested nthreads array for the next nested
1319  // level
1320  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1321  if (this_thr->th.th_team->t.t_nested_nth)
1322  nested_nth = this_thr->th.th_team->t.t_nested_nth;
1323  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1324  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1325  }
1326 
1327  if (__kmp_nested_proc_bind.used &&
1328  (level + 1 < __kmp_nested_proc_bind.used)) {
1329  this_thr->th.th_current_task->td_icvs.proc_bind =
1330  __kmp_nested_proc_bind.bind_types[level + 1];
1331  }
1332 
1333 #if USE_DEBUGGER
1334  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1335 #endif
1336  this_thr->th.th_info.ds.ds_tid = 0;
1337 
1338  /* set thread cache values */
1339  this_thr->th.th_team_nproc = 1;
1340  this_thr->th.th_team_master = this_thr;
1341  this_thr->th.th_team_serialized = 1;
1342  this_thr->th.th_task_team = NULL;
1343  this_thr->th.th_task_state = 0;
1344 
1345  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1346  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1347  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1348 
1349  propagateFPControl(serial_team);
1350 
1351  /* check if we need to allocate dispatch buffers stack */
1352  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1353  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1354  serial_team->t.t_dispatch->th_disp_buffer =
1355  (dispatch_private_info_t *)__kmp_allocate(
1356  sizeof(dispatch_private_info_t));
1357  }
1358  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1359 
1360  KMP_MB();
1361 
1362  } else {
1363  /* this serialized team is already being used,
1364  * that's fine, just add another nested level */
1365  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1366  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1367  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1368  ++serial_team->t.t_serialized;
1369  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1370 
1371  // Nested level will be an index in the nested nthreads array
1372  int level = this_thr->th.th_team->t.t_level;
1373  // Thread value exists in the nested nthreads array for the next nested
1374  // level
1375 
1376  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1377  if (serial_team->t.t_nested_nth)
1378  nested_nth = serial_team->t.t_nested_nth;
1379  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1380  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1381  }
1382 
1383  serial_team->t.t_level++;
1384  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1385  "of serial team %p to %d\n",
1386  global_tid, serial_team, serial_team->t.t_level));
1387 
1388  /* allocate/push dispatch buffers stack */
1389  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1390  {
1391  dispatch_private_info_t *disp_buffer =
1392  (dispatch_private_info_t *)__kmp_allocate(
1393  sizeof(dispatch_private_info_t));
1394  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1395  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1396  }
1397  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1398 
1399  /* allocate/push task team stack */
1400  __kmp_push_task_team_node(this_thr, serial_team);
1401 
1402  KMP_MB();
1403  }
1404  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1405 
1406  // Perform the display affinity functionality for
1407  // serialized parallel regions
1408  if (__kmp_display_affinity) {
1409  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1410  this_thr->th.th_prev_num_threads != 1) {
1411  // NULL means use the affinity-format-var ICV
1412  __kmp_aux_display_affinity(global_tid, NULL);
1413  this_thr->th.th_prev_level = serial_team->t.t_level;
1414  this_thr->th.th_prev_num_threads = 1;
1415  }
1416  }
1417 
1418  if (__kmp_env_consistency_check)
1419  __kmp_push_parallel(global_tid, NULL);
1420 #if OMPT_SUPPORT
1421  serial_team->t.ompt_team_info.master_return_address = codeptr;
1422  if (ompt_enabled.enabled &&
1423  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1424  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1425  OMPT_GET_FRAME_ADDRESS(0);
1426 
1427  ompt_lw_taskteam_t lw_taskteam;
1428  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1429  &ompt_parallel_data, codeptr);
1430 
1431  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1432  // don't use lw_taskteam after linking. content was swaped
1433 
1434  /* OMPT implicit task begin */
1435  if (ompt_enabled.ompt_callback_implicit_task) {
1436  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1437  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1438  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1439  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1440  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1441  __kmp_tid_from_gtid(global_tid);
1442  }
1443 
1444  /* OMPT state */
1445  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1446  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1447  OMPT_GET_FRAME_ADDRESS(0);
1448  }
1449 #endif
1450 }
1451 
1452 // Test if this fork is for a team closely nested in a teams construct
1453 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1454  microtask_t microtask, int level,
1455  int teams_level, kmp_va_list ap) {
1456  return (master_th->th.th_teams_microtask && ap &&
1457  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1458 }
1459 
1460 // Test if this fork is for the teams construct, i.e. to form the outer league
1461 // of teams
1462 static inline bool __kmp_is_entering_teams(int active_level, int level,
1463  int teams_level, kmp_va_list ap) {
1464  return ((ap == NULL && active_level == 0) ||
1465  (ap && teams_level > 0 && teams_level == level));
1466 }
1467 
1468 // AC: This is start of parallel that is nested inside teams construct.
1469 // The team is actual (hot), all workers are ready at the fork barrier.
1470 // No lock needed to initialize the team a bit, then free workers.
1471 static inline int
1472 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1473  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1474  enum fork_context_e call_context, microtask_t microtask,
1475  launch_t invoker, int master_set_numthreads, int level,
1476 #if OMPT_SUPPORT
1477  ompt_data_t ompt_parallel_data, void *return_address,
1478 #endif
1479  kmp_va_list ap) {
1480  void **argv;
1481  int i;
1482 
1483  parent_team->t.t_ident = loc;
1484  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1485  parent_team->t.t_argc = argc;
1486  argv = (void **)parent_team->t.t_argv;
1487  for (i = argc - 1; i >= 0; --i) {
1488  *argv++ = va_arg(kmp_va_deref(ap), void *);
1489  }
1490  // Increment our nested depth levels, but not increase the serialization
1491  if (parent_team == master_th->th.th_serial_team) {
1492  // AC: we are in serialized parallel
1493  __kmpc_serialized_parallel(loc, gtid);
1494  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1495 
1496  if (call_context == fork_context_gnu) {
1497  // AC: need to decrement t_serialized for enquiry functions to work
1498  // correctly, will restore at join time
1499  parent_team->t.t_serialized--;
1500  return TRUE;
1501  }
1502 
1503 #if OMPD_SUPPORT
1504  parent_team->t.t_pkfn = microtask;
1505 #endif
1506 
1507 #if OMPT_SUPPORT
1508  void *dummy;
1509  void **exit_frame_p;
1510  ompt_data_t *implicit_task_data;
1511  ompt_lw_taskteam_t lw_taskteam;
1512 
1513  if (ompt_enabled.enabled) {
1514  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1515  &ompt_parallel_data, return_address);
1516  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1517 
1518  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1519  // Don't use lw_taskteam after linking. Content was swapped.
1520 
1521  /* OMPT implicit task begin */
1522  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1523  if (ompt_enabled.ompt_callback_implicit_task) {
1524  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1525  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1526  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1527  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1528  }
1529 
1530  /* OMPT state */
1531  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1532  } else {
1533  exit_frame_p = &dummy;
1534  }
1535 #endif
1536 
1537  // AC: need to decrement t_serialized for enquiry functions to work
1538  // correctly, will restore at join time
1539  parent_team->t.t_serialized--;
1540 
1541  {
1542  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1543  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1544  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1545 #if OMPT_SUPPORT
1546  ,
1547  exit_frame_p
1548 #endif
1549  );
1550  }
1551 
1552 #if OMPT_SUPPORT
1553  if (ompt_enabled.enabled) {
1554  *exit_frame_p = NULL;
1555  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1556  if (ompt_enabled.ompt_callback_implicit_task) {
1557  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1558  ompt_scope_end, NULL, implicit_task_data, 1,
1559  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1560  }
1561  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1562  __ompt_lw_taskteam_unlink(master_th);
1563  if (ompt_enabled.ompt_callback_parallel_end) {
1564  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1565  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1566  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1567  }
1568  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1569  }
1570 #endif
1571  return TRUE;
1572  }
1573 
1574  parent_team->t.t_pkfn = microtask;
1575  parent_team->t.t_invoke = invoker;
1576  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1577  parent_team->t.t_active_level++;
1578  parent_team->t.t_level++;
1579  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1580 
1581  // If the threads allocated to the team are less than the thread limit, update
1582  // the thread limit here. th_teams_size.nth is specific to this team nested
1583  // in a teams construct, the team is fully created, and we're about to do
1584  // the actual fork. Best to do this here so that the subsequent uses below
1585  // and in the join have the correct value.
1586  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1587 
1588 #if OMPT_SUPPORT
1589  if (ompt_enabled.enabled) {
1590  ompt_lw_taskteam_t lw_taskteam;
1591  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1592  return_address);
1593  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1594  }
1595 #endif
1596 
1597  /* Change number of threads in the team if requested */
1598  if (master_set_numthreads) { // The parallel has num_threads clause
1599  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1600  // AC: only can reduce number of threads dynamically, can't increase
1601  kmp_info_t **other_threads = parent_team->t.t_threads;
1602  // NOTE: if using distributed barrier, we need to run this code block
1603  // even when the team size appears not to have changed from the max.
1604  int old_proc = master_th->th.th_teams_size.nth;
1605  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1606  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1607  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1608  }
1609  parent_team->t.t_nproc = master_set_numthreads;
1610  for (i = 0; i < master_set_numthreads; ++i) {
1611  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1612  }
1613  }
1614  // Keep extra threads hot in the team for possible next parallels
1615  master_th->th.th_set_nproc = 0;
1616  }
1617 
1618 #if USE_DEBUGGER
1619  if (__kmp_debugging) { // Let debugger override number of threads.
1620  int nth = __kmp_omp_num_threads(loc);
1621  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1622  master_set_numthreads = nth;
1623  }
1624  }
1625 #endif
1626 
1627  // Figure out the proc_bind policy for the nested parallel within teams
1628  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1629  // proc_bind_default means don't update
1630  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1631  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1632  proc_bind = proc_bind_false;
1633  } else {
1634  // No proc_bind clause specified; use current proc-bind-var
1635  if (proc_bind == proc_bind_default) {
1636  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1637  }
1638  /* else: The proc_bind policy was specified explicitly on parallel clause.
1639  This overrides proc-bind-var for this parallel region, but does not
1640  change proc-bind-var. */
1641  // Figure the value of proc-bind-var for the child threads.
1642  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1643  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1644  master_th->th.th_current_task->td_icvs.proc_bind)) {
1645  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1646  }
1647  }
1648  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1649  // Need to change the bind-var ICV to correct value for each implicit task
1650  if (proc_bind_icv != proc_bind_default &&
1651  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1652  kmp_info_t **other_threads = parent_team->t.t_threads;
1653  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1654  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1655  }
1656  }
1657  // Reset for next parallel region
1658  master_th->th.th_set_proc_bind = proc_bind_default;
1659 
1660 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1661  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1662  KMP_ITT_DEBUG) &&
1663  __kmp_forkjoin_frames_mode == 3 &&
1664  parent_team->t.t_active_level == 1 // only report frames at level 1
1665  && master_th->th.th_teams_size.nteams == 1) {
1666  kmp_uint64 tmp_time = __itt_get_timestamp();
1667  master_th->th.th_frame_time = tmp_time;
1668  parent_team->t.t_region_time = tmp_time;
1669  }
1670  if (__itt_stack_caller_create_ptr) {
1671  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1672  // create new stack stitching id before entering fork barrier
1673  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1674  }
1675 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1676 #if KMP_AFFINITY_SUPPORTED
1677  __kmp_partition_places(parent_team);
1678 #endif
1679 
1680  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1681  "master_th=%p, gtid=%d\n",
1682  root, parent_team, master_th, gtid));
1683  __kmp_internal_fork(loc, gtid, parent_team);
1684  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1685  "master_th=%p, gtid=%d\n",
1686  root, parent_team, master_th, gtid));
1687 
1688  if (call_context == fork_context_gnu)
1689  return TRUE;
1690 
1691  /* Invoke microtask for PRIMARY thread */
1692  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1693  parent_team->t.t_id, parent_team->t.t_pkfn));
1694 
1695  if (!parent_team->t.t_invoke(gtid)) {
1696  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1697  }
1698  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1699  parent_team->t.t_id, parent_team->t.t_pkfn));
1700  KMP_MB(); /* Flush all pending memory write invalidates. */
1701 
1702  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1703 
1704  return TRUE;
1705 }
1706 
1707 // Create a serialized parallel region
1708 static inline int
1709 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1710  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1711  kmp_info_t *master_th, kmp_team_t *parent_team,
1712 #if OMPT_SUPPORT
1713  ompt_data_t *ompt_parallel_data, void **return_address,
1714  ompt_data_t **parent_task_data,
1715 #endif
1716  kmp_va_list ap) {
1717  kmp_team_t *team;
1718  int i;
1719  void **argv;
1720 
1721 /* josh todo: hypothetical question: what do we do for OS X*? */
1722 #if KMP_OS_LINUX && \
1723  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1724  SimpleVLA<void *> args(argc);
1725 #else
1726  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1727 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1728  KMP_ARCH_AARCH64) */
1729 
1730  KA_TRACE(
1731  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1732 
1733  __kmpc_serialized_parallel(loc, gtid);
1734 
1735 #if OMPD_SUPPORT
1736  master_th->th.th_serial_team->t.t_pkfn = microtask;
1737 #endif
1738 
1739  if (call_context == fork_context_intel) {
1740  /* TODO this sucks, use the compiler itself to pass args! :) */
1741  master_th->th.th_serial_team->t.t_ident = loc;
1742  if (!ap) {
1743  // revert change made in __kmpc_serialized_parallel()
1744  master_th->th.th_serial_team->t.t_level--;
1745 // Get args from parent team for teams construct
1746 
1747 #if OMPT_SUPPORT
1748  void *dummy;
1749  void **exit_frame_p;
1750  ompt_task_info_t *task_info;
1751  ompt_lw_taskteam_t lw_taskteam;
1752 
1753  if (ompt_enabled.enabled) {
1754  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1755  ompt_parallel_data, *return_address);
1756 
1757  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1758  // don't use lw_taskteam after linking. content was swaped
1759  task_info = OMPT_CUR_TASK_INFO(master_th);
1760  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1761  if (ompt_enabled.ompt_callback_implicit_task) {
1762  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1763  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1764  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1765  &(task_info->task_data), 1,
1766  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1767  }
1768 
1769  /* OMPT state */
1770  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1771  } else {
1772  exit_frame_p = &dummy;
1773  }
1774 #endif
1775 
1776  {
1777  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1778  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1779  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1780 #if OMPT_SUPPORT
1781  ,
1782  exit_frame_p
1783 #endif
1784  );
1785  }
1786 
1787 #if OMPT_SUPPORT
1788  if (ompt_enabled.enabled) {
1789  *exit_frame_p = NULL;
1790  if (ompt_enabled.ompt_callback_implicit_task) {
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_end, NULL, &(task_info->task_data), 1,
1793  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1794  }
1795  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1796  __ompt_lw_taskteam_unlink(master_th);
1797  if (ompt_enabled.ompt_callback_parallel_end) {
1798  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1799  ompt_parallel_data, *parent_task_data,
1800  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1801  }
1802  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1803  }
1804 #endif
1805  } else if (microtask == (microtask_t)__kmp_teams_master) {
1806  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1807  team = master_th->th.th_team;
1808  // team->t.t_pkfn = microtask;
1809  team->t.t_invoke = invoker;
1810  __kmp_alloc_argv_entries(argc, team, TRUE);
1811  team->t.t_argc = argc;
1812  argv = (void **)team->t.t_argv;
1813  for (i = argc - 1; i >= 0; --i)
1814  *argv++ = va_arg(kmp_va_deref(ap), void *);
1815  // AC: revert change made in __kmpc_serialized_parallel()
1816  // because initial code in teams should have level=0
1817  team->t.t_level--;
1818  // AC: call special invoker for outer "parallel" of teams construct
1819  invoker(gtid);
1820 #if OMPT_SUPPORT
1821  if (ompt_enabled.enabled) {
1822  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1823  if (ompt_enabled.ompt_callback_implicit_task) {
1824  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1825  ompt_scope_end, NULL, &(task_info->task_data), 0,
1826  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1827  }
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  ompt_parallel_data, *parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_league,
1832  *return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else {
1838  argv = args;
1839  for (i = argc - 1; i >= 0; --i)
1840  *argv++ = va_arg(kmp_va_deref(ap), void *);
1841  KMP_MB();
1842 
1843 #if OMPT_SUPPORT
1844  void *dummy;
1845  void **exit_frame_p;
1846  ompt_task_info_t *task_info;
1847  ompt_lw_taskteam_t lw_taskteam;
1848  ompt_data_t *implicit_task_data;
1849 
1850  if (ompt_enabled.enabled) {
1851  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1852  ompt_parallel_data, *return_address);
1853  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1854  // don't use lw_taskteam after linking. content was swaped
1855  task_info = OMPT_CUR_TASK_INFO(master_th);
1856  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1857 
1858  /* OMPT implicit task begin */
1859  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1860  if (ompt_enabled.ompt_callback_implicit_task) {
1861  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1862  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1863  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1864  ompt_task_implicit);
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1866  }
1867 
1868  /* OMPT state */
1869  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1870  } else {
1871  exit_frame_p = &dummy;
1872  }
1873 #endif
1874 
1875  {
1876  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1877  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1878  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1879 #if OMPT_SUPPORT
1880  ,
1881  exit_frame_p
1882 #endif
1883  );
1884  }
1885 
1886 #if OMPT_SUPPORT
1887  if (ompt_enabled.enabled) {
1888  *exit_frame_p = NULL;
1889  if (ompt_enabled.ompt_callback_implicit_task) {
1890  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1891  ompt_scope_end, NULL, &(task_info->task_data), 1,
1892  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1893  }
1894 
1895  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1896  __ompt_lw_taskteam_unlink(master_th);
1897  if (ompt_enabled.ompt_callback_parallel_end) {
1898  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1899  ompt_parallel_data, *parent_task_data,
1900  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1901  }
1902  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1903  }
1904 #endif
1905  }
1906  } else if (call_context == fork_context_gnu) {
1907 #if OMPT_SUPPORT
1908  if (ompt_enabled.enabled) {
1909  ompt_lw_taskteam_t lwt;
1910  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1911  *return_address);
1912 
1913  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1914  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1915  }
1916 // don't use lw_taskteam after linking. content was swaped
1917 #endif
1918 
1919  // we were called from GNU native code
1920  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1921  return FALSE;
1922  } else {
1923  KMP_ASSERT2(call_context < fork_context_last,
1924  "__kmp_serial_fork_call: unknown fork_context parameter");
1925  }
1926 
1927  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1928  KMP_MB();
1929  return FALSE;
1930 }
1931 
1932 /* most of the work for a fork */
1933 /* return true if we really went parallel, false if serialized */
1934 int __kmp_fork_call(ident_t *loc, int gtid,
1935  enum fork_context_e call_context, // Intel, GNU, ...
1936  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1937  kmp_va_list ap) {
1938  void **argv;
1939  int i;
1940  int master_tid;
1941  int master_this_cons;
1942  kmp_team_t *team;
1943  kmp_team_t *parent_team;
1944  kmp_info_t *master_th;
1945  kmp_root_t *root;
1946  int nthreads;
1947  int master_active;
1948  int master_set_numthreads;
1949  int task_thread_limit = 0;
1950  int level;
1951  int active_level;
1952  int teams_level;
1953  kmp_hot_team_ptr_t **p_hot_teams;
1954  { // KMP_TIME_BLOCK
1955  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1956  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1957 
1958  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1959  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1960  /* Some systems prefer the stack for the root thread(s) to start with */
1961  /* some gap from the parent stack to prevent false sharing. */
1962  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1963  /* These 2 lines below are so this does not get optimized out */
1964  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1965  __kmp_stkpadding += (short)((kmp_int64)dummy);
1966  }
1967 
1968  /* initialize if needed */
1969  KMP_DEBUG_ASSERT(
1970  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1971  if (!TCR_4(__kmp_init_parallel))
1972  __kmp_parallel_initialize();
1973  __kmp_resume_if_soft_paused();
1974 
1975  /* setup current data */
1976  // AC: potentially unsafe, not in sync with library shutdown,
1977  // __kmp_threads can be freed
1978  master_th = __kmp_threads[gtid];
1979 
1980  parent_team = master_th->th.th_team;
1981  master_tid = master_th->th.th_info.ds.ds_tid;
1982  master_this_cons = master_th->th.th_local.this_construct;
1983  root = master_th->th.th_root;
1984  master_active = root->r.r_active;
1985  master_set_numthreads = master_th->th.th_set_nproc;
1986  task_thread_limit =
1987  master_th->th.th_current_task->td_icvs.task_thread_limit;
1988 
1989 #if OMPT_SUPPORT
1990  ompt_data_t ompt_parallel_data = ompt_data_none;
1991  ompt_data_t *parent_task_data = NULL;
1992  ompt_frame_t *ompt_frame = NULL;
1993  void *return_address = NULL;
1994 
1995  if (ompt_enabled.enabled) {
1996  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1997  NULL, NULL);
1998  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1999  }
2000 #endif
2001 
2002  // Assign affinity to root thread if it hasn't happened yet
2003  __kmp_assign_root_init_mask();
2004 
2005  // Nested level will be an index in the nested nthreads array
2006  level = parent_team->t.t_level;
2007  // used to launch non-serial teams even if nested is not allowed
2008  active_level = parent_team->t.t_active_level;
2009  // needed to check nesting inside the teams
2010  teams_level = master_th->th.th_teams_level;
2011  p_hot_teams = &master_th->th.th_hot_teams;
2012  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2013  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2014  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2015  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2016  // it is either actual or not needed (when active_level > 0)
2017  (*p_hot_teams)[0].hot_team_nth = 1;
2018  }
2019 
2020 #if OMPT_SUPPORT
2021  if (ompt_enabled.enabled) {
2022  if (ompt_enabled.ompt_callback_parallel_begin) {
2023  int team_size = master_set_numthreads
2024  ? master_set_numthreads
2025  : get__nproc_2(parent_team, master_tid);
2026  int flags = OMPT_INVOKER(call_context) |
2027  ((microtask == (microtask_t)__kmp_teams_master)
2028  ? ompt_parallel_league
2029  : ompt_parallel_team);
2030  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2031  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2032  return_address);
2033  }
2034  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2035  }
2036 #endif
2037 
2038  master_th->th.th_ident = loc;
2039 
2040  // Parallel closely nested in teams construct:
2041  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2042  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2043  call_context, microtask, invoker,
2044  master_set_numthreads, level,
2045 #if OMPT_SUPPORT
2046  ompt_parallel_data, return_address,
2047 #endif
2048  ap);
2049  } // End parallel closely nested in teams construct
2050 
2051  // Need this to happen before we determine the number of threads, not while
2052  // we are allocating the team
2053  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2054 
2055  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2056 
2057  // Determine the number of threads
2058  int enter_teams =
2059  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2060  if ((!enter_teams &&
2061  (parent_team->t.t_active_level >=
2062  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2063  (__kmp_library == library_serial)) {
2064  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2065  nthreads = 1;
2066  } else {
2067  nthreads = master_set_numthreads
2068  ? master_set_numthreads
2069  // TODO: get nproc directly from current task
2070  : get__nproc_2(parent_team, master_tid);
2071  // Use the thread_limit set for the current target task if exists, else go
2072  // with the deduced nthreads
2073  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2074  ? task_thread_limit
2075  : nthreads;
2076  // Check if we need to take forkjoin lock? (no need for serialized
2077  // parallel out of teams construct).
2078  if (nthreads > 1) {
2079  /* determine how many new threads we can use */
2080  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2081  /* AC: If we execute teams from parallel region (on host), then teams
2082  should be created but each can only have 1 thread if nesting is
2083  disabled. If teams called from serial region, then teams and their
2084  threads should be created regardless of the nesting setting. */
2085  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2086  nthreads, enter_teams);
2087  if (nthreads == 1) {
2088  // Free lock for single thread execution here; for multi-thread
2089  // execution it will be freed later after team of threads created
2090  // and initialized
2091  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2092  }
2093  }
2094  }
2095  KMP_DEBUG_ASSERT(nthreads > 0);
2096 
2097  // If we temporarily changed the set number of threads then restore it now
2098  master_th->th.th_set_nproc = 0;
2099 
2100  if (nthreads == 1) {
2101  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2102  invoker, master_th, parent_team,
2103 #if OMPT_SUPPORT
2104  &ompt_parallel_data, &return_address,
2105  &parent_task_data,
2106 #endif
2107  ap);
2108  } // if (nthreads == 1)
2109 
2110  // GEH: only modify the executing flag in the case when not serialized
2111  // serialized case is handled in kmpc_serialized_parallel
2112  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2113  "curtask=%p, curtask_max_aclevel=%d\n",
2114  parent_team->t.t_active_level, master_th,
2115  master_th->th.th_current_task,
2116  master_th->th.th_current_task->td_icvs.max_active_levels));
2117  // TODO: GEH - cannot do this assertion because root thread not set up as
2118  // executing
2119  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2120  master_th->th.th_current_task->td_flags.executing = 0;
2121 
2122  if (!master_th->th.th_teams_microtask || level > teams_level) {
2123  /* Increment our nested depth level */
2124  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2125  }
2126 
2127  // See if we need to make a copy of the ICVs.
2128  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2129  kmp_nested_nthreads_t *nested_nth = NULL;
2130  if (!master_th->th.th_set_nested_nth &&
2131  (level + 1 < parent_team->t.t_nested_nth->used) &&
2132  (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2133  nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2134  } else if (master_th->th.th_set_nested_nth) {
2135  nested_nth = __kmp_override_nested_nth(master_th, level);
2136  if ((level + 1 < nested_nth->used) &&
2137  (nested_nth->nth[level + 1] != nthreads_icv))
2138  nthreads_icv = nested_nth->nth[level + 1];
2139  else
2140  nthreads_icv = 0; // don't update
2141  } else {
2142  nthreads_icv = 0; // don't update
2143  }
2144 
2145  // Figure out the proc_bind_policy for the new team.
2146  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2147  // proc_bind_default means don't update
2148  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2149  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2150  proc_bind = proc_bind_false;
2151  } else {
2152  // No proc_bind clause specified; use current proc-bind-var for this
2153  // parallel region
2154  if (proc_bind == proc_bind_default) {
2155  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2156  }
2157  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2158  if (master_th->th.th_teams_microtask &&
2159  microtask == (microtask_t)__kmp_teams_master) {
2160  proc_bind = __kmp_teams_proc_bind;
2161  }
2162  /* else: The proc_bind policy was specified explicitly on parallel clause.
2163  This overrides proc-bind-var for this parallel region, but does not
2164  change proc-bind-var. */
2165  // Figure the value of proc-bind-var for the child threads.
2166  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2167  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2168  master_th->th.th_current_task->td_icvs.proc_bind)) {
2169  // Do not modify the proc bind icv for the two teams construct forks
2170  // They just let the proc bind icv pass through
2171  if (!master_th->th.th_teams_microtask ||
2172  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2173  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2174  }
2175  }
2176 
2177  // Reset for next parallel region
2178  master_th->th.th_set_proc_bind = proc_bind_default;
2179 
2180  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2181  kmp_internal_control_t new_icvs;
2182  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2183  new_icvs.next = NULL;
2184  if (nthreads_icv > 0) {
2185  new_icvs.nproc = nthreads_icv;
2186  }
2187  if (proc_bind_icv != proc_bind_default) {
2188  new_icvs.proc_bind = proc_bind_icv;
2189  }
2190 
2191  /* allocate a new parallel team */
2192  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2193  team = __kmp_allocate_team(root, nthreads, nthreads,
2194 #if OMPT_SUPPORT
2195  ompt_parallel_data,
2196 #endif
2197  proc_bind, &new_icvs, argc, master_th);
2198  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2199  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2200  } else {
2201  /* allocate a new parallel team */
2202  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2203  team = __kmp_allocate_team(
2204  root, nthreads, nthreads,
2205 #if OMPT_SUPPORT
2206  ompt_parallel_data,
2207 #endif
2208  proc_bind, &master_th->th.th_current_task->td_icvs, argc, master_th);
2209  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2210  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2211  &master_th->th.th_current_task->td_icvs);
2212  }
2213  KF_TRACE(
2214  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2215 
2216  /* setup the new team */
2217  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2218  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2219  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2220  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2221  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2222 #if OMPT_SUPPORT
2223  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2224  return_address);
2225 #endif
2226  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2227  // TODO: parent_team->t.t_level == INT_MAX ???
2228  if (!master_th->th.th_teams_microtask || level > teams_level) {
2229  int new_level = parent_team->t.t_level + 1;
2230  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2231  new_level = parent_team->t.t_active_level + 1;
2232  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2233  } else {
2234  // AC: Do not increase parallel level at start of the teams construct
2235  int new_level = parent_team->t.t_level;
2236  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2237  new_level = parent_team->t.t_active_level;
2238  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2239  }
2240  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2241  // set primary thread's schedule as new run-time schedule
2242  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2243 
2244  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2245  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2246 
2247  // Check if hot team has potentially outdated list, and if so, free it
2248  if (team->t.t_nested_nth &&
2249  team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2250  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2251  KMP_INTERNAL_FREE(team->t.t_nested_nth);
2252  team->t.t_nested_nth = NULL;
2253  }
2254  team->t.t_nested_nth = parent_team->t.t_nested_nth;
2255  if (master_th->th.th_set_nested_nth) {
2256  if (!nested_nth)
2257  nested_nth = __kmp_override_nested_nth(master_th, level);
2258  team->t.t_nested_nth = nested_nth;
2259  KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2260  master_th->th.th_set_nested_nth = NULL;
2261  master_th->th.th_set_nested_nth_sz = 0;
2262  master_th->th.th_nt_strict = false;
2263  }
2264 
2265  // Update the floating point rounding in the team if required.
2266  propagateFPControl(team);
2267 #if OMPD_SUPPORT
2268  if (ompd_state & OMPD_ENABLE_BP)
2269  ompd_bp_parallel_begin();
2270 #endif
2271 
2272  KA_TRACE(
2273  20,
2274  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2275  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2276  team->t.t_nproc));
2277  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2278  (team->t.t_master_tid == 0 &&
2279  (team->t.t_parent == root->r.r_root_team ||
2280  team->t.t_parent->t.t_serialized)));
2281  KMP_MB();
2282 
2283  /* now, setup the arguments */
2284  argv = (void **)team->t.t_argv;
2285  if (ap) {
2286  for (i = argc - 1; i >= 0; --i) {
2287  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2288  KMP_CHECK_UPDATE(*argv, new_argv);
2289  argv++;
2290  }
2291  } else {
2292  for (i = 0; i < argc; ++i) {
2293  // Get args from parent team for teams construct
2294  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2295  }
2296  }
2297 
2298  /* now actually fork the threads */
2299  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2300  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2301  root->r.r_active = TRUE;
2302 
2303  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2304  __kmp_setup_icv_copy(team, nthreads,
2305  &master_th->th.th_current_task->td_icvs, loc);
2306 
2307 #if OMPT_SUPPORT
2308  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2309 #endif
2310 
2311  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2312 
2313 #if USE_ITT_BUILD
2314  if (team->t.t_active_level == 1 // only report frames at level 1
2315  && !master_th->th.th_teams_microtask) { // not in teams construct
2316 #if USE_ITT_NOTIFY
2317  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2318  (__kmp_forkjoin_frames_mode == 3 ||
2319  __kmp_forkjoin_frames_mode == 1)) {
2320  kmp_uint64 tmp_time = 0;
2321  if (__itt_get_timestamp_ptr)
2322  tmp_time = __itt_get_timestamp();
2323  // Internal fork - report frame begin
2324  master_th->th.th_frame_time = tmp_time;
2325  if (__kmp_forkjoin_frames_mode == 3)
2326  team->t.t_region_time = tmp_time;
2327  } else
2328 // only one notification scheme (either "submit" or "forking/joined", not both)
2329 #endif /* USE_ITT_NOTIFY */
2330  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2331  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2332  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2333  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2334  }
2335  }
2336 #endif /* USE_ITT_BUILD */
2337 
2338  /* now go on and do the work */
2339  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2340  KMP_MB();
2341  KF_TRACE(10,
2342  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2343  root, team, master_th, gtid));
2344 
2345 #if USE_ITT_BUILD
2346  if (__itt_stack_caller_create_ptr) {
2347  // create new stack stitching id before entering fork barrier
2348  if (!enter_teams) {
2349  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2350  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2351  } else if (parent_team->t.t_serialized) {
2352  // keep stack stitching id in the serialized parent_team;
2353  // current team will be used for parallel inside the teams;
2354  // if parent_team is active, then it already keeps stack stitching id
2355  // for the league of teams
2356  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2357  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2358  }
2359  }
2360 #endif /* USE_ITT_BUILD */
2361 
2362  // AC: skip __kmp_internal_fork at teams construct, let only primary
2363  // threads execute
2364  if (ap) {
2365  __kmp_internal_fork(loc, gtid, team);
2366  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2367  "master_th=%p, gtid=%d\n",
2368  root, team, master_th, gtid));
2369  }
2370 
2371  if (call_context == fork_context_gnu) {
2372  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2373  return TRUE;
2374  }
2375 
2376  /* Invoke microtask for PRIMARY thread */
2377  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2378  team->t.t_id, team->t.t_pkfn));
2379  } // END of timer KMP_fork_call block
2380 
2381 #if KMP_STATS_ENABLED
2382  // If beginning a teams construct, then change thread state
2383  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2384  if (!ap) {
2385  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2386  }
2387 #endif
2388 
2389  if (!team->t.t_invoke(gtid)) {
2390  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2391  }
2392 
2393 #if KMP_STATS_ENABLED
2394  // If was beginning of a teams construct, then reset thread state
2395  if (!ap) {
2396  KMP_SET_THREAD_STATE(previous_state);
2397  }
2398 #endif
2399 
2400  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2401  team->t.t_id, team->t.t_pkfn));
2402  KMP_MB(); /* Flush all pending memory write invalidates. */
2403 
2404  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2405 #if OMPT_SUPPORT
2406  if (ompt_enabled.enabled) {
2407  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2408  }
2409 #endif
2410 
2411  return TRUE;
2412 }
2413 
2414 #if OMPT_SUPPORT
2415 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2416  kmp_team_t *team) {
2417  // restore state outside the region
2418  thread->th.ompt_thread_info.state =
2419  ((team->t.t_serialized) ? ompt_state_work_serial
2420  : ompt_state_work_parallel);
2421 }
2422 
2423 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2424  kmp_team_t *team, ompt_data_t *parallel_data,
2425  int flags, void *codeptr) {
2426  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2427  if (ompt_enabled.ompt_callback_parallel_end) {
2428  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2429  parallel_data, &(task_info->task_data), flags, codeptr);
2430  }
2431 
2432  task_info->frame.enter_frame = ompt_data_none;
2433  __kmp_join_restore_state(thread, team);
2434 }
2435 #endif
2436 
2437 void __kmp_join_call(ident_t *loc, int gtid
2438 #if OMPT_SUPPORT
2439  ,
2440  enum fork_context_e fork_context
2441 #endif
2442  ,
2443  int exit_teams) {
2444  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2445  kmp_team_t *team;
2446  kmp_team_t *parent_team;
2447  kmp_info_t *master_th;
2448  kmp_root_t *root;
2449  int master_active;
2450 
2451  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2452 
2453  /* setup current data */
2454  master_th = __kmp_threads[gtid];
2455  root = master_th->th.th_root;
2456  team = master_th->th.th_team;
2457  parent_team = team->t.t_parent;
2458 
2459  master_th->th.th_ident = loc;
2460 
2461 #if OMPT_SUPPORT
2462  void *team_microtask = (void *)team->t.t_pkfn;
2463  // For GOMP interface with serialized parallel, need the
2464  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2465  // and end-parallel events.
2466  if (ompt_enabled.enabled &&
2467  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2468  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2469  }
2470 #endif
2471 
2472 #if KMP_DEBUG
2473  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2474  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2475  "th_task_team = %p\n",
2476  __kmp_gtid_from_thread(master_th), team,
2477  team->t.t_task_team[master_th->th.th_task_state],
2478  master_th->th.th_task_team));
2479  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2480  }
2481 #endif
2482 
2483  if (team->t.t_serialized) {
2484  if (master_th->th.th_teams_microtask) {
2485  // We are in teams construct
2486  int level = team->t.t_level;
2487  int tlevel = master_th->th.th_teams_level;
2488  if (level == tlevel) {
2489  // AC: we haven't incremented it earlier at start of teams construct,
2490  // so do it here - at the end of teams construct
2491  team->t.t_level++;
2492  } else if (level == tlevel + 1) {
2493  // AC: we are exiting parallel inside teams, need to increment
2494  // serialization in order to restore it in the next call to
2495  // __kmpc_end_serialized_parallel
2496  team->t.t_serialized++;
2497  }
2498  }
2499  __kmpc_end_serialized_parallel(loc, gtid);
2500 
2501 #if OMPT_SUPPORT
2502  if (ompt_enabled.enabled) {
2503  if (fork_context == fork_context_gnu) {
2504  __ompt_lw_taskteam_unlink(master_th);
2505  }
2506  __kmp_join_restore_state(master_th, parent_team);
2507  }
2508 #endif
2509 
2510  return;
2511  }
2512 
2513  master_active = team->t.t_master_active;
2514 
2515  if (!exit_teams) {
2516  // AC: No barrier for internal teams at exit from teams construct.
2517  // But there is barrier for external team (league).
2518  __kmp_internal_join(loc, gtid, team);
2519 #if USE_ITT_BUILD
2520  if (__itt_stack_caller_create_ptr) {
2521  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2522  // destroy the stack stitching id after join barrier
2523  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2524  team->t.t_stack_id = NULL;
2525  }
2526 #endif
2527  } else {
2528  master_th->th.th_task_state =
2529  0; // AC: no tasking in teams (out of any parallel)
2530 #if USE_ITT_BUILD
2531  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2532  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2533  // destroy the stack stitching id on exit from the teams construct
2534  // if parent_team is active, then the id will be destroyed later on
2535  // by master of the league of teams
2536  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2537  parent_team->t.t_stack_id = NULL;
2538  }
2539 #endif
2540  }
2541 
2542  KMP_MB();
2543 
2544 #if OMPT_SUPPORT
2545  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2546  void *codeptr = team->t.ompt_team_info.master_return_address;
2547 #endif
2548 
2549 #if USE_ITT_BUILD
2550  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2551  if (team->t.t_active_level == 1 &&
2552  (!master_th->th.th_teams_microtask || /* not in teams construct */
2553  master_th->th.th_teams_size.nteams == 1)) {
2554  master_th->th.th_ident = loc;
2555  // only one notification scheme (either "submit" or "forking/joined", not
2556  // both)
2557  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2558  __kmp_forkjoin_frames_mode == 3)
2559  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2560  master_th->th.th_frame_time, 0, loc,
2561  master_th->th.th_team_nproc, 1);
2562  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2563  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2564  __kmp_itt_region_joined(gtid);
2565  } // active_level == 1
2566 #endif /* USE_ITT_BUILD */
2567 
2568 #if KMP_AFFINITY_SUPPORTED
2569  if (!exit_teams) {
2570  // Restore master thread's partition.
2571  master_th->th.th_first_place = team->t.t_first_place;
2572  master_th->th.th_last_place = team->t.t_last_place;
2573  }
2574 #endif // KMP_AFFINITY_SUPPORTED
2575 
2576  if (master_th->th.th_teams_microtask && !exit_teams &&
2577  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2578  team->t.t_level == master_th->th.th_teams_level + 1) {
2579 // AC: We need to leave the team structure intact at the end of parallel
2580 // inside the teams construct, so that at the next parallel same (hot) team
2581 // works, only adjust nesting levels
2582 #if OMPT_SUPPORT
2583  ompt_data_t ompt_parallel_data = ompt_data_none;
2584  if (ompt_enabled.enabled) {
2585  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2586  if (ompt_enabled.ompt_callback_implicit_task) {
2587  int ompt_team_size = team->t.t_nproc;
2588  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2589  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2590  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2591  }
2592  task_info->frame.exit_frame = ompt_data_none;
2593  task_info->task_data = ompt_data_none;
2594  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2595  __ompt_lw_taskteam_unlink(master_th);
2596  }
2597 #endif
2598  /* Decrement our nested depth level */
2599  team->t.t_level--;
2600  team->t.t_active_level--;
2601  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2602 
2603  // Restore number of threads in the team if needed. This code relies on
2604  // the proper adjustment of th_teams_size.nth after the fork in
2605  // __kmp_teams_master on each teams primary thread in the case that
2606  // __kmp_reserve_threads reduced it.
2607  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2608  int old_num = master_th->th.th_team_nproc;
2609  int new_num = master_th->th.th_teams_size.nth;
2610  kmp_info_t **other_threads = team->t.t_threads;
2611  team->t.t_nproc = new_num;
2612  for (int i = 0; i < old_num; ++i) {
2613  other_threads[i]->th.th_team_nproc = new_num;
2614  }
2615  // Adjust states of non-used threads of the team
2616  for (int i = old_num; i < new_num; ++i) {
2617  // Re-initialize thread's barrier data.
2618  KMP_DEBUG_ASSERT(other_threads[i]);
2619  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2620  for (int b = 0; b < bs_last_barrier; ++b) {
2621  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2622  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2623 #if USE_DEBUGGER
2624  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2625 #endif
2626  }
2627  if (__kmp_tasking_mode != tskm_immediate_exec) {
2628  // Synchronize thread's task state
2629  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2630  }
2631  }
2632  }
2633 
2634 #if OMPT_SUPPORT
2635  if (ompt_enabled.enabled) {
2636  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2637  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2638  }
2639 #endif
2640 
2641  return;
2642  }
2643 
2644  /* do cleanup and restore the parent team */
2645  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2646  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2647 
2648  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2649 
2650  /* jc: The following lock has instructions with REL and ACQ semantics,
2651  separating the parallel user code called in this parallel region
2652  from the serial user code called after this function returns. */
2653  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2654 
2655  if (!master_th->th.th_teams_microtask ||
2656  team->t.t_level > master_th->th.th_teams_level) {
2657  /* Decrement our nested depth level */
2658  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2659  }
2660  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2661 
2662 #if OMPT_SUPPORT
2663  if (ompt_enabled.enabled) {
2664  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2665  if (ompt_enabled.ompt_callback_implicit_task) {
2666  int flags = (team_microtask == (void *)__kmp_teams_master)
2667  ? ompt_task_initial
2668  : ompt_task_implicit;
2669  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2670  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2671  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2672  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2673  }
2674  task_info->frame.exit_frame = ompt_data_none;
2675  task_info->task_data = ompt_data_none;
2676  }
2677 #endif
2678 
2679  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2680  master_th, team));
2681  __kmp_pop_current_task_from_thread(master_th);
2682 
2683  master_th->th.th_def_allocator = team->t.t_def_allocator;
2684 
2685 #if OMPD_SUPPORT
2686  if (ompd_state & OMPD_ENABLE_BP)
2687  ompd_bp_parallel_end();
2688 #endif
2689  updateHWFPControl(team);
2690 
2691  if (root->r.r_active != master_active)
2692  root->r.r_active = master_active;
2693 
2694  __kmp_free_team(root, team, master_th); // this will free worker threads
2695 
2696  /* this race was fun to find. make sure the following is in the critical
2697  region otherwise assertions may fail occasionally since the old team may be
2698  reallocated and the hierarchy appears inconsistent. it is actually safe to
2699  run and won't cause any bugs, but will cause those assertion failures. it's
2700  only one deref&assign so might as well put this in the critical region */
2701  master_th->th.th_team = parent_team;
2702  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2703  master_th->th.th_team_master = parent_team->t.t_threads[0];
2704  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2705 
2706  /* restore serialized team, if need be */
2707  if (parent_team->t.t_serialized &&
2708  parent_team != master_th->th.th_serial_team &&
2709  parent_team != root->r.r_root_team) {
2710  __kmp_free_team(root, master_th->th.th_serial_team, NULL);
2711  master_th->th.th_serial_team = parent_team;
2712  }
2713 
2714  if (__kmp_tasking_mode != tskm_immediate_exec) {
2715  // Restore primary thread's task state from team structure
2716  KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2717  team->t.t_primary_task_state == 1);
2718  master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2719 
2720  // Copy the task team from the parent team to the primary thread
2721  master_th->th.th_task_team =
2722  parent_team->t.t_task_team[master_th->th.th_task_state];
2723  KA_TRACE(20,
2724  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2725  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2726  parent_team));
2727  }
2728 
2729  // TODO: GEH - cannot do this assertion because root thread not set up as
2730  // executing
2731  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2732  master_th->th.th_current_task->td_flags.executing = 1;
2733 
2734  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2735 
2736 #if KMP_AFFINITY_SUPPORTED
2737  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2738  __kmp_reset_root_init_mask(gtid);
2739  }
2740 #endif
2741 #if OMPT_SUPPORT
2742  int flags =
2743  OMPT_INVOKER(fork_context) |
2744  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2745  : ompt_parallel_team);
2746  if (ompt_enabled.enabled) {
2747  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2748  codeptr);
2749  }
2750 #endif
2751 
2752  KMP_MB();
2753  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2754 }
2755 
2756 /* Check whether we should push an internal control record onto the
2757  serial team stack. If so, do it. */
2758 void __kmp_save_internal_controls(kmp_info_t *thread) {
2759 
2760  if (thread->th.th_team != thread->th.th_serial_team) {
2761  return;
2762  }
2763  if (thread->th.th_team->t.t_serialized > 1) {
2764  int push = 0;
2765 
2766  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2767  push = 1;
2768  } else {
2769  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2770  thread->th.th_team->t.t_serialized) {
2771  push = 1;
2772  }
2773  }
2774  if (push) { /* push a record on the serial team's stack */
2775  kmp_internal_control_t *control =
2776  (kmp_internal_control_t *)__kmp_allocate(
2777  sizeof(kmp_internal_control_t));
2778 
2779  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2780 
2781  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2782 
2783  control->next = thread->th.th_team->t.t_control_stack_top;
2784  thread->th.th_team->t.t_control_stack_top = control;
2785  }
2786  }
2787 }
2788 
2789 /* Changes set_nproc */
2790 void __kmp_set_num_threads(int new_nth, int gtid) {
2791  kmp_info_t *thread;
2792  kmp_root_t *root;
2793 
2794  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2795  KMP_DEBUG_ASSERT(__kmp_init_serial);
2796 
2797  if (new_nth < 1)
2798  new_nth = 1;
2799  else if (new_nth > __kmp_max_nth)
2800  new_nth = __kmp_max_nth;
2801 
2802  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2803  thread = __kmp_threads[gtid];
2804  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2805  return; // nothing to do
2806 
2807  __kmp_save_internal_controls(thread);
2808 
2809  set__nproc(thread, new_nth);
2810 
2811  // If this omp_set_num_threads() call will cause the hot team size to be
2812  // reduced (in the absence of a num_threads clause), then reduce it now,
2813  // rather than waiting for the next parallel region.
2814  root = thread->th.th_root;
2815  if (__kmp_init_parallel && (!root->r.r_active) &&
2816  (root->r.r_hot_team->t.t_nproc > new_nth) && __kmp_hot_teams_max_level &&
2817  !__kmp_hot_teams_mode) {
2818  kmp_team_t *hot_team = root->r.r_hot_team;
2819  int f;
2820 
2821  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2822 
2823  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2824  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2825  }
2826  // Release the extra threads we don't need any more.
2827  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2828  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2829  if (__kmp_tasking_mode != tskm_immediate_exec) {
2830  // When decreasing team size, threads no longer in the team should unref
2831  // task team.
2832  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2833  }
2834  __kmp_free_thread(hot_team->t.t_threads[f]);
2835  hot_team->t.t_threads[f] = NULL;
2836  }
2837  hot_team->t.t_nproc = new_nth;
2838  if (thread->th.th_hot_teams) {
2839  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2840  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2841  }
2842 
2843  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2844  hot_team->t.b->update_num_threads(new_nth);
2845  __kmp_add_threads_to_team(hot_team, new_nth);
2846  }
2847 
2848  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2849 
2850  // Update the t_nproc field in the threads that are still active.
2851  for (f = 0; f < new_nth; f++) {
2852  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2853  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2854  }
2855  // Special flag in case omp_set_num_threads() call
2856  hot_team->t.t_size_changed = -1;
2857  }
2858 }
2859 
2860 /* Changes max_active_levels */
2861 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2862  kmp_info_t *thread;
2863 
2864  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2865  "%d = (%d)\n",
2866  gtid, max_active_levels));
2867  KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869  // validate max_active_levels
2870  if (max_active_levels < 0) {
2871  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2872  // We ignore this call if the user has specified a negative value.
2873  // The current setting won't be changed. The last valid setting will be
2874  // used. A warning will be issued (if warnings are allowed as controlled by
2875  // the KMP_WARNINGS env var).
2876  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2877  "max_active_levels for thread %d = (%d)\n",
2878  gtid, max_active_levels));
2879  return;
2880  }
2881  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2882  // it's OK, the max_active_levels is within the valid range: [ 0;
2883  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2884  // We allow a zero value. (implementation defined behavior)
2885  } else {
2886  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2887  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2888  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2889  // Current upper limit is MAX_INT. (implementation defined behavior)
2890  // If the input exceeds the upper limit, we correct the input to be the
2891  // upper limit. (implementation defined behavior)
2892  // Actually, the flow should never get here until we use MAX_INT limit.
2893  }
2894  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2895  "max_active_levels for thread %d = (%d)\n",
2896  gtid, max_active_levels));
2897 
2898  thread = __kmp_threads[gtid];
2899 
2900  __kmp_save_internal_controls(thread);
2901 
2902  set__max_active_levels(thread, max_active_levels);
2903 }
2904 
2905 /* Gets max_active_levels */
2906 int __kmp_get_max_active_levels(int gtid) {
2907  kmp_info_t *thread;
2908 
2909  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2910  KMP_DEBUG_ASSERT(__kmp_init_serial);
2911 
2912  thread = __kmp_threads[gtid];
2913  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2914  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2915  "curtask_maxaclevel=%d\n",
2916  gtid, thread->th.th_current_task,
2917  thread->th.th_current_task->td_icvs.max_active_levels));
2918  return thread->th.th_current_task->td_icvs.max_active_levels;
2919 }
2920 
2921 // nteams-var per-device ICV
2922 void __kmp_set_num_teams(int num_teams) {
2923  if (num_teams > 0)
2924  __kmp_nteams = num_teams;
2925 }
2926 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2927 // teams-thread-limit-var per-device ICV
2928 void __kmp_set_teams_thread_limit(int limit) {
2929  if (limit > 0)
2930  __kmp_teams_thread_limit = limit;
2931 }
2932 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2933 
2934 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2935 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2936 
2937 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2938 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2939  kmp_info_t *thread;
2940  kmp_sched_t orig_kind;
2941  // kmp_team_t *team;
2942 
2943  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2944  gtid, (int)kind, chunk));
2945  KMP_DEBUG_ASSERT(__kmp_init_serial);
2946 
2947  // Check if the kind parameter is valid, correct if needed.
2948  // Valid parameters should fit in one of two intervals - standard or extended:
2949  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2950  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2951  orig_kind = kind;
2952  kind = __kmp_sched_without_mods(kind);
2953 
2954  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2955  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2956  // TODO: Hint needs attention in case we change the default schedule.
2957  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2958  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2959  __kmp_msg_null);
2960  kind = kmp_sched_default;
2961  chunk = 0; // ignore chunk value in case of bad kind
2962  }
2963 
2964  thread = __kmp_threads[gtid];
2965 
2966  __kmp_save_internal_controls(thread);
2967 
2968  if (kind < kmp_sched_upper_std) {
2969  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2970  // differ static chunked vs. unchunked: chunk should be invalid to
2971  // indicate unchunked schedule (which is the default)
2972  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2973  } else {
2974  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2975  __kmp_sch_map[kind - kmp_sched_lower - 1];
2976  }
2977  } else {
2978  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979  // kmp_sched_lower - 2 ];
2980  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2981  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2982  kmp_sched_lower - 2];
2983  }
2984  __kmp_sched_apply_mods_intkind(
2985  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2986  if (kind == kmp_sched_auto || chunk < 1) {
2987  // ignore parameter chunk for schedule auto
2988  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2989  } else {
2990  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2991  }
2992 }
2993 
2994 /* Gets def_sched_var ICV values */
2995 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2996  kmp_info_t *thread;
2997  enum sched_type th_type;
2998 
2999  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3000  KMP_DEBUG_ASSERT(__kmp_init_serial);
3001 
3002  thread = __kmp_threads[gtid];
3003 
3004  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3005  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3006  case kmp_sch_static:
3007  case kmp_sch_static_greedy:
3008  case kmp_sch_static_balanced:
3009  *kind = kmp_sched_static;
3010  __kmp_sched_apply_mods_stdkind(kind, th_type);
3011  *chunk = 0; // chunk was not set, try to show this fact via zero value
3012  return;
3013  case kmp_sch_static_chunked:
3014  *kind = kmp_sched_static;
3015  break;
3016  case kmp_sch_dynamic_chunked:
3017  *kind = kmp_sched_dynamic;
3018  break;
3020  case kmp_sch_guided_iterative_chunked:
3021  case kmp_sch_guided_analytical_chunked:
3022  *kind = kmp_sched_guided;
3023  break;
3024  case kmp_sch_auto:
3025  *kind = kmp_sched_auto;
3026  break;
3027  case kmp_sch_trapezoidal:
3028  *kind = kmp_sched_trapezoidal;
3029  break;
3030 #if KMP_STATIC_STEAL_ENABLED
3031  case kmp_sch_static_steal:
3032  *kind = kmp_sched_static_steal;
3033  break;
3034 #endif
3035  default:
3036  KMP_FATAL(UnknownSchedulingType, th_type);
3037  }
3038 
3039  __kmp_sched_apply_mods_stdkind(kind, th_type);
3040  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3041 }
3042 
3043 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3044 
3045  int ii, dd;
3046  kmp_team_t *team;
3047  kmp_info_t *thr;
3048 
3049  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3050  KMP_DEBUG_ASSERT(__kmp_init_serial);
3051 
3052  // validate level
3053  if (level == 0)
3054  return 0;
3055  if (level < 0)
3056  return -1;
3057  thr = __kmp_threads[gtid];
3058  team = thr->th.th_team;
3059  ii = team->t.t_level;
3060  if (level > ii)
3061  return -1;
3062 
3063  if (thr->th.th_teams_microtask) {
3064  // AC: we are in teams region where multiple nested teams have same level
3065  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3066  if (level <=
3067  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3068  KMP_DEBUG_ASSERT(ii >= tlevel);
3069  // AC: As we need to pass by the teams league, we need to artificially
3070  // increase ii
3071  if (ii == tlevel) {
3072  ii += 2; // three teams have same level
3073  } else {
3074  ii++; // two teams have same level
3075  }
3076  }
3077  }
3078 
3079  if (ii == level)
3080  return __kmp_tid_from_gtid(gtid);
3081 
3082  dd = team->t.t_serialized;
3083  level++;
3084  while (ii > level) {
3085  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3086  }
3087  if ((team->t.t_serialized) && (!dd)) {
3088  team = team->t.t_parent;
3089  continue;
3090  }
3091  if (ii > level) {
3092  team = team->t.t_parent;
3093  dd = team->t.t_serialized;
3094  ii--;
3095  }
3096  }
3097 
3098  return (dd > 1) ? (0) : (team->t.t_master_tid);
3099 }
3100 
3101 int __kmp_get_team_size(int gtid, int level) {
3102 
3103  int ii, dd;
3104  kmp_team_t *team;
3105  kmp_info_t *thr;
3106 
3107  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3108  KMP_DEBUG_ASSERT(__kmp_init_serial);
3109 
3110  // validate level
3111  if (level == 0)
3112  return 1;
3113  if (level < 0)
3114  return -1;
3115  thr = __kmp_threads[gtid];
3116  team = thr->th.th_team;
3117  ii = team->t.t_level;
3118  if (level > ii)
3119  return -1;
3120 
3121  if (thr->th.th_teams_microtask) {
3122  // AC: we are in teams region where multiple nested teams have same level
3123  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3124  if (level <=
3125  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3126  KMP_DEBUG_ASSERT(ii >= tlevel);
3127  // AC: As we need to pass by the teams league, we need to artificially
3128  // increase ii
3129  if (ii == tlevel) {
3130  ii += 2; // three teams have same level
3131  } else {
3132  ii++; // two teams have same level
3133  }
3134  }
3135  }
3136 
3137  while (ii > level) {
3138  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3139  }
3140  if (team->t.t_serialized && (!dd)) {
3141  team = team->t.t_parent;
3142  continue;
3143  }
3144  if (ii > level) {
3145  team = team->t.t_parent;
3146  ii--;
3147  }
3148  }
3149 
3150  return team->t.t_nproc;
3151 }
3152 
3153 kmp_r_sched_t __kmp_get_schedule_global() {
3154  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3155  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3156  // independently. So one can get the updated schedule here.
3157 
3158  kmp_r_sched_t r_sched;
3159 
3160  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3161  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3162  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3163  // different roots (even in OMP 2.5)
3164  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3165  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3166  if (s == kmp_sch_static) {
3167  // replace STATIC with more detailed schedule (balanced or greedy)
3168  r_sched.r_sched_type = __kmp_static;
3169  } else if (s == kmp_sch_guided_chunked) {
3170  // replace GUIDED with more detailed schedule (iterative or analytical)
3171  r_sched.r_sched_type = __kmp_guided;
3172  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3173  r_sched.r_sched_type = __kmp_sched;
3174  }
3175  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3176 
3177  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3178  // __kmp_chunk may be wrong here (if it was not ever set)
3179  r_sched.chunk = KMP_DEFAULT_CHUNK;
3180  } else {
3181  r_sched.chunk = __kmp_chunk;
3182  }
3183 
3184  return r_sched;
3185 }
3186 
3187 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3188  at least argc number of *t_argv entries for the requested team. */
3189 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3190 
3191  KMP_DEBUG_ASSERT(team);
3192  if (!realloc || argc > team->t.t_max_argc) {
3193 
3194  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3195  "current entries=%d\n",
3196  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3197  /* if previously allocated heap space for args, free them */
3198  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3199  __kmp_free((void *)team->t.t_argv);
3200 
3201  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3202  /* use unused space in the cache line for arguments */
3203  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3204  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3205  "argv entries\n",
3206  team->t.t_id, team->t.t_max_argc));
3207  team->t.t_argv = &team->t.t_inline_argv[0];
3208  if (__kmp_storage_map) {
3209  __kmp_print_storage_map_gtid(
3210  -1, &team->t.t_inline_argv[0],
3211  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3212  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3213  team->t.t_id);
3214  }
3215  } else {
3216  /* allocate space for arguments in the heap */
3217  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3218  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3219  : 2 * argc;
3220  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3221  "argv entries\n",
3222  team->t.t_id, team->t.t_max_argc));
3223  team->t.t_argv =
3224  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3225  if (__kmp_storage_map) {
3226  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3227  &team->t.t_argv[team->t.t_max_argc],
3228  sizeof(void *) * team->t.t_max_argc,
3229  "team_%d.t_argv", team->t.t_id);
3230  }
3231  }
3232  }
3233 }
3234 
3235 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3236  int i;
3237  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3238  team->t.t_threads =
3239  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3240  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3241  sizeof(dispatch_shared_info_t) * num_disp_buff);
3242  team->t.t_dispatch =
3243  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3244  team->t.t_implicit_task_taskdata =
3245  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3246  team->t.t_max_nproc = max_nth;
3247 
3248  /* setup dispatch buffers */
3249  for (i = 0; i < num_disp_buff; ++i) {
3250  team->t.t_disp_buffer[i].buffer_index = i;
3251  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3252  }
3253 }
3254 
3255 static void __kmp_free_team_arrays(kmp_team_t *team) {
3256  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3257  int i;
3258  for (i = 0; i < team->t.t_max_nproc; ++i) {
3259  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3260  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3261  team->t.t_dispatch[i].th_disp_buffer = NULL;
3262  }
3263  }
3264 #if KMP_USE_HIER_SCHED
3265  __kmp_dispatch_free_hierarchies(team);
3266 #endif
3267  __kmp_free(team->t.t_threads);
3268  __kmp_free(team->t.t_disp_buffer);
3269  __kmp_free(team->t.t_dispatch);
3270  __kmp_free(team->t.t_implicit_task_taskdata);
3271  team->t.t_threads = NULL;
3272  team->t.t_disp_buffer = NULL;
3273  team->t.t_dispatch = NULL;
3274  team->t.t_implicit_task_taskdata = 0;
3275 }
3276 
3277 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3278  kmp_info_t **oldThreads = team->t.t_threads;
3279 
3280  __kmp_free(team->t.t_disp_buffer);
3281  __kmp_free(team->t.t_dispatch);
3282  __kmp_free(team->t.t_implicit_task_taskdata);
3283  __kmp_allocate_team_arrays(team, max_nth);
3284 
3285  KMP_MEMCPY(team->t.t_threads, oldThreads,
3286  team->t.t_nproc * sizeof(kmp_info_t *));
3287 
3288  __kmp_free(oldThreads);
3289 }
3290 
3291 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3292 
3293  kmp_r_sched_t r_sched =
3294  __kmp_get_schedule_global(); // get current state of scheduling globals
3295 
3296  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3297 
3298  kmp_internal_control_t g_icvs = {
3299  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3300  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3301  // adjustment of threads (per thread)
3302  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3303  // whether blocktime is explicitly set
3304  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3305 #if KMP_USE_MONITOR
3306  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3307 // intervals
3308 #endif
3309  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3310  // next parallel region (per thread)
3311  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3312  __kmp_cg_max_nth, // int thread_limit;
3313  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3314  // on task. This is used in the case of target thread_limit
3315  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3316  // for max_active_levels
3317  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3318  // {sched,chunk} pair
3319  __kmp_nested_proc_bind.bind_types[0],
3320  __kmp_default_device,
3321  NULL // struct kmp_internal_control *next;
3322  };
3323 
3324  return g_icvs;
3325 }
3326 
3327 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3328 
3329  kmp_internal_control_t gx_icvs;
3330  gx_icvs.serial_nesting_level =
3331  0; // probably =team->t.t_serial like in save_inter_controls
3332  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3333  gx_icvs.next = NULL;
3334 
3335  return gx_icvs;
3336 }
3337 
3338 static void __kmp_initialize_root(kmp_root_t *root) {
3339  int f;
3340  kmp_team_t *root_team;
3341  kmp_team_t *hot_team;
3342  int hot_team_max_nth;
3343  kmp_r_sched_t r_sched =
3344  __kmp_get_schedule_global(); // get current state of scheduling globals
3345  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3346  KMP_DEBUG_ASSERT(root);
3347  KMP_ASSERT(!root->r.r_begin);
3348 
3349  /* setup the root state structure */
3350  __kmp_init_lock(&root->r.r_begin_lock);
3351  root->r.r_begin = FALSE;
3352  root->r.r_active = FALSE;
3353  root->r.r_in_parallel = 0;
3354  root->r.r_blocktime = __kmp_dflt_blocktime;
3355 #if KMP_AFFINITY_SUPPORTED
3356  root->r.r_affinity_assigned = FALSE;
3357 #endif
3358 
3359  /* setup the root team for this task */
3360  /* allocate the root team structure */
3361  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3362 
3363  root_team = __kmp_allocate_team(root,
3364  1, // new_nproc
3365  1, // max_nproc
3366 #if OMPT_SUPPORT
3367  ompt_data_none, // root parallel id
3368 #endif
3369  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3370  0, // argc
3371  NULL // primary thread is unknown
3372  );
3373 #if USE_DEBUGGER
3374  // Non-NULL value should be assigned to make the debugger display the root
3375  // team.
3376  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3377 #endif
3378 
3379  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3380 
3381  root->r.r_root_team = root_team;
3382  root_team->t.t_control_stack_top = NULL;
3383 
3384  /* initialize root team */
3385  root_team->t.t_threads[0] = NULL;
3386  root_team->t.t_nproc = 1;
3387  root_team->t.t_serialized = 1;
3388  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3389  root_team->t.t_sched.sched = r_sched.sched;
3390  root_team->t.t_nested_nth = &__kmp_nested_nth;
3391  KA_TRACE(
3392  20,
3393  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3394  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3395 
3396  /* setup the hot team for this task */
3397  /* allocate the hot team structure */
3398  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3399 
3400  hot_team = __kmp_allocate_team(root,
3401  1, // new_nproc
3402  __kmp_dflt_team_nth_ub * 2, // max_nproc
3403 #if OMPT_SUPPORT
3404  ompt_data_none, // root parallel id
3405 #endif
3406  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3407  0, // argc
3408  NULL // primary thread is unknown
3409  );
3410  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3411 
3412  root->r.r_hot_team = hot_team;
3413  root_team->t.t_control_stack_top = NULL;
3414 
3415  /* first-time initialization */
3416  hot_team->t.t_parent = root_team;
3417 
3418  /* initialize hot team */
3419  hot_team_max_nth = hot_team->t.t_max_nproc;
3420  for (f = 0; f < hot_team_max_nth; ++f) {
3421  hot_team->t.t_threads[f] = NULL;
3422  }
3423  hot_team->t.t_nproc = 1;
3424  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3425  hot_team->t.t_sched.sched = r_sched.sched;
3426  hot_team->t.t_size_changed = 0;
3427  hot_team->t.t_nested_nth = &__kmp_nested_nth;
3428 }
3429 
3430 #ifdef KMP_DEBUG
3431 
3432 typedef struct kmp_team_list_item {
3433  kmp_team_p const *entry;
3434  struct kmp_team_list_item *next;
3435 } kmp_team_list_item_t;
3436 typedef kmp_team_list_item_t *kmp_team_list_t;
3437 
3438 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3439  kmp_team_list_t list, // List of teams.
3440  kmp_team_p const *team // Team to add.
3441 ) {
3442 
3443  // List must terminate with item where both entry and next are NULL.
3444  // Team is added to the list only once.
3445  // List is sorted in ascending order by team id.
3446  // Team id is *not* a key.
3447 
3448  kmp_team_list_t l;
3449 
3450  KMP_DEBUG_ASSERT(list != NULL);
3451  if (team == NULL) {
3452  return;
3453  }
3454 
3455  __kmp_print_structure_team_accum(list, team->t.t_parent);
3456  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3457 
3458  // Search list for the team.
3459  l = list;
3460  while (l->next != NULL && l->entry != team) {
3461  l = l->next;
3462  }
3463  if (l->next != NULL) {
3464  return; // Team has been added before, exit.
3465  }
3466 
3467  // Team is not found. Search list again for insertion point.
3468  l = list;
3469  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3470  l = l->next;
3471  }
3472 
3473  // Insert team.
3474  {
3475  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3476  sizeof(kmp_team_list_item_t));
3477  *item = *l;
3478  l->entry = team;
3479  l->next = item;
3480  }
3481 }
3482 
3483 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3484 
3485 ) {
3486  __kmp_printf("%s", title);
3487  if (team != NULL) {
3488  __kmp_printf("%2x %p\n", team->t.t_id, team);
3489  } else {
3490  __kmp_printf(" - (nil)\n");
3491  }
3492 }
3493 
3494 static void __kmp_print_structure_thread(char const *title,
3495  kmp_info_p const *thread) {
3496  __kmp_printf("%s", title);
3497  if (thread != NULL) {
3498  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3499  } else {
3500  __kmp_printf(" - (nil)\n");
3501  }
3502 }
3503 
3504 void __kmp_print_structure(void) {
3505 
3506  kmp_team_list_t list;
3507 
3508  // Initialize list of teams.
3509  list =
3510  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3511  list->entry = NULL;
3512  list->next = NULL;
3513 
3514  __kmp_printf("\n------------------------------\nGlobal Thread "
3515  "Table\n------------------------------\n");
3516  {
3517  int gtid;
3518  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3519  __kmp_printf("%2d", gtid);
3520  if (__kmp_threads != NULL) {
3521  __kmp_printf(" %p", __kmp_threads[gtid]);
3522  }
3523  if (__kmp_root != NULL) {
3524  __kmp_printf(" %p", __kmp_root[gtid]);
3525  }
3526  __kmp_printf("\n");
3527  }
3528  }
3529 
3530  // Print out __kmp_threads array.
3531  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3532  "----------\n");
3533  if (__kmp_threads != NULL) {
3534  int gtid;
3535  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3536  kmp_info_t const *thread = __kmp_threads[gtid];
3537  if (thread != NULL) {
3538  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3539  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3540  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3541  __kmp_print_structure_team(" Serial Team: ",
3542  thread->th.th_serial_team);
3543  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3544  __kmp_print_structure_thread(" Primary: ",
3545  thread->th.th_team_master);
3546  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3547  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3548  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3549  __kmp_print_structure_thread(" Next in pool: ",
3550  thread->th.th_next_pool);
3551  __kmp_printf("\n");
3552  __kmp_print_structure_team_accum(list, thread->th.th_team);
3553  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3554  }
3555  }
3556  } else {
3557  __kmp_printf("Threads array is not allocated.\n");
3558  }
3559 
3560  // Print out __kmp_root array.
3561  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3562  "--------\n");
3563  if (__kmp_root != NULL) {
3564  int gtid;
3565  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3566  kmp_root_t const *root = __kmp_root[gtid];
3567  if (root != NULL) {
3568  __kmp_printf("GTID %2d %p:\n", gtid, root);
3569  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3570  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3571  __kmp_print_structure_thread(" Uber Thread: ",
3572  root->r.r_uber_thread);
3573  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3574  __kmp_printf(" In Parallel: %2d\n",
3575  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3576  __kmp_printf("\n");
3577  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3578  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3579  }
3580  }
3581  } else {
3582  __kmp_printf("Ubers array is not allocated.\n");
3583  }
3584 
3585  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3586  "--------\n");
3587  while (list->next != NULL) {
3588  kmp_team_p const *team = list->entry;
3589  int i;
3590  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3591  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3592  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3593  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3594  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3595  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3596  for (i = 0; i < team->t.t_nproc; ++i) {
3597  __kmp_printf(" Thread %2d: ", i);
3598  __kmp_print_structure_thread("", team->t.t_threads[i]);
3599  }
3600  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3601  __kmp_printf("\n");
3602  list = list->next;
3603  }
3604 
3605  // Print out __kmp_thread_pool and __kmp_team_pool.
3606  __kmp_printf("\n------------------------------\nPools\n----------------------"
3607  "--------\n");
3608  __kmp_print_structure_thread("Thread pool: ",
3609  CCAST(kmp_info_t *, __kmp_thread_pool));
3610  __kmp_print_structure_team("Team pool: ",
3611  CCAST(kmp_team_t *, __kmp_team_pool));
3612  __kmp_printf("\n");
3613 
3614  // Free team list.
3615  while (list != NULL) {
3616  kmp_team_list_item_t *item = list;
3617  list = list->next;
3618  KMP_INTERNAL_FREE(item);
3619  }
3620 }
3621 
3622 #endif
3623 
3624 //---------------------------------------------------------------------------
3625 // Stuff for per-thread fast random number generator
3626 // Table of primes
3627 static const unsigned __kmp_primes[] = {
3628  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3629  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3630  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3631  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3632  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3633  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3634  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3635  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3636  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3637  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3638  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3639 
3640 //---------------------------------------------------------------------------
3641 // __kmp_get_random: Get a random number using a linear congruential method.
3642 unsigned short __kmp_get_random(kmp_info_t *thread) {
3643  unsigned x = thread->th.th_x;
3644  unsigned short r = (unsigned short)(x >> 16);
3645 
3646  thread->th.th_x = x * thread->th.th_a + 1;
3647 
3648  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3649  thread->th.th_info.ds.ds_tid, r));
3650 
3651  return r;
3652 }
3653 //--------------------------------------------------------
3654 // __kmp_init_random: Initialize a random number generator
3655 void __kmp_init_random(kmp_info_t *thread) {
3656  unsigned seed = thread->th.th_info.ds.ds_tid;
3657 
3658  thread->th.th_a =
3659  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3660  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3661  KA_TRACE(30,
3662  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3663 }
3664 
3665 #if KMP_OS_WINDOWS
3666 /* reclaim array entries for root threads that are already dead, returns number
3667  * reclaimed */
3668 static int __kmp_reclaim_dead_roots(void) {
3669  int i, r = 0;
3670 
3671  for (i = 0; i < __kmp_threads_capacity; ++i) {
3672  if (KMP_UBER_GTID(i) &&
3673  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3674  !__kmp_root[i]
3675  ->r.r_active) { // AC: reclaim only roots died in non-active state
3676  r += __kmp_unregister_root_other_thread(i);
3677  }
3678  }
3679  return r;
3680 }
3681 #endif
3682 
3683 /* This function attempts to create free entries in __kmp_threads and
3684  __kmp_root, and returns the number of free entries generated.
3685 
3686  For Windows* OS static library, the first mechanism used is to reclaim array
3687  entries for root threads that are already dead.
3688 
3689  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3690  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3691  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3692  threadprivate cache array has been created. Synchronization with
3693  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3694 
3695  After any dead root reclamation, if the clipping value allows array expansion
3696  to result in the generation of a total of nNeed free slots, the function does
3697  that expansion. If not, nothing is done beyond the possible initial root
3698  thread reclamation.
3699 
3700  If any argument is negative, the behavior is undefined. */
3701 static int __kmp_expand_threads(int nNeed) {
3702  int added = 0;
3703  int minimumRequiredCapacity;
3704  int newCapacity;
3705  kmp_info_t **newThreads;
3706  kmp_root_t **newRoot;
3707 
3708  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3709  // resizing __kmp_threads does not need additional protection if foreign
3710  // threads are present
3711 
3712 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3713  /* only for Windows static library */
3714  /* reclaim array entries for root threads that are already dead */
3715  added = __kmp_reclaim_dead_roots();
3716 
3717  if (nNeed) {
3718  nNeed -= added;
3719  if (nNeed < 0)
3720  nNeed = 0;
3721  }
3722 #endif
3723  if (nNeed <= 0)
3724  return added;
3725 
3726  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3727  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3728  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3729  // > __kmp_max_nth in one of two ways:
3730  //
3731  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3732  // may not be reused by another thread, so we may need to increase
3733  // __kmp_threads_capacity to __kmp_max_nth + 1.
3734  //
3735  // 2) New foreign root(s) are encountered. We always register new foreign
3736  // roots. This may cause a smaller # of threads to be allocated at
3737  // subsequent parallel regions, but the worker threads hang around (and
3738  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3739  //
3740  // Anyway, that is the reason for moving the check to see if
3741  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3742  // instead of having it performed here. -BB
3743 
3744  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3745 
3746  /* compute expansion headroom to check if we can expand */
3747  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3748  /* possible expansion too small -- give up */
3749  return added;
3750  }
3751  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3752 
3753  newCapacity = __kmp_threads_capacity;
3754  do {
3755  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3756  : __kmp_sys_max_nth;
3757  } while (newCapacity < minimumRequiredCapacity);
3758  newThreads = (kmp_info_t **)__kmp_allocate(
3759  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3760  newRoot =
3761  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3762  KMP_MEMCPY(newThreads, __kmp_threads,
3763  __kmp_threads_capacity * sizeof(kmp_info_t *));
3764  KMP_MEMCPY(newRoot, __kmp_root,
3765  __kmp_threads_capacity * sizeof(kmp_root_t *));
3766  // Put old __kmp_threads array on a list. Any ongoing references to the old
3767  // list will be valid. This list is cleaned up at library shutdown.
3768  kmp_old_threads_list_t *node =
3769  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3770  node->threads = __kmp_threads;
3771  node->next = __kmp_old_threads_list;
3772  __kmp_old_threads_list = node;
3773 
3774  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3775  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3776  added += newCapacity - __kmp_threads_capacity;
3777  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3778 
3779  if (newCapacity > __kmp_tp_capacity) {
3780  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3781  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3782  __kmp_threadprivate_resize_cache(newCapacity);
3783  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3784  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3785  }
3786  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3787  }
3788 
3789  return added;
3790 }
3791 
3792 /* Register the current thread as a root thread and obtain our gtid. We must
3793  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3794  thread that calls from __kmp_do_serial_initialize() */
3795 int __kmp_register_root(int initial_thread) {
3796  kmp_info_t *root_thread;
3797  kmp_root_t *root;
3798  int gtid;
3799  int capacity;
3800  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3801  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3802  KMP_MB();
3803 
3804  /* 2007-03-02:
3805  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3806  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3807  work as expected -- it may return false (that means there is at least one
3808  empty slot in __kmp_threads array), but it is possible the only free slot
3809  is #0, which is reserved for initial thread and so cannot be used for this
3810  one. Following code workarounds this bug.
3811 
3812  However, right solution seems to be not reserving slot #0 for initial
3813  thread because:
3814  (1) there is no magic in slot #0,
3815  (2) we cannot detect initial thread reliably (the first thread which does
3816  serial initialization may be not a real initial thread).
3817  */
3818  capacity = __kmp_threads_capacity;
3819  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3820  --capacity;
3821  }
3822 
3823  // If it is not for initializing the hidden helper team, we need to take
3824  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3825  // in __kmp_threads_capacity.
3826  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3827  capacity -= __kmp_hidden_helper_threads_num;
3828  }
3829 
3830  /* see if there are too many threads */
3831  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3832  if (__kmp_tp_cached) {
3833  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3834  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3835  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3836  } else {
3837  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3838  __kmp_msg_null);
3839  }
3840  }
3841 
3842  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3843  // 0: initial thread, also a regular OpenMP thread.
3844  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3845  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3846  // regular OpenMP threads.
3847  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3848  // Find an available thread slot for hidden helper thread. Slots for hidden
3849  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3850  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3851  gtid <= __kmp_hidden_helper_threads_num;
3852  gtid++)
3853  ;
3854  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3855  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3856  "hidden helper thread: T#%d\n",
3857  gtid));
3858  } else {
3859  /* find an available thread slot */
3860  // Don't reassign the zero slot since we need that to only be used by
3861  // initial thread. Slots for hidden helper threads should also be skipped.
3862  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3863  gtid = 0;
3864  } else {
3865  for (gtid = __kmp_hidden_helper_threads_num + 1;
3866  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3867  ;
3868  }
3869  KA_TRACE(
3870  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3871  KMP_ASSERT(gtid < __kmp_threads_capacity);
3872  }
3873 
3874  /* update global accounting */
3875  __kmp_all_nth++;
3876  TCW_4(__kmp_nth, __kmp_nth + 1);
3877 
3878  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3879  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3880  if (__kmp_adjust_gtid_mode) {
3881  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3882  if (TCR_4(__kmp_gtid_mode) != 2) {
3883  TCW_4(__kmp_gtid_mode, 2);
3884  }
3885  } else {
3886  if (TCR_4(__kmp_gtid_mode) != 1) {
3887  TCW_4(__kmp_gtid_mode, 1);
3888  }
3889  }
3890  }
3891 
3892 #ifdef KMP_ADJUST_BLOCKTIME
3893  /* Adjust blocktime to zero if necessary */
3894  /* Middle initialization might not have occurred yet */
3895  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3896  if (__kmp_nth > __kmp_avail_proc) {
3897  __kmp_zero_bt = TRUE;
3898  }
3899  }
3900 #endif /* KMP_ADJUST_BLOCKTIME */
3901 
3902  /* setup this new hierarchy */
3903  if (!(root = __kmp_root[gtid])) {
3904  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3905  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3906  }
3907 
3908 #if KMP_STATS_ENABLED
3909  // Initialize stats as soon as possible (right after gtid assignment).
3910  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3911  __kmp_stats_thread_ptr->startLife();
3912  KMP_SET_THREAD_STATE(SERIAL_REGION);
3913  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3914 #endif
3915  __kmp_initialize_root(root);
3916 
3917  /* setup new root thread structure */
3918  if (root->r.r_uber_thread) {
3919  root_thread = root->r.r_uber_thread;
3920  } else {
3921  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3922  if (__kmp_storage_map) {
3923  __kmp_print_thread_storage_map(root_thread, gtid);
3924  }
3925  root_thread->th.th_info.ds.ds_gtid = gtid;
3926 #if OMPT_SUPPORT
3927  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3928 #endif
3929  root_thread->th.th_root = root;
3930  if (__kmp_env_consistency_check) {
3931  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3932  }
3933 #if USE_FAST_MEMORY
3934  __kmp_initialize_fast_memory(root_thread);
3935 #endif /* USE_FAST_MEMORY */
3936 
3937 #if KMP_USE_BGET
3938  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3939  __kmp_initialize_bget(root_thread);
3940 #endif
3941  __kmp_init_random(root_thread); // Initialize random number generator
3942  }
3943 
3944  /* setup the serial team held in reserve by the root thread */
3945  if (!root_thread->th.th_serial_team) {
3946  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3947  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3948  root_thread->th.th_serial_team =
3949  __kmp_allocate_team(root, 1, 1,
3950 #if OMPT_SUPPORT
3951  ompt_data_none, // root parallel id
3952 #endif
3953  proc_bind_default, &r_icvs, 0, NULL);
3954  }
3955  KMP_ASSERT(root_thread->th.th_serial_team);
3956  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3957  root_thread->th.th_serial_team));
3958 
3959  /* drop root_thread into place */
3960  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3961 
3962  root->r.r_root_team->t.t_threads[0] = root_thread;
3963  root->r.r_hot_team->t.t_threads[0] = root_thread;
3964  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3965  // AC: the team created in reserve, not for execution (it is unused for now).
3966  root_thread->th.th_serial_team->t.t_serialized = 0;
3967  root->r.r_uber_thread = root_thread;
3968 
3969  /* initialize the thread, get it ready to go */
3970  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3971  TCW_4(__kmp_init_gtid, TRUE);
3972 
3973  /* prepare the primary thread for get_gtid() */
3974  __kmp_gtid_set_specific(gtid);
3975 
3976 #if USE_ITT_BUILD
3977  __kmp_itt_thread_name(gtid);
3978 #endif /* USE_ITT_BUILD */
3979 
3980 #ifdef KMP_TDATA_GTID
3981  __kmp_gtid = gtid;
3982 #endif
3983  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3984  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3985 
3986  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3987  "plain=%u\n",
3988  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3989  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3990  KMP_INIT_BARRIER_STATE));
3991  { // Initialize barrier data.
3992  int b;
3993  for (b = 0; b < bs_last_barrier; ++b) {
3994  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3995 #if USE_DEBUGGER
3996  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3997 #endif
3998  }
3999  }
4000  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4001  KMP_INIT_BARRIER_STATE);
4002 
4003 #if KMP_AFFINITY_SUPPORTED
4004  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4005  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4006  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4007  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4008 #endif /* KMP_AFFINITY_SUPPORTED */
4009  root_thread->th.th_def_allocator = __kmp_def_allocator;
4010  root_thread->th.th_prev_level = 0;
4011  root_thread->th.th_prev_num_threads = 1;
4012 
4013  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4014  tmp->cg_root = root_thread;
4015  tmp->cg_thread_limit = __kmp_cg_max_nth;
4016  tmp->cg_nthreads = 1;
4017  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4018  " cg_nthreads init to 1\n",
4019  root_thread, tmp));
4020  tmp->up = NULL;
4021  root_thread->th.th_cg_roots = tmp;
4022 
4023  __kmp_root_counter++;
4024 
4025 #if OMPT_SUPPORT
4026  if (ompt_enabled.enabled) {
4027 
4028  kmp_info_t *root_thread = ompt_get_thread();
4029 
4030  ompt_set_thread_state(root_thread, ompt_state_overhead);
4031 
4032  if (ompt_enabled.ompt_callback_thread_begin) {
4033  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4034  ompt_thread_initial, __ompt_get_thread_data_internal());
4035  }
4036  ompt_data_t *task_data;
4037  ompt_data_t *parallel_data;
4038  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4039  NULL);
4040  if (ompt_enabled.ompt_callback_implicit_task) {
4041  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4042  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4043  }
4044 
4045  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4046  }
4047 #endif
4048 #if OMPD_SUPPORT
4049  if (ompd_state & OMPD_ENABLE_BP)
4050  ompd_bp_thread_begin();
4051 #endif
4052 
4053  KMP_MB();
4054  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4055 
4056  return gtid;
4057 }
4058 
4059 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4060  const int max_level) {
4061  int i, n, nth;
4062  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4063  if (!hot_teams || !hot_teams[level].hot_team) {
4064  return 0;
4065  }
4066  KMP_DEBUG_ASSERT(level < max_level);
4067  kmp_team_t *team = hot_teams[level].hot_team;
4068  nth = hot_teams[level].hot_team_nth;
4069  n = nth - 1; // primary thread is not freed
4070  if (level < max_level - 1) {
4071  for (i = 0; i < nth; ++i) {
4072  kmp_info_t *th = team->t.t_threads[i];
4073  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4074  if (i > 0 && th->th.th_hot_teams) {
4075  __kmp_free(th->th.th_hot_teams);
4076  th->th.th_hot_teams = NULL;
4077  }
4078  }
4079  }
4080  __kmp_free_team(root, team, NULL);
4081  return n;
4082 }
4083 
4084 // Resets a root thread and clear its root and hot teams.
4085 // Returns the number of __kmp_threads entries directly and indirectly freed.
4086 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4087  kmp_team_t *root_team = root->r.r_root_team;
4088  kmp_team_t *hot_team = root->r.r_hot_team;
4089  int n = hot_team->t.t_nproc;
4090  int i;
4091 
4092  KMP_DEBUG_ASSERT(!root->r.r_active);
4093 
4094  root->r.r_root_team = NULL;
4095  root->r.r_hot_team = NULL;
4096  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4097  // before call to __kmp_free_team().
4098  __kmp_free_team(root, root_team, NULL);
4099  if (__kmp_hot_teams_max_level >
4100  0) { // need to free nested hot teams and their threads if any
4101  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4102  kmp_info_t *th = hot_team->t.t_threads[i];
4103  if (__kmp_hot_teams_max_level > 1) {
4104  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4105  }
4106  if (th->th.th_hot_teams) {
4107  __kmp_free(th->th.th_hot_teams);
4108  th->th.th_hot_teams = NULL;
4109  }
4110  }
4111  }
4112  __kmp_free_team(root, hot_team, NULL);
4113 
4114  // Before we can reap the thread, we need to make certain that all other
4115  // threads in the teams that had this root as ancestor have stopped trying to
4116  // steal tasks.
4117  if (__kmp_tasking_mode != tskm_immediate_exec) {
4118  __kmp_wait_to_unref_task_teams();
4119  }
4120 
4121 #if KMP_OS_WINDOWS
4122  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4123  KA_TRACE(
4124  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4125  "\n",
4126  (LPVOID) & (root->r.r_uber_thread->th),
4127  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4128  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4129 #endif /* KMP_OS_WINDOWS */
4130 
4131 #if OMPD_SUPPORT
4132  if (ompd_state & OMPD_ENABLE_BP)
4133  ompd_bp_thread_end();
4134 #endif
4135 
4136 #if OMPT_SUPPORT
4137  ompt_data_t *task_data;
4138  ompt_data_t *parallel_data;
4139  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4140  NULL);
4141  if (ompt_enabled.ompt_callback_implicit_task) {
4142  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4143  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4144  }
4145  if (ompt_enabled.ompt_callback_thread_end) {
4146  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4147  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4148  }
4149 #endif
4150 
4151  TCW_4(__kmp_nth,
4152  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4153  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4154  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4155  " to %d\n",
4156  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4157  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4158  if (i == 1) {
4159  // need to free contention group structure
4160  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4161  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4162  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4163  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4164  root->r.r_uber_thread->th.th_cg_roots = NULL;
4165  }
4166  __kmp_reap_thread(root->r.r_uber_thread, 1);
4167 
4168  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4169  // instead of freeing.
4170  root->r.r_uber_thread = NULL;
4171  /* mark root as no longer in use */
4172  root->r.r_begin = FALSE;
4173 
4174  return n;
4175 }
4176 
4177 void __kmp_unregister_root_current_thread(int gtid) {
4178  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4179  /* this lock should be ok, since unregister_root_current_thread is never
4180  called during an abort, only during a normal close. furthermore, if you
4181  have the forkjoin lock, you should never try to get the initz lock */
4182  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4183  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4184  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4185  "exiting T#%d\n",
4186  gtid));
4187  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4188  return;
4189  }
4190  kmp_root_t *root = __kmp_root[gtid];
4191 
4192  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4193  KMP_ASSERT(KMP_UBER_GTID(gtid));
4194  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4195  KMP_ASSERT(root->r.r_active == FALSE);
4196 
4197  KMP_MB();
4198 
4199  kmp_info_t *thread = __kmp_threads[gtid];
4200  kmp_team_t *team = thread->th.th_team;
4201  kmp_task_team_t *task_team = thread->th.th_task_team;
4202 
4203  // we need to wait for the proxy tasks before finishing the thread
4204  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4205  task_team->tt.tt_hidden_helper_task_encountered)) {
4206 #if OMPT_SUPPORT
4207  // the runtime is shutting down so we won't report any events
4208  thread->th.ompt_thread_info.state = ompt_state_undefined;
4209 #endif
4210  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4211  }
4212 
4213  __kmp_reset_root(gtid, root);
4214 
4215  KMP_MB();
4216  KC_TRACE(10,
4217  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4218 
4219  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4220 }
4221 
4222 #if KMP_OS_WINDOWS
4223 /* __kmp_forkjoin_lock must be already held
4224  Unregisters a root thread that is not the current thread. Returns the number
4225  of __kmp_threads entries freed as a result. */
4226 static int __kmp_unregister_root_other_thread(int gtid) {
4227  kmp_root_t *root = __kmp_root[gtid];
4228  int r;
4229 
4230  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4231  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4232  KMP_ASSERT(KMP_UBER_GTID(gtid));
4233  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4234  KMP_ASSERT(root->r.r_active == FALSE);
4235 
4236  r = __kmp_reset_root(gtid, root);
4237  KC_TRACE(10,
4238  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4239  return r;
4240 }
4241 #endif
4242 
4243 #if KMP_DEBUG
4244 void __kmp_task_info() {
4245 
4246  kmp_int32 gtid = __kmp_entry_gtid();
4247  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4248  kmp_info_t *this_thr = __kmp_threads[gtid];
4249  kmp_team_t *steam = this_thr->th.th_serial_team;
4250  kmp_team_t *team = this_thr->th.th_team;
4251 
4252  __kmp_printf(
4253  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4254  "ptask=%p\n",
4255  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4256  team->t.t_implicit_task_taskdata[tid].td_parent);
4257 }
4258 #endif // KMP_DEBUG
4259 
4260 /* TODO optimize with one big memclr, take out what isn't needed, split
4261  responsibility to workers as much as possible, and delay initialization of
4262  features as much as possible */
4263 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4264  int tid, int gtid) {
4265  /* this_thr->th.th_info.ds.ds_gtid is setup in
4266  kmp_allocate_thread/create_worker.
4267  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4268  KMP_DEBUG_ASSERT(this_thr != NULL);
4269  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4270  KMP_DEBUG_ASSERT(team);
4271  KMP_DEBUG_ASSERT(team->t.t_threads);
4272  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4273  kmp_info_t *master = team->t.t_threads[0];
4274  KMP_DEBUG_ASSERT(master);
4275  KMP_DEBUG_ASSERT(master->th.th_root);
4276 
4277  KMP_MB();
4278 
4279  TCW_SYNC_PTR(this_thr->th.th_team, team);
4280 
4281  this_thr->th.th_info.ds.ds_tid = tid;
4282  this_thr->th.th_set_nproc = 0;
4283  if (__kmp_tasking_mode != tskm_immediate_exec)
4284  // When tasking is possible, threads are not safe to reap until they are
4285  // done tasking; this will be set when tasking code is exited in wait
4286  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4287  else // no tasking --> always safe to reap
4288  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4289  this_thr->th.th_set_proc_bind = proc_bind_default;
4290 
4291 #if KMP_AFFINITY_SUPPORTED
4292  this_thr->th.th_new_place = this_thr->th.th_current_place;
4293 #endif
4294  this_thr->th.th_root = master->th.th_root;
4295 
4296  /* setup the thread's cache of the team structure */
4297  this_thr->th.th_team_nproc = team->t.t_nproc;
4298  this_thr->th.th_team_master = master;
4299  this_thr->th.th_team_serialized = team->t.t_serialized;
4300 
4301  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4302 
4303  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4304  tid, gtid, this_thr, this_thr->th.th_current_task));
4305 
4306  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4307  team, tid, TRUE);
4308 
4309  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4310  tid, gtid, this_thr, this_thr->th.th_current_task));
4311  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4312  // __kmp_initialize_team()?
4313 
4314  /* TODO no worksharing in speculative threads */
4315  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4316 
4317  this_thr->th.th_local.this_construct = 0;
4318 
4319  if (!this_thr->th.th_pri_common) {
4320  this_thr->th.th_pri_common =
4321  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4322  if (__kmp_storage_map) {
4323  __kmp_print_storage_map_gtid(
4324  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4325  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4326  }
4327  this_thr->th.th_pri_head = NULL;
4328  }
4329 
4330  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4331  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4332  // Make new thread's CG root same as primary thread's
4333  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4334  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4335  if (tmp) {
4336  // worker changes CG, need to check if old CG should be freed
4337  int i = tmp->cg_nthreads--;
4338  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4339  " on node %p of thread %p to %d\n",
4340  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4341  if (i == 1) {
4342  __kmp_free(tmp); // last thread left CG --> free it
4343  }
4344  }
4345  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4346  // Increment new thread's CG root's counter to add the new thread
4347  this_thr->th.th_cg_roots->cg_nthreads++;
4348  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4349  " node %p of thread %p to %d\n",
4350  this_thr, this_thr->th.th_cg_roots,
4351  this_thr->th.th_cg_roots->cg_root,
4352  this_thr->th.th_cg_roots->cg_nthreads));
4353  this_thr->th.th_current_task->td_icvs.thread_limit =
4354  this_thr->th.th_cg_roots->cg_thread_limit;
4355  }
4356 
4357  /* Initialize dynamic dispatch */
4358  {
4359  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4360  // Use team max_nproc since this will never change for the team.
4361  size_t disp_size =
4362  sizeof(dispatch_private_info_t) *
4363  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4364  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4365  team->t.t_max_nproc));
4366  KMP_ASSERT(dispatch);
4367  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4368  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4369 
4370  dispatch->th_disp_index = 0;
4371  dispatch->th_doacross_buf_idx = 0;
4372  if (!dispatch->th_disp_buffer) {
4373  dispatch->th_disp_buffer =
4374  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4375 
4376  if (__kmp_storage_map) {
4377  __kmp_print_storage_map_gtid(
4378  gtid, &dispatch->th_disp_buffer[0],
4379  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4380  ? 1
4381  : __kmp_dispatch_num_buffers],
4382  disp_size,
4383  "th_%d.th_dispatch.th_disp_buffer "
4384  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4385  gtid, team->t.t_id, gtid);
4386  }
4387  } else {
4388  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4389  }
4390 
4391  dispatch->th_dispatch_pr_current = 0;
4392  dispatch->th_dispatch_sh_current = 0;
4393 
4394  dispatch->th_deo_fcn = 0; /* ORDERED */
4395  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4396  }
4397 
4398  this_thr->th.th_next_pool = NULL;
4399 
4400  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4401  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4402 
4403  KMP_MB();
4404 }
4405 
4406 /* allocate a new thread for the requesting team. this is only called from
4407  within a forkjoin critical section. we will first try to get an available
4408  thread from the thread pool. if none is available, we will fork a new one
4409  assuming we are able to create a new one. this should be assured, as the
4410  caller should check on this first. */
4411 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4412  int new_tid) {
4413  kmp_team_t *serial_team;
4414  kmp_info_t *new_thr;
4415  int new_gtid;
4416 
4417  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4418  KMP_DEBUG_ASSERT(root && team);
4419  KMP_MB();
4420 
4421  /* first, try to get one from the thread pool unless allocating thread is
4422  * the main hidden helper thread. The hidden helper team should always
4423  * allocate new OS threads. */
4424  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4425  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4426  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4427  if (new_thr == __kmp_thread_pool_insert_pt) {
4428  __kmp_thread_pool_insert_pt = NULL;
4429  }
4430  TCW_4(new_thr->th.th_in_pool, FALSE);
4431  __kmp_suspend_initialize_thread(new_thr);
4432  __kmp_lock_suspend_mx(new_thr);
4433  if (new_thr->th.th_active_in_pool == TRUE) {
4434  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4435  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4436  new_thr->th.th_active_in_pool = FALSE;
4437  }
4438  __kmp_unlock_suspend_mx(new_thr);
4439 
4440  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4441  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4442  KMP_ASSERT(!new_thr->th.th_team);
4443  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4444 
4445  /* setup the thread structure */
4446  __kmp_initialize_info(new_thr, team, new_tid,
4447  new_thr->th.th_info.ds.ds_gtid);
4448  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4449 
4450  TCW_4(__kmp_nth, __kmp_nth + 1);
4451 
4452  new_thr->th.th_task_state = 0;
4453 
4454  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4455  // Make sure pool thread has transitioned to waiting on own thread struct
4456  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4457  // Thread activated in __kmp_allocate_team when increasing team size
4458  }
4459 
4460 #ifdef KMP_ADJUST_BLOCKTIME
4461  /* Adjust blocktime back to zero if necessary */
4462  /* Middle initialization might not have occurred yet */
4463  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4464  if (__kmp_nth > __kmp_avail_proc) {
4465  __kmp_zero_bt = TRUE;
4466  }
4467  }
4468 #endif /* KMP_ADJUST_BLOCKTIME */
4469 
4470 #if KMP_DEBUG
4471  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4472  // KMP_BARRIER_PARENT_FLAG.
4473  int b;
4474  kmp_balign_t *balign = new_thr->th.th_bar;
4475  for (b = 0; b < bs_last_barrier; ++b)
4476  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4477 #endif
4478 
4479  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4480  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4481 
4482  KMP_MB();
4483  return new_thr;
4484  }
4485 
4486  /* no, well fork a new one */
4487  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4488  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4489 
4490 #if KMP_USE_MONITOR
4491  // If this is the first worker thread the RTL is creating, then also
4492  // launch the monitor thread. We try to do this as early as possible.
4493  if (!TCR_4(__kmp_init_monitor)) {
4494  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4495  if (!TCR_4(__kmp_init_monitor)) {
4496  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4497  TCW_4(__kmp_init_monitor, 1);
4498  __kmp_create_monitor(&__kmp_monitor);
4499  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4500 #if KMP_OS_WINDOWS
4501  // AC: wait until monitor has started. This is a fix for CQ232808.
4502  // The reason is that if the library is loaded/unloaded in a loop with
4503  // small (parallel) work in between, then there is high probability that
4504  // monitor thread started after the library shutdown. At shutdown it is
4505  // too late to cope with the problem, because when the primary thread is
4506  // in DllMain (process detach) the monitor has no chances to start (it is
4507  // blocked), and primary thread has no means to inform the monitor that
4508  // the library has gone, because all the memory which the monitor can
4509  // access is going to be released/reset.
4510  while (TCR_4(__kmp_init_monitor) < 2) {
4511  KMP_YIELD(TRUE);
4512  }
4513  KF_TRACE(10, ("after monitor thread has started\n"));
4514 #endif
4515  }
4516  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4517  }
4518 #endif
4519 
4520  KMP_MB();
4521 
4522  {
4523  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4524  ? 1
4525  : __kmp_hidden_helper_threads_num + 1;
4526 
4527  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4528  ++new_gtid) {
4529  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4530  }
4531 
4532  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4533  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4534  }
4535  }
4536 
4537  /* allocate space for it. */
4538  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4539 
4540  new_thr->th.th_nt_strict = false;
4541  new_thr->th.th_nt_loc = NULL;
4542  new_thr->th.th_nt_sev = severity_fatal;
4543  new_thr->th.th_nt_msg = NULL;
4544 
4545  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4546 
4547 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4548  // suppress race conditions detection on synchronization flags in debug mode
4549  // this helps to analyze library internals eliminating false positives
4550  __itt_suppress_mark_range(
4551  __itt_suppress_range, __itt_suppress_threading_errors,
4552  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4553  __itt_suppress_mark_range(
4554  __itt_suppress_range, __itt_suppress_threading_errors,
4555  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4556 #if KMP_OS_WINDOWS
4557  __itt_suppress_mark_range(
4558  __itt_suppress_range, __itt_suppress_threading_errors,
4559  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4560 #else
4561  __itt_suppress_mark_range(__itt_suppress_range,
4562  __itt_suppress_threading_errors,
4563  &new_thr->th.th_suspend_init_count,
4564  sizeof(new_thr->th.th_suspend_init_count));
4565 #endif
4566  // TODO: check if we need to also suppress b_arrived flags
4567  __itt_suppress_mark_range(__itt_suppress_range,
4568  __itt_suppress_threading_errors,
4569  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4570  sizeof(new_thr->th.th_bar[0].bb.b_go));
4571  __itt_suppress_mark_range(__itt_suppress_range,
4572  __itt_suppress_threading_errors,
4573  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4574  sizeof(new_thr->th.th_bar[1].bb.b_go));
4575  __itt_suppress_mark_range(__itt_suppress_range,
4576  __itt_suppress_threading_errors,
4577  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4578  sizeof(new_thr->th.th_bar[2].bb.b_go));
4579 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4580  if (__kmp_storage_map) {
4581  __kmp_print_thread_storage_map(new_thr, new_gtid);
4582  }
4583 
4584  // add the reserve serialized team, initialized from the team's primary thread
4585  {
4586  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4587  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4588  new_thr->th.th_serial_team = serial_team =
4589  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4590 #if OMPT_SUPPORT
4591  ompt_data_none, // root parallel id
4592 #endif
4593  proc_bind_default, &r_icvs, 0, NULL);
4594  }
4595  KMP_ASSERT(serial_team);
4596  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4597  // execution (it is unused for now).
4598  serial_team->t.t_threads[0] = new_thr;
4599  KF_TRACE(10,
4600  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4601  new_thr));
4602 
4603  /* setup the thread structures */
4604  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4605 
4606 #if USE_FAST_MEMORY
4607  __kmp_initialize_fast_memory(new_thr);
4608 #endif /* USE_FAST_MEMORY */
4609 
4610 #if KMP_USE_BGET
4611  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4612  __kmp_initialize_bget(new_thr);
4613 #endif
4614 
4615  __kmp_init_random(new_thr); // Initialize random number generator
4616 
4617  /* Initialize these only once when thread is grabbed for a team allocation */
4618  KA_TRACE(20,
4619  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4620  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4621 
4622  int b;
4623  kmp_balign_t *balign = new_thr->th.th_bar;
4624  for (b = 0; b < bs_last_barrier; ++b) {
4625  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4626  balign[b].bb.team = NULL;
4627  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4628  balign[b].bb.use_oncore_barrier = 0;
4629  }
4630 
4631  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4632  new_thr->th.th_sleep_loc_type = flag_unset;
4633 
4634  new_thr->th.th_spin_here = FALSE;
4635  new_thr->th.th_next_waiting = 0;
4636 #if KMP_OS_UNIX
4637  new_thr->th.th_blocking = false;
4638 #endif
4639 
4640 #if KMP_AFFINITY_SUPPORTED
4641  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4642  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4643  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4644  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4645 #endif
4646  new_thr->th.th_def_allocator = __kmp_def_allocator;
4647  new_thr->th.th_prev_level = 0;
4648  new_thr->th.th_prev_num_threads = 1;
4649 
4650  TCW_4(new_thr->th.th_in_pool, FALSE);
4651  new_thr->th.th_active_in_pool = FALSE;
4652  TCW_4(new_thr->th.th_active, TRUE);
4653 
4654  new_thr->th.th_set_nested_nth = NULL;
4655  new_thr->th.th_set_nested_nth_sz = 0;
4656 
4657  /* adjust the global counters */
4658  __kmp_all_nth++;
4659  __kmp_nth++;
4660 
4661  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4662  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4663  if (__kmp_adjust_gtid_mode) {
4664  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4665  if (TCR_4(__kmp_gtid_mode) != 2) {
4666  TCW_4(__kmp_gtid_mode, 2);
4667  }
4668  } else {
4669  if (TCR_4(__kmp_gtid_mode) != 1) {
4670  TCW_4(__kmp_gtid_mode, 1);
4671  }
4672  }
4673  }
4674 
4675 #ifdef KMP_ADJUST_BLOCKTIME
4676  /* Adjust blocktime back to zero if necessary */
4677  /* Middle initialization might not have occurred yet */
4678  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4679  if (__kmp_nth > __kmp_avail_proc) {
4680  __kmp_zero_bt = TRUE;
4681  }
4682  }
4683 #endif /* KMP_ADJUST_BLOCKTIME */
4684 
4685 #if KMP_AFFINITY_SUPPORTED
4686  // Set the affinity and topology information for new thread
4687  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4688 #endif
4689 
4690  /* actually fork it and create the new worker thread */
4691  KF_TRACE(
4692  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4693  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4694  KF_TRACE(10,
4695  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4696 
4697  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4698  new_gtid));
4699  KMP_MB();
4700  return new_thr;
4701 }
4702 
4703 /* Reinitialize team for reuse.
4704  The hot team code calls this case at every fork barrier, so EPCC barrier
4705  test are extremely sensitive to changes in it, esp. writes to the team
4706  struct, which cause a cache invalidation in all threads.
4707  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4708 static void __kmp_reinitialize_team(kmp_team_t *team,
4709  kmp_internal_control_t *new_icvs,
4710  ident_t *loc) {
4711  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4712  team->t.t_threads[0], team));
4713  KMP_DEBUG_ASSERT(team && new_icvs);
4714  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4715  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4716 
4717  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4718  // Copy ICVs to the primary thread's implicit taskdata
4719  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4720  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4721 
4722  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4723  team->t.t_threads[0], team));
4724 }
4725 
4726 /* Initialize the team data structure.
4727  This assumes the t_threads and t_max_nproc are already set.
4728  Also, we don't touch the arguments */
4729 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4730  kmp_internal_control_t *new_icvs,
4731  ident_t *loc) {
4732  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4733 
4734  /* verify */
4735  KMP_DEBUG_ASSERT(team);
4736  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4737  KMP_DEBUG_ASSERT(team->t.t_threads);
4738  KMP_MB();
4739 
4740  team->t.t_master_tid = 0; /* not needed */
4741  /* team->t.t_master_bar; not needed */
4742  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4743  team->t.t_nproc = new_nproc;
4744 
4745  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4746  team->t.t_next_pool = NULL;
4747  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4748  * up hot team */
4749 
4750  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4751  team->t.t_invoke = NULL; /* not needed */
4752 
4753  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4754  team->t.t_sched.sched = new_icvs->sched.sched;
4755 
4756 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4757  team->t.t_fp_control_saved = FALSE; /* not needed */
4758  team->t.t_x87_fpu_control_word = 0; /* not needed */
4759  team->t.t_mxcsr = 0; /* not needed */
4760 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4761 
4762  team->t.t_construct = 0;
4763 
4764  team->t.t_ordered.dt.t_value = 0;
4765  team->t.t_master_active = FALSE;
4766 
4767 #ifdef KMP_DEBUG
4768  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4769 #endif
4770 #if KMP_OS_WINDOWS
4771  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4772 #endif
4773 
4774  team->t.t_control_stack_top = NULL;
4775 
4776  __kmp_reinitialize_team(team, new_icvs, loc);
4777 
4778  KMP_MB();
4779  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4780 }
4781 
4782 #if KMP_AFFINITY_SUPPORTED
4783 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4784  int first, int last, int newp) {
4785  th->th.th_first_place = first;
4786  th->th.th_last_place = last;
4787  th->th.th_new_place = newp;
4788  if (newp != th->th.th_current_place) {
4789  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4790  team->t.t_display_affinity = 1;
4791  // Copy topology information associated with the new place
4792  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4793  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4794  }
4795 }
4796 
4797 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4798 // It calculates the worker + primary thread's partition based upon the parent
4799 // thread's partition, and binds each worker to a thread in their partition.
4800 // The primary thread's partition should already include its current binding.
4801 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4802  // Do not partition places for the hidden helper team
4803  if (KMP_HIDDEN_HELPER_TEAM(team))
4804  return;
4805  // Copy the primary thread's place partition to the team struct
4806  kmp_info_t *master_th = team->t.t_threads[0];
4807  KMP_DEBUG_ASSERT(master_th != NULL);
4808  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4809  int first_place = master_th->th.th_first_place;
4810  int last_place = master_th->th.th_last_place;
4811  int masters_place = master_th->th.th_current_place;
4812  int num_masks = __kmp_affinity.num_masks;
4813  team->t.t_first_place = first_place;
4814  team->t.t_last_place = last_place;
4815 
4816  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4817  "bound to place %d partition = [%d,%d]\n",
4818  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4819  team->t.t_id, masters_place, first_place, last_place));
4820 
4821  switch (proc_bind) {
4822 
4823  case proc_bind_default:
4824  // Serial teams might have the proc_bind policy set to proc_bind_default.
4825  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4826  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4827  break;
4828 
4829  case proc_bind_primary: {
4830  int f;
4831  int n_th = team->t.t_nproc;
4832  for (f = 1; f < n_th; f++) {
4833  kmp_info_t *th = team->t.t_threads[f];
4834  KMP_DEBUG_ASSERT(th != NULL);
4835  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4836 
4837  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4838  "partition = [%d,%d]\n",
4839  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4840  f, masters_place, first_place, last_place));
4841  }
4842  } break;
4843 
4844  case proc_bind_close: {
4845  int f;
4846  int n_th = team->t.t_nproc;
4847  int n_places;
4848  if (first_place <= last_place) {
4849  n_places = last_place - first_place + 1;
4850  } else {
4851  n_places = num_masks - first_place + last_place + 1;
4852  }
4853  if (n_th <= n_places) {
4854  int place = masters_place;
4855  for (f = 1; f < n_th; f++) {
4856  kmp_info_t *th = team->t.t_threads[f];
4857  KMP_DEBUG_ASSERT(th != NULL);
4858 
4859  if (place == last_place) {
4860  place = first_place;
4861  } else if (place == (num_masks - 1)) {
4862  place = 0;
4863  } else {
4864  place++;
4865  }
4866  __kmp_set_thread_place(team, th, first_place, last_place, place);
4867 
4868  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4869  "partition = [%d,%d]\n",
4870  __kmp_gtid_from_thread(team->t.t_threads[f]),
4871  team->t.t_id, f, place, first_place, last_place));
4872  }
4873  } else {
4874  int S, rem, gap, s_count;
4875  S = n_th / n_places;
4876  s_count = 0;
4877  rem = n_th - (S * n_places);
4878  gap = rem > 0 ? n_places / rem : n_places;
4879  int place = masters_place;
4880  int gap_ct = gap;
4881  for (f = 0; f < n_th; f++) {
4882  kmp_info_t *th = team->t.t_threads[f];
4883  KMP_DEBUG_ASSERT(th != NULL);
4884 
4885  __kmp_set_thread_place(team, th, first_place, last_place, place);
4886  s_count++;
4887 
4888  if ((s_count == S) && rem && (gap_ct == gap)) {
4889  // do nothing, add an extra thread to place on next iteration
4890  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4891  // we added an extra thread to this place; move to next place
4892  if (place == last_place) {
4893  place = first_place;
4894  } else if (place == (num_masks - 1)) {
4895  place = 0;
4896  } else {
4897  place++;
4898  }
4899  s_count = 0;
4900  gap_ct = 1;
4901  rem--;
4902  } else if (s_count == S) { // place full; don't add extra
4903  if (place == last_place) {
4904  place = first_place;
4905  } else if (place == (num_masks - 1)) {
4906  place = 0;
4907  } else {
4908  place++;
4909  }
4910  gap_ct++;
4911  s_count = 0;
4912  }
4913 
4914  KA_TRACE(100,
4915  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4916  "partition = [%d,%d]\n",
4917  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4918  th->th.th_new_place, first_place, last_place));
4919  }
4920  KMP_DEBUG_ASSERT(place == masters_place);
4921  }
4922  } break;
4923 
4924  case proc_bind_spread: {
4925  int f;
4926  int n_th = team->t.t_nproc;
4927  int n_places;
4928  int thidx;
4929  if (first_place <= last_place) {
4930  n_places = last_place - first_place + 1;
4931  } else {
4932  n_places = num_masks - first_place + last_place + 1;
4933  }
4934  if (n_th <= n_places) {
4935  int place = -1;
4936 
4937  if (n_places != num_masks) {
4938  int S = n_places / n_th;
4939  int s_count, rem, gap, gap_ct;
4940 
4941  place = masters_place;
4942  rem = n_places - n_th * S;
4943  gap = rem ? n_th / rem : 1;
4944  gap_ct = gap;
4945  thidx = n_th;
4946  if (update_master_only == 1)
4947  thidx = 1;
4948  for (f = 0; f < thidx; f++) {
4949  kmp_info_t *th = team->t.t_threads[f];
4950  KMP_DEBUG_ASSERT(th != NULL);
4951 
4952  int fplace = place, nplace = place;
4953  s_count = 1;
4954  while (s_count < S) {
4955  if (place == last_place) {
4956  place = first_place;
4957  } else if (place == (num_masks - 1)) {
4958  place = 0;
4959  } else {
4960  place++;
4961  }
4962  s_count++;
4963  }
4964  if (rem && (gap_ct == gap)) {
4965  if (place == last_place) {
4966  place = first_place;
4967  } else if (place == (num_masks - 1)) {
4968  place = 0;
4969  } else {
4970  place++;
4971  }
4972  rem--;
4973  gap_ct = 0;
4974  }
4975  __kmp_set_thread_place(team, th, fplace, place, nplace);
4976  gap_ct++;
4977 
4978  if (place == last_place) {
4979  place = first_place;
4980  } else if (place == (num_masks - 1)) {
4981  place = 0;
4982  } else {
4983  place++;
4984  }
4985 
4986  KA_TRACE(100,
4987  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988  "partition = [%d,%d], num_masks: %u\n",
4989  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4990  f, th->th.th_new_place, th->th.th_first_place,
4991  th->th.th_last_place, num_masks));
4992  }
4993  } else {
4994  /* Having uniform space of available computation places I can create
4995  T partitions of round(P/T) size and put threads into the first
4996  place of each partition. */
4997  double current = static_cast<double>(masters_place);
4998  double spacing =
4999  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5000  int first, last;
5001  kmp_info_t *th;
5002 
5003  thidx = n_th + 1;
5004  if (update_master_only == 1)
5005  thidx = 1;
5006  for (f = 0; f < thidx; f++) {
5007  first = static_cast<int>(current);
5008  last = static_cast<int>(current + spacing) - 1;
5009  KMP_DEBUG_ASSERT(last >= first);
5010  if (first >= n_places) {
5011  if (masters_place) {
5012  first -= n_places;
5013  last -= n_places;
5014  if (first == (masters_place + 1)) {
5015  KMP_DEBUG_ASSERT(f == n_th);
5016  first--;
5017  }
5018  if (last == masters_place) {
5019  KMP_DEBUG_ASSERT(f == (n_th - 1));
5020  last--;
5021  }
5022  } else {
5023  KMP_DEBUG_ASSERT(f == n_th);
5024  first = 0;
5025  last = 0;
5026  }
5027  }
5028  if (last >= n_places) {
5029  last = (n_places - 1);
5030  }
5031  place = first;
5032  current += spacing;
5033  if (f < n_th) {
5034  KMP_DEBUG_ASSERT(0 <= first);
5035  KMP_DEBUG_ASSERT(n_places > first);
5036  KMP_DEBUG_ASSERT(0 <= last);
5037  KMP_DEBUG_ASSERT(n_places > last);
5038  KMP_DEBUG_ASSERT(last_place >= first_place);
5039  th = team->t.t_threads[f];
5040  KMP_DEBUG_ASSERT(th);
5041  __kmp_set_thread_place(team, th, first, last, place);
5042  KA_TRACE(100,
5043  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5044  "partition = [%d,%d], spacing = %.4f\n",
5045  __kmp_gtid_from_thread(team->t.t_threads[f]),
5046  team->t.t_id, f, th->th.th_new_place,
5047  th->th.th_first_place, th->th.th_last_place, spacing));
5048  }
5049  }
5050  }
5051  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5052  } else {
5053  int S, rem, gap, s_count;
5054  S = n_th / n_places;
5055  s_count = 0;
5056  rem = n_th - (S * n_places);
5057  gap = rem > 0 ? n_places / rem : n_places;
5058  int place = masters_place;
5059  int gap_ct = gap;
5060  thidx = n_th;
5061  if (update_master_only == 1)
5062  thidx = 1;
5063  for (f = 0; f < thidx; f++) {
5064  kmp_info_t *th = team->t.t_threads[f];
5065  KMP_DEBUG_ASSERT(th != NULL);
5066 
5067  __kmp_set_thread_place(team, th, place, place, place);
5068  s_count++;
5069 
5070  if ((s_count == S) && rem && (gap_ct == gap)) {
5071  // do nothing, add an extra thread to place on next iteration
5072  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5073  // we added an extra thread to this place; move on to next place
5074  if (place == last_place) {
5075  place = first_place;
5076  } else if (place == (num_masks - 1)) {
5077  place = 0;
5078  } else {
5079  place++;
5080  }
5081  s_count = 0;
5082  gap_ct = 1;
5083  rem--;
5084  } else if (s_count == S) { // place is full; don't add extra thread
5085  if (place == last_place) {
5086  place = first_place;
5087  } else if (place == (num_masks - 1)) {
5088  place = 0;
5089  } else {
5090  place++;
5091  }
5092  gap_ct++;
5093  s_count = 0;
5094  }
5095 
5096  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5097  "partition = [%d,%d]\n",
5098  __kmp_gtid_from_thread(team->t.t_threads[f]),
5099  team->t.t_id, f, th->th.th_new_place,
5100  th->th.th_first_place, th->th.th_last_place));
5101  }
5102  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5103  }
5104  } break;
5105 
5106  default:
5107  break;
5108  }
5109 
5110  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5111 }
5112 
5113 #endif // KMP_AFFINITY_SUPPORTED
5114 
5115 /* allocate a new team data structure to use. take one off of the free pool if
5116  available */
5117 kmp_team_t *__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5118 #if OMPT_SUPPORT
5119  ompt_data_t ompt_parallel_data,
5120 #endif
5121  kmp_proc_bind_t new_proc_bind,
5122  kmp_internal_control_t *new_icvs, int argc,
5123  kmp_info_t *master) {
5124  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5125  int f;
5126  kmp_team_t *team;
5127  int use_hot_team = !root->r.r_active;
5128  int level = 0;
5129  int do_place_partition = 1;
5130 
5131  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5132  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5133  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5134  KMP_MB();
5135 
5136  kmp_hot_team_ptr_t *hot_teams;
5137  if (master) {
5138  team = master->th.th_team;
5139  level = team->t.t_active_level;
5140  if (master->th.th_teams_microtask) { // in teams construct?
5141  if (master->th.th_teams_size.nteams > 1 &&
5142  ( // #teams > 1
5143  team->t.t_pkfn ==
5144  (microtask_t)__kmp_teams_master || // inner fork of the teams
5145  master->th.th_teams_level <
5146  team->t.t_level)) { // or nested parallel inside the teams
5147  ++level; // not increment if #teams==1, or for outer fork of the teams;
5148  // increment otherwise
5149  }
5150  // Do not perform the place partition if inner fork of the teams
5151  // Wait until nested parallel region encountered inside teams construct
5152  if ((master->th.th_teams_size.nteams == 1 &&
5153  master->th.th_teams_level >= team->t.t_level) ||
5154  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5155  do_place_partition = 0;
5156  }
5157  hot_teams = master->th.th_hot_teams;
5158  if (level < __kmp_hot_teams_max_level && hot_teams &&
5159  hot_teams[level].hot_team) {
5160  // hot team has already been allocated for given level
5161  use_hot_team = 1;
5162  } else {
5163  use_hot_team = 0;
5164  }
5165  } else {
5166  // check we won't access uninitialized hot_teams, just in case
5167  KMP_DEBUG_ASSERT(new_nproc == 1);
5168  }
5169  // Optimization to use a "hot" team
5170  if (use_hot_team && new_nproc > 1) {
5171  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5172  team = hot_teams[level].hot_team;
5173 #if KMP_DEBUG
5174  if (__kmp_tasking_mode != tskm_immediate_exec) {
5175  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5176  "task_team[1] = %p before reinit\n",
5177  team->t.t_task_team[0], team->t.t_task_team[1]));
5178  }
5179 #endif
5180 
5181  if (team->t.t_nproc != new_nproc &&
5182  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5183  // Distributed barrier may need a resize
5184  int old_nthr = team->t.t_nproc;
5185  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5186  }
5187 
5188  // If not doing the place partition, then reset the team's proc bind
5189  // to indicate that partitioning of all threads still needs to take place
5190  if (do_place_partition == 0)
5191  team->t.t_proc_bind = proc_bind_default;
5192  // Has the number of threads changed?
5193  /* Let's assume the most common case is that the number of threads is
5194  unchanged, and put that case first. */
5195  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5196  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5197  // This case can mean that omp_set_num_threads() was called and the hot
5198  // team size was already reduced, so we check the special flag
5199  if (team->t.t_size_changed == -1) {
5200  team->t.t_size_changed = 1;
5201  } else {
5202  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5203  }
5204 
5205  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5206  kmp_r_sched_t new_sched = new_icvs->sched;
5207  // set primary thread's schedule as new run-time schedule
5208  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5209 
5210  __kmp_reinitialize_team(team, new_icvs,
5211  root->r.r_uber_thread->th.th_ident);
5212 
5213  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5214  team->t.t_threads[0], team));
5215  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5216 
5217 #if KMP_AFFINITY_SUPPORTED
5218  if ((team->t.t_size_changed == 0) &&
5219  (team->t.t_proc_bind == new_proc_bind)) {
5220  if (new_proc_bind == proc_bind_spread) {
5221  if (do_place_partition) {
5222  // add flag to update only master for spread
5223  __kmp_partition_places(team, 1);
5224  }
5225  }
5226  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5227  "proc_bind = %d, partition = [%d,%d]\n",
5228  team->t.t_id, new_proc_bind, team->t.t_first_place,
5229  team->t.t_last_place));
5230  } else {
5231  if (do_place_partition) {
5232  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5233  __kmp_partition_places(team);
5234  }
5235  }
5236 #else
5237  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5238 #endif /* KMP_AFFINITY_SUPPORTED */
5239  } else if (team->t.t_nproc > new_nproc) {
5240  KA_TRACE(20,
5241  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5242  new_nproc));
5243 
5244  team->t.t_size_changed = 1;
5245  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5246  // Barrier size already reduced earlier in this function
5247  // Activate team threads via th_used_in_team
5248  __kmp_add_threads_to_team(team, new_nproc);
5249  }
5250  // When decreasing team size, threads no longer in the team should
5251  // unref task team.
5252  if (__kmp_tasking_mode != tskm_immediate_exec) {
5253  for (f = new_nproc; f < team->t.t_nproc; f++) {
5254  kmp_info_t *th = team->t.t_threads[f];
5255  KMP_DEBUG_ASSERT(th);
5256  th->th.th_task_team = NULL;
5257  }
5258  }
5259  if (__kmp_hot_teams_mode == 0) {
5260  // AC: saved number of threads should correspond to team's value in this
5261  // mode, can be bigger in mode 1, when hot team has threads in reserve
5262  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5263  hot_teams[level].hot_team_nth = new_nproc;
5264  /* release the extra threads we don't need any more */
5265  for (f = new_nproc; f < team->t.t_nproc; f++) {
5266  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5267  __kmp_free_thread(team->t.t_threads[f]);
5268  team->t.t_threads[f] = NULL;
5269  }
5270  } // (__kmp_hot_teams_mode == 0)
5271  else {
5272  // When keeping extra threads in team, switch threads to wait on own
5273  // b_go flag
5274  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5275  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5276  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5277  for (int b = 0; b < bs_last_barrier; ++b) {
5278  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5279  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5280  }
5281  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5282  }
5283  }
5284  }
5285  team->t.t_nproc = new_nproc;
5286  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5287  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5288  __kmp_reinitialize_team(team, new_icvs,
5289  root->r.r_uber_thread->th.th_ident);
5290 
5291  // Update remaining threads
5292  for (f = 0; f < new_nproc; ++f) {
5293  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5294  }
5295 
5296  // restore the current task state of the primary thread: should be the
5297  // implicit task
5298  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5299  team->t.t_threads[0], team));
5300 
5301  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5302 
5303 #ifdef KMP_DEBUG
5304  for (f = 0; f < team->t.t_nproc; f++) {
5305  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5306  team->t.t_threads[f]->th.th_team_nproc ==
5307  team->t.t_nproc);
5308  }
5309 #endif
5310 
5311  if (do_place_partition) {
5312  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5313 #if KMP_AFFINITY_SUPPORTED
5314  __kmp_partition_places(team);
5315 #endif
5316  }
5317  } else { // team->t.t_nproc < new_nproc
5318 
5319  KA_TRACE(20,
5320  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5321  new_nproc));
5322  int old_nproc = team->t.t_nproc; // save old value and use to update only
5323  team->t.t_size_changed = 1;
5324 
5325  int avail_threads = hot_teams[level].hot_team_nth;
5326  if (new_nproc < avail_threads)
5327  avail_threads = new_nproc;
5328  kmp_info_t **other_threads = team->t.t_threads;
5329  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5330  // Adjust barrier data of reserved threads (if any) of the team
5331  // Other data will be set in __kmp_initialize_info() below.
5332  int b;
5333  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5334  for (b = 0; b < bs_last_barrier; ++b) {
5335  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5336  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5337 #if USE_DEBUGGER
5338  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5339 #endif
5340  }
5341  }
5342  if (hot_teams[level].hot_team_nth >= new_nproc) {
5343  // we have all needed threads in reserve, no need to allocate any
5344  // this only possible in mode 1, cannot have reserved threads in mode 0
5345  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5346  team->t.t_nproc = new_nproc; // just get reserved threads involved
5347  } else {
5348  // We may have some threads in reserve, but not enough;
5349  // get reserved threads involved if any.
5350  team->t.t_nproc = hot_teams[level].hot_team_nth;
5351  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5352  if (team->t.t_max_nproc < new_nproc) {
5353  /* reallocate larger arrays */
5354  __kmp_reallocate_team_arrays(team, new_nproc);
5355  __kmp_reinitialize_team(team, new_icvs, NULL);
5356  }
5357 
5358 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5359  KMP_AFFINITY_SUPPORTED
5360  /* Temporarily set full mask for primary thread before creation of
5361  workers. The reason is that workers inherit the affinity from the
5362  primary thread, so if a lot of workers are created on the single
5363  core quickly, they don't get a chance to set their own affinity for
5364  a long time. */
5365  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5366 #endif
5367 
5368  /* allocate new threads for the hot team */
5369  for (f = team->t.t_nproc; f < new_nproc; f++) {
5370  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5371  KMP_DEBUG_ASSERT(new_worker);
5372  team->t.t_threads[f] = new_worker;
5373 
5374  KA_TRACE(20,
5375  ("__kmp_allocate_team: team %d init T#%d arrived: "
5376  "join=%llu, plain=%llu\n",
5377  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5378  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5379  team->t.t_bar[bs_plain_barrier].b_arrived));
5380 
5381  { // Initialize barrier data for new threads.
5382  int b;
5383  kmp_balign_t *balign = new_worker->th.th_bar;
5384  for (b = 0; b < bs_last_barrier; ++b) {
5385  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5386  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5387  KMP_BARRIER_PARENT_FLAG);
5388 #if USE_DEBUGGER
5389  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5390 #endif
5391  }
5392  }
5393  }
5394 
5395 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5396  KMP_AFFINITY_SUPPORTED
5397  /* Restore initial primary thread's affinity mask */
5398  new_temp_affinity.restore();
5399 #endif
5400  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5401  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5402  // Barrier size already increased earlier in this function
5403  // Activate team threads via th_used_in_team
5404  __kmp_add_threads_to_team(team, new_nproc);
5405  }
5406  /* make sure everyone is syncronized */
5407  // new threads below
5408  __kmp_initialize_team(team, new_nproc, new_icvs,
5409  root->r.r_uber_thread->th.th_ident);
5410 
5411  /* reinitialize the threads */
5412  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5413  for (f = 0; f < team->t.t_nproc; ++f)
5414  __kmp_initialize_info(team->t.t_threads[f], team, f,
5415  __kmp_gtid_from_tid(f, team));
5416 
5417  // set th_task_state for new threads in hot team with older thread's state
5418  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5419  for (f = old_nproc; f < team->t.t_nproc; ++f)
5420  team->t.t_threads[f]->th.th_task_state = old_state;
5421 
5422 #ifdef KMP_DEBUG
5423  for (f = 0; f < team->t.t_nproc; ++f) {
5424  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5425  team->t.t_threads[f]->th.th_team_nproc ==
5426  team->t.t_nproc);
5427  }
5428 #endif
5429 
5430  if (do_place_partition) {
5431  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5432 #if KMP_AFFINITY_SUPPORTED
5433  __kmp_partition_places(team);
5434 #endif
5435  }
5436  } // Check changes in number of threads
5437 
5438  if (master->th.th_teams_microtask) {
5439  for (f = 1; f < new_nproc; ++f) {
5440  // propagate teams construct specific info to workers
5441  kmp_info_t *thr = team->t.t_threads[f];
5442  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5443  thr->th.th_teams_level = master->th.th_teams_level;
5444  thr->th.th_teams_size = master->th.th_teams_size;
5445  }
5446  }
5447  if (level) {
5448  // Sync barrier state for nested hot teams, not needed for outermost hot
5449  // team.
5450  for (f = 1; f < new_nproc; ++f) {
5451  kmp_info_t *thr = team->t.t_threads[f];
5452  int b;
5453  kmp_balign_t *balign = thr->th.th_bar;
5454  for (b = 0; b < bs_last_barrier; ++b) {
5455  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5456  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5457 #if USE_DEBUGGER
5458  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5459 #endif
5460  }
5461  }
5462  }
5463 
5464  /* reallocate space for arguments if necessary */
5465  __kmp_alloc_argv_entries(argc, team, TRUE);
5466  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5467  // The hot team re-uses the previous task team,
5468  // if untouched during the previous release->gather phase.
5469 
5470  KF_TRACE(10, (" hot_team = %p\n", team));
5471 
5472 #if KMP_DEBUG
5473  if (__kmp_tasking_mode != tskm_immediate_exec) {
5474  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5475  "task_team[1] = %p after reinit\n",
5476  team->t.t_task_team[0], team->t.t_task_team[1]));
5477  }
5478 #endif
5479 
5480 #if OMPT_SUPPORT
5481  __ompt_team_assign_id(team, ompt_parallel_data);
5482 #endif
5483 
5484  KMP_MB();
5485 
5486  return team;
5487  }
5488 
5489  /* next, let's try to take one from the team pool */
5490  KMP_MB();
5491  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5492  /* TODO: consider resizing undersized teams instead of reaping them, now
5493  that we have a resizing mechanism */
5494  if (team->t.t_max_nproc >= max_nproc) {
5495  /* take this team from the team pool */
5496  __kmp_team_pool = team->t.t_next_pool;
5497 
5498  if (max_nproc > 1 &&
5499  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5500  if (!team->t.b) { // Allocate barrier structure
5501  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5502  }
5503  }
5504 
5505  /* setup the team for fresh use */
5506  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5507 
5508  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5509  "task_team[1] %p to NULL\n",
5510  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5511  team->t.t_task_team[0] = NULL;
5512  team->t.t_task_team[1] = NULL;
5513 
5514  /* reallocate space for arguments if necessary */
5515  __kmp_alloc_argv_entries(argc, team, TRUE);
5516  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5517 
5518  KA_TRACE(
5519  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5520  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5521  { // Initialize barrier data.
5522  int b;
5523  for (b = 0; b < bs_last_barrier; ++b) {
5524  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5525 #if USE_DEBUGGER
5526  team->t.t_bar[b].b_master_arrived = 0;
5527  team->t.t_bar[b].b_team_arrived = 0;
5528 #endif
5529  }
5530  }
5531 
5532  team->t.t_proc_bind = new_proc_bind;
5533 
5534  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5535  team->t.t_id));
5536 
5537 #if OMPT_SUPPORT
5538  __ompt_team_assign_id(team, ompt_parallel_data);
5539 #endif
5540 
5541  team->t.t_nested_nth = NULL;
5542 
5543  KMP_MB();
5544 
5545  return team;
5546  }
5547 
5548  /* reap team if it is too small, then loop back and check the next one */
5549  // not sure if this is wise, but, will be redone during the hot-teams
5550  // rewrite.
5551  /* TODO: Use technique to find the right size hot-team, don't reap them */
5552  team = __kmp_reap_team(team);
5553  __kmp_team_pool = team;
5554  }
5555 
5556  /* nothing available in the pool, no matter, make a new team! */
5557  KMP_MB();
5558  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5559 
5560  /* and set it up */
5561  team->t.t_max_nproc = max_nproc;
5562  if (max_nproc > 1 &&
5563  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5564  // Allocate barrier structure
5565  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5566  }
5567 
5568  /* NOTE well, for some reason allocating one big buffer and dividing it up
5569  seems to really hurt performance a lot on the P4, so, let's not use this */
5570  __kmp_allocate_team_arrays(team, max_nproc);
5571 
5572  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5573  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5574 
5575  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5576  "%p to NULL\n",
5577  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5578  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5579  // memory, no need to duplicate
5580  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5581  // memory, no need to duplicate
5582 
5583  if (__kmp_storage_map) {
5584  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5585  }
5586 
5587  /* allocate space for arguments */
5588  __kmp_alloc_argv_entries(argc, team, FALSE);
5589  team->t.t_argc = argc;
5590 
5591  KA_TRACE(20,
5592  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5593  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5594  { // Initialize barrier data.
5595  int b;
5596  for (b = 0; b < bs_last_barrier; ++b) {
5597  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5598 #if USE_DEBUGGER
5599  team->t.t_bar[b].b_master_arrived = 0;
5600  team->t.t_bar[b].b_team_arrived = 0;
5601 #endif
5602  }
5603  }
5604 
5605  team->t.t_proc_bind = new_proc_bind;
5606 
5607 #if OMPT_SUPPORT
5608  __ompt_team_assign_id(team, ompt_parallel_data);
5609  team->t.ompt_serialized_team_info = NULL;
5610 #endif
5611 
5612  KMP_MB();
5613 
5614  team->t.t_nested_nth = NULL;
5615 
5616  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5617  team->t.t_id));
5618 
5619  return team;
5620 }
5621 
5622 /* TODO implement hot-teams at all levels */
5623 /* TODO implement lazy thread release on demand (disband request) */
5624 
5625 /* free the team. return it to the team pool. release all the threads
5626  * associated with it */
5627 void __kmp_free_team(kmp_root_t *root, kmp_team_t *team, kmp_info_t *master) {
5628  int f;
5629  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5630  team->t.t_id));
5631 
5632  /* verify state */
5633  KMP_DEBUG_ASSERT(root);
5634  KMP_DEBUG_ASSERT(team);
5635  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5636  KMP_DEBUG_ASSERT(team->t.t_threads);
5637 
5638  int use_hot_team = team == root->r.r_hot_team;
5639  int level;
5640  if (master) {
5641  level = team->t.t_active_level - 1;
5642  if (master->th.th_teams_microtask) { // in teams construct?
5643  if (master->th.th_teams_size.nteams > 1) {
5644  ++level; // level was not increased in teams construct for
5645  // team_of_masters
5646  }
5647  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5648  master->th.th_teams_level == team->t.t_level) {
5649  ++level; // level was not increased in teams construct for
5650  // team_of_workers before the parallel
5651  } // team->t.t_level will be increased inside parallel
5652  }
5653 #if KMP_DEBUG
5654  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5655 #endif
5656  if (level < __kmp_hot_teams_max_level) {
5657  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5658  use_hot_team = 1;
5659  }
5660  }
5661 
5662  /* team is done working */
5663  TCW_SYNC_PTR(team->t.t_pkfn,
5664  NULL); // Important for Debugging Support Library.
5665 #if KMP_OS_WINDOWS
5666  team->t.t_copyin_counter = 0; // init counter for possible reuse
5667 #endif
5668  // Do not reset pointer to parent team to NULL for hot teams.
5669 
5670  /* if we are non-hot team, release our threads */
5671  if (!use_hot_team) {
5672  if (__kmp_tasking_mode != tskm_immediate_exec) {
5673  // Wait for threads to reach reapable state
5674  for (f = 1; f < team->t.t_nproc; ++f) {
5675  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5676  kmp_info_t *th = team->t.t_threads[f];
5677  volatile kmp_uint32 *state = &th->th.th_reap_state;
5678  while (*state != KMP_SAFE_TO_REAP) {
5679 #if KMP_OS_WINDOWS
5680  // On Windows a thread can be killed at any time, check this
5681  DWORD ecode;
5682  if (!__kmp_is_thread_alive(th, &ecode)) {
5683  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5684  break;
5685  }
5686 #endif
5687  // first check if thread is sleeping
5688  if (th->th.th_sleep_loc)
5689  __kmp_null_resume_wrapper(th);
5690  KMP_CPU_PAUSE();
5691  }
5692  }
5693 
5694  // Delete task teams
5695  int tt_idx;
5696  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5697  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5698  if (task_team != NULL) {
5699  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5700  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5701  team->t.t_threads[f]->th.th_task_team = NULL;
5702  }
5703  KA_TRACE(
5704  20,
5705  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5706  __kmp_get_gtid(), task_team, team->t.t_id));
5707  __kmp_free_task_team(master, task_team);
5708  team->t.t_task_team[tt_idx] = NULL;
5709  }
5710  }
5711  }
5712 
5713  // Before clearing parent pointer, check if nested_nth list should be freed
5714  if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5715  team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5716  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5717  KMP_INTERNAL_FREE(team->t.t_nested_nth);
5718  }
5719  team->t.t_nested_nth = NULL;
5720 
5721  // Reset pointer to parent team only for non-hot teams.
5722  team->t.t_parent = NULL;
5723  team->t.t_level = 0;
5724  team->t.t_active_level = 0;
5725 
5726  /* free the worker threads */
5727  for (f = 1; f < team->t.t_nproc; ++f) {
5728  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5729  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5730  (void)KMP_COMPARE_AND_STORE_ACQ32(
5731  &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5732  }
5733  __kmp_free_thread(team->t.t_threads[f]);
5734  }
5735 
5736  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5737  if (team->t.b) {
5738  // wake up thread at old location
5739  team->t.b->go_release();
5740  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5741  for (f = 1; f < team->t.t_nproc; ++f) {
5742  if (team->t.b->sleep[f].sleep) {
5743  __kmp_atomic_resume_64(
5744  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5745  (kmp_atomic_flag_64<> *)NULL);
5746  }
5747  }
5748  }
5749  // Wait for threads to be removed from team
5750  for (int f = 1; f < team->t.t_nproc; ++f) {
5751  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5752  KMP_CPU_PAUSE();
5753  }
5754  }
5755  }
5756 
5757  for (f = 1; f < team->t.t_nproc; ++f) {
5758  team->t.t_threads[f] = NULL;
5759  }
5760 
5761  if (team->t.t_max_nproc > 1 &&
5762  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5763  distributedBarrier::deallocate(team->t.b);
5764  team->t.b = NULL;
5765  }
5766  /* put the team back in the team pool */
5767  /* TODO limit size of team pool, call reap_team if pool too large */
5768  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5769  __kmp_team_pool = (volatile kmp_team_t *)team;
5770  } else { // Check if team was created for primary threads in teams construct
5771  // See if first worker is a CG root
5772  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5773  team->t.t_threads[1]->th.th_cg_roots);
5774  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5775  // Clean up the CG root nodes on workers so that this team can be re-used
5776  for (f = 1; f < team->t.t_nproc; ++f) {
5777  kmp_info_t *thr = team->t.t_threads[f];
5778  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5779  thr->th.th_cg_roots->cg_root == thr);
5780  // Pop current CG root off list
5781  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5782  thr->th.th_cg_roots = tmp->up;
5783  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5784  " up to node %p. cg_nthreads was %d\n",
5785  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5786  int i = tmp->cg_nthreads--;
5787  if (i == 1) {
5788  __kmp_free(tmp); // free CG if we are the last thread in it
5789  }
5790  // Restore current task's thread_limit from CG root
5791  if (thr->th.th_cg_roots)
5792  thr->th.th_current_task->td_icvs.thread_limit =
5793  thr->th.th_cg_roots->cg_thread_limit;
5794  }
5795  }
5796  }
5797 
5798  KMP_MB();
5799 }
5800 
5801 /* reap the team. destroy it, reclaim all its resources and free its memory */
5802 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5803  kmp_team_t *next_pool = team->t.t_next_pool;
5804 
5805  KMP_DEBUG_ASSERT(team);
5806  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5807  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5808  KMP_DEBUG_ASSERT(team->t.t_threads);
5809  KMP_DEBUG_ASSERT(team->t.t_argv);
5810 
5811  /* TODO clean the threads that are a part of this? */
5812 
5813  /* free stuff */
5814  __kmp_free_team_arrays(team);
5815  if (team->t.t_argv != &team->t.t_inline_argv[0])
5816  __kmp_free((void *)team->t.t_argv);
5817  __kmp_free(team);
5818 
5819  KMP_MB();
5820  return next_pool;
5821 }
5822 
5823 // Free the thread. Don't reap it, just place it on the pool of available
5824 // threads.
5825 //
5826 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5827 // binding for the affinity mechanism to be useful.
5828 //
5829 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5830 // However, we want to avoid a potential performance problem by always
5831 // scanning through the list to find the correct point at which to insert
5832 // the thread (potential N**2 behavior). To do this we keep track of the
5833 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5834 // With single-level parallelism, threads will always be added to the tail
5835 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5836 // parallelism, all bets are off and we may need to scan through the entire
5837 // free list.
5838 //
5839 // This change also has a potentially large performance benefit, for some
5840 // applications. Previously, as threads were freed from the hot team, they
5841 // would be placed back on the free list in inverse order. If the hot team
5842 // grew back to it's original size, then the freed thread would be placed
5843 // back on the hot team in reverse order. This could cause bad cache
5844 // locality problems on programs where the size of the hot team regularly
5845 // grew and shrunk.
5846 //
5847 // Now, for single-level parallelism, the OMP tid is always == gtid.
5848 void __kmp_free_thread(kmp_info_t *this_th) {
5849  int gtid;
5850  kmp_info_t **scan;
5851 
5852  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5853  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5854 
5855  KMP_DEBUG_ASSERT(this_th);
5856 
5857  // When moving thread to pool, switch thread to wait on own b_go flag, and
5858  // uninitialized (NULL team).
5859  int b;
5860  kmp_balign_t *balign = this_th->th.th_bar;
5861  for (b = 0; b < bs_last_barrier; ++b) {
5862  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5863  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5864  balign[b].bb.team = NULL;
5865  balign[b].bb.leaf_kids = 0;
5866  }
5867  this_th->th.th_task_state = 0;
5868  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5869 
5870  /* put thread back on the free pool */
5871  TCW_PTR(this_th->th.th_team, NULL);
5872  TCW_PTR(this_th->th.th_root, NULL);
5873  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5874 
5875  while (this_th->th.th_cg_roots) {
5876  this_th->th.th_cg_roots->cg_nthreads--;
5877  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5878  " %p of thread %p to %d\n",
5879  this_th, this_th->th.th_cg_roots,
5880  this_th->th.th_cg_roots->cg_root,
5881  this_th->th.th_cg_roots->cg_nthreads));
5882  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5883  if (tmp->cg_root == this_th) { // Thread is a cg_root
5884  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5885  KA_TRACE(
5886  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5887  this_th->th.th_cg_roots = tmp->up;
5888  __kmp_free(tmp);
5889  } else { // Worker thread
5890  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5891  __kmp_free(tmp);
5892  }
5893  this_th->th.th_cg_roots = NULL;
5894  break;
5895  }
5896  }
5897 
5898  /* If the implicit task assigned to this thread can be used by other threads
5899  * -> multiple threads can share the data and try to free the task at
5900  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5901  * with higher probability when hot team is disabled but can occurs even when
5902  * the hot team is enabled */
5903  __kmp_free_implicit_task(this_th);
5904  this_th->th.th_current_task = NULL;
5905 
5906  // If the __kmp_thread_pool_insert_pt is already past the new insert
5907  // point, then we need to re-scan the entire list.
5908  gtid = this_th->th.th_info.ds.ds_gtid;
5909  if (__kmp_thread_pool_insert_pt != NULL) {
5910  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5911  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5912  __kmp_thread_pool_insert_pt = NULL;
5913  }
5914  }
5915 
5916  // Scan down the list to find the place to insert the thread.
5917  // scan is the address of a link in the list, possibly the address of
5918  // __kmp_thread_pool itself.
5919  //
5920  // In the absence of nested parallelism, the for loop will have 0 iterations.
5921  if (__kmp_thread_pool_insert_pt != NULL) {
5922  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5923  } else {
5924  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5925  }
5926  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5927  scan = &((*scan)->th.th_next_pool))
5928  ;
5929 
5930  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5931  // to its address.
5932  TCW_PTR(this_th->th.th_next_pool, *scan);
5933  __kmp_thread_pool_insert_pt = *scan = this_th;
5934  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5935  (this_th->th.th_info.ds.ds_gtid <
5936  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5937  TCW_4(this_th->th.th_in_pool, TRUE);
5938  __kmp_suspend_initialize_thread(this_th);
5939  __kmp_lock_suspend_mx(this_th);
5940  if (this_th->th.th_active == TRUE) {
5941  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5942  this_th->th.th_active_in_pool = TRUE;
5943  }
5944 #if KMP_DEBUG
5945  else {
5946  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5947  }
5948 #endif
5949  __kmp_unlock_suspend_mx(this_th);
5950 
5951  TCW_4(__kmp_nth, __kmp_nth - 1);
5952 
5953 #ifdef KMP_ADJUST_BLOCKTIME
5954  /* Adjust blocktime back to user setting or default if necessary */
5955  /* Middle initialization might never have occurred */
5956  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5957  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5958  if (__kmp_nth <= __kmp_avail_proc) {
5959  __kmp_zero_bt = FALSE;
5960  }
5961  }
5962 #endif /* KMP_ADJUST_BLOCKTIME */
5963 
5964  KMP_MB();
5965 }
5966 
5967 /* ------------------------------------------------------------------------ */
5968 
5969 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5970 #if OMP_PROFILING_SUPPORT
5971  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5972  // TODO: add a configuration option for time granularity
5973  if (ProfileTraceFile)
5974  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5975 #endif
5976 
5977  int gtid = this_thr->th.th_info.ds.ds_gtid;
5978  /* void *stack_data;*/
5979  kmp_team_t **volatile pteam;
5980 
5981  KMP_MB();
5982  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5983 
5984  if (__kmp_env_consistency_check) {
5985  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5986  }
5987 
5988 #if OMPD_SUPPORT
5989  if (ompd_state & OMPD_ENABLE_BP)
5990  ompd_bp_thread_begin();
5991 #endif
5992 
5993 #if OMPT_SUPPORT
5994  ompt_data_t *thread_data = nullptr;
5995  if (ompt_enabled.enabled) {
5996  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5997  *thread_data = ompt_data_none;
5998 
5999  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6000  this_thr->th.ompt_thread_info.wait_id = 0;
6001  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6002  this_thr->th.ompt_thread_info.parallel_flags = 0;
6003  if (ompt_enabled.ompt_callback_thread_begin) {
6004  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6005  ompt_thread_worker, thread_data);
6006  }
6007  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6008  }
6009 #endif
6010 
6011  /* This is the place where threads wait for work */
6012  while (!TCR_4(__kmp_global.g.g_done)) {
6013  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6014  KMP_MB();
6015 
6016  /* wait for work to do */
6017  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6018 
6019  /* No tid yet since not part of a team */
6020  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6021 
6022 #if OMPT_SUPPORT
6023  if (ompt_enabled.enabled) {
6024  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6025  }
6026 #endif
6027 
6028  pteam = &this_thr->th.th_team;
6029 
6030  /* have we been allocated? */
6031  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6032  /* we were just woken up, so run our new task */
6033  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6034  int rc;
6035  KA_TRACE(20,
6036  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6037  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6038  (*pteam)->t.t_pkfn));
6039 
6040  updateHWFPControl(*pteam);
6041 
6042 #if OMPT_SUPPORT
6043  if (ompt_enabled.enabled) {
6044  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6045  }
6046 #endif
6047 
6048  rc = (*pteam)->t.t_invoke(gtid);
6049  KMP_ASSERT(rc);
6050 
6051  KMP_MB();
6052  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6053  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6054  (*pteam)->t.t_pkfn));
6055  }
6056 #if OMPT_SUPPORT
6057  if (ompt_enabled.enabled) {
6058  /* no frame set while outside task */
6059  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6060 
6061  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6062  }
6063 #endif
6064  /* join barrier after parallel region */
6065  __kmp_join_barrier(gtid);
6066  }
6067  }
6068 
6069 #if OMPD_SUPPORT
6070  if (ompd_state & OMPD_ENABLE_BP)
6071  ompd_bp_thread_end();
6072 #endif
6073 
6074 #if OMPT_SUPPORT
6075  if (ompt_enabled.ompt_callback_thread_end) {
6076  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6077  }
6078 #endif
6079 
6080  this_thr->th.th_task_team = NULL;
6081  /* run the destructors for the threadprivate data for this thread */
6082  __kmp_common_destroy_gtid(gtid);
6083 
6084  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6085  KMP_MB();
6086 
6087 #if OMP_PROFILING_SUPPORT
6088  llvm::timeTraceProfilerFinishThread();
6089 #endif
6090  return this_thr;
6091 }
6092 
6093 /* ------------------------------------------------------------------------ */
6094 
6095 void __kmp_internal_end_dest(void *specific_gtid) {
6096  // Make sure no significant bits are lost
6097  int gtid;
6098  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6099 
6100  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6101  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6102  * this is because 0 is reserved for the nothing-stored case */
6103 
6104  __kmp_internal_end_thread(gtid);
6105 }
6106 
6107 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6108 
6109 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6110  __kmp_internal_end_atexit();
6111 }
6112 
6113 #endif
6114 
6115 /* [Windows] josh: when the atexit handler is called, there may still be more
6116  than one thread alive */
6117 void __kmp_internal_end_atexit(void) {
6118  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6119  /* [Windows]
6120  josh: ideally, we want to completely shutdown the library in this atexit
6121  handler, but stat code that depends on thread specific data for gtid fails
6122  because that data becomes unavailable at some point during the shutdown, so
6123  we call __kmp_internal_end_thread instead. We should eventually remove the
6124  dependency on __kmp_get_specific_gtid in the stat code and use
6125  __kmp_internal_end_library to cleanly shutdown the library.
6126 
6127  // TODO: Can some of this comment about GVS be removed?
6128  I suspect that the offending stat code is executed when the calling thread
6129  tries to clean up a dead root thread's data structures, resulting in GVS
6130  code trying to close the GVS structures for that thread, but since the stat
6131  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6132  the calling thread is cleaning up itself instead of another thread, it get
6133  confused. This happens because allowing a thread to unregister and cleanup
6134  another thread is a recent modification for addressing an issue.
6135  Based on the current design (20050722), a thread may end up
6136  trying to unregister another thread only if thread death does not trigger
6137  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6138  thread specific data destructor function to detect thread death. For
6139  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6140  is nothing. Thus, the workaround is applicable only for Windows static
6141  stat library. */
6142  __kmp_internal_end_library(-1);
6143 #if KMP_OS_WINDOWS
6144  __kmp_close_console();
6145 #endif
6146 }
6147 
6148 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6149  // It is assumed __kmp_forkjoin_lock is acquired.
6150 
6151  int gtid;
6152 
6153  KMP_DEBUG_ASSERT(thread != NULL);
6154 
6155  gtid = thread->th.th_info.ds.ds_gtid;
6156 
6157  if (!is_root) {
6158  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6159  /* Assume the threads are at the fork barrier here */
6160  KA_TRACE(
6161  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6162  gtid));
6163  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6164  while (
6165  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6166  KMP_CPU_PAUSE();
6167  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6168  } else {
6169  /* Need release fence here to prevent seg faults for tree forkjoin
6170  barrier (GEH) */
6171  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6172  thread);
6173  __kmp_release_64(&flag);
6174  }
6175  }
6176 
6177  // Terminate OS thread.
6178  __kmp_reap_worker(thread);
6179 
6180  // The thread was killed asynchronously. If it was actively
6181  // spinning in the thread pool, decrement the global count.
6182  //
6183  // There is a small timing hole here - if the worker thread was just waking
6184  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6185  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6186  // the global counter might not get updated.
6187  //
6188  // Currently, this can only happen as the library is unloaded,
6189  // so there are no harmful side effects.
6190  if (thread->th.th_active_in_pool) {
6191  thread->th.th_active_in_pool = FALSE;
6192  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6193  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6194  }
6195  }
6196 
6197  __kmp_free_implicit_task(thread);
6198 
6199 // Free the fast memory for tasking
6200 #if USE_FAST_MEMORY
6201  __kmp_free_fast_memory(thread);
6202 #endif /* USE_FAST_MEMORY */
6203 
6204  __kmp_suspend_uninitialize_thread(thread);
6205 
6206  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6207  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6208 
6209  --__kmp_all_nth;
6210  // __kmp_nth was decremented when thread is added to the pool.
6211 
6212 #ifdef KMP_ADJUST_BLOCKTIME
6213  /* Adjust blocktime back to user setting or default if necessary */
6214  /* Middle initialization might never have occurred */
6215  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6216  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6217  if (__kmp_nth <= __kmp_avail_proc) {
6218  __kmp_zero_bt = FALSE;
6219  }
6220  }
6221 #endif /* KMP_ADJUST_BLOCKTIME */
6222 
6223  /* free the memory being used */
6224  if (__kmp_env_consistency_check) {
6225  if (thread->th.th_cons) {
6226  __kmp_free_cons_stack(thread->th.th_cons);
6227  thread->th.th_cons = NULL;
6228  }
6229  }
6230 
6231  if (thread->th.th_pri_common != NULL) {
6232  __kmp_free(thread->th.th_pri_common);
6233  thread->th.th_pri_common = NULL;
6234  }
6235 
6236 #if KMP_USE_BGET
6237  if (thread->th.th_local.bget_data != NULL) {
6238  __kmp_finalize_bget(thread);
6239  }
6240 #endif
6241 
6242 #if KMP_AFFINITY_SUPPORTED
6243  if (thread->th.th_affin_mask != NULL) {
6244  KMP_CPU_FREE(thread->th.th_affin_mask);
6245  thread->th.th_affin_mask = NULL;
6246  }
6247 #endif /* KMP_AFFINITY_SUPPORTED */
6248 
6249 #if KMP_USE_HIER_SCHED
6250  if (thread->th.th_hier_bar_data != NULL) {
6251  __kmp_free(thread->th.th_hier_bar_data);
6252  thread->th.th_hier_bar_data = NULL;
6253  }
6254 #endif
6255 
6256  __kmp_reap_team(thread->th.th_serial_team);
6257  thread->th.th_serial_team = NULL;
6258  __kmp_free(thread);
6259 
6260  KMP_MB();
6261 
6262 } // __kmp_reap_thread
6263 
6264 static void __kmp_itthash_clean(kmp_info_t *th) {
6265 #if USE_ITT_NOTIFY
6266  if (__kmp_itt_region_domains.count > 0) {
6267  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6268  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6269  while (bucket) {
6270  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6271  __kmp_thread_free(th, bucket);
6272  bucket = next;
6273  }
6274  }
6275  }
6276  if (__kmp_itt_barrier_domains.count > 0) {
6277  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6278  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6279  while (bucket) {
6280  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6281  __kmp_thread_free(th, bucket);
6282  bucket = next;
6283  }
6284  }
6285  }
6286 #endif
6287 }
6288 
6289 static void __kmp_internal_end(void) {
6290  int i;
6291 
6292  /* First, unregister the library */
6293  __kmp_unregister_library();
6294 
6295 #if KMP_OS_WINDOWS
6296  /* In Win static library, we can't tell when a root actually dies, so we
6297  reclaim the data structures for any root threads that have died but not
6298  unregistered themselves, in order to shut down cleanly.
6299  In Win dynamic library we also can't tell when a thread dies. */
6300  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6301 // dead roots
6302 #endif
6303 
6304  for (i = 0; i < __kmp_threads_capacity; i++)
6305  if (__kmp_root[i])
6306  if (__kmp_root[i]->r.r_active)
6307  break;
6308  KMP_MB(); /* Flush all pending memory write invalidates. */
6309  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6310 
6311  if (i < __kmp_threads_capacity) {
6312 #if KMP_USE_MONITOR
6313  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6314  KMP_MB(); /* Flush all pending memory write invalidates. */
6315 
6316  // Need to check that monitor was initialized before reaping it. If we are
6317  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6318  // __kmp_monitor will appear to contain valid data, but it is only valid in
6319  // the parent process, not the child.
6320  // New behavior (201008): instead of keying off of the flag
6321  // __kmp_init_parallel, the monitor thread creation is keyed off
6322  // of the new flag __kmp_init_monitor.
6323  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6324  if (TCR_4(__kmp_init_monitor)) {
6325  __kmp_reap_monitor(&__kmp_monitor);
6326  TCW_4(__kmp_init_monitor, 0);
6327  }
6328  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6329  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6330 #endif // KMP_USE_MONITOR
6331  } else {
6332 /* TODO move this to cleanup code */
6333 #ifdef KMP_DEBUG
6334  /* make sure that everything has properly ended */
6335  for (i = 0; i < __kmp_threads_capacity; i++) {
6336  if (__kmp_root[i]) {
6337  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6338  // there can be uber threads alive here
6339  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6340  }
6341  }
6342 #endif
6343 
6344  KMP_MB();
6345 
6346  // Reap the worker threads.
6347  // This is valid for now, but be careful if threads are reaped sooner.
6348  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6349  // Get the next thread from the pool.
6350  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6351  __kmp_thread_pool = thread->th.th_next_pool;
6352  // Reap it.
6353  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6354  thread->th.th_next_pool = NULL;
6355  thread->th.th_in_pool = FALSE;
6356  __kmp_reap_thread(thread, 0);
6357  }
6358  __kmp_thread_pool_insert_pt = NULL;
6359 
6360  // Reap teams.
6361  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6362  // Get the next team from the pool.
6363  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6364  __kmp_team_pool = team->t.t_next_pool;
6365  // Reap it.
6366  team->t.t_next_pool = NULL;
6367  __kmp_reap_team(team);
6368  }
6369 
6370  __kmp_reap_task_teams();
6371 
6372 #if KMP_OS_UNIX
6373  // Threads that are not reaped should not access any resources since they
6374  // are going to be deallocated soon, so the shutdown sequence should wait
6375  // until all threads either exit the final spin-waiting loop or begin
6376  // sleeping after the given blocktime.
6377  for (i = 0; i < __kmp_threads_capacity; i++) {
6378  kmp_info_t *thr = __kmp_threads[i];
6379  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6380  KMP_CPU_PAUSE();
6381  }
6382 #endif
6383 
6384  for (i = 0; i < __kmp_threads_capacity; ++i) {
6385  // TBD: Add some checking...
6386  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6387  }
6388 
6389  /* Make sure all threadprivate destructors get run by joining with all
6390  worker threads before resetting this flag */
6391  TCW_SYNC_4(__kmp_init_common, FALSE);
6392 
6393  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6394  KMP_MB();
6395 
6396 #if KMP_USE_MONITOR
6397  // See note above: One of the possible fixes for CQ138434 / CQ140126
6398  //
6399  // FIXME: push both code fragments down and CSE them?
6400  // push them into __kmp_cleanup() ?
6401  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6402  if (TCR_4(__kmp_init_monitor)) {
6403  __kmp_reap_monitor(&__kmp_monitor);
6404  TCW_4(__kmp_init_monitor, 0);
6405  }
6406  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6407  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6408 #endif
6409  } /* else !__kmp_global.t_active */
6410  TCW_4(__kmp_init_gtid, FALSE);
6411  KMP_MB(); /* Flush all pending memory write invalidates. */
6412 
6413  __kmp_cleanup();
6414 #if OMPT_SUPPORT
6415  ompt_fini();
6416 #endif
6417 }
6418 
6419 void __kmp_internal_end_library(int gtid_req) {
6420  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6421  /* this shouldn't be a race condition because __kmp_internal_end() is the
6422  only place to clear __kmp_serial_init */
6423  /* we'll check this later too, after we get the lock */
6424  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6425  // redundant, because the next check will work in any case.
6426  if (__kmp_global.g.g_abort) {
6427  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6428  /* TODO abort? */
6429  return;
6430  }
6431  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6432  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6433  return;
6434  }
6435 
6436  // If hidden helper team has been initialized, we need to deinit it
6437  if (TCR_4(__kmp_init_hidden_helper) &&
6438  !TCR_4(__kmp_hidden_helper_team_done)) {
6439  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6440  // First release the main thread to let it continue its work
6441  __kmp_hidden_helper_main_thread_release();
6442  // Wait until the hidden helper team has been destroyed
6443  __kmp_hidden_helper_threads_deinitz_wait();
6444  }
6445 
6446  KMP_MB(); /* Flush all pending memory write invalidates. */
6447  /* find out who we are and what we should do */
6448  {
6449  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6450  KA_TRACE(
6451  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6452  if (gtid == KMP_GTID_SHUTDOWN) {
6453  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6454  "already shutdown\n"));
6455  return;
6456  } else if (gtid == KMP_GTID_MONITOR) {
6457  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6458  "registered, or system shutdown\n"));
6459  return;
6460  } else if (gtid == KMP_GTID_DNE) {
6461  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6462  "shutdown\n"));
6463  /* we don't know who we are, but we may still shutdown the library */
6464  } else if (KMP_UBER_GTID(gtid)) {
6465  /* unregister ourselves as an uber thread. gtid is no longer valid */
6466  if (__kmp_root[gtid]->r.r_active) {
6467  __kmp_global.g.g_abort = -1;
6468  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6469  __kmp_unregister_library();
6470  KA_TRACE(10,
6471  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6472  gtid));
6473  return;
6474  } else {
6475  __kmp_itthash_clean(__kmp_threads[gtid]);
6476  KA_TRACE(
6477  10,
6478  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6479  __kmp_unregister_root_current_thread(gtid);
6480  }
6481  } else {
6482 /* worker threads may call this function through the atexit handler, if they
6483  * call exit() */
6484 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6485  TODO: do a thorough shutdown instead */
6486 #ifdef DUMP_DEBUG_ON_EXIT
6487  if (__kmp_debug_buf)
6488  __kmp_dump_debug_buffer();
6489 #endif
6490  // added unregister library call here when we switch to shm linux
6491  // if we don't, it will leave lots of files in /dev/shm
6492  // cleanup shared memory file before exiting.
6493  __kmp_unregister_library();
6494  return;
6495  }
6496  }
6497  /* synchronize the termination process */
6498  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6499 
6500  /* have we already finished */
6501  if (__kmp_global.g.g_abort) {
6502  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6503  /* TODO abort? */
6504  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6505  return;
6506  }
6507  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6508  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6509  return;
6510  }
6511 
6512  /* We need this lock to enforce mutex between this reading of
6513  __kmp_threads_capacity and the writing by __kmp_register_root.
6514  Alternatively, we can use a counter of roots that is atomically updated by
6515  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6516  __kmp_internal_end_*. */
6517  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6518 
6519  /* now we can safely conduct the actual termination */
6520  __kmp_internal_end();
6521 
6522  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6523  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6524 
6525  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6526 
6527 #ifdef DUMP_DEBUG_ON_EXIT
6528  if (__kmp_debug_buf)
6529  __kmp_dump_debug_buffer();
6530 #endif
6531 
6532 #if KMP_OS_WINDOWS
6533  __kmp_close_console();
6534 #endif
6535 
6536  __kmp_fini_allocator();
6537 
6538 } // __kmp_internal_end_library
6539 
6540 void __kmp_internal_end_thread(int gtid_req) {
6541  int i;
6542 
6543  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6544  /* this shouldn't be a race condition because __kmp_internal_end() is the
6545  * only place to clear __kmp_serial_init */
6546  /* we'll check this later too, after we get the lock */
6547  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6548  // redundant, because the next check will work in any case.
6549  if (__kmp_global.g.g_abort) {
6550  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6551  /* TODO abort? */
6552  return;
6553  }
6554  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6555  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6556  return;
6557  }
6558 
6559  // If hidden helper team has been initialized, we need to deinit it
6560  if (TCR_4(__kmp_init_hidden_helper) &&
6561  !TCR_4(__kmp_hidden_helper_team_done)) {
6562  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6563  // First release the main thread to let it continue its work
6564  __kmp_hidden_helper_main_thread_release();
6565  // Wait until the hidden helper team has been destroyed
6566  __kmp_hidden_helper_threads_deinitz_wait();
6567  }
6568 
6569  KMP_MB(); /* Flush all pending memory write invalidates. */
6570 
6571  /* find out who we are and what we should do */
6572  {
6573  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6574  KA_TRACE(10,
6575  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6576  if (gtid == KMP_GTID_SHUTDOWN) {
6577  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6578  "already shutdown\n"));
6579  return;
6580  } else if (gtid == KMP_GTID_MONITOR) {
6581  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6582  "registered, or system shutdown\n"));
6583  return;
6584  } else if (gtid == KMP_GTID_DNE) {
6585  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6586  "shutdown\n"));
6587  return;
6588  /* we don't know who we are */
6589  } else if (KMP_UBER_GTID(gtid)) {
6590  /* unregister ourselves as an uber thread. gtid is no longer valid */
6591  if (__kmp_root[gtid]->r.r_active) {
6592  __kmp_global.g.g_abort = -1;
6593  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6594  KA_TRACE(10,
6595  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6596  gtid));
6597  return;
6598  } else {
6599  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6600  gtid));
6601  __kmp_unregister_root_current_thread(gtid);
6602  }
6603  } else {
6604  /* just a worker thread, let's leave */
6605  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6606 
6607  if (gtid >= 0) {
6608  __kmp_threads[gtid]->th.th_task_team = NULL;
6609  }
6610 
6611  KA_TRACE(10,
6612  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6613  gtid));
6614  return;
6615  }
6616  }
6617 #if KMP_DYNAMIC_LIB
6618  if (__kmp_pause_status != kmp_hard_paused)
6619  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6620  // because we will better shutdown later in the library destructor.
6621  {
6622  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6623  return;
6624  }
6625 #endif
6626  /* synchronize the termination process */
6627  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6628 
6629  /* have we already finished */
6630  if (__kmp_global.g.g_abort) {
6631  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6632  /* TODO abort? */
6633  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6634  return;
6635  }
6636  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6637  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6638  return;
6639  }
6640 
6641  /* We need this lock to enforce mutex between this reading of
6642  __kmp_threads_capacity and the writing by __kmp_register_root.
6643  Alternatively, we can use a counter of roots that is atomically updated by
6644  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6645  __kmp_internal_end_*. */
6646 
6647  /* should we finish the run-time? are all siblings done? */
6648  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6649 
6650  for (i = 0; i < __kmp_threads_capacity; ++i) {
6651  if (KMP_UBER_GTID(i)) {
6652  KA_TRACE(
6653  10,
6654  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6655  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6656  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6657  return;
6658  }
6659  }
6660 
6661  /* now we can safely conduct the actual termination */
6662 
6663  __kmp_internal_end();
6664 
6665  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6666  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6667 
6668  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6669 
6670 #ifdef DUMP_DEBUG_ON_EXIT
6671  if (__kmp_debug_buf)
6672  __kmp_dump_debug_buffer();
6673 #endif
6674 } // __kmp_internal_end_thread
6675 
6676 // -----------------------------------------------------------------------------
6677 // Library registration stuff.
6678 
6679 static long __kmp_registration_flag = 0;
6680 // Random value used to indicate library initialization.
6681 static char *__kmp_registration_str = NULL;
6682 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6683 
6684 static inline char *__kmp_reg_status_name() {
6685 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6686  each thread. If registration and unregistration go in different threads
6687  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6688  env var can not be found, because the name will contain different pid. */
6689 // macOS* complains about name being too long with additional getuid()
6690 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6691  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6692  (int)getuid());
6693 #else
6694  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6695 #endif
6696 } // __kmp_reg_status_get
6697 
6698 #if defined(KMP_USE_SHM)
6699 bool __kmp_shm_available = false;
6700 bool __kmp_tmp_available = false;
6701 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6702 char *temp_reg_status_file_name = nullptr;
6703 #endif
6704 
6705 void __kmp_register_library_startup(void) {
6706 
6707  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6708  int done = 0;
6709  union {
6710  double dtime;
6711  long ltime;
6712  } time;
6713 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6714  __kmp_initialize_system_tick();
6715 #endif
6716  __kmp_read_system_time(&time.dtime);
6717  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6718  __kmp_registration_str =
6719  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6720  __kmp_registration_flag, KMP_LIBRARY_FILE);
6721 
6722  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6723  __kmp_registration_str));
6724 
6725  while (!done) {
6726 
6727  char *value = NULL; // Actual value of the environment variable.
6728 
6729 #if defined(KMP_USE_SHM)
6730  char *shm_name = nullptr;
6731  char *data1 = nullptr;
6732  __kmp_shm_available = __kmp_detect_shm();
6733  if (__kmp_shm_available) {
6734  int fd1 = -1;
6735  shm_name = __kmp_str_format("/%s", name);
6736  int shm_preexist = 0;
6737  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6738  if ((fd1 == -1) && (errno == EEXIST)) {
6739  // file didn't open because it already exists.
6740  // try opening existing file
6741  fd1 = shm_open(shm_name, O_RDWR, 0600);
6742  if (fd1 == -1) { // file didn't open
6743  KMP_WARNING(FunctionError, "Can't open SHM");
6744  __kmp_shm_available = false;
6745  } else { // able to open existing file
6746  shm_preexist = 1;
6747  }
6748  }
6749  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6750  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6751  KMP_WARNING(FunctionError, "Can't set size of SHM");
6752  __kmp_shm_available = false;
6753  }
6754  }
6755  if (__kmp_shm_available) { // SHM exists, now map it
6756  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6757  fd1, 0);
6758  if (data1 == MAP_FAILED) { // failed to map shared memory
6759  KMP_WARNING(FunctionError, "Can't map SHM");
6760  __kmp_shm_available = false;
6761  }
6762  }
6763  if (__kmp_shm_available) { // SHM mapped
6764  if (shm_preexist == 0) { // set data to SHM, set value
6765  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6766  }
6767  // Read value from either what we just wrote or existing file.
6768  value = __kmp_str_format("%s", data1); // read value from SHM
6769  munmap(data1, SHM_SIZE);
6770  }
6771  if (fd1 != -1)
6772  close(fd1);
6773  }
6774  if (!__kmp_shm_available)
6775  __kmp_tmp_available = __kmp_detect_tmp();
6776  if (!__kmp_shm_available && __kmp_tmp_available) {
6777  // SHM failed to work due to an error other than that the file already
6778  // exists. Try to create a temp file under /tmp.
6779  // If /tmp isn't accessible, fall back to using environment variable.
6780  // TODO: /tmp might not always be the temporary directory. For now we will
6781  // not consider TMPDIR.
6782  int fd1 = -1;
6783  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6784  int tmp_preexist = 0;
6785  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6786  if ((fd1 == -1) && (errno == EEXIST)) {
6787  // file didn't open because it already exists.
6788  // try opening existing file
6789  fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6790  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6791  KMP_WARNING(FunctionError, "Can't open TEMP");
6792  __kmp_tmp_available = false;
6793  } else {
6794  tmp_preexist = 1;
6795  }
6796  }
6797  if (__kmp_tmp_available && tmp_preexist == 0) {
6798  // we created /tmp file now set size
6799  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6800  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6801  __kmp_tmp_available = false;
6802  }
6803  }
6804  if (__kmp_tmp_available) {
6805  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6806  fd1, 0);
6807  if (data1 == MAP_FAILED) { // failed to map /tmp
6808  KMP_WARNING(FunctionError, "Can't map /tmp");
6809  __kmp_tmp_available = false;
6810  }
6811  }
6812  if (__kmp_tmp_available) {
6813  if (tmp_preexist == 0) { // set data to TMP, set value
6814  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6815  }
6816  // Read value from either what we just wrote or existing file.
6817  value = __kmp_str_format("%s", data1); // read value from SHM
6818  munmap(data1, SHM_SIZE);
6819  }
6820  if (fd1 != -1)
6821  close(fd1);
6822  }
6823  if (!__kmp_shm_available && !__kmp_tmp_available) {
6824  // no /dev/shm and no /tmp -- fall back to environment variable
6825  // Set environment variable, but do not overwrite if it exists.
6826  __kmp_env_set(name, __kmp_registration_str, 0);
6827  // read value to see if it got set
6828  value = __kmp_env_get(name);
6829  }
6830 #else // Windows and unix with static library
6831  // Set environment variable, but do not overwrite if it exists.
6832  __kmp_env_set(name, __kmp_registration_str, 0);
6833  // read value to see if it got set
6834  value = __kmp_env_get(name);
6835 #endif
6836 
6837  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6838  done = 1; // Ok, environment variable set successfully, exit the loop.
6839  } else {
6840  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6841  // Check whether it alive or dead.
6842  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6843  char *tail = value;
6844  char *flag_addr_str = NULL;
6845  char *flag_val_str = NULL;
6846  char const *file_name = NULL;
6847  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6848  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6849  file_name = tail;
6850  if (tail != NULL) {
6851  unsigned long *flag_addr = 0;
6852  unsigned long flag_val = 0;
6853  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6854  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6855  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6856  // First, check whether environment-encoded address is mapped into
6857  // addr space.
6858  // If so, dereference it to see if it still has the right value.
6859  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6860  neighbor = 1;
6861  } else {
6862  // If not, then we know the other copy of the library is no longer
6863  // running.
6864  neighbor = 2;
6865  }
6866  }
6867  }
6868  switch (neighbor) {
6869  case 0: // Cannot parse environment variable -- neighbor status unknown.
6870  // Assume it is the incompatible format of future version of the
6871  // library. Assume the other library is alive.
6872  // WARN( ... ); // TODO: Issue a warning.
6873  file_name = "unknown library";
6874  KMP_FALLTHROUGH();
6875  // Attention! Falling to the next case. That's intentional.
6876  case 1: { // Neighbor is alive.
6877  // Check it is allowed.
6878  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6879  if (!__kmp_str_match_true(duplicate_ok)) {
6880  // That's not allowed. Issue fatal error.
6881  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6882  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6883  }
6884  KMP_INTERNAL_FREE(duplicate_ok);
6885  __kmp_duplicate_library_ok = 1;
6886  done = 1; // Exit the loop.
6887  } break;
6888  case 2: { // Neighbor is dead.
6889 
6890 #if defined(KMP_USE_SHM)
6891  if (__kmp_shm_available) { // close shared memory.
6892  shm_unlink(shm_name); // this removes file in /dev/shm
6893  } else if (__kmp_tmp_available) {
6894  unlink(temp_reg_status_file_name); // this removes the temp file
6895  } else {
6896  // Clear the variable and try to register library again.
6897  __kmp_env_unset(name);
6898  }
6899 #else
6900  // Clear the variable and try to register library again.
6901  __kmp_env_unset(name);
6902 #endif
6903  } break;
6904  default: {
6905  KMP_DEBUG_ASSERT(0);
6906  } break;
6907  }
6908  }
6909  KMP_INTERNAL_FREE((void *)value);
6910 #if defined(KMP_USE_SHM)
6911  if (shm_name)
6912  KMP_INTERNAL_FREE((void *)shm_name);
6913 #endif
6914  } // while
6915  KMP_INTERNAL_FREE((void *)name);
6916 
6917 } // func __kmp_register_library_startup
6918 
6919 void __kmp_unregister_library(void) {
6920 
6921  char *name = __kmp_reg_status_name();
6922  char *value = NULL;
6923 
6924 #if defined(KMP_USE_SHM)
6925  char *shm_name = nullptr;
6926  int fd1;
6927  if (__kmp_shm_available) {
6928  shm_name = __kmp_str_format("/%s", name);
6929  fd1 = shm_open(shm_name, O_RDONLY, 0600);
6930  if (fd1 != -1) { // File opened successfully
6931  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6932  if (data1 != MAP_FAILED) {
6933  value = __kmp_str_format("%s", data1); // read value from SHM
6934  munmap(data1, SHM_SIZE);
6935  }
6936  close(fd1);
6937  }
6938  } else if (__kmp_tmp_available) { // try /tmp
6939  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6940  if (fd1 != -1) { // File opened successfully
6941  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6942  if (data1 != MAP_FAILED) {
6943  value = __kmp_str_format("%s", data1); // read value from /tmp
6944  munmap(data1, SHM_SIZE);
6945  }
6946  close(fd1);
6947  }
6948  } else { // fall back to envirable
6949  value = __kmp_env_get(name);
6950  }
6951 #else
6952  value = __kmp_env_get(name);
6953 #endif
6954 
6955  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6956  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6957  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6958 // Ok, this is our variable. Delete it.
6959 #if defined(KMP_USE_SHM)
6960  if (__kmp_shm_available) {
6961  shm_unlink(shm_name); // this removes file in /dev/shm
6962  } else if (__kmp_tmp_available) {
6963  unlink(temp_reg_status_file_name); // this removes the temp file
6964  } else {
6965  __kmp_env_unset(name);
6966  }
6967 #else
6968  __kmp_env_unset(name);
6969 #endif
6970  }
6971 
6972 #if defined(KMP_USE_SHM)
6973  if (shm_name)
6974  KMP_INTERNAL_FREE(shm_name);
6975  if (temp_reg_status_file_name)
6976  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6977 #endif
6978 
6979  KMP_INTERNAL_FREE(__kmp_registration_str);
6980  KMP_INTERNAL_FREE(value);
6981  KMP_INTERNAL_FREE(name);
6982 
6983  __kmp_registration_flag = 0;
6984  __kmp_registration_str = NULL;
6985 
6986 } // __kmp_unregister_library
6987 
6988 // End of Library registration stuff.
6989 // -----------------------------------------------------------------------------
6990 
6991 #if KMP_MIC_SUPPORTED
6992 
6993 static void __kmp_check_mic_type() {
6994  kmp_cpuid_t cpuid_state = {0};
6995  kmp_cpuid_t *cs_p = &cpuid_state;
6996  __kmp_x86_cpuid(1, 0, cs_p);
6997  // We don't support mic1 at the moment
6998  if ((cs_p->eax & 0xff0) == 0xB10) {
6999  __kmp_mic_type = mic2;
7000  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7001  __kmp_mic_type = mic3;
7002  } else {
7003  __kmp_mic_type = non_mic;
7004  }
7005 }
7006 
7007 #endif /* KMP_MIC_SUPPORTED */
7008 
7009 #if KMP_HAVE_UMWAIT
7010 static void __kmp_user_level_mwait_init() {
7011  struct kmp_cpuid buf;
7012  __kmp_x86_cpuid(7, 0, &buf);
7013  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7014  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7015  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7016  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7017  __kmp_umwait_enabled));
7018 }
7019 #elif KMP_HAVE_MWAIT
7020 #ifndef AT_INTELPHIUSERMWAIT
7021 // Spurious, non-existent value that should always fail to return anything.
7022 // Will be replaced with the correct value when we know that.
7023 #define AT_INTELPHIUSERMWAIT 10000
7024 #endif
7025 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7026 // earlier OS is used to build the RTL, we'll use the following internal
7027 // function when the entry is not found.
7028 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7029 unsigned long getauxval(unsigned long) { return 0; }
7030 
7031 static void __kmp_user_level_mwait_init() {
7032  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7033  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7034  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7035  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7036  if (__kmp_mic_type == mic3) {
7037  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7038  if ((res & 0x1) || __kmp_user_level_mwait) {
7039  __kmp_mwait_enabled = TRUE;
7040  if (__kmp_user_level_mwait) {
7041  KMP_INFORM(EnvMwaitWarn);
7042  }
7043  } else {
7044  __kmp_mwait_enabled = FALSE;
7045  }
7046  }
7047  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7048  "__kmp_mwait_enabled = %d\n",
7049  __kmp_mic_type, __kmp_mwait_enabled));
7050 }
7051 #endif /* KMP_HAVE_UMWAIT */
7052 
7053 static void __kmp_do_serial_initialize(void) {
7054  int i, gtid;
7055  size_t size;
7056 
7057  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7058 
7059  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7060  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7061  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7062  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7063  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7064 
7065 #if OMPT_SUPPORT
7066  ompt_pre_init();
7067 #endif
7068 #if OMPD_SUPPORT
7069  __kmp_env_dump();
7070  ompd_init();
7071 #endif
7072 
7073  __kmp_validate_locks();
7074 
7075 #if ENABLE_LIBOMPTARGET
7076  /* Initialize functions from libomptarget */
7077  __kmp_init_omptarget();
7078 #endif
7079 
7080  /* Initialize internal memory allocator */
7081  __kmp_init_allocator();
7082 
7083  /* Register the library startup via an environment variable or via mapped
7084  shared memory file and check to see whether another copy of the library is
7085  already registered. Since forked child process is often terminated, we
7086  postpone the registration till middle initialization in the child */
7087  if (__kmp_need_register_serial)
7088  __kmp_register_library_startup();
7089 
7090  /* TODO reinitialization of library */
7091  if (TCR_4(__kmp_global.g.g_done)) {
7092  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7093  }
7094 
7095  __kmp_global.g.g_abort = 0;
7096  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7097 
7098 /* initialize the locks */
7099 #if KMP_USE_ADAPTIVE_LOCKS
7100 #if KMP_DEBUG_ADAPTIVE_LOCKS
7101  __kmp_init_speculative_stats();
7102 #endif
7103 #endif
7104 #if KMP_STATS_ENABLED
7105  __kmp_stats_init();
7106 #endif
7107  __kmp_init_lock(&__kmp_global_lock);
7108  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7109  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7110  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7111  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7112  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7113  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7114  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7115  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7116  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7117  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7118  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7119  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7120  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7121  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7122  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7123 #if KMP_USE_MONITOR
7124  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7125 #endif
7126  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7127 
7128  /* conduct initialization and initial setup of configuration */
7129 
7130  __kmp_runtime_initialize();
7131 
7132 #if KMP_MIC_SUPPORTED
7133  __kmp_check_mic_type();
7134 #endif
7135 #if ENABLE_LIBOMPTARGET
7136  __kmp_target_init();
7137 #endif /* ENABLE_LIBOMPTARGET */
7138 
7139 // Some global variable initialization moved here from kmp_env_initialize()
7140 #ifdef KMP_DEBUG
7141  kmp_diag = 0;
7142 #endif
7143  __kmp_abort_delay = 0;
7144 
7145  // From __kmp_init_dflt_team_nth()
7146  /* assume the entire machine will be used */
7147  __kmp_dflt_team_nth_ub = __kmp_xproc;
7148  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7149  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7150  }
7151  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7152  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7153  }
7154  __kmp_max_nth = __kmp_sys_max_nth;
7155  __kmp_cg_max_nth = __kmp_sys_max_nth;
7156  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7157  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7158  __kmp_teams_max_nth = __kmp_sys_max_nth;
7159  }
7160 
7161  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7162  // part
7163  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7164 #if KMP_USE_MONITOR
7165  __kmp_monitor_wakeups =
7166  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7167  __kmp_bt_intervals =
7168  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7169 #endif
7170  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7171  __kmp_library = library_throughput;
7172  // From KMP_SCHEDULE initialization
7173  __kmp_static = kmp_sch_static_balanced;
7174 // AC: do not use analytical here, because it is non-monotonous
7175 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7176 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7177 // need to repeat assignment
7178 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7179 // bit control and barrier method control parts
7180 #if KMP_FAST_REDUCTION_BARRIER
7181 #define kmp_reduction_barrier_gather_bb ((int)1)
7182 #define kmp_reduction_barrier_release_bb ((int)1)
7183 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7184 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7185 #endif // KMP_FAST_REDUCTION_BARRIER
7186  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7187  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7188  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7189  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7190  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7191 #if KMP_FAST_REDUCTION_BARRIER
7192  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7193  // lin_64 ): hyper,1
7194  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7195  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7196  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7197  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7198  }
7199 #endif // KMP_FAST_REDUCTION_BARRIER
7200  }
7201 #if KMP_FAST_REDUCTION_BARRIER
7202 #undef kmp_reduction_barrier_release_pat
7203 #undef kmp_reduction_barrier_gather_pat
7204 #undef kmp_reduction_barrier_release_bb
7205 #undef kmp_reduction_barrier_gather_bb
7206 #endif // KMP_FAST_REDUCTION_BARRIER
7207 #if KMP_MIC_SUPPORTED
7208  if (__kmp_mic_type == mic2) { // KNC
7209  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7210  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7211  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7212  1; // forkjoin release
7213  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7214  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7215  }
7216 #if KMP_FAST_REDUCTION_BARRIER
7217  if (__kmp_mic_type == mic2) { // KNC
7218  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7219  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7220  }
7221 #endif // KMP_FAST_REDUCTION_BARRIER
7222 #endif // KMP_MIC_SUPPORTED
7223 
7224 // From KMP_CHECKS initialization
7225 #ifdef KMP_DEBUG
7226  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7227 #else
7228  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7229 #endif
7230 
7231  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7232  __kmp_foreign_tp = TRUE;
7233 
7234  __kmp_global.g.g_dynamic = FALSE;
7235  __kmp_global.g.g_dynamic_mode = dynamic_default;
7236 
7237  __kmp_init_nesting_mode();
7238 
7239  __kmp_env_initialize(NULL);
7240 
7241 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7242  __kmp_user_level_mwait_init();
7243 #endif
7244 // Print all messages in message catalog for testing purposes.
7245 #ifdef KMP_DEBUG
7246  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7247  if (__kmp_str_match_true(val)) {
7248  kmp_str_buf_t buffer;
7249  __kmp_str_buf_init(&buffer);
7250  __kmp_i18n_dump_catalog(&buffer);
7251  __kmp_printf("%s", buffer.str);
7252  __kmp_str_buf_free(&buffer);
7253  }
7254  __kmp_env_free(&val);
7255 #endif
7256 
7257  __kmp_threads_capacity =
7258  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7259  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7260  __kmp_tp_capacity = __kmp_default_tp_capacity(
7261  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7262 
7263  // If the library is shut down properly, both pools must be NULL. Just in
7264  // case, set them to NULL -- some memory may leak, but subsequent code will
7265  // work even if pools are not freed.
7266  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7267  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7268  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7269  __kmp_thread_pool = NULL;
7270  __kmp_thread_pool_insert_pt = NULL;
7271  __kmp_team_pool = NULL;
7272 
7273  /* Allocate all of the variable sized records */
7274  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7275  * expandable */
7276  /* Since allocation is cache-aligned, just add extra padding at the end */
7277  size =
7278  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7279  CACHE_LINE;
7280  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7281  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7282  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7283 
7284  /* init thread counts */
7285  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7286  0); // Asserts fail if the library is reinitializing and
7287  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7288  __kmp_all_nth = 0;
7289  __kmp_nth = 0;
7290 
7291  /* setup the uber master thread and hierarchy */
7292  gtid = __kmp_register_root(TRUE);
7293  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7294  KMP_ASSERT(KMP_UBER_GTID(gtid));
7295  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7296 
7297  KMP_MB(); /* Flush all pending memory write invalidates. */
7298 
7299  __kmp_common_initialize();
7300 
7301 #if KMP_OS_UNIX
7302  /* invoke the child fork handler */
7303  __kmp_register_atfork();
7304 #endif
7305 
7306 #if !KMP_DYNAMIC_LIB || \
7307  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7308  {
7309  /* Invoke the exit handler when the program finishes, only for static
7310  library and macOS* dynamic. For other dynamic libraries, we already
7311  have _fini and DllMain. */
7312  int rc = atexit(__kmp_internal_end_atexit);
7313  if (rc != 0) {
7314  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7315  __kmp_msg_null);
7316  }
7317  }
7318 #endif
7319 
7320 #if KMP_HANDLE_SIGNALS
7321 #if KMP_OS_UNIX
7322  /* NOTE: make sure that this is called before the user installs their own
7323  signal handlers so that the user handlers are called first. this way they
7324  can return false, not call our handler, avoid terminating the library, and
7325  continue execution where they left off. */
7326  __kmp_install_signals(FALSE);
7327 #endif /* KMP_OS_UNIX */
7328 #if KMP_OS_WINDOWS
7329  __kmp_install_signals(TRUE);
7330 #endif /* KMP_OS_WINDOWS */
7331 #endif
7332 
7333  /* we have finished the serial initialization */
7334  __kmp_init_counter++;
7335 
7336  __kmp_init_serial = TRUE;
7337 
7338  if (__kmp_version) {
7339  __kmp_print_version_1();
7340  }
7341 
7342  if (__kmp_settings) {
7343  __kmp_env_print();
7344  }
7345 
7346  if (__kmp_display_env || __kmp_display_env_verbose) {
7347  __kmp_env_print_2();
7348  }
7349 
7350 #if OMPT_SUPPORT
7351  ompt_post_init();
7352 #endif
7353 
7354  KMP_MB();
7355 
7356  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7357 }
7358 
7359 void __kmp_serial_initialize(void) {
7360  if (__kmp_init_serial) {
7361  return;
7362  }
7363  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7364  if (__kmp_init_serial) {
7365  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7366  return;
7367  }
7368  __kmp_do_serial_initialize();
7369  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7370 }
7371 
7372 static void __kmp_do_middle_initialize(void) {
7373  int i, j;
7374  int prev_dflt_team_nth;
7375 
7376  if (!__kmp_init_serial) {
7377  __kmp_do_serial_initialize();
7378  }
7379 
7380  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7381 
7382  if (UNLIKELY(!__kmp_need_register_serial)) {
7383  // We are in a forked child process. The registration was skipped during
7384  // serial initialization in __kmp_atfork_child handler. Do it here.
7385  __kmp_register_library_startup();
7386  }
7387 
7388  // Save the previous value for the __kmp_dflt_team_nth so that
7389  // we can avoid some reinitialization if it hasn't changed.
7390  prev_dflt_team_nth = __kmp_dflt_team_nth;
7391 
7392 #if KMP_AFFINITY_SUPPORTED
7393  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7394  // number of cores on the machine.
7395  __kmp_affinity_initialize(__kmp_affinity);
7396 
7397 #endif /* KMP_AFFINITY_SUPPORTED */
7398 
7399  KMP_ASSERT(__kmp_xproc > 0);
7400  if (__kmp_avail_proc == 0) {
7401  __kmp_avail_proc = __kmp_xproc;
7402  }
7403 
7404  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7405  // correct them now
7406  j = 0;
7407  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7408  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7409  __kmp_avail_proc;
7410  j++;
7411  }
7412 
7413  if (__kmp_dflt_team_nth == 0) {
7414 #ifdef KMP_DFLT_NTH_CORES
7415  // Default #threads = #cores
7416  __kmp_dflt_team_nth = __kmp_ncores;
7417  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7418  "__kmp_ncores (%d)\n",
7419  __kmp_dflt_team_nth));
7420 #else
7421  // Default #threads = #available OS procs
7422  __kmp_dflt_team_nth = __kmp_avail_proc;
7423  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7424  "__kmp_avail_proc(%d)\n",
7425  __kmp_dflt_team_nth));
7426 #endif /* KMP_DFLT_NTH_CORES */
7427  }
7428 
7429  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7430  __kmp_dflt_team_nth = KMP_MIN_NTH;
7431  }
7432  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7433  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7434  }
7435 
7436  if (__kmp_nesting_mode > 0)
7437  __kmp_set_nesting_mode_threads();
7438 
7439  // There's no harm in continuing if the following check fails,
7440  // but it indicates an error in the previous logic.
7441  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7442 
7443  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7444  // Run through the __kmp_threads array and set the num threads icv for each
7445  // root thread that is currently registered with the RTL (which has not
7446  // already explicitly set its nthreads-var with a call to
7447  // omp_set_num_threads()).
7448  for (i = 0; i < __kmp_threads_capacity; i++) {
7449  kmp_info_t *thread = __kmp_threads[i];
7450  if (thread == NULL)
7451  continue;
7452  if (thread->th.th_current_task->td_icvs.nproc != 0)
7453  continue;
7454 
7455  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7456  }
7457  }
7458  KA_TRACE(
7459  20,
7460  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7461  __kmp_dflt_team_nth));
7462 
7463 #ifdef KMP_ADJUST_BLOCKTIME
7464  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7465  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7466  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7467  if (__kmp_nth > __kmp_avail_proc) {
7468  __kmp_zero_bt = TRUE;
7469  }
7470  }
7471 #endif /* KMP_ADJUST_BLOCKTIME */
7472 
7473  /* we have finished middle initialization */
7474  TCW_SYNC_4(__kmp_init_middle, TRUE);
7475 
7476  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7477 }
7478 
7479 void __kmp_middle_initialize(void) {
7480  if (__kmp_init_middle) {
7481  return;
7482  }
7483  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7484  if (__kmp_init_middle) {
7485  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7486  return;
7487  }
7488  __kmp_do_middle_initialize();
7489  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7490 }
7491 
7492 void __kmp_parallel_initialize(void) {
7493  int gtid = __kmp_entry_gtid(); // this might be a new root
7494 
7495  /* synchronize parallel initialization (for sibling) */
7496  if (TCR_4(__kmp_init_parallel))
7497  return;
7498  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7499  if (TCR_4(__kmp_init_parallel)) {
7500  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7501  return;
7502  }
7503 
7504  /* TODO reinitialization after we have already shut down */
7505  if (TCR_4(__kmp_global.g.g_done)) {
7506  KA_TRACE(
7507  10,
7508  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7509  __kmp_infinite_loop();
7510  }
7511 
7512  /* jc: The lock __kmp_initz_lock is already held, so calling
7513  __kmp_serial_initialize would cause a deadlock. So we call
7514  __kmp_do_serial_initialize directly. */
7515  if (!__kmp_init_middle) {
7516  __kmp_do_middle_initialize();
7517  }
7518  __kmp_assign_root_init_mask();
7519  __kmp_resume_if_hard_paused();
7520 
7521  /* begin initialization */
7522  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7523  KMP_ASSERT(KMP_UBER_GTID(gtid));
7524 
7525 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7526  // Save the FP control regs.
7527  // Worker threads will set theirs to these values at thread startup.
7528  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7529  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7530  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7531 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7532 
7533 #if KMP_OS_UNIX
7534 #if KMP_HANDLE_SIGNALS
7535  /* must be after __kmp_serial_initialize */
7536  __kmp_install_signals(TRUE);
7537 #endif
7538 #endif
7539 
7540  __kmp_suspend_initialize();
7541 
7542 #if defined(USE_LOAD_BALANCE)
7543  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7544  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7545  }
7546 #else
7547  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7548  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7549  }
7550 #endif
7551 
7552  if (__kmp_version) {
7553  __kmp_print_version_2();
7554  }
7555 
7556  /* we have finished parallel initialization */
7557  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7558 
7559  KMP_MB();
7560  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7561 
7562  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7563 }
7564 
7565 void __kmp_hidden_helper_initialize() {
7566  if (TCR_4(__kmp_init_hidden_helper))
7567  return;
7568 
7569  // __kmp_parallel_initialize is required before we initialize hidden helper
7570  if (!TCR_4(__kmp_init_parallel))
7571  __kmp_parallel_initialize();
7572 
7573  // Double check. Note that this double check should not be placed before
7574  // __kmp_parallel_initialize as it will cause dead lock.
7575  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7576  if (TCR_4(__kmp_init_hidden_helper)) {
7577  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7578  return;
7579  }
7580 
7581 #if KMP_AFFINITY_SUPPORTED
7582  // Initialize hidden helper affinity settings.
7583  // The above __kmp_parallel_initialize() will initialize
7584  // regular affinity (and topology) if not already done.
7585  if (!__kmp_hh_affinity.flags.initialized)
7586  __kmp_affinity_initialize(__kmp_hh_affinity);
7587 #endif
7588 
7589  // Set the count of hidden helper tasks to be executed to zero
7590  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7591 
7592  // Set the global variable indicating that we're initializing hidden helper
7593  // team/threads
7594  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7595 
7596  // Platform independent initialization
7597  __kmp_do_initialize_hidden_helper_threads();
7598 
7599  // Wait here for the finish of initialization of hidden helper teams
7600  __kmp_hidden_helper_threads_initz_wait();
7601 
7602  // We have finished hidden helper initialization
7603  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7604 
7605  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7606 }
7607 
7608 /* ------------------------------------------------------------------------ */
7609 
7610 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7611  kmp_team_t *team) {
7612  kmp_disp_t *dispatch;
7613 
7614  KMP_MB();
7615 
7616  /* none of the threads have encountered any constructs, yet. */
7617  this_thr->th.th_local.this_construct = 0;
7618 #if KMP_CACHE_MANAGE
7619  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7620 #endif /* KMP_CACHE_MANAGE */
7621  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7622  KMP_DEBUG_ASSERT(dispatch);
7623  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7624  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7625  // this_thr->th.th_info.ds.ds_tid ] );
7626 
7627  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7628  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7629  if (__kmp_env_consistency_check)
7630  __kmp_push_parallel(gtid, team->t.t_ident);
7631 
7632  KMP_MB(); /* Flush all pending memory write invalidates. */
7633 }
7634 
7635 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7636  kmp_team_t *team) {
7637  if (__kmp_env_consistency_check)
7638  __kmp_pop_parallel(gtid, team->t.t_ident);
7639 
7640  __kmp_finish_implicit_task(this_thr);
7641 }
7642 
7643 int __kmp_invoke_task_func(int gtid) {
7644  int rc;
7645  int tid = __kmp_tid_from_gtid(gtid);
7646  kmp_info_t *this_thr = __kmp_threads[gtid];
7647  kmp_team_t *team = this_thr->th.th_team;
7648 
7649  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7650 #if USE_ITT_BUILD
7651  if (__itt_stack_caller_create_ptr) {
7652  // inform ittnotify about entering user's code
7653  if (team->t.t_stack_id != NULL) {
7654  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7655  } else {
7656  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7657  __kmp_itt_stack_callee_enter(
7658  (__itt_caller)team->t.t_parent->t.t_stack_id);
7659  }
7660  }
7661 #endif /* USE_ITT_BUILD */
7662 #if INCLUDE_SSC_MARKS
7663  SSC_MARK_INVOKING();
7664 #endif
7665 
7666 #if OMPT_SUPPORT
7667  void *dummy;
7668  void **exit_frame_p;
7669  ompt_data_t *my_task_data;
7670  ompt_data_t *my_parallel_data;
7671  int ompt_team_size;
7672 
7673  if (ompt_enabled.enabled) {
7674  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7675  .ompt_task_info.frame.exit_frame.ptr);
7676  } else {
7677  exit_frame_p = &dummy;
7678  }
7679 
7680  my_task_data =
7681  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7682  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7683  if (ompt_enabled.ompt_callback_implicit_task) {
7684  ompt_team_size = team->t.t_nproc;
7685  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7686  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7687  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7688  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7689  }
7690 #endif
7691 
7692 #if KMP_STATS_ENABLED
7693  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7694  if (previous_state == stats_state_e::TEAMS_REGION) {
7695  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7696  } else {
7697  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7698  }
7699  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7700 #endif
7701 
7702  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7703  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7704 #if OMPT_SUPPORT
7705  ,
7706  exit_frame_p
7707 #endif
7708  );
7709 #if OMPT_SUPPORT
7710  *exit_frame_p = NULL;
7711  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7712 #endif
7713 
7714 #if KMP_STATS_ENABLED
7715  if (previous_state == stats_state_e::TEAMS_REGION) {
7716  KMP_SET_THREAD_STATE(previous_state);
7717  }
7718  KMP_POP_PARTITIONED_TIMER();
7719 #endif
7720 
7721 #if USE_ITT_BUILD
7722  if (__itt_stack_caller_create_ptr) {
7723  // inform ittnotify about leaving user's code
7724  if (team->t.t_stack_id != NULL) {
7725  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7726  } else {
7727  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7728  __kmp_itt_stack_callee_leave(
7729  (__itt_caller)team->t.t_parent->t.t_stack_id);
7730  }
7731  }
7732 #endif /* USE_ITT_BUILD */
7733  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7734 
7735  return rc;
7736 }
7737 
7738 void __kmp_teams_master(int gtid) {
7739  // This routine is called by all primary threads in teams construct
7740  kmp_info_t *thr = __kmp_threads[gtid];
7741  kmp_team_t *team = thr->th.th_team;
7742  ident_t *loc = team->t.t_ident;
7743  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7744  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7745  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7746  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7747  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7748 
7749  // This thread is a new CG root. Set up the proper variables.
7750  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7751  tmp->cg_root = thr; // Make thr the CG root
7752  // Init to thread limit stored when league primary threads were forked
7753  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7754  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7755  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7756  " cg_nthreads to 1\n",
7757  thr, tmp));
7758  tmp->up = thr->th.th_cg_roots;
7759  thr->th.th_cg_roots = tmp;
7760 
7761 // Launch league of teams now, but not let workers execute
7762 // (they hang on fork barrier until next parallel)
7763 #if INCLUDE_SSC_MARKS
7764  SSC_MARK_FORKING();
7765 #endif
7766  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7767  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7768  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7769 #if INCLUDE_SSC_MARKS
7770  SSC_MARK_JOINING();
7771 #endif
7772  // If the team size was reduced from the limit, set it to the new size
7773  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7774  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7775  // AC: last parameter "1" eliminates join barrier which won't work because
7776  // worker threads are in a fork barrier waiting for more parallel regions
7777  __kmp_join_call(loc, gtid
7778 #if OMPT_SUPPORT
7779  ,
7780  fork_context_intel
7781 #endif
7782  ,
7783  1);
7784 }
7785 
7786 int __kmp_invoke_teams_master(int gtid) {
7787  kmp_info_t *this_thr = __kmp_threads[gtid];
7788  kmp_team_t *team = this_thr->th.th_team;
7789 #if KMP_DEBUG
7790  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7791  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7792  (void *)__kmp_teams_master);
7793 #endif
7794  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7795 #if OMPT_SUPPORT
7796  int tid = __kmp_tid_from_gtid(gtid);
7797  ompt_data_t *task_data =
7798  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7799  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7800  if (ompt_enabled.ompt_callback_implicit_task) {
7801  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7802  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7803  ompt_task_initial);
7804  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7805  }
7806 #endif
7807  __kmp_teams_master(gtid);
7808 #if OMPT_SUPPORT
7809  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7810 #endif
7811  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7812  return 1;
7813 }
7814 
7815 /* this sets the requested number of threads for the next parallel region
7816  encountered by this team. since this should be enclosed in the forkjoin
7817  critical section it should avoid race conditions with asymmetrical nested
7818  parallelism */
7819 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7820  kmp_info_t *thr = __kmp_threads[gtid];
7821 
7822  if (num_threads > 0)
7823  thr->th.th_set_nproc = num_threads;
7824 }
7825 
7826 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7827  int *num_threads_list) {
7828  kmp_info_t *thr = __kmp_threads[gtid];
7829 
7830  KMP_DEBUG_ASSERT(list_length > 1);
7831 
7832  if (num_threads_list[0] > 0)
7833  thr->th.th_set_nproc = num_threads_list[0];
7834  thr->th.th_set_nested_nth =
7835  (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7836  for (kmp_uint32 i = 0; i < list_length; ++i)
7837  thr->th.th_set_nested_nth[i] = num_threads_list[i];
7838  thr->th.th_set_nested_nth_sz = list_length;
7839 }
7840 
7841 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7842  const char *msg) {
7843  kmp_info_t *thr = __kmp_threads[gtid];
7844  thr->th.th_nt_strict = true;
7845  thr->th.th_nt_loc = loc;
7846  // if sev is unset make fatal
7847  if (sev == severity_warning)
7848  thr->th.th_nt_sev = sev;
7849  else
7850  thr->th.th_nt_sev = severity_fatal;
7851  // if msg is unset, use an appropriate message
7852  if (msg)
7853  thr->th.th_nt_msg = msg;
7854  else
7855  thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7856  "strict num_threads clause.";
7857 }
7858 
7859 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7860  int num_threads) {
7861  KMP_DEBUG_ASSERT(thr);
7862  // Remember the number of threads for inner parallel regions
7863  if (!TCR_4(__kmp_init_middle))
7864  __kmp_middle_initialize(); // get internal globals calculated
7865  __kmp_assign_root_init_mask();
7866  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7867  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7868 
7869  if (num_threads == 0) {
7870  if (__kmp_teams_thread_limit > 0) {
7871  num_threads = __kmp_teams_thread_limit;
7872  } else {
7873  num_threads = __kmp_avail_proc / num_teams;
7874  }
7875  // adjust num_threads w/o warning as it is not user setting
7876  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7877  // no thread_limit clause specified - do not change thread-limit-var ICV
7878  if (num_threads > __kmp_dflt_team_nth) {
7879  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7880  }
7881  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7882  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7883  } // prevent team size to exceed thread-limit-var
7884  if (num_teams * num_threads > __kmp_teams_max_nth) {
7885  num_threads = __kmp_teams_max_nth / num_teams;
7886  }
7887  if (num_threads == 0) {
7888  num_threads = 1;
7889  }
7890  } else {
7891  if (num_threads < 0) {
7892  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7893  __kmp_msg_null);
7894  num_threads = 1;
7895  }
7896  // This thread will be the primary thread of the league primary threads
7897  // Store new thread limit; old limit is saved in th_cg_roots list
7898  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7899  // num_threads = min(num_threads, nthreads-var)
7900  if (num_threads > __kmp_dflt_team_nth) {
7901  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7902  }
7903  if (num_teams * num_threads > __kmp_teams_max_nth) {
7904  int new_threads = __kmp_teams_max_nth / num_teams;
7905  if (new_threads == 0) {
7906  new_threads = 1;
7907  }
7908  if (new_threads != num_threads) {
7909  if (!__kmp_reserve_warn) { // user asked for too many threads
7910  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7911  __kmp_msg(kmp_ms_warning,
7912  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7913  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7914  }
7915  }
7916  num_threads = new_threads;
7917  }
7918  }
7919  thr->th.th_teams_size.nth = num_threads;
7920 }
7921 
7922 /* this sets the requested number of teams for the teams region and/or
7923  the number of threads for the next parallel region encountered */
7924 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7925  int num_threads) {
7926  kmp_info_t *thr = __kmp_threads[gtid];
7927  if (num_teams < 0) {
7928  // OpenMP specification requires requested values to be positive,
7929  // but people can send us any value, so we'd better check
7930  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7931  __kmp_msg_null);
7932  num_teams = 1;
7933  }
7934  if (num_teams == 0) {
7935  if (__kmp_nteams > 0) {
7936  num_teams = __kmp_nteams;
7937  } else {
7938  num_teams = 1; // default number of teams is 1.
7939  }
7940  }
7941  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7942  if (!__kmp_reserve_warn) {
7943  __kmp_reserve_warn = 1;
7944  __kmp_msg(kmp_ms_warning,
7945  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7946  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7947  }
7948  num_teams = __kmp_teams_max_nth;
7949  }
7950  // Set number of teams (number of threads in the outer "parallel" of the
7951  // teams)
7952  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7953 
7954  __kmp_push_thread_limit(thr, num_teams, num_threads);
7955 }
7956 
7957 /* This sets the requested number of teams for the teams region and/or
7958  the number of threads for the next parallel region encountered */
7959 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7960  int num_teams_ub, int num_threads) {
7961  kmp_info_t *thr = __kmp_threads[gtid];
7962  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7963  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7964  KMP_DEBUG_ASSERT(num_threads >= 0);
7965 
7966  if (num_teams_lb > num_teams_ub) {
7967  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7968  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7969  }
7970 
7971  int num_teams = 1; // defalt number of teams is 1.
7972 
7973  if (num_teams_lb == 0 && num_teams_ub > 0)
7974  num_teams_lb = num_teams_ub;
7975 
7976  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7977  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7978  if (num_teams > __kmp_teams_max_nth) {
7979  if (!__kmp_reserve_warn) {
7980  __kmp_reserve_warn = 1;
7981  __kmp_msg(kmp_ms_warning,
7982  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7983  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7984  }
7985  num_teams = __kmp_teams_max_nth;
7986  }
7987  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7988  num_teams = num_teams_ub;
7989  } else { // num_teams_lb <= num_teams <= num_teams_ub
7990  if (num_threads <= 0) {
7991  if (num_teams_ub > __kmp_teams_max_nth) {
7992  num_teams = num_teams_lb;
7993  } else {
7994  num_teams = num_teams_ub;
7995  }
7996  } else {
7997  num_teams = (num_threads > __kmp_teams_max_nth)
7998  ? num_teams
7999  : __kmp_teams_max_nth / num_threads;
8000  if (num_teams < num_teams_lb) {
8001  num_teams = num_teams_lb;
8002  } else if (num_teams > num_teams_ub) {
8003  num_teams = num_teams_ub;
8004  }
8005  }
8006  }
8007  // Set number of teams (number of threads in the outer "parallel" of the
8008  // teams)
8009  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8010 
8011  __kmp_push_thread_limit(thr, num_teams, num_threads);
8012 }
8013 
8014 // Set the proc_bind var to use in the following parallel region.
8015 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8016  kmp_info_t *thr = __kmp_threads[gtid];
8017  thr->th.th_set_proc_bind = proc_bind;
8018 }
8019 
8020 /* Launch the worker threads into the microtask. */
8021 
8022 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8023  kmp_info_t *this_thr = __kmp_threads[gtid];
8024 
8025 #ifdef KMP_DEBUG
8026  int f;
8027 #endif /* KMP_DEBUG */
8028 
8029  KMP_DEBUG_ASSERT(team);
8030  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8031  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8032  KMP_MB(); /* Flush all pending memory write invalidates. */
8033 
8034  team->t.t_construct = 0; /* no single directives seen yet */
8035  team->t.t_ordered.dt.t_value =
8036  0; /* thread 0 enters the ordered section first */
8037 
8038  /* Reset the identifiers on the dispatch buffer */
8039  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8040  if (team->t.t_max_nproc > 1) {
8041  int i;
8042  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8043  team->t.t_disp_buffer[i].buffer_index = i;
8044  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8045  }
8046  } else {
8047  team->t.t_disp_buffer[0].buffer_index = 0;
8048  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8049  }
8050 
8051  KMP_MB(); /* Flush all pending memory write invalidates. */
8052  KMP_ASSERT(this_thr->th.th_team == team);
8053 
8054 #ifdef KMP_DEBUG
8055  for (f = 0; f < team->t.t_nproc; f++) {
8056  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8057  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8058  }
8059 #endif /* KMP_DEBUG */
8060 
8061  /* release the worker threads so they may begin working */
8062  __kmp_fork_barrier(gtid, 0);
8063 }
8064 
8065 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8066  kmp_info_t *this_thr = __kmp_threads[gtid];
8067 
8068  KMP_DEBUG_ASSERT(team);
8069  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8070  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8071  KMP_MB(); /* Flush all pending memory write invalidates. */
8072 
8073  /* Join barrier after fork */
8074 
8075 #ifdef KMP_DEBUG
8076  if (__kmp_threads[gtid] &&
8077  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8078  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8079  __kmp_threads[gtid]);
8080  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8081  "team->t.t_nproc=%d\n",
8082  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8083  team->t.t_nproc);
8084  __kmp_print_structure();
8085  }
8086  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8087  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8088 #endif /* KMP_DEBUG */
8089 
8090  __kmp_join_barrier(gtid); /* wait for everyone */
8091 #if OMPT_SUPPORT
8092  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8093  if (ompt_enabled.enabled &&
8094  (ompt_state == ompt_state_wait_barrier_teams ||
8095  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8096  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8097  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8098  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8099 #if OMPT_OPTIONAL
8100  void *codeptr = NULL;
8101  if (KMP_MASTER_TID(ds_tid) &&
8102  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8103  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8104  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8105 
8106  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8107  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8108  sync_kind = ompt_sync_region_barrier_teams;
8109  if (ompt_enabled.ompt_callback_sync_region_wait) {
8110  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8111  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8112  }
8113  if (ompt_enabled.ompt_callback_sync_region) {
8114  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8115  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8116  }
8117 #endif
8118  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8119  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8120  ompt_scope_end, NULL, task_data, 0, ds_tid,
8121  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8122  }
8123  }
8124 #endif
8125 
8126  KMP_MB(); /* Flush all pending memory write invalidates. */
8127  KMP_ASSERT(this_thr->th.th_team == team);
8128 }
8129 
8130 /* ------------------------------------------------------------------------ */
8131 
8132 #ifdef USE_LOAD_BALANCE
8133 
8134 // Return the worker threads actively spinning in the hot team, if we
8135 // are at the outermost level of parallelism. Otherwise, return 0.
8136 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8137  int i;
8138  int retval;
8139  kmp_team_t *hot_team;
8140 
8141  if (root->r.r_active) {
8142  return 0;
8143  }
8144  hot_team = root->r.r_hot_team;
8145  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8146  return hot_team->t.t_nproc - 1; // Don't count primary thread
8147  }
8148 
8149  // Skip the primary thread - it is accounted for elsewhere.
8150  retval = 0;
8151  for (i = 1; i < hot_team->t.t_nproc; i++) {
8152  if (hot_team->t.t_threads[i]->th.th_active) {
8153  retval++;
8154  }
8155  }
8156  return retval;
8157 }
8158 
8159 // Perform an automatic adjustment to the number of
8160 // threads used by the next parallel region.
8161 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8162  int retval;
8163  int pool_active;
8164  int hot_team_active;
8165  int team_curr_active;
8166  int system_active;
8167 
8168  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8169  set_nproc));
8170  KMP_DEBUG_ASSERT(root);
8171  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8172  ->th.th_current_task->td_icvs.dynamic == TRUE);
8173  KMP_DEBUG_ASSERT(set_nproc > 1);
8174 
8175  if (set_nproc == 1) {
8176  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8177  return 1;
8178  }
8179 
8180  // Threads that are active in the thread pool, active in the hot team for this
8181  // particular root (if we are at the outer par level), and the currently
8182  // executing thread (to become the primary thread) are available to add to the
8183  // new team, but are currently contributing to the system load, and must be
8184  // accounted for.
8185  pool_active = __kmp_thread_pool_active_nth;
8186  hot_team_active = __kmp_active_hot_team_nproc(root);
8187  team_curr_active = pool_active + hot_team_active + 1;
8188 
8189  // Check the system load.
8190  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8191  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8192  "hot team active = %d\n",
8193  system_active, pool_active, hot_team_active));
8194 
8195  if (system_active < 0) {
8196  // There was an error reading the necessary info from /proc, so use the
8197  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8198  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8199  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8200  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8201 
8202  // Make this call behave like the thread limit algorithm.
8203  retval = __kmp_avail_proc - __kmp_nth +
8204  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8205  if (retval > set_nproc) {
8206  retval = set_nproc;
8207  }
8208  if (retval < KMP_MIN_NTH) {
8209  retval = KMP_MIN_NTH;
8210  }
8211 
8212  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8213  retval));
8214  return retval;
8215  }
8216 
8217  // There is a slight delay in the load balance algorithm in detecting new
8218  // running procs. The real system load at this instant should be at least as
8219  // large as the #active omp thread that are available to add to the team.
8220  if (system_active < team_curr_active) {
8221  system_active = team_curr_active;
8222  }
8223  retval = __kmp_avail_proc - system_active + team_curr_active;
8224  if (retval > set_nproc) {
8225  retval = set_nproc;
8226  }
8227  if (retval < KMP_MIN_NTH) {
8228  retval = KMP_MIN_NTH;
8229  }
8230 
8231  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8232  return retval;
8233 } // __kmp_load_balance_nproc()
8234 
8235 #endif /* USE_LOAD_BALANCE */
8236 
8237 /* ------------------------------------------------------------------------ */
8238 
8239 /* NOTE: this is called with the __kmp_init_lock held */
8240 void __kmp_cleanup(void) {
8241  int f;
8242 
8243  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8244 
8245  if (TCR_4(__kmp_init_parallel)) {
8246 #if KMP_HANDLE_SIGNALS
8247  __kmp_remove_signals();
8248 #endif
8249  TCW_4(__kmp_init_parallel, FALSE);
8250  }
8251 
8252  if (TCR_4(__kmp_init_middle)) {
8253 #if KMP_AFFINITY_SUPPORTED
8254  __kmp_affinity_uninitialize();
8255 #endif /* KMP_AFFINITY_SUPPORTED */
8256  __kmp_cleanup_hierarchy();
8257  TCW_4(__kmp_init_middle, FALSE);
8258  }
8259 
8260  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8261 
8262  if (__kmp_init_serial) {
8263  __kmp_runtime_destroy();
8264  __kmp_init_serial = FALSE;
8265  }
8266 
8267  __kmp_cleanup_threadprivate_caches();
8268 
8269  for (f = 0; f < __kmp_threads_capacity; f++) {
8270  if (__kmp_root[f] != NULL) {
8271  __kmp_free(__kmp_root[f]);
8272  __kmp_root[f] = NULL;
8273  }
8274  }
8275  __kmp_free(__kmp_threads);
8276  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8277  // there is no need in freeing __kmp_root.
8278  __kmp_threads = NULL;
8279  __kmp_root = NULL;
8280  __kmp_threads_capacity = 0;
8281 
8282  // Free old __kmp_threads arrays if they exist.
8283  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8284  while (ptr) {
8285  kmp_old_threads_list_t *next = ptr->next;
8286  __kmp_free(ptr->threads);
8287  __kmp_free(ptr);
8288  ptr = next;
8289  }
8290  __kmp_old_threads_list = NULL;
8291 
8292 #if KMP_USE_DYNAMIC_LOCK
8293  __kmp_cleanup_indirect_user_locks();
8294 #else
8295  __kmp_cleanup_user_locks();
8296 #endif
8297 #if OMPD_SUPPORT
8298  if (ompd_env_block) {
8299  __kmp_free(ompd_env_block);
8300  ompd_env_block = NULL;
8301  ompd_env_block_size = 0;
8302  }
8303 #endif
8304 
8305 #if KMP_AFFINITY_SUPPORTED
8306  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8307  __kmp_cpuinfo_file = NULL;
8308 #endif /* KMP_AFFINITY_SUPPORTED */
8309 
8310 #if KMP_USE_ADAPTIVE_LOCKS
8311 #if KMP_DEBUG_ADAPTIVE_LOCKS
8312  __kmp_print_speculative_stats();
8313 #endif
8314 #endif
8315  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8316  __kmp_nested_nth.nth = NULL;
8317  __kmp_nested_nth.size = 0;
8318  __kmp_nested_nth.used = 0;
8319 
8320  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8321  __kmp_nested_proc_bind.bind_types = NULL;
8322  __kmp_nested_proc_bind.size = 0;
8323  __kmp_nested_proc_bind.used = 0;
8324  __kmp_dflt_team_nth = 0;
8325  __kmp_dflt_team_nth_ub = 0;
8326  if (__kmp_affinity_format) {
8327  KMP_INTERNAL_FREE(__kmp_affinity_format);
8328  __kmp_affinity_format = NULL;
8329  }
8330 
8331  __kmp_i18n_catclose();
8332 
8333  if (__kmp_nesting_nth_level)
8334  KMP_INTERNAL_FREE(__kmp_nesting_nth_level);
8335 
8336 #if KMP_USE_HIER_SCHED
8337  __kmp_hier_scheds.deallocate();
8338 #endif
8339 
8340 #if KMP_STATS_ENABLED
8341  __kmp_stats_fini();
8342 #endif
8343 
8344  __kmpc_destroy_allocator(KMP_GTID_SHUTDOWN, __kmp_def_allocator);
8345  __kmp_def_allocator = omp_default_mem_alloc;
8346 
8347  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8348 }
8349 
8350 /* ------------------------------------------------------------------------ */
8351 
8352 int __kmp_ignore_mppbeg(void) {
8353  char *env;
8354 
8355  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8356  if (__kmp_str_match_false(env))
8357  return FALSE;
8358  }
8359  // By default __kmpc_begin() is no-op.
8360  return TRUE;
8361 }
8362 
8363 int __kmp_ignore_mppend(void) {
8364  char *env;
8365 
8366  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8367  if (__kmp_str_match_false(env))
8368  return FALSE;
8369  }
8370  // By default __kmpc_end() is no-op.
8371  return TRUE;
8372 }
8373 
8374 void __kmp_internal_begin(void) {
8375  int gtid;
8376  kmp_root_t *root;
8377 
8378  /* this is a very important step as it will register new sibling threads
8379  and assign these new uber threads a new gtid */
8380  gtid = __kmp_entry_gtid();
8381  root = __kmp_threads[gtid]->th.th_root;
8382  KMP_ASSERT(KMP_UBER_GTID(gtid));
8383 
8384  if (root->r.r_begin)
8385  return;
8386  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8387  if (root->r.r_begin) {
8388  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8389  return;
8390  }
8391 
8392  root->r.r_begin = TRUE;
8393 
8394  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8395 }
8396 
8397 /* ------------------------------------------------------------------------ */
8398 
8399 void __kmp_user_set_library(enum library_type arg) {
8400  int gtid;
8401  kmp_root_t *root;
8402  kmp_info_t *thread;
8403 
8404  /* first, make sure we are initialized so we can get our gtid */
8405 
8406  gtid = __kmp_entry_gtid();
8407  thread = __kmp_threads[gtid];
8408 
8409  root = thread->th.th_root;
8410 
8411  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8412  library_serial));
8413  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8414  thread */
8415  KMP_WARNING(SetLibraryIncorrectCall);
8416  return;
8417  }
8418 
8419  switch (arg) {
8420  case library_serial:
8421  thread->th.th_set_nproc = 0;
8422  set__nproc(thread, 1);
8423  break;
8424  case library_turnaround:
8425  thread->th.th_set_nproc = 0;
8426  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8427  : __kmp_dflt_team_nth_ub);
8428  break;
8429  case library_throughput:
8430  thread->th.th_set_nproc = 0;
8431  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8432  : __kmp_dflt_team_nth_ub);
8433  break;
8434  default:
8435  KMP_FATAL(UnknownLibraryType, arg);
8436  }
8437 
8438  __kmp_aux_set_library(arg);
8439 }
8440 
8441 void __kmp_aux_set_stacksize(size_t arg) {
8442  if (!__kmp_init_serial)
8443  __kmp_serial_initialize();
8444 
8445 #if KMP_OS_DARWIN
8446  if (arg & (0x1000 - 1)) {
8447  arg &= ~(0x1000 - 1);
8448  if (arg + 0x1000) /* check for overflow if we round up */
8449  arg += 0x1000;
8450  }
8451 #endif
8452  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8453 
8454  /* only change the default stacksize before the first parallel region */
8455  if (!TCR_4(__kmp_init_parallel)) {
8456  size_t value = arg; /* argument is in bytes */
8457 
8458  if (value < __kmp_sys_min_stksize)
8459  value = __kmp_sys_min_stksize;
8460  else if (value > KMP_MAX_STKSIZE)
8461  value = KMP_MAX_STKSIZE;
8462 
8463  __kmp_stksize = value;
8464 
8465  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8466  }
8467 
8468  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8469 }
8470 
8471 /* set the behaviour of the runtime library */
8472 /* TODO this can cause some odd behaviour with sibling parallelism... */
8473 void __kmp_aux_set_library(enum library_type arg) {
8474  __kmp_library = arg;
8475 
8476  switch (__kmp_library) {
8477  case library_serial: {
8478  KMP_INFORM(LibraryIsSerial);
8479  } break;
8480  case library_turnaround:
8481  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8482  __kmp_use_yield = 2; // only yield when oversubscribed
8483  break;
8484  case library_throughput:
8485  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8486  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8487  break;
8488  default:
8489  KMP_FATAL(UnknownLibraryType, arg);
8490  }
8491 }
8492 
8493 /* Getting team information common for all team API */
8494 // Returns NULL if not in teams construct
8495 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8496  kmp_info_t *thr = __kmp_entry_thread();
8497  teams_serialized = 0;
8498  if (thr->th.th_teams_microtask) {
8499  kmp_team_t *team = thr->th.th_team;
8500  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8501  int ii = team->t.t_level;
8502  teams_serialized = team->t.t_serialized;
8503  int level = tlevel + 1;
8504  KMP_DEBUG_ASSERT(ii >= tlevel);
8505  while (ii > level) {
8506  for (teams_serialized = team->t.t_serialized;
8507  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8508  }
8509  if (team->t.t_serialized && (!teams_serialized)) {
8510  team = team->t.t_parent;
8511  continue;
8512  }
8513  if (ii > level) {
8514  team = team->t.t_parent;
8515  ii--;
8516  }
8517  }
8518  return team;
8519  }
8520  return NULL;
8521 }
8522 
8523 int __kmp_aux_get_team_num() {
8524  int serialized;
8525  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8526  if (team) {
8527  if (serialized > 1) {
8528  return 0; // teams region is serialized ( 1 team of 1 thread ).
8529  } else {
8530  return team->t.t_master_tid;
8531  }
8532  }
8533  return 0;
8534 }
8535 
8536 int __kmp_aux_get_num_teams() {
8537  int serialized;
8538  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8539  if (team) {
8540  if (serialized > 1) {
8541  return 1;
8542  } else {
8543  return team->t.t_parent->t.t_nproc;
8544  }
8545  }
8546  return 1;
8547 }
8548 
8549 /* ------------------------------------------------------------------------ */
8550 
8551 /*
8552  * Affinity Format Parser
8553  *
8554  * Field is in form of: %[[[0].]size]type
8555  * % and type are required (%% means print a literal '%')
8556  * type is either single char or long name surrounded by {},
8557  * e.g., N or {num_threads}
8558  * 0 => leading zeros
8559  * . => right justified when size is specified
8560  * by default output is left justified
8561  * size is the *minimum* field length
8562  * All other characters are printed as is
8563  *
8564  * Available field types:
8565  * L {thread_level} - omp_get_level()
8566  * n {thread_num} - omp_get_thread_num()
8567  * h {host} - name of host machine
8568  * P {process_id} - process id (integer)
8569  * T {thread_identifier} - native thread identifier (integer)
8570  * N {num_threads} - omp_get_num_threads()
8571  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8572  * a {thread_affinity} - comma separated list of integers or integer ranges
8573  * (values of affinity mask)
8574  *
8575  * Implementation-specific field types can be added
8576  * If a type is unknown, print "undefined"
8577  */
8578 
8579 // Structure holding the short name, long name, and corresponding data type
8580 // for snprintf. A table of these will represent the entire valid keyword
8581 // field types.
8582 typedef struct kmp_affinity_format_field_t {
8583  char short_name; // from spec e.g., L -> thread level
8584  const char *long_name; // from spec thread_level -> thread level
8585  char field_format; // data type for snprintf (typically 'd' or 's'
8586  // for integer or string)
8587 } kmp_affinity_format_field_t;
8588 
8589 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8590 #if KMP_AFFINITY_SUPPORTED
8591  {'A', "thread_affinity", 's'},
8592 #endif
8593  {'t', "team_num", 'd'},
8594  {'T', "num_teams", 'd'},
8595  {'L', "nesting_level", 'd'},
8596  {'n', "thread_num", 'd'},
8597  {'N', "num_threads", 'd'},
8598  {'a', "ancestor_tnum", 'd'},
8599  {'H', "host", 's'},
8600  {'P', "process_id", 'd'},
8601  {'i', "native_thread_id", 'd'}};
8602 
8603 // Return the number of characters it takes to hold field
8604 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8605  const char **ptr,
8606  kmp_str_buf_t *field_buffer) {
8607  int rc, format_index, field_value;
8608  const char *width_left, *width_right;
8609  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8610  static const int FORMAT_SIZE = 20;
8611  char format[FORMAT_SIZE] = {0};
8612  char absolute_short_name = 0;
8613 
8614  KMP_DEBUG_ASSERT(gtid >= 0);
8615  KMP_DEBUG_ASSERT(th);
8616  KMP_DEBUG_ASSERT(**ptr == '%');
8617  KMP_DEBUG_ASSERT(field_buffer);
8618 
8619  __kmp_str_buf_clear(field_buffer);
8620 
8621  // Skip the initial %
8622  (*ptr)++;
8623 
8624  // Check for %% first
8625  if (**ptr == '%') {
8626  __kmp_str_buf_cat(field_buffer, "%", 1);
8627  (*ptr)++; // skip over the second %
8628  return 1;
8629  }
8630 
8631  // Parse field modifiers if they are present
8632  pad_zeros = false;
8633  if (**ptr == '0') {
8634  pad_zeros = true;
8635  (*ptr)++; // skip over 0
8636  }
8637  right_justify = false;
8638  if (**ptr == '.') {
8639  right_justify = true;
8640  (*ptr)++; // skip over .
8641  }
8642  // Parse width of field: [width_left, width_right)
8643  width_left = width_right = NULL;
8644  if (**ptr >= '0' && **ptr <= '9') {
8645  width_left = *ptr;
8646  SKIP_DIGITS(*ptr);
8647  width_right = *ptr;
8648  }
8649 
8650  // Create the format for KMP_SNPRINTF based on flags parsed above
8651  format_index = 0;
8652  format[format_index++] = '%';
8653  if (!right_justify)
8654  format[format_index++] = '-';
8655  if (pad_zeros)
8656  format[format_index++] = '0';
8657  if (width_left && width_right) {
8658  int i = 0;
8659  // Only allow 8 digit number widths.
8660  // This also prevents overflowing format variable
8661  while (i < 8 && width_left < width_right) {
8662  format[format_index++] = *width_left;
8663  width_left++;
8664  i++;
8665  }
8666  }
8667 
8668  // Parse a name (long or short)
8669  // Canonicalize the name into absolute_short_name
8670  found_valid_name = false;
8671  parse_long_name = (**ptr == '{');
8672  if (parse_long_name)
8673  (*ptr)++; // skip initial left brace
8674  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8675  sizeof(__kmp_affinity_format_table[0]);
8676  ++i) {
8677  char short_name = __kmp_affinity_format_table[i].short_name;
8678  const char *long_name = __kmp_affinity_format_table[i].long_name;
8679  char field_format = __kmp_affinity_format_table[i].field_format;
8680  if (parse_long_name) {
8681  size_t length = KMP_STRLEN(long_name);
8682  if (strncmp(*ptr, long_name, length) == 0) {
8683  found_valid_name = true;
8684  (*ptr) += length; // skip the long name
8685  }
8686  } else if (**ptr == short_name) {
8687  found_valid_name = true;
8688  (*ptr)++; // skip the short name
8689  }
8690  if (found_valid_name) {
8691  format[format_index++] = field_format;
8692  format[format_index++] = '\0';
8693  absolute_short_name = short_name;
8694  break;
8695  }
8696  }
8697  if (parse_long_name) {
8698  if (**ptr != '}') {
8699  absolute_short_name = 0;
8700  } else {
8701  (*ptr)++; // skip over the right brace
8702  }
8703  }
8704 
8705  // Attempt to fill the buffer with the requested
8706  // value using snprintf within __kmp_str_buf_print()
8707  switch (absolute_short_name) {
8708  case 't':
8709  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8710  break;
8711  case 'T':
8712  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8713  break;
8714  case 'L':
8715  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8716  break;
8717  case 'n':
8718  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8719  break;
8720  case 'H': {
8721  static const int BUFFER_SIZE = 256;
8722  char buf[BUFFER_SIZE];
8723  __kmp_expand_host_name(buf, BUFFER_SIZE);
8724  rc = __kmp_str_buf_print(field_buffer, format, buf);
8725  } break;
8726  case 'P':
8727  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8728  break;
8729  case 'i':
8730  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8731  break;
8732  case 'N':
8733  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8734  break;
8735  case 'a':
8736  field_value =
8737  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8738  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8739  break;
8740 #if KMP_AFFINITY_SUPPORTED
8741  case 'A': {
8742  if (th->th.th_affin_mask) {
8743  kmp_str_buf_t buf;
8744  __kmp_str_buf_init(&buf);
8745  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8746  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8747  __kmp_str_buf_free(&buf);
8748  } else {
8749  rc = __kmp_str_buf_print(field_buffer, "%s", "disabled");
8750  }
8751  } break;
8752 #endif
8753  default:
8754  // According to spec, If an implementation does not have info for field
8755  // type, then "undefined" is printed
8756  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8757  // Skip the field
8758  if (parse_long_name) {
8759  SKIP_TOKEN(*ptr);
8760  if (**ptr == '}')
8761  (*ptr)++;
8762  } else {
8763  (*ptr)++;
8764  }
8765  }
8766 
8767  KMP_ASSERT(format_index <= FORMAT_SIZE);
8768  return rc;
8769 }
8770 
8771 /*
8772  * Return number of characters needed to hold the affinity string
8773  * (not including null byte character)
8774  * The resultant string is printed to buffer, which the caller can then
8775  * handle afterwards
8776  */
8777 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8778  kmp_str_buf_t *buffer) {
8779  const char *parse_ptr;
8780  size_t retval;
8781  const kmp_info_t *th;
8782  kmp_str_buf_t field;
8783 
8784  KMP_DEBUG_ASSERT(buffer);
8785  KMP_DEBUG_ASSERT(gtid >= 0);
8786 
8787  __kmp_str_buf_init(&field);
8788  __kmp_str_buf_clear(buffer);
8789 
8790  th = __kmp_threads[gtid];
8791  retval = 0;
8792 
8793  // If format is NULL or zero-length string, then we use
8794  // affinity-format-var ICV
8795  parse_ptr = format;
8796  if (parse_ptr == NULL || *parse_ptr == '\0') {
8797  parse_ptr = __kmp_affinity_format;
8798  }
8799  KMP_DEBUG_ASSERT(parse_ptr);
8800 
8801  while (*parse_ptr != '\0') {
8802  // Parse a field
8803  if (*parse_ptr == '%') {
8804  // Put field in the buffer
8805  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8806  __kmp_str_buf_catbuf(buffer, &field);
8807  retval += rc;
8808  } else {
8809  // Put literal character in buffer
8810  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8811  retval++;
8812  parse_ptr++;
8813  }
8814  }
8815  __kmp_str_buf_free(&field);
8816  return retval;
8817 }
8818 
8819 // Displays the affinity string to stdout
8820 void __kmp_aux_display_affinity(int gtid, const char *format) {
8821  kmp_str_buf_t buf;
8822  __kmp_str_buf_init(&buf);
8823  __kmp_aux_capture_affinity(gtid, format, &buf);
8824  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8825  __kmp_str_buf_free(&buf);
8826 }
8827 
8828 /* ------------------------------------------------------------------------ */
8829 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8830  int blocktime = arg; /* argument is in microseconds */
8831 #if KMP_USE_MONITOR
8832  int bt_intervals;
8833 #endif
8834  kmp_int8 bt_set;
8835 
8836  __kmp_save_internal_controls(thread);
8837 
8838  /* Normalize and set blocktime for the teams */
8839  if (blocktime < KMP_MIN_BLOCKTIME)
8840  blocktime = KMP_MIN_BLOCKTIME;
8841  else if (blocktime > KMP_MAX_BLOCKTIME)
8842  blocktime = KMP_MAX_BLOCKTIME;
8843 
8844  set__blocktime_team(thread->th.th_team, tid, blocktime);
8845  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8846 
8847 #if KMP_USE_MONITOR
8848  /* Calculate and set blocktime intervals for the teams */
8849  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8850 
8851  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8852  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8853 #endif
8854 
8855  /* Set whether blocktime has been set to "TRUE" */
8856  bt_set = TRUE;
8857 
8858  set__bt_set_team(thread->th.th_team, tid, bt_set);
8859  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8860 #if KMP_USE_MONITOR
8861  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8862  "bt_intervals=%d, monitor_updates=%d\n",
8863  __kmp_gtid_from_tid(tid, thread->th.th_team),
8864  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8865  __kmp_monitor_wakeups));
8866 #else
8867  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8868  __kmp_gtid_from_tid(tid, thread->th.th_team),
8869  thread->th.th_team->t.t_id, tid, blocktime));
8870 #endif
8871 }
8872 
8873 void __kmp_aux_set_defaults(char const *str, size_t len) {
8874  if (!__kmp_init_serial) {
8875  __kmp_serial_initialize();
8876  }
8877  __kmp_env_initialize(str);
8878 
8879  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8880  __kmp_env_print();
8881  }
8882 } // __kmp_aux_set_defaults
8883 
8884 /* ------------------------------------------------------------------------ */
8885 /* internal fast reduction routines */
8886 
8887 PACKED_REDUCTION_METHOD_T
8888 __kmp_determine_reduction_method(
8889  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8890  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8891  kmp_critical_name *lck) {
8892 
8893  // Default reduction method: critical construct ( lck != NULL, like in current
8894  // PAROPT )
8895  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8896  // can be selected by RTL
8897  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8898  // can be selected by RTL
8899  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8900  // among generated by PAROPT.
8901 
8902  PACKED_REDUCTION_METHOD_T retval;
8903 
8904  int team_size;
8905 
8906  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8907 
8908 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8909  (loc && \
8910  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8911 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8912 
8913  retval = critical_reduce_block;
8914 
8915  // another choice of getting a team size (with 1 dynamic deference) is slower
8916  team_size = __kmp_get_team_num_threads(global_tid);
8917  if (team_size == 1) {
8918 
8919  retval = empty_reduce_block;
8920 
8921  } else {
8922 
8923  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8924 
8925 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8926  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8927  KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8928 
8929 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8930  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8931  KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8932 
8933  int teamsize_cutoff = 4;
8934 
8935 #if KMP_MIC_SUPPORTED
8936  if (__kmp_mic_type != non_mic) {
8937  teamsize_cutoff = 8;
8938  }
8939 #endif
8940  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8941  if (tree_available) {
8942  if (team_size <= teamsize_cutoff) {
8943  if (atomic_available) {
8944  retval = atomic_reduce_block;
8945  }
8946  } else {
8947  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8948  }
8949  } else if (atomic_available) {
8950  retval = atomic_reduce_block;
8951  }
8952 #else
8953 #error "Unknown or unsupported OS"
8954 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8955  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8956  // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8957 
8958 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8959  KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8960 
8961 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8962  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8963  KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8964 
8965  // basic tuning
8966 
8967  if (atomic_available) {
8968  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8969  retval = atomic_reduce_block;
8970  }
8971  } // otherwise: use critical section
8972 
8973 #elif KMP_OS_DARWIN
8974 
8975  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8976  if (atomic_available && (num_vars <= 3)) {
8977  retval = atomic_reduce_block;
8978  } else if (tree_available) {
8979  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8980  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8981  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8982  }
8983  } // otherwise: use critical section
8984 
8985 #else
8986 #error "Unknown or unsupported OS"
8987 #endif
8988 
8989 #else
8990 #error "Unknown or unsupported architecture"
8991 #endif
8992  }
8993 
8994  // KMP_FORCE_REDUCTION
8995 
8996  // If the team is serialized (team_size == 1), ignore the forced reduction
8997  // method and stay with the unsynchronized method (empty_reduce_block)
8998  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8999  team_size != 1) {
9000 
9001  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9002 
9003  int atomic_available, tree_available;
9004 
9005  switch ((forced_retval = __kmp_force_reduction_method)) {
9006  case critical_reduce_block:
9007  KMP_ASSERT(lck); // lck should be != 0
9008  break;
9009 
9010  case atomic_reduce_block:
9011  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9012  if (!atomic_available) {
9013  KMP_WARNING(RedMethodNotSupported, "atomic");
9014  forced_retval = critical_reduce_block;
9015  }
9016  break;
9017 
9018  case tree_reduce_block:
9019  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9020  if (!tree_available) {
9021  KMP_WARNING(RedMethodNotSupported, "tree");
9022  forced_retval = critical_reduce_block;
9023  } else {
9024 #if KMP_FAST_REDUCTION_BARRIER
9025  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9026 #endif
9027  }
9028  break;
9029 
9030  default:
9031  KMP_ASSERT(0); // "unsupported method specified"
9032  }
9033 
9034  retval = forced_retval;
9035  }
9036 
9037  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9038 
9039 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9040 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9041 
9042  return (retval);
9043 }
9044 // this function is for testing set/get/determine reduce method
9045 kmp_int32 __kmp_get_reduce_method(void) {
9046  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9047 }
9048 
9049 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9050 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9051 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9052 
9053 // Hard pause shuts down the runtime completely. Resume happens naturally when
9054 // OpenMP is used subsequently.
9055 void __kmp_hard_pause() {
9056  __kmp_pause_status = kmp_hard_paused;
9057  __kmp_internal_end_thread(-1);
9058 }
9059 
9060 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9061 void __kmp_resume_if_soft_paused() {
9062  if (__kmp_pause_status == kmp_soft_paused) {
9063  __kmp_pause_status = kmp_not_paused;
9064 
9065  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9066  kmp_info_t *thread = __kmp_threads[gtid];
9067  if (thread) { // Wake it if sleeping
9068  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9069  thread);
9070  if (fl.is_sleeping())
9071  fl.resume(gtid);
9072  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9073  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9074  } else { // thread holds the lock and may sleep soon
9075  do { // until either the thread sleeps, or we can get the lock
9076  if (fl.is_sleeping()) {
9077  fl.resume(gtid);
9078  break;
9079  } else if (__kmp_try_suspend_mx(thread)) {
9080  __kmp_unlock_suspend_mx(thread);
9081  break;
9082  }
9083  } while (1);
9084  }
9085  }
9086  }
9087  }
9088 }
9089 
9090 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9091 // TODO: add warning messages
9092 int __kmp_pause_resource(kmp_pause_status_t level) {
9093  if (level == kmp_not_paused) { // requesting resume
9094  if (__kmp_pause_status == kmp_not_paused) {
9095  // error message about runtime not being paused, so can't resume
9096  return 1;
9097  } else {
9098  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9099  __kmp_pause_status == kmp_hard_paused);
9100  __kmp_pause_status = kmp_not_paused;
9101  return 0;
9102  }
9103  } else if (level == kmp_soft_paused) { // requesting soft pause
9104  if (__kmp_pause_status != kmp_not_paused) {
9105  // error message about already being paused
9106  return 1;
9107  } else {
9108  __kmp_soft_pause();
9109  return 0;
9110  }
9111  } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9112  // requesting hard pause or stop_tool pause
9113  if (__kmp_pause_status != kmp_not_paused) {
9114  // error message about already being paused
9115  return 1;
9116  } else {
9117  __kmp_hard_pause();
9118  return 0;
9119  }
9120  } else {
9121  // error message about invalid level
9122  return 1;
9123  }
9124 }
9125 
9126 void __kmp_omp_display_env(int verbose) {
9127  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9128  if (__kmp_init_serial == 0)
9129  __kmp_do_serial_initialize();
9130  __kmp_display_env_impl(!verbose, verbose);
9131  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9132 }
9133 
9134 // The team size is changing, so distributed barrier must be modified
9135 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9136  int new_nthreads) {
9137  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9138  bp_dist_bar);
9139  kmp_info_t **other_threads = team->t.t_threads;
9140 
9141  // We want all the workers to stop waiting on the barrier while we adjust the
9142  // size of the team.
9143  for (int f = 1; f < old_nthreads; ++f) {
9144  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9145  // Ignore threads that are already inactive or not present in the team
9146  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9147  // teams construct causes thread_limit to get passed in, and some of
9148  // those could be inactive; just ignore them
9149  continue;
9150  }
9151  // If thread is transitioning still to in_use state, wait for it
9152  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9153  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9154  KMP_CPU_PAUSE();
9155  }
9156  // The thread should be in_use now
9157  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9158  // Transition to unused state
9159  team->t.t_threads[f]->th.th_used_in_team.store(2);
9160  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9161  }
9162  // Release all the workers
9163  team->t.b->go_release();
9164 
9165  KMP_MFENCE();
9166 
9167  // Workers should see transition status 2 and move to 0; but may need to be
9168  // woken up first
9169  int count = old_nthreads - 1;
9170  while (count > 0) {
9171  count = old_nthreads - 1;
9172  for (int f = 1; f < old_nthreads; ++f) {
9173  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9174  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9175  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9176  void *, other_threads[f]->th.th_sleep_loc);
9177  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9178  }
9179  } else {
9180  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9181  count--;
9182  }
9183  }
9184  }
9185  // Now update the barrier size
9186  team->t.b->update_num_threads(new_nthreads);
9187  team->t.b->go_reset();
9188 }
9189 
9190 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9191  // Add the threads back to the team
9192  KMP_DEBUG_ASSERT(team);
9193  // Threads were paused and pointed at th_used_in_team temporarily during a
9194  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9195  // the thread that it should transition itself back into the team. Then, if
9196  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9197  // to wake it up.
9198  for (int f = 1; f < new_nthreads; ++f) {
9199  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9200  (void)KMP_COMPARE_AND_STORE_ACQ32(
9201  &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9202  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9203  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9204  (kmp_flag_32<false, false> *)NULL);
9205  }
9206  }
9207  // The threads should be transitioning to the team; when they are done, they
9208  // should have set th_used_in_team to 1. This loop forces master to wait until
9209  // all threads have moved into the team and are waiting in the barrier.
9210  int count = new_nthreads - 1;
9211  while (count > 0) {
9212  count = new_nthreads - 1;
9213  for (int f = 1; f < new_nthreads; ++f) {
9214  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9215  count--;
9216  }
9217  }
9218  }
9219 }
9220 
9221 // Globals and functions for hidden helper task
9222 kmp_info_t **__kmp_hidden_helper_threads;
9223 kmp_info_t *__kmp_hidden_helper_main_thread;
9224 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9225 #if KMP_OS_LINUX
9226 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9227 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9228 #else
9229 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9230 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9231 #endif
9232 
9233 namespace {
9234 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9235 
9236 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9237  // This is an explicit synchronization on all hidden helper threads in case
9238  // that when a regular thread pushes a hidden helper task to one hidden
9239  // helper thread, the thread has not been awaken once since they're released
9240  // by the main thread after creating the team.
9241  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9242  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9243  __kmp_hidden_helper_threads_num)
9244  ;
9245 
9246  // If main thread, then wait for signal
9247  if (__kmpc_master(nullptr, *gtid)) {
9248  // First, unset the initial state and release the initial thread
9249  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9250  __kmp_hidden_helper_initz_release();
9251  __kmp_hidden_helper_main_thread_wait();
9252  // Now wake up all worker threads
9253  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9254  __kmp_hidden_helper_worker_thread_signal();
9255  }
9256  }
9257 }
9258 } // namespace
9259 
9260 void __kmp_hidden_helper_threads_initz_routine() {
9261  // Create a new root for hidden helper team/threads
9262  const int gtid = __kmp_register_root(TRUE);
9263  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9264  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9265  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9266  __kmp_hidden_helper_threads_num;
9267 
9268  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9269 
9270  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9271 
9272  // Set the initialization flag to FALSE
9273  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9274 
9275  __kmp_hidden_helper_threads_deinitz_release();
9276 }
9277 
9278 /* Nesting Mode:
9279  Set via KMP_NESTING_MODE, which takes an integer.
9280  Note: we skip duplicate topology levels, and skip levels with only
9281  one entity.
9282  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9283  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9284  in the topology, and initializes the number of threads at each of those
9285  levels to the number of entities at each level, respectively, below the
9286  entity at the parent level.
9287  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9288  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9289  the user to turn nesting on explicitly. This is an even more experimental
9290  option to this experimental feature, and may change or go away in the
9291  future.
9292 */
9293 
9294 // Allocate space to store nesting levels
9295 void __kmp_init_nesting_mode() {
9296  int levels = KMP_HW_LAST;
9297  __kmp_nesting_mode_nlevels = levels;
9298  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9299  for (int i = 0; i < levels; ++i)
9300  __kmp_nesting_nth_level[i] = 0;
9301  if (__kmp_nested_nth.size < levels) {
9302  __kmp_nested_nth.nth =
9303  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9304  __kmp_nested_nth.size = levels;
9305  }
9306 }
9307 
9308 // Set # threads for top levels of nesting; must be called after topology set
9309 void __kmp_set_nesting_mode_threads() {
9310  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9311 
9312  if (__kmp_nesting_mode == 1)
9313  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9314  else if (__kmp_nesting_mode > 1)
9315  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9316 
9317  if (__kmp_topology) { // use topology info
9318  int loc, hw_level;
9319  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9320  loc < __kmp_nesting_mode_nlevels;
9321  loc++, hw_level++) {
9322  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9323  if (__kmp_nesting_nth_level[loc] == 1)
9324  loc--;
9325  }
9326  // Make sure all cores are used
9327  if (__kmp_nesting_mode > 1 && loc > 1) {
9328  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9329  int num_cores = __kmp_topology->get_count(core_level);
9330  int upper_levels = 1;
9331  for (int level = 0; level < loc - 1; ++level)
9332  upper_levels *= __kmp_nesting_nth_level[level];
9333  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9334  __kmp_nesting_nth_level[loc - 1] =
9335  num_cores / __kmp_nesting_nth_level[loc - 2];
9336  }
9337  __kmp_nesting_mode_nlevels = loc;
9338  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9339  } else { // no topology info available; provide a reasonable guesstimation
9340  if (__kmp_avail_proc >= 4) {
9341  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9342  __kmp_nesting_nth_level[1] = 2;
9343  __kmp_nesting_mode_nlevels = 2;
9344  } else {
9345  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9346  __kmp_nesting_mode_nlevels = 1;
9347  }
9348  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9349  }
9350  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9351  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9352  }
9353  set__nproc(thread, __kmp_nesting_nth_level[0]);
9354  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9355  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9356  if (get__max_active_levels(thread) > 1) {
9357  // if max levels was set, set nesting mode levels to same
9358  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9359  }
9360  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9361  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9362 }
9363 
9364 #if ENABLE_LIBOMPTARGET
9365 void (*kmp_target_sync_cb)(ident_t *loc_ref, int gtid, void *current_task,
9366  void *event) = NULL;
9367 void __kmp_target_init() {
9368  // Look for hooks in the libomptarget library
9369  *(void **)(&kmp_target_sync_cb) = KMP_DLSYM("__tgt_target_sync");
9370 }
9371 #endif // ENABLE_LIBOMPTARGET
9372 
9373 // Empty symbols to export (see exports_so.txt) when feature is disabled
9374 extern "C" {
9375 #if !KMP_STATS_ENABLED
9376 void __kmp_reset_stats() {}
9377 #endif
9378 #if !USE_DEBUGGER
9379 int __kmp_omp_debug_struct_info = FALSE;
9380 int __kmp_debugging = FALSE;
9381 #endif
9382 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9383 void __kmp_itt_fini_ittlib() {}
9384 void __kmp_itt_init_ittlib() {}
9385 #endif
9386 }
9387 
9388 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:195
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:353
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:360
@ kmp_sch_static
Definition: kmp.h:356
@ kmp_sch_guided_chunked
Definition: kmp.h:358
Definition: kmp.h:230
kmp_int32 flags
Definition: kmp.h:232