LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014 
1015  if (team->t.t_nproc > 1 &&
1016  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1017  team->t.b->update_num_threads(team->t.t_nproc);
1018  __kmp_add_threads_to_team(team, team->t.t_nproc);
1019  }
1020  }
1021 
1022  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1023  for (i = 0; i < team->t.t_nproc; i++) {
1024  kmp_info_t *thr = team->t.t_threads[i];
1025  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1026  thr->th.th_prev_level != team->t.t_level) {
1027  team->t.t_display_affinity = 1;
1028  break;
1029  }
1030  }
1031  }
1032 
1033  KMP_MB();
1034 }
1035 
1036 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1037 // Propagate any changes to the floating point control registers out to the team
1038 // We try to avoid unnecessary writes to the relevant cache line in the team
1039 // structure, so we don't make changes unless they are needed.
1040 inline static void propagateFPControl(kmp_team_t *team) {
1041  if (__kmp_inherit_fp_control) {
1042  kmp_int16 x87_fpu_control_word;
1043  kmp_uint32 mxcsr;
1044 
1045  // Get primary thread's values of FPU control flags (both X87 and vector)
1046  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1047  __kmp_store_mxcsr(&mxcsr);
1048  mxcsr &= KMP_X86_MXCSR_MASK;
1049 
1050  // There is no point looking at t_fp_control_saved here.
1051  // If it is TRUE, we still have to update the values if they are different
1052  // from those we now have. If it is FALSE we didn't save anything yet, but
1053  // our objective is the same. We have to ensure that the values in the team
1054  // are the same as those we have.
1055  // So, this code achieves what we need whether or not t_fp_control_saved is
1056  // true. By checking whether the value needs updating we avoid unnecessary
1057  // writes that would put the cache-line into a written state, causing all
1058  // threads in the team to have to read it again.
1059  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1060  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1061  // Although we don't use this value, other code in the runtime wants to know
1062  // whether it should restore them. So we must ensure it is correct.
1063  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1064  } else {
1065  // Similarly here. Don't write to this cache-line in the team structure
1066  // unless we have to.
1067  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1068  }
1069 }
1070 
1071 // Do the opposite, setting the hardware registers to the updated values from
1072 // the team.
1073 inline static void updateHWFPControl(kmp_team_t *team) {
1074  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1075  // Only reset the fp control regs if they have been changed in the team.
1076  // the parallel region that we are exiting.
1077  kmp_int16 x87_fpu_control_word;
1078  kmp_uint32 mxcsr;
1079  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1080  __kmp_store_mxcsr(&mxcsr);
1081  mxcsr &= KMP_X86_MXCSR_MASK;
1082 
1083  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1084  __kmp_clear_x87_fpu_status_word();
1085  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1086  }
1087 
1088  if (team->t.t_mxcsr != mxcsr) {
1089  __kmp_load_mxcsr(&team->t.t_mxcsr);
1090  }
1091  }
1092 }
1093 #else
1094 #define propagateFPControl(x) ((void)0)
1095 #define updateHWFPControl(x) ((void)0)
1096 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1097 
1098 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1099  int realloc); // forward declaration
1100 
1101 /* Run a parallel region that has been serialized, so runs only in a team of the
1102  single primary thread. */
1103 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1104  kmp_info_t *this_thr;
1105  kmp_team_t *serial_team;
1106 
1107  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1108 
1109  /* Skip all this code for autopar serialized loops since it results in
1110  unacceptable overhead */
1111  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1112  return;
1113 
1114  if (!TCR_4(__kmp_init_parallel))
1115  __kmp_parallel_initialize();
1116  __kmp_resume_if_soft_paused();
1117 
1118  this_thr = __kmp_threads[global_tid];
1119  serial_team = this_thr->th.th_serial_team;
1120 
1121  /* utilize the serialized team held by this thread */
1122  KMP_DEBUG_ASSERT(serial_team);
1123  KMP_MB();
1124 
1125  if (__kmp_tasking_mode != tskm_immediate_exec) {
1126  KMP_DEBUG_ASSERT(
1127  this_thr->th.th_task_team ==
1128  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1129  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1130  NULL);
1131  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1132  "team %p, new task_team = NULL\n",
1133  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1134  this_thr->th.th_task_team = NULL;
1135  }
1136 
1137  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1138  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1139  proc_bind = proc_bind_false;
1140  } else if (proc_bind == proc_bind_default) {
1141  // No proc_bind clause was specified, so use the current value
1142  // of proc-bind-var for this parallel region.
1143  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1144  }
1145  // Reset for next parallel region
1146  this_thr->th.th_set_proc_bind = proc_bind_default;
1147 
1148 #if OMPT_SUPPORT
1149  ompt_data_t ompt_parallel_data = ompt_data_none;
1150  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1151  if (ompt_enabled.enabled &&
1152  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1153 
1154  ompt_task_info_t *parent_task_info;
1155  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1156 
1157  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1158  if (ompt_enabled.ompt_callback_parallel_begin) {
1159  int team_size = 1;
1160 
1161  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1162  &(parent_task_info->task_data), &(parent_task_info->frame),
1163  &ompt_parallel_data, team_size,
1164  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1165  }
1166  }
1167 #endif // OMPT_SUPPORT
1168 
1169  if (this_thr->th.th_team != serial_team) {
1170  // Nested level will be an index in the nested nthreads array
1171  int level = this_thr->th.th_team->t.t_level;
1172 
1173  if (serial_team->t.t_serialized) {
1174  /* this serial team was already used
1175  TODO increase performance by making this locks more specific */
1176  kmp_team_t *new_team;
1177 
1178  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1179 
1180  new_team =
1181  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1182 #if OMPT_SUPPORT
1183  ompt_parallel_data,
1184 #endif
1185  proc_bind, &this_thr->th.th_current_task->td_icvs,
1186  0 USE_NESTED_HOT_ARG(NULL));
1187  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1188  KMP_ASSERT(new_team);
1189 
1190  /* setup new serialized team and install it */
1191  new_team->t.t_threads[0] = this_thr;
1192  new_team->t.t_parent = this_thr->th.th_team;
1193  serial_team = new_team;
1194  this_thr->th.th_serial_team = serial_team;
1195 
1196  KF_TRACE(
1197  10,
1198  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1199  global_tid, serial_team));
1200 
1201  /* TODO the above breaks the requirement that if we run out of resources,
1202  then we can still guarantee that serialized teams are ok, since we may
1203  need to allocate a new one */
1204  } else {
1205  KF_TRACE(
1206  10,
1207  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1208  global_tid, serial_team));
1209  }
1210 
1211  /* we have to initialize this serial team */
1212  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1213  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1214  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1215  serial_team->t.t_ident = loc;
1216  serial_team->t.t_serialized = 1;
1217  serial_team->t.t_nproc = 1;
1218  serial_team->t.t_parent = this_thr->th.th_team;
1219  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1220  this_thr->th.th_team = serial_team;
1221  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1222 
1223  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1224  this_thr->th.th_current_task));
1225  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1226  this_thr->th.th_current_task->td_flags.executing = 0;
1227 
1228  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1229 
1230  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1231  implicit task for each serialized task represented by
1232  team->t.t_serialized? */
1233  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1234  &this_thr->th.th_current_task->td_parent->td_icvs);
1235 
1236  // Thread value exists in the nested nthreads array for the next nested
1237  // level
1238  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1239  this_thr->th.th_current_task->td_icvs.nproc =
1240  __kmp_nested_nth.nth[level + 1];
1241  }
1242 
1243  if (__kmp_nested_proc_bind.used &&
1244  (level + 1 < __kmp_nested_proc_bind.used)) {
1245  this_thr->th.th_current_task->td_icvs.proc_bind =
1246  __kmp_nested_proc_bind.bind_types[level + 1];
1247  }
1248 
1249 #if USE_DEBUGGER
1250  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1251 #endif
1252  this_thr->th.th_info.ds.ds_tid = 0;
1253 
1254  /* set thread cache values */
1255  this_thr->th.th_team_nproc = 1;
1256  this_thr->th.th_team_master = this_thr;
1257  this_thr->th.th_team_serialized = 1;
1258 
1259  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1260  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1261  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1262 
1263  propagateFPControl(serial_team);
1264 
1265  /* check if we need to allocate dispatch buffers stack */
1266  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1267  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1268  serial_team->t.t_dispatch->th_disp_buffer =
1269  (dispatch_private_info_t *)__kmp_allocate(
1270  sizeof(dispatch_private_info_t));
1271  }
1272  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1273 
1274  KMP_MB();
1275 
1276  } else {
1277  /* this serialized team is already being used,
1278  * that's fine, just add another nested level */
1279  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1280  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1281  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1282  ++serial_team->t.t_serialized;
1283  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1284 
1285  // Nested level will be an index in the nested nthreads array
1286  int level = this_thr->th.th_team->t.t_level;
1287  // Thread value exists in the nested nthreads array for the next nested
1288  // level
1289  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1290  this_thr->th.th_current_task->td_icvs.nproc =
1291  __kmp_nested_nth.nth[level + 1];
1292  }
1293  serial_team->t.t_level++;
1294  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1295  "of serial team %p to %d\n",
1296  global_tid, serial_team, serial_team->t.t_level));
1297 
1298  /* allocate/push dispatch buffers stack */
1299  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1300  {
1301  dispatch_private_info_t *disp_buffer =
1302  (dispatch_private_info_t *)__kmp_allocate(
1303  sizeof(dispatch_private_info_t));
1304  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1305  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1306  }
1307  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1308 
1309  KMP_MB();
1310  }
1311  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1312 
1313  // Perform the display affinity functionality for
1314  // serialized parallel regions
1315  if (__kmp_display_affinity) {
1316  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1317  this_thr->th.th_prev_num_threads != 1) {
1318  // NULL means use the affinity-format-var ICV
1319  __kmp_aux_display_affinity(global_tid, NULL);
1320  this_thr->th.th_prev_level = serial_team->t.t_level;
1321  this_thr->th.th_prev_num_threads = 1;
1322  }
1323  }
1324 
1325  if (__kmp_env_consistency_check)
1326  __kmp_push_parallel(global_tid, NULL);
1327 #if OMPT_SUPPORT
1328  serial_team->t.ompt_team_info.master_return_address = codeptr;
1329  if (ompt_enabled.enabled &&
1330  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1331  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1332  OMPT_GET_FRAME_ADDRESS(0);
1333 
1334  ompt_lw_taskteam_t lw_taskteam;
1335  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1336  &ompt_parallel_data, codeptr);
1337 
1338  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1339  // don't use lw_taskteam after linking. content was swaped
1340 
1341  /* OMPT implicit task begin */
1342  if (ompt_enabled.ompt_callback_implicit_task) {
1343  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1344  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1345  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1346  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1347  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1348  __kmp_tid_from_gtid(global_tid);
1349  }
1350 
1351  /* OMPT state */
1352  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1353  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1354  OMPT_GET_FRAME_ADDRESS(0);
1355  }
1356 #endif
1357 }
1358 
1359 // Test if this fork is for a team closely nested in a teams construct
1360 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1361  microtask_t microtask, int level,
1362  int teams_level, kmp_va_list ap) {
1363  return (master_th->th.th_teams_microtask && ap &&
1364  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1365 }
1366 
1367 // Test if this fork is for the teams construct, i.e. to form the outer league
1368 // of teams
1369 static inline bool __kmp_is_entering_teams(int active_level, int level,
1370  int teams_level, kmp_va_list ap) {
1371  return ((ap == NULL && active_level == 0) ||
1372  (ap && teams_level > 0 && teams_level == level));
1373 }
1374 
1375 // AC: This is start of parallel that is nested inside teams construct.
1376 // The team is actual (hot), all workers are ready at the fork barrier.
1377 // No lock needed to initialize the team a bit, then free workers.
1378 static inline int
1379 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1380  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1381  enum fork_context_e call_context, microtask_t microtask,
1382  launch_t invoker, int master_set_numthreads, int level,
1383 #if OMPT_SUPPORT
1384  ompt_data_t ompt_parallel_data, void *return_address,
1385 #endif
1386  kmp_va_list ap) {
1387  void **argv;
1388  int i;
1389 
1390  parent_team->t.t_ident = loc;
1391  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1392  parent_team->t.t_argc = argc;
1393  argv = (void **)parent_team->t.t_argv;
1394  for (i = argc - 1; i >= 0; --i) {
1395  *argv++ = va_arg(kmp_va_deref(ap), void *);
1396  }
1397  // Increment our nested depth levels, but not increase the serialization
1398  if (parent_team == master_th->th.th_serial_team) {
1399  // AC: we are in serialized parallel
1400  __kmpc_serialized_parallel(loc, gtid);
1401  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1402 
1403  if (call_context == fork_context_gnu) {
1404  // AC: need to decrement t_serialized for enquiry functions to work
1405  // correctly, will restore at join time
1406  parent_team->t.t_serialized--;
1407  return TRUE;
1408  }
1409 
1410 #if OMPD_SUPPORT
1411  parent_team->t.t_pkfn = microtask;
1412 #endif
1413 
1414 #if OMPT_SUPPORT
1415  void *dummy;
1416  void **exit_frame_p;
1417  ompt_data_t *implicit_task_data;
1418  ompt_lw_taskteam_t lw_taskteam;
1419 
1420  if (ompt_enabled.enabled) {
1421  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1422  &ompt_parallel_data, return_address);
1423  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1424 
1425  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1426  // Don't use lw_taskteam after linking. Content was swapped.
1427 
1428  /* OMPT implicit task begin */
1429  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1430  if (ompt_enabled.ompt_callback_implicit_task) {
1431  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1432  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1434  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1435  }
1436 
1437  /* OMPT state */
1438  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1439  } else {
1440  exit_frame_p = &dummy;
1441  }
1442 #endif
1443 
1444  // AC: need to decrement t_serialized for enquiry functions to work
1445  // correctly, will restore at join time
1446  parent_team->t.t_serialized--;
1447 
1448  {
1449  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1450  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1451  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1452 #if OMPT_SUPPORT
1453  ,
1454  exit_frame_p
1455 #endif
1456  );
1457  }
1458 
1459 #if OMPT_SUPPORT
1460  if (ompt_enabled.enabled) {
1461  *exit_frame_p = NULL;
1462  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1463  if (ompt_enabled.ompt_callback_implicit_task) {
1464  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1465  ompt_scope_end, NULL, implicit_task_data, 1,
1466  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1467  }
1468  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1469  __ompt_lw_taskteam_unlink(master_th);
1470  if (ompt_enabled.ompt_callback_parallel_end) {
1471  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1472  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1473  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1474  }
1475  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1476  }
1477 #endif
1478  return TRUE;
1479  }
1480 
1481  parent_team->t.t_pkfn = microtask;
1482  parent_team->t.t_invoke = invoker;
1483  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1484  parent_team->t.t_active_level++;
1485  parent_team->t.t_level++;
1486  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1487 
1488  // If the threads allocated to the team are less than the thread limit, update
1489  // the thread limit here. th_teams_size.nth is specific to this team nested
1490  // in a teams construct, the team is fully created, and we're about to do
1491  // the actual fork. Best to do this here so that the subsequent uses below
1492  // and in the join have the correct value.
1493  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1494 
1495 #if OMPT_SUPPORT
1496  if (ompt_enabled.enabled) {
1497  ompt_lw_taskteam_t lw_taskteam;
1498  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1499  return_address);
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1501  }
1502 #endif
1503 
1504  /* Change number of threads in the team if requested */
1505  if (master_set_numthreads) { // The parallel has num_threads clause
1506  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1507  // AC: only can reduce number of threads dynamically, can't increase
1508  kmp_info_t **other_threads = parent_team->t.t_threads;
1509  // NOTE: if using distributed barrier, we need to run this code block
1510  // even when the team size appears not to have changed from the max.
1511  int old_proc = master_th->th.th_teams_size.nth;
1512  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1513  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1514  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1515  }
1516  parent_team->t.t_nproc = master_set_numthreads;
1517  for (i = 0; i < master_set_numthreads; ++i) {
1518  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1519  }
1520  }
1521  // Keep extra threads hot in the team for possible next parallels
1522  master_th->th.th_set_nproc = 0;
1523  }
1524 
1525 #if USE_DEBUGGER
1526  if (__kmp_debugging) { // Let debugger override number of threads.
1527  int nth = __kmp_omp_num_threads(loc);
1528  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1529  master_set_numthreads = nth;
1530  }
1531  }
1532 #endif
1533 
1534  // Figure out the proc_bind policy for the nested parallel within teams
1535  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1536  // proc_bind_default means don't update
1537  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1538  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1539  proc_bind = proc_bind_false;
1540  } else {
1541  // No proc_bind clause specified; use current proc-bind-var
1542  if (proc_bind == proc_bind_default) {
1543  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1544  }
1545  /* else: The proc_bind policy was specified explicitly on parallel clause.
1546  This overrides proc-bind-var for this parallel region, but does not
1547  change proc-bind-var. */
1548  // Figure the value of proc-bind-var for the child threads.
1549  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1550  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1551  master_th->th.th_current_task->td_icvs.proc_bind)) {
1552  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1553  }
1554  }
1555  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1556  // Need to change the bind-var ICV to correct value for each implicit task
1557  if (proc_bind_icv != proc_bind_default &&
1558  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1559  kmp_info_t **other_threads = parent_team->t.t_threads;
1560  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1561  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1562  }
1563  }
1564  // Reset for next parallel region
1565  master_th->th.th_set_proc_bind = proc_bind_default;
1566 
1567 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1568  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1569  KMP_ITT_DEBUG) &&
1570  __kmp_forkjoin_frames_mode == 3 &&
1571  parent_team->t.t_active_level == 1 // only report frames at level 1
1572  && master_th->th.th_teams_size.nteams == 1) {
1573  kmp_uint64 tmp_time = __itt_get_timestamp();
1574  master_th->th.th_frame_time = tmp_time;
1575  parent_team->t.t_region_time = tmp_time;
1576  }
1577  if (__itt_stack_caller_create_ptr) {
1578  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1579  // create new stack stitching id before entering fork barrier
1580  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1581  }
1582 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1583 #if KMP_AFFINITY_SUPPORTED
1584  __kmp_partition_places(parent_team);
1585 #endif
1586 
1587  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1588  "master_th=%p, gtid=%d\n",
1589  root, parent_team, master_th, gtid));
1590  __kmp_internal_fork(loc, gtid, parent_team);
1591  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1592  "master_th=%p, gtid=%d\n",
1593  root, parent_team, master_th, gtid));
1594 
1595  if (call_context == fork_context_gnu)
1596  return TRUE;
1597 
1598  /* Invoke microtask for PRIMARY thread */
1599  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1600  parent_team->t.t_id, parent_team->t.t_pkfn));
1601 
1602  if (!parent_team->t.t_invoke(gtid)) {
1603  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1604  }
1605  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1606  parent_team->t.t_id, parent_team->t.t_pkfn));
1607  KMP_MB(); /* Flush all pending memory write invalidates. */
1608 
1609  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1610 
1611  return TRUE;
1612 }
1613 
1614 // Create a serialized parallel region
1615 static inline int
1616 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1617  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1618  kmp_info_t *master_th, kmp_team_t *parent_team,
1619 #if OMPT_SUPPORT
1620  ompt_data_t *ompt_parallel_data, void **return_address,
1621  ompt_data_t **parent_task_data,
1622 #endif
1623  kmp_va_list ap) {
1624  kmp_team_t *team;
1625  int i;
1626  void **argv;
1627 
1628 /* josh todo: hypothetical question: what do we do for OS X*? */
1629 #if KMP_OS_LINUX && \
1630  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1631  void *args[argc];
1632 #else
1633  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1634 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1635  KMP_ARCH_AARCH64) */
1636 
1637  KA_TRACE(
1638  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1639 
1640  __kmpc_serialized_parallel(loc, gtid);
1641 
1642 #if OMPD_SUPPORT
1643  master_th->th.th_serial_team->t.t_pkfn = microtask;
1644 #endif
1645 
1646  if (call_context == fork_context_intel) {
1647  /* TODO this sucks, use the compiler itself to pass args! :) */
1648  master_th->th.th_serial_team->t.t_ident = loc;
1649  if (!ap) {
1650  // revert change made in __kmpc_serialized_parallel()
1651  master_th->th.th_serial_team->t.t_level--;
1652 // Get args from parent team for teams construct
1653 
1654 #if OMPT_SUPPORT
1655  void *dummy;
1656  void **exit_frame_p;
1657  ompt_task_info_t *task_info;
1658  ompt_lw_taskteam_t lw_taskteam;
1659 
1660  if (ompt_enabled.enabled) {
1661  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1662  ompt_parallel_data, *return_address);
1663 
1664  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1665  // don't use lw_taskteam after linking. content was swaped
1666  task_info = OMPT_CUR_TASK_INFO(master_th);
1667  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1668  if (ompt_enabled.ompt_callback_implicit_task) {
1669  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1670  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1671  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1672  &(task_info->task_data), 1,
1673  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1674  }
1675 
1676  /* OMPT state */
1677  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1678  } else {
1679  exit_frame_p = &dummy;
1680  }
1681 #endif
1682 
1683  {
1684  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1685  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1686  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1687 #if OMPT_SUPPORT
1688  ,
1689  exit_frame_p
1690 #endif
1691  );
1692  }
1693 
1694 #if OMPT_SUPPORT
1695  if (ompt_enabled.enabled) {
1696  *exit_frame_p = NULL;
1697  if (ompt_enabled.ompt_callback_implicit_task) {
1698  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1699  ompt_scope_end, NULL, &(task_info->task_data), 1,
1700  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1701  }
1702  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1703  __ompt_lw_taskteam_unlink(master_th);
1704  if (ompt_enabled.ompt_callback_parallel_end) {
1705  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1706  ompt_parallel_data, *parent_task_data,
1707  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1708  }
1709  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1710  }
1711 #endif
1712  } else if (microtask == (microtask_t)__kmp_teams_master) {
1713  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1714  team = master_th->th.th_team;
1715  // team->t.t_pkfn = microtask;
1716  team->t.t_invoke = invoker;
1717  __kmp_alloc_argv_entries(argc, team, TRUE);
1718  team->t.t_argc = argc;
1719  argv = (void **)team->t.t_argv;
1720  if (ap) {
1721  for (i = argc - 1; i >= 0; --i)
1722  *argv++ = va_arg(kmp_va_deref(ap), void *);
1723  } else {
1724  for (i = 0; i < argc; ++i)
1725  // Get args from parent team for teams construct
1726  argv[i] = parent_team->t.t_argv[i];
1727  }
1728  // AC: revert change made in __kmpc_serialized_parallel()
1729  // because initial code in teams should have level=0
1730  team->t.t_level--;
1731  // AC: call special invoker for outer "parallel" of teams construct
1732  invoker(gtid);
1733 #if OMPT_SUPPORT
1734  if (ompt_enabled.enabled) {
1735  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1736  if (ompt_enabled.ompt_callback_implicit_task) {
1737  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1738  ompt_scope_end, NULL, &(task_info->task_data), 0,
1739  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1740  }
1741  if (ompt_enabled.ompt_callback_parallel_end) {
1742  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1743  ompt_parallel_data, *parent_task_data,
1744  OMPT_INVOKER(call_context) | ompt_parallel_league,
1745  *return_address);
1746  }
1747  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1748  }
1749 #endif
1750  } else {
1751  argv = args;
1752  for (i = argc - 1; i >= 0; --i)
1753  *argv++ = va_arg(kmp_va_deref(ap), void *);
1754  KMP_MB();
1755 
1756 #if OMPT_SUPPORT
1757  void *dummy;
1758  void **exit_frame_p;
1759  ompt_task_info_t *task_info;
1760  ompt_lw_taskteam_t lw_taskteam;
1761  ompt_data_t *implicit_task_data;
1762 
1763  if (ompt_enabled.enabled) {
1764  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765  ompt_parallel_data, *return_address);
1766  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767  // don't use lw_taskteam after linking. content was swaped
1768  task_info = OMPT_CUR_TASK_INFO(master_th);
1769  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1770 
1771  /* OMPT implicit task begin */
1772  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1773  if (ompt_enabled.ompt_callback_implicit_task) {
1774  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1777  ompt_task_implicit);
1778  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1779  }
1780 
1781  /* OMPT state */
1782  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783  } else {
1784  exit_frame_p = &dummy;
1785  }
1786 #endif
1787 
1788  {
1789  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1792 #if OMPT_SUPPORT
1793  ,
1794  exit_frame_p
1795 #endif
1796  );
1797  }
1798 
1799 #if OMPT_SUPPORT
1800  if (ompt_enabled.enabled) {
1801  *exit_frame_p = NULL;
1802  if (ompt_enabled.ompt_callback_implicit_task) {
1803  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1804  ompt_scope_end, NULL, &(task_info->task_data), 1,
1805  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1806  }
1807 
1808  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1809  __ompt_lw_taskteam_unlink(master_th);
1810  if (ompt_enabled.ompt_callback_parallel_end) {
1811  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812  ompt_parallel_data, *parent_task_data,
1813  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1814  }
1815  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1816  }
1817 #endif
1818  }
1819  } else if (call_context == fork_context_gnu) {
1820 #if OMPT_SUPPORT
1821  if (ompt_enabled.enabled) {
1822  ompt_lw_taskteam_t lwt;
1823  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1824  *return_address);
1825 
1826  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1827  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1828  }
1829 // don't use lw_taskteam after linking. content was swaped
1830 #endif
1831 
1832  // we were called from GNU native code
1833  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1834  return FALSE;
1835  } else {
1836  KMP_ASSERT2(call_context < fork_context_last,
1837  "__kmp_serial_fork_call: unknown fork_context parameter");
1838  }
1839 
1840  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1841  KMP_MB();
1842  return FALSE;
1843 }
1844 
1845 /* most of the work for a fork */
1846 /* return true if we really went parallel, false if serialized */
1847 int __kmp_fork_call(ident_t *loc, int gtid,
1848  enum fork_context_e call_context, // Intel, GNU, ...
1849  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1850  kmp_va_list ap) {
1851  void **argv;
1852  int i;
1853  int master_tid;
1854  int master_this_cons;
1855  kmp_team_t *team;
1856  kmp_team_t *parent_team;
1857  kmp_info_t *master_th;
1858  kmp_root_t *root;
1859  int nthreads;
1860  int master_active;
1861  int master_set_numthreads;
1862  int level;
1863  int active_level;
1864  int teams_level;
1865 #if KMP_NESTED_HOT_TEAMS
1866  kmp_hot_team_ptr_t **p_hot_teams;
1867 #endif
1868  { // KMP_TIME_BLOCK
1869  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1870  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1871 
1872  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1873  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1874  /* Some systems prefer the stack for the root thread(s) to start with */
1875  /* some gap from the parent stack to prevent false sharing. */
1876  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1877  /* These 2 lines below are so this does not get optimized out */
1878  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1879  __kmp_stkpadding += (short)((kmp_int64)dummy);
1880  }
1881 
1882  /* initialize if needed */
1883  KMP_DEBUG_ASSERT(
1884  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1885  if (!TCR_4(__kmp_init_parallel))
1886  __kmp_parallel_initialize();
1887  __kmp_resume_if_soft_paused();
1888 
1889  /* setup current data */
1890  // AC: potentially unsafe, not in sync with library shutdown,
1891  // __kmp_threads can be freed
1892  master_th = __kmp_threads[gtid];
1893 
1894  parent_team = master_th->th.th_team;
1895  master_tid = master_th->th.th_info.ds.ds_tid;
1896  master_this_cons = master_th->th.th_local.this_construct;
1897  root = master_th->th.th_root;
1898  master_active = root->r.r_active;
1899  master_set_numthreads = master_th->th.th_set_nproc;
1900 
1901 #if OMPT_SUPPORT
1902  ompt_data_t ompt_parallel_data = ompt_data_none;
1903  ompt_data_t *parent_task_data;
1904  ompt_frame_t *ompt_frame;
1905  void *return_address = NULL;
1906 
1907  if (ompt_enabled.enabled) {
1908  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1909  NULL, NULL);
1910  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1911  }
1912 #endif
1913 
1914  // Assign affinity to root thread if it hasn't happened yet
1915  __kmp_assign_root_init_mask();
1916 
1917  // Nested level will be an index in the nested nthreads array
1918  level = parent_team->t.t_level;
1919  // used to launch non-serial teams even if nested is not allowed
1920  active_level = parent_team->t.t_active_level;
1921  // needed to check nesting inside the teams
1922  teams_level = master_th->th.th_teams_level;
1923 #if KMP_NESTED_HOT_TEAMS
1924  p_hot_teams = &master_th->th.th_hot_teams;
1925  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1926  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1927  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1928  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1929  // it is either actual or not needed (when active_level > 0)
1930  (*p_hot_teams)[0].hot_team_nth = 1;
1931  }
1932 #endif
1933 
1934 #if OMPT_SUPPORT
1935  if (ompt_enabled.enabled) {
1936  if (ompt_enabled.ompt_callback_parallel_begin) {
1937  int team_size = master_set_numthreads
1938  ? master_set_numthreads
1939  : get__nproc_2(parent_team, master_tid);
1940  int flags = OMPT_INVOKER(call_context) |
1941  ((microtask == (microtask_t)__kmp_teams_master)
1942  ? ompt_parallel_league
1943  : ompt_parallel_team);
1944  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1945  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1946  return_address);
1947  }
1948  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1949  }
1950 #endif
1951 
1952  master_th->th.th_ident = loc;
1953 
1954  // Parallel closely nested in teams construct:
1955  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1956  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1957  call_context, microtask, invoker,
1958  master_set_numthreads, level,
1959 #if OMPT_SUPPORT
1960  ompt_parallel_data, return_address,
1961 #endif
1962  ap);
1963  } // End parallel closely nested in teams construct
1964 
1965 #if KMP_DEBUG
1966  if (__kmp_tasking_mode != tskm_immediate_exec) {
1967  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1968  parent_team->t.t_task_team[master_th->th.th_task_state]);
1969  }
1970 #endif
1971 
1972  // Need this to happen before we determine the number of threads, not while
1973  // we are allocating the team
1974  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1975 
1976  // Determine the number of threads
1977  int enter_teams =
1978  __kmp_is_entering_teams(active_level, level, teams_level, ap);
1979  if ((!enter_teams &&
1980  (parent_team->t.t_active_level >=
1981  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1982  (__kmp_library == library_serial)) {
1983  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1984  nthreads = 1;
1985  } else {
1986  nthreads = master_set_numthreads
1987  ? master_set_numthreads
1988  // TODO: get nproc directly from current task
1989  : get__nproc_2(parent_team, master_tid);
1990  // Check if we need to take forkjoin lock? (no need for serialized
1991  // parallel out of teams construct).
1992  if (nthreads > 1) {
1993  /* determine how many new threads we can use */
1994  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1995  /* AC: If we execute teams from parallel region (on host), then teams
1996  should be created but each can only have 1 thread if nesting is
1997  disabled. If teams called from serial region, then teams and their
1998  threads should be created regardless of the nesting setting. */
1999  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2000  nthreads, enter_teams);
2001  if (nthreads == 1) {
2002  // Free lock for single thread execution here; for multi-thread
2003  // execution it will be freed later after team of threads created
2004  // and initialized
2005  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2006  }
2007  }
2008  }
2009  KMP_DEBUG_ASSERT(nthreads > 0);
2010 
2011  // If we temporarily changed the set number of threads then restore it now
2012  master_th->th.th_set_nproc = 0;
2013 
2014  if (nthreads == 1) {
2015  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2016  invoker, master_th, parent_team,
2017 #if OMPT_SUPPORT
2018  &ompt_parallel_data, &return_address,
2019  &parent_task_data,
2020 #endif
2021  ap);
2022  } // if (nthreads == 1)
2023 
2024  // GEH: only modify the executing flag in the case when not serialized
2025  // serialized case is handled in kmpc_serialized_parallel
2026  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2027  "curtask=%p, curtask_max_aclevel=%d\n",
2028  parent_team->t.t_active_level, master_th,
2029  master_th->th.th_current_task,
2030  master_th->th.th_current_task->td_icvs.max_active_levels));
2031  // TODO: GEH - cannot do this assertion because root thread not set up as
2032  // executing
2033  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2034  master_th->th.th_current_task->td_flags.executing = 0;
2035 
2036  if (!master_th->th.th_teams_microtask || level > teams_level) {
2037  /* Increment our nested depth level */
2038  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2039  }
2040 
2041  // See if we need to make a copy of the ICVs.
2042  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2043  if ((level + 1 < __kmp_nested_nth.used) &&
2044  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2045  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2046  } else {
2047  nthreads_icv = 0; // don't update
2048  }
2049 
2050  // Figure out the proc_bind_policy for the new team.
2051  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2052  // proc_bind_default means don't update
2053  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2054  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2055  proc_bind = proc_bind_false;
2056  } else {
2057  // No proc_bind clause specified; use current proc-bind-var for this
2058  // parallel region
2059  if (proc_bind == proc_bind_default) {
2060  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2061  }
2062  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2063  if (master_th->th.th_teams_microtask &&
2064  microtask == (microtask_t)__kmp_teams_master) {
2065  proc_bind = __kmp_teams_proc_bind;
2066  }
2067  /* else: The proc_bind policy was specified explicitly on parallel clause.
2068  This overrides proc-bind-var for this parallel region, but does not
2069  change proc-bind-var. */
2070  // Figure the value of proc-bind-var for the child threads.
2071  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2072  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2073  master_th->th.th_current_task->td_icvs.proc_bind)) {
2074  // Do not modify the proc bind icv for the two teams construct forks
2075  // They just let the proc bind icv pass through
2076  if (!master_th->th.th_teams_microtask ||
2077  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2078  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2079  }
2080  }
2081 
2082  // Reset for next parallel region
2083  master_th->th.th_set_proc_bind = proc_bind_default;
2084 
2085  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2086  kmp_internal_control_t new_icvs;
2087  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2088  new_icvs.next = NULL;
2089  if (nthreads_icv > 0) {
2090  new_icvs.nproc = nthreads_icv;
2091  }
2092  if (proc_bind_icv != proc_bind_default) {
2093  new_icvs.proc_bind = proc_bind_icv;
2094  }
2095 
2096  /* allocate a new parallel team */
2097  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2098  team = __kmp_allocate_team(root, nthreads, nthreads,
2099 #if OMPT_SUPPORT
2100  ompt_parallel_data,
2101 #endif
2102  proc_bind, &new_icvs,
2103  argc USE_NESTED_HOT_ARG(master_th));
2104  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2105  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2106  } else {
2107  /* allocate a new parallel team */
2108  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2109  team = __kmp_allocate_team(root, nthreads, nthreads,
2110 #if OMPT_SUPPORT
2111  ompt_parallel_data,
2112 #endif
2113  proc_bind,
2114  &master_th->th.th_current_task->td_icvs,
2115  argc USE_NESTED_HOT_ARG(master_th));
2116  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2117  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2118  &master_th->th.th_current_task->td_icvs);
2119  }
2120  KF_TRACE(
2121  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2122 
2123  /* setup the new team */
2124  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2125  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2126  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2127  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2128  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2129 #if OMPT_SUPPORT
2130  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2131  return_address);
2132 #endif
2133  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2134  // TODO: parent_team->t.t_level == INT_MAX ???
2135  if (!master_th->th.th_teams_microtask || level > teams_level) {
2136  int new_level = parent_team->t.t_level + 1;
2137  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2138  new_level = parent_team->t.t_active_level + 1;
2139  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2140  } else {
2141  // AC: Do not increase parallel level at start of the teams construct
2142  int new_level = parent_team->t.t_level;
2143  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2144  new_level = parent_team->t.t_active_level;
2145  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2146  }
2147  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2148  // set primary thread's schedule as new run-time schedule
2149  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2150 
2151  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2152  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2153 
2154  // Update the floating point rounding in the team if required.
2155  propagateFPControl(team);
2156 #if OMPD_SUPPORT
2157  if (ompd_state & OMPD_ENABLE_BP)
2158  ompd_bp_parallel_begin();
2159 #endif
2160 
2161  if (__kmp_tasking_mode != tskm_immediate_exec) {
2162  // Set primary thread's task team to team's task team. Unless this is hot
2163  // team, it should be NULL.
2164  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2165  parent_team->t.t_task_team[master_th->th.th_task_state]);
2166  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2167  "%p, new task_team %p / team %p\n",
2168  __kmp_gtid_from_thread(master_th),
2169  master_th->th.th_task_team, parent_team,
2170  team->t.t_task_team[master_th->th.th_task_state], team));
2171 
2172  if (active_level || master_th->th.th_task_team) {
2173  // Take a memo of primary thread's task_state
2174  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2175  if (master_th->th.th_task_state_top >=
2176  master_th->th.th_task_state_stack_sz) { // increase size
2177  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2178  kmp_uint8 *old_stack, *new_stack;
2179  kmp_uint32 i;
2180  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2181  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2182  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2183  }
2184  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2185  ++i) { // zero-init rest of stack
2186  new_stack[i] = 0;
2187  }
2188  old_stack = master_th->th.th_task_state_memo_stack;
2189  master_th->th.th_task_state_memo_stack = new_stack;
2190  master_th->th.th_task_state_stack_sz = new_size;
2191  __kmp_free(old_stack);
2192  }
2193  // Store primary thread's task_state on stack
2194  master_th->th
2195  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2196  master_th->th.th_task_state;
2197  master_th->th.th_task_state_top++;
2198 #if KMP_NESTED_HOT_TEAMS
2199  if (master_th->th.th_hot_teams &&
2200  active_level < __kmp_hot_teams_max_level &&
2201  team == master_th->th.th_hot_teams[active_level].hot_team) {
2202  // Restore primary thread's nested state if nested hot team
2203  master_th->th.th_task_state =
2204  master_th->th
2205  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2206  } else {
2207 #endif
2208  master_th->th.th_task_state = 0;
2209 #if KMP_NESTED_HOT_TEAMS
2210  }
2211 #endif
2212  }
2213 #if !KMP_NESTED_HOT_TEAMS
2214  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2215  (team == root->r.r_hot_team));
2216 #endif
2217  }
2218 
2219  KA_TRACE(
2220  20,
2221  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2222  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2223  team->t.t_nproc));
2224  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2225  (team->t.t_master_tid == 0 &&
2226  (team->t.t_parent == root->r.r_root_team ||
2227  team->t.t_parent->t.t_serialized)));
2228  KMP_MB();
2229 
2230  /* now, setup the arguments */
2231  argv = (void **)team->t.t_argv;
2232  if (ap) {
2233  for (i = argc - 1; i >= 0; --i) {
2234  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2235  KMP_CHECK_UPDATE(*argv, new_argv);
2236  argv++;
2237  }
2238  } else {
2239  for (i = 0; i < argc; ++i) {
2240  // Get args from parent team for teams construct
2241  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2242  }
2243  }
2244 
2245  /* now actually fork the threads */
2246  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2247  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2248  root->r.r_active = TRUE;
2249 
2250  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2251  __kmp_setup_icv_copy(team, nthreads,
2252  &master_th->th.th_current_task->td_icvs, loc);
2253 
2254 #if OMPT_SUPPORT
2255  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2256 #endif
2257 
2258  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2259 
2260 #if USE_ITT_BUILD
2261  if (team->t.t_active_level == 1 // only report frames at level 1
2262  && !master_th->th.th_teams_microtask) { // not in teams construct
2263 #if USE_ITT_NOTIFY
2264  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2265  (__kmp_forkjoin_frames_mode == 3 ||
2266  __kmp_forkjoin_frames_mode == 1)) {
2267  kmp_uint64 tmp_time = 0;
2268  if (__itt_get_timestamp_ptr)
2269  tmp_time = __itt_get_timestamp();
2270  // Internal fork - report frame begin
2271  master_th->th.th_frame_time = tmp_time;
2272  if (__kmp_forkjoin_frames_mode == 3)
2273  team->t.t_region_time = tmp_time;
2274  } else
2275 // only one notification scheme (either "submit" or "forking/joined", not both)
2276 #endif /* USE_ITT_NOTIFY */
2277  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2278  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2279  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2280  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2281  }
2282  }
2283 #endif /* USE_ITT_BUILD */
2284 
2285  /* now go on and do the work */
2286  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2287  KMP_MB();
2288  KF_TRACE(10,
2289  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2290  root, team, master_th, gtid));
2291 
2292 #if USE_ITT_BUILD
2293  if (__itt_stack_caller_create_ptr) {
2294  // create new stack stitching id before entering fork barrier
2295  if (!enter_teams) {
2296  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2297  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2298  } else if (parent_team->t.t_serialized) {
2299  // keep stack stitching id in the serialized parent_team;
2300  // current team will be used for parallel inside the teams;
2301  // if parent_team is active, then it already keeps stack stitching id
2302  // for the league of teams
2303  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2304  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2305  }
2306  }
2307 #endif /* USE_ITT_BUILD */
2308 
2309  // AC: skip __kmp_internal_fork at teams construct, let only primary
2310  // threads execute
2311  if (ap) {
2312  __kmp_internal_fork(loc, gtid, team);
2313  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2314  "master_th=%p, gtid=%d\n",
2315  root, team, master_th, gtid));
2316  }
2317 
2318  if (call_context == fork_context_gnu) {
2319  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2320  return TRUE;
2321  }
2322 
2323  /* Invoke microtask for PRIMARY thread */
2324  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2325  team->t.t_id, team->t.t_pkfn));
2326  } // END of timer KMP_fork_call block
2327 
2328 #if KMP_STATS_ENABLED
2329  // If beginning a teams construct, then change thread state
2330  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2331  if (!ap) {
2332  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2333  }
2334 #endif
2335 
2336  if (!team->t.t_invoke(gtid)) {
2337  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2338  }
2339 
2340 #if KMP_STATS_ENABLED
2341  // If was beginning of a teams construct, then reset thread state
2342  if (!ap) {
2343  KMP_SET_THREAD_STATE(previous_state);
2344  }
2345 #endif
2346 
2347  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2348  team->t.t_id, team->t.t_pkfn));
2349  KMP_MB(); /* Flush all pending memory write invalidates. */
2350 
2351  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352 #if OMPT_SUPPORT
2353  if (ompt_enabled.enabled) {
2354  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2355  }
2356 #endif
2357 
2358  return TRUE;
2359 }
2360 
2361 #if OMPT_SUPPORT
2362 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2363  kmp_team_t *team) {
2364  // restore state outside the region
2365  thread->th.ompt_thread_info.state =
2366  ((team->t.t_serialized) ? ompt_state_work_serial
2367  : ompt_state_work_parallel);
2368 }
2369 
2370 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2371  kmp_team_t *team, ompt_data_t *parallel_data,
2372  int flags, void *codeptr) {
2373  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2374  if (ompt_enabled.ompt_callback_parallel_end) {
2375  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2376  parallel_data, &(task_info->task_data), flags, codeptr);
2377  }
2378 
2379  task_info->frame.enter_frame = ompt_data_none;
2380  __kmp_join_restore_state(thread, team);
2381 }
2382 #endif
2383 
2384 void __kmp_join_call(ident_t *loc, int gtid
2385 #if OMPT_SUPPORT
2386  ,
2387  enum fork_context_e fork_context
2388 #endif
2389  ,
2390  int exit_teams) {
2391  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2392  kmp_team_t *team;
2393  kmp_team_t *parent_team;
2394  kmp_info_t *master_th;
2395  kmp_root_t *root;
2396  int master_active;
2397 
2398  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2399 
2400  /* setup current data */
2401  master_th = __kmp_threads[gtid];
2402  root = master_th->th.th_root;
2403  team = master_th->th.th_team;
2404  parent_team = team->t.t_parent;
2405 
2406  master_th->th.th_ident = loc;
2407 
2408 #if OMPT_SUPPORT
2409  void *team_microtask = (void *)team->t.t_pkfn;
2410  // For GOMP interface with serialized parallel, need the
2411  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2412  // and end-parallel events.
2413  if (ompt_enabled.enabled &&
2414  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2415  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416  }
2417 #endif
2418 
2419 #if KMP_DEBUG
2420  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2421  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2422  "th_task_team = %p\n",
2423  __kmp_gtid_from_thread(master_th), team,
2424  team->t.t_task_team[master_th->th.th_task_state],
2425  master_th->th.th_task_team));
2426  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2427  team->t.t_task_team[master_th->th.th_task_state]);
2428  }
2429 #endif
2430 
2431  if (team->t.t_serialized) {
2432  if (master_th->th.th_teams_microtask) {
2433  // We are in teams construct
2434  int level = team->t.t_level;
2435  int tlevel = master_th->th.th_teams_level;
2436  if (level == tlevel) {
2437  // AC: we haven't incremented it earlier at start of teams construct,
2438  // so do it here - at the end of teams construct
2439  team->t.t_level++;
2440  } else if (level == tlevel + 1) {
2441  // AC: we are exiting parallel inside teams, need to increment
2442  // serialization in order to restore it in the next call to
2443  // __kmpc_end_serialized_parallel
2444  team->t.t_serialized++;
2445  }
2446  }
2447  __kmpc_end_serialized_parallel(loc, gtid);
2448 
2449 #if OMPT_SUPPORT
2450  if (ompt_enabled.enabled) {
2451  if (fork_context == fork_context_gnu) {
2452  __ompt_lw_taskteam_unlink(master_th);
2453  }
2454  __kmp_join_restore_state(master_th, parent_team);
2455  }
2456 #endif
2457 
2458  return;
2459  }
2460 
2461  master_active = team->t.t_master_active;
2462 
2463  if (!exit_teams) {
2464  // AC: No barrier for internal teams at exit from teams construct.
2465  // But there is barrier for external team (league).
2466  __kmp_internal_join(loc, gtid, team);
2467 #if USE_ITT_BUILD
2468  if (__itt_stack_caller_create_ptr) {
2469  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2470  // destroy the stack stitching id after join barrier
2471  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2472  team->t.t_stack_id = NULL;
2473  }
2474 #endif
2475  } else {
2476  master_th->th.th_task_state =
2477  0; // AC: no tasking in teams (out of any parallel)
2478 #if USE_ITT_BUILD
2479  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2480  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2481  // destroy the stack stitching id on exit from the teams construct
2482  // if parent_team is active, then the id will be destroyed later on
2483  // by master of the league of teams
2484  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2485  parent_team->t.t_stack_id = NULL;
2486  }
2487 #endif
2488  }
2489 
2490  KMP_MB();
2491 
2492 #if OMPT_SUPPORT
2493  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2494  void *codeptr = team->t.ompt_team_info.master_return_address;
2495 #endif
2496 
2497 #if USE_ITT_BUILD
2498  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2499  if (team->t.t_active_level == 1 &&
2500  (!master_th->th.th_teams_microtask || /* not in teams construct */
2501  master_th->th.th_teams_size.nteams == 1)) {
2502  master_th->th.th_ident = loc;
2503  // only one notification scheme (either "submit" or "forking/joined", not
2504  // both)
2505  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2506  __kmp_forkjoin_frames_mode == 3)
2507  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2508  master_th->th.th_frame_time, 0, loc,
2509  master_th->th.th_team_nproc, 1);
2510  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2511  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2512  __kmp_itt_region_joined(gtid);
2513  } // active_level == 1
2514 #endif /* USE_ITT_BUILD */
2515 
2516 #if KMP_AFFINITY_SUPPORTED
2517  if (!exit_teams) {
2518  // Restore master thread's partition.
2519  master_th->th.th_first_place = team->t.t_first_place;
2520  master_th->th.th_last_place = team->t.t_last_place;
2521  }
2522 #endif // KMP_AFFINITY_SUPPORTED
2523 
2524  if (master_th->th.th_teams_microtask && !exit_teams &&
2525  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2526  team->t.t_level == master_th->th.th_teams_level + 1) {
2527 // AC: We need to leave the team structure intact at the end of parallel
2528 // inside the teams construct, so that at the next parallel same (hot) team
2529 // works, only adjust nesting levels
2530 #if OMPT_SUPPORT
2531  ompt_data_t ompt_parallel_data = ompt_data_none;
2532  if (ompt_enabled.enabled) {
2533  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2534  if (ompt_enabled.ompt_callback_implicit_task) {
2535  int ompt_team_size = team->t.t_nproc;
2536  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2537  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2538  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2539  }
2540  task_info->frame.exit_frame = ompt_data_none;
2541  task_info->task_data = ompt_data_none;
2542  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2543  __ompt_lw_taskteam_unlink(master_th);
2544  }
2545 #endif
2546  /* Decrement our nested depth level */
2547  team->t.t_level--;
2548  team->t.t_active_level--;
2549  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2550 
2551  // Restore number of threads in the team if needed. This code relies on
2552  // the proper adjustment of th_teams_size.nth after the fork in
2553  // __kmp_teams_master on each teams primary thread in the case that
2554  // __kmp_reserve_threads reduced it.
2555  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2556  int old_num = master_th->th.th_team_nproc;
2557  int new_num = master_th->th.th_teams_size.nth;
2558  kmp_info_t **other_threads = team->t.t_threads;
2559  team->t.t_nproc = new_num;
2560  for (int i = 0; i < old_num; ++i) {
2561  other_threads[i]->th.th_team_nproc = new_num;
2562  }
2563  // Adjust states of non-used threads of the team
2564  for (int i = old_num; i < new_num; ++i) {
2565  // Re-initialize thread's barrier data.
2566  KMP_DEBUG_ASSERT(other_threads[i]);
2567  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2568  for (int b = 0; b < bs_last_barrier; ++b) {
2569  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2570  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2571 #if USE_DEBUGGER
2572  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2573 #endif
2574  }
2575  if (__kmp_tasking_mode != tskm_immediate_exec) {
2576  // Synchronize thread's task state
2577  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2578  }
2579  }
2580  }
2581 
2582 #if OMPT_SUPPORT
2583  if (ompt_enabled.enabled) {
2584  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2585  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2586  }
2587 #endif
2588 
2589  return;
2590  }
2591 
2592  /* do cleanup and restore the parent team */
2593  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2594  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2595 
2596  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2597 
2598  /* jc: The following lock has instructions with REL and ACQ semantics,
2599  separating the parallel user code called in this parallel region
2600  from the serial user code called after this function returns. */
2601  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2602 
2603  if (!master_th->th.th_teams_microtask ||
2604  team->t.t_level > master_th->th.th_teams_level) {
2605  /* Decrement our nested depth level */
2606  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2607  }
2608  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2609 
2610 #if OMPT_SUPPORT
2611  if (ompt_enabled.enabled) {
2612  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2613  if (ompt_enabled.ompt_callback_implicit_task) {
2614  int flags = (team_microtask == (void *)__kmp_teams_master)
2615  ? ompt_task_initial
2616  : ompt_task_implicit;
2617  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2618  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2619  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2620  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2621  }
2622  task_info->frame.exit_frame = ompt_data_none;
2623  task_info->task_data = ompt_data_none;
2624  }
2625 #endif
2626 
2627  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2628  master_th, team));
2629  __kmp_pop_current_task_from_thread(master_th);
2630 
2631  master_th->th.th_def_allocator = team->t.t_def_allocator;
2632 
2633 #if OMPD_SUPPORT
2634  if (ompd_state & OMPD_ENABLE_BP)
2635  ompd_bp_parallel_end();
2636 #endif
2637  updateHWFPControl(team);
2638 
2639  if (root->r.r_active != master_active)
2640  root->r.r_active = master_active;
2641 
2642  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2643  master_th)); // this will free worker threads
2644 
2645  /* this race was fun to find. make sure the following is in the critical
2646  region otherwise assertions may fail occasionally since the old team may be
2647  reallocated and the hierarchy appears inconsistent. it is actually safe to
2648  run and won't cause any bugs, but will cause those assertion failures. it's
2649  only one deref&assign so might as well put this in the critical region */
2650  master_th->th.th_team = parent_team;
2651  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2652  master_th->th.th_team_master = parent_team->t.t_threads[0];
2653  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2654 
2655  /* restore serialized team, if need be */
2656  if (parent_team->t.t_serialized &&
2657  parent_team != master_th->th.th_serial_team &&
2658  parent_team != root->r.r_root_team) {
2659  __kmp_free_team(root,
2660  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2661  master_th->th.th_serial_team = parent_team;
2662  }
2663 
2664  if (__kmp_tasking_mode != tskm_immediate_exec) {
2665  if (master_th->th.th_task_state_top >
2666  0) { // Restore task state from memo stack
2667  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2668  // Remember primary thread's state if we re-use this nested hot team
2669  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2670  master_th->th.th_task_state;
2671  --master_th->th.th_task_state_top; // pop
2672  // Now restore state at this level
2673  master_th->th.th_task_state =
2674  master_th->th
2675  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2676  } else if (team != root->r.r_hot_team) {
2677  // Reset the task state of primary thread if we are not hot team because
2678  // in this case all the worker threads will be free, and their task state
2679  // will be reset. If not reset the primary's, the task state will be
2680  // inconsistent.
2681  master_th->th.th_task_state = 0;
2682  }
2683  // Copy the task team from the parent team to the primary thread
2684  master_th->th.th_task_team =
2685  parent_team->t.t_task_team[master_th->th.th_task_state];
2686  KA_TRACE(20,
2687  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2688  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2689  parent_team));
2690  }
2691 
2692  // TODO: GEH - cannot do this assertion because root thread not set up as
2693  // executing
2694  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2695  master_th->th.th_current_task->td_flags.executing = 1;
2696 
2697  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2698 
2699 #if KMP_AFFINITY_SUPPORTED
2700  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2701  __kmp_reset_root_init_mask(gtid);
2702  }
2703 #endif
2704 #if OMPT_SUPPORT
2705  int flags =
2706  OMPT_INVOKER(fork_context) |
2707  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2708  : ompt_parallel_team);
2709  if (ompt_enabled.enabled) {
2710  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2711  codeptr);
2712  }
2713 #endif
2714 
2715  KMP_MB();
2716  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2717 }
2718 
2719 /* Check whether we should push an internal control record onto the
2720  serial team stack. If so, do it. */
2721 void __kmp_save_internal_controls(kmp_info_t *thread) {
2722 
2723  if (thread->th.th_team != thread->th.th_serial_team) {
2724  return;
2725  }
2726  if (thread->th.th_team->t.t_serialized > 1) {
2727  int push = 0;
2728 
2729  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2730  push = 1;
2731  } else {
2732  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2733  thread->th.th_team->t.t_serialized) {
2734  push = 1;
2735  }
2736  }
2737  if (push) { /* push a record on the serial team's stack */
2738  kmp_internal_control_t *control =
2739  (kmp_internal_control_t *)__kmp_allocate(
2740  sizeof(kmp_internal_control_t));
2741 
2742  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2743 
2744  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2745 
2746  control->next = thread->th.th_team->t.t_control_stack_top;
2747  thread->th.th_team->t.t_control_stack_top = control;
2748  }
2749  }
2750 }
2751 
2752 /* Changes set_nproc */
2753 void __kmp_set_num_threads(int new_nth, int gtid) {
2754  kmp_info_t *thread;
2755  kmp_root_t *root;
2756 
2757  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2758  KMP_DEBUG_ASSERT(__kmp_init_serial);
2759 
2760  if (new_nth < 1)
2761  new_nth = 1;
2762  else if (new_nth > __kmp_max_nth)
2763  new_nth = __kmp_max_nth;
2764 
2765  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2766  thread = __kmp_threads[gtid];
2767  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2768  return; // nothing to do
2769 
2770  __kmp_save_internal_controls(thread);
2771 
2772  set__nproc(thread, new_nth);
2773 
2774  // If this omp_set_num_threads() call will cause the hot team size to be
2775  // reduced (in the absence of a num_threads clause), then reduce it now,
2776  // rather than waiting for the next parallel region.
2777  root = thread->th.th_root;
2778  if (__kmp_init_parallel && (!root->r.r_active) &&
2779  (root->r.r_hot_team->t.t_nproc > new_nth)
2780 #if KMP_NESTED_HOT_TEAMS
2781  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2782 #endif
2783  ) {
2784  kmp_team_t *hot_team = root->r.r_hot_team;
2785  int f;
2786 
2787  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2788 
2789  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2790  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2791  }
2792  // Release the extra threads we don't need any more.
2793  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2794  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2795  if (__kmp_tasking_mode != tskm_immediate_exec) {
2796  // When decreasing team size, threads no longer in the team should unref
2797  // task team.
2798  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2799  }
2800  __kmp_free_thread(hot_team->t.t_threads[f]);
2801  hot_team->t.t_threads[f] = NULL;
2802  }
2803  hot_team->t.t_nproc = new_nth;
2804 #if KMP_NESTED_HOT_TEAMS
2805  if (thread->th.th_hot_teams) {
2806  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2807  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2808  }
2809 #endif
2810 
2811  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2812  hot_team->t.b->update_num_threads(new_nth);
2813  __kmp_add_threads_to_team(hot_team, new_nth);
2814  }
2815 
2816  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2817 
2818  // Update the t_nproc field in the threads that are still active.
2819  for (f = 0; f < new_nth; f++) {
2820  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2821  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2822  }
2823  // Special flag in case omp_set_num_threads() call
2824  hot_team->t.t_size_changed = -1;
2825  }
2826 }
2827 
2828 /* Changes max_active_levels */
2829 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2830  kmp_info_t *thread;
2831 
2832  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2833  "%d = (%d)\n",
2834  gtid, max_active_levels));
2835  KMP_DEBUG_ASSERT(__kmp_init_serial);
2836 
2837  // validate max_active_levels
2838  if (max_active_levels < 0) {
2839  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2840  // We ignore this call if the user has specified a negative value.
2841  // The current setting won't be changed. The last valid setting will be
2842  // used. A warning will be issued (if warnings are allowed as controlled by
2843  // the KMP_WARNINGS env var).
2844  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2845  "max_active_levels for thread %d = (%d)\n",
2846  gtid, max_active_levels));
2847  return;
2848  }
2849  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2850  // it's OK, the max_active_levels is within the valid range: [ 0;
2851  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2852  // We allow a zero value. (implementation defined behavior)
2853  } else {
2854  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2855  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2856  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2857  // Current upper limit is MAX_INT. (implementation defined behavior)
2858  // If the input exceeds the upper limit, we correct the input to be the
2859  // upper limit. (implementation defined behavior)
2860  // Actually, the flow should never get here until we use MAX_INT limit.
2861  }
2862  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2863  "max_active_levels for thread %d = (%d)\n",
2864  gtid, max_active_levels));
2865 
2866  thread = __kmp_threads[gtid];
2867 
2868  __kmp_save_internal_controls(thread);
2869 
2870  set__max_active_levels(thread, max_active_levels);
2871 }
2872 
2873 /* Gets max_active_levels */
2874 int __kmp_get_max_active_levels(int gtid) {
2875  kmp_info_t *thread;
2876 
2877  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2878  KMP_DEBUG_ASSERT(__kmp_init_serial);
2879 
2880  thread = __kmp_threads[gtid];
2881  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2882  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2883  "curtask_maxaclevel=%d\n",
2884  gtid, thread->th.th_current_task,
2885  thread->th.th_current_task->td_icvs.max_active_levels));
2886  return thread->th.th_current_task->td_icvs.max_active_levels;
2887 }
2888 
2889 // nteams-var per-device ICV
2890 void __kmp_set_num_teams(int num_teams) {
2891  if (num_teams > 0)
2892  __kmp_nteams = num_teams;
2893 }
2894 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2895 // teams-thread-limit-var per-device ICV
2896 void __kmp_set_teams_thread_limit(int limit) {
2897  if (limit > 0)
2898  __kmp_teams_thread_limit = limit;
2899 }
2900 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2901 
2902 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2903 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2904 
2905 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2906 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2907  kmp_info_t *thread;
2908  kmp_sched_t orig_kind;
2909  // kmp_team_t *team;
2910 
2911  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2912  gtid, (int)kind, chunk));
2913  KMP_DEBUG_ASSERT(__kmp_init_serial);
2914 
2915  // Check if the kind parameter is valid, correct if needed.
2916  // Valid parameters should fit in one of two intervals - standard or extended:
2917  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2918  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2919  orig_kind = kind;
2920  kind = __kmp_sched_without_mods(kind);
2921 
2922  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2923  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2924  // TODO: Hint needs attention in case we change the default schedule.
2925  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2926  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2927  __kmp_msg_null);
2928  kind = kmp_sched_default;
2929  chunk = 0; // ignore chunk value in case of bad kind
2930  }
2931 
2932  thread = __kmp_threads[gtid];
2933 
2934  __kmp_save_internal_controls(thread);
2935 
2936  if (kind < kmp_sched_upper_std) {
2937  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2938  // differ static chunked vs. unchunked: chunk should be invalid to
2939  // indicate unchunked schedule (which is the default)
2940  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2941  } else {
2942  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2943  __kmp_sch_map[kind - kmp_sched_lower - 1];
2944  }
2945  } else {
2946  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2947  // kmp_sched_lower - 2 ];
2948  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2949  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2950  kmp_sched_lower - 2];
2951  }
2952  __kmp_sched_apply_mods_intkind(
2953  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2954  if (kind == kmp_sched_auto || chunk < 1) {
2955  // ignore parameter chunk for schedule auto
2956  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2957  } else {
2958  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2959  }
2960 }
2961 
2962 /* Gets def_sched_var ICV values */
2963 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2964  kmp_info_t *thread;
2965  enum sched_type th_type;
2966 
2967  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2968  KMP_DEBUG_ASSERT(__kmp_init_serial);
2969 
2970  thread = __kmp_threads[gtid];
2971 
2972  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2973  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2974  case kmp_sch_static:
2975  case kmp_sch_static_greedy:
2976  case kmp_sch_static_balanced:
2977  *kind = kmp_sched_static;
2978  __kmp_sched_apply_mods_stdkind(kind, th_type);
2979  *chunk = 0; // chunk was not set, try to show this fact via zero value
2980  return;
2981  case kmp_sch_static_chunked:
2982  *kind = kmp_sched_static;
2983  break;
2984  case kmp_sch_dynamic_chunked:
2985  *kind = kmp_sched_dynamic;
2986  break;
2988  case kmp_sch_guided_iterative_chunked:
2989  case kmp_sch_guided_analytical_chunked:
2990  *kind = kmp_sched_guided;
2991  break;
2992  case kmp_sch_auto:
2993  *kind = kmp_sched_auto;
2994  break;
2995  case kmp_sch_trapezoidal:
2996  *kind = kmp_sched_trapezoidal;
2997  break;
2998 #if KMP_STATIC_STEAL_ENABLED
2999  case kmp_sch_static_steal:
3000  *kind = kmp_sched_static_steal;
3001  break;
3002 #endif
3003  default:
3004  KMP_FATAL(UnknownSchedulingType, th_type);
3005  }
3006 
3007  __kmp_sched_apply_mods_stdkind(kind, th_type);
3008  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3009 }
3010 
3011 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3012 
3013  int ii, dd;
3014  kmp_team_t *team;
3015  kmp_info_t *thr;
3016 
3017  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3018  KMP_DEBUG_ASSERT(__kmp_init_serial);
3019 
3020  // validate level
3021  if (level == 0)
3022  return 0;
3023  if (level < 0)
3024  return -1;
3025  thr = __kmp_threads[gtid];
3026  team = thr->th.th_team;
3027  ii = team->t.t_level;
3028  if (level > ii)
3029  return -1;
3030 
3031  if (thr->th.th_teams_microtask) {
3032  // AC: we are in teams region where multiple nested teams have same level
3033  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3034  if (level <=
3035  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3036  KMP_DEBUG_ASSERT(ii >= tlevel);
3037  // AC: As we need to pass by the teams league, we need to artificially
3038  // increase ii
3039  if (ii == tlevel) {
3040  ii += 2; // three teams have same level
3041  } else {
3042  ii++; // two teams have same level
3043  }
3044  }
3045  }
3046 
3047  if (ii == level)
3048  return __kmp_tid_from_gtid(gtid);
3049 
3050  dd = team->t.t_serialized;
3051  level++;
3052  while (ii > level) {
3053  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3054  }
3055  if ((team->t.t_serialized) && (!dd)) {
3056  team = team->t.t_parent;
3057  continue;
3058  }
3059  if (ii > level) {
3060  team = team->t.t_parent;
3061  dd = team->t.t_serialized;
3062  ii--;
3063  }
3064  }
3065 
3066  return (dd > 1) ? (0) : (team->t.t_master_tid);
3067 }
3068 
3069 int __kmp_get_team_size(int gtid, int level) {
3070 
3071  int ii, dd;
3072  kmp_team_t *team;
3073  kmp_info_t *thr;
3074 
3075  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3076  KMP_DEBUG_ASSERT(__kmp_init_serial);
3077 
3078  // validate level
3079  if (level == 0)
3080  return 1;
3081  if (level < 0)
3082  return -1;
3083  thr = __kmp_threads[gtid];
3084  team = thr->th.th_team;
3085  ii = team->t.t_level;
3086  if (level > ii)
3087  return -1;
3088 
3089  if (thr->th.th_teams_microtask) {
3090  // AC: we are in teams region where multiple nested teams have same level
3091  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3092  if (level <=
3093  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3094  KMP_DEBUG_ASSERT(ii >= tlevel);
3095  // AC: As we need to pass by the teams league, we need to artificially
3096  // increase ii
3097  if (ii == tlevel) {
3098  ii += 2; // three teams have same level
3099  } else {
3100  ii++; // two teams have same level
3101  }
3102  }
3103  }
3104 
3105  while (ii > level) {
3106  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3107  }
3108  if (team->t.t_serialized && (!dd)) {
3109  team = team->t.t_parent;
3110  continue;
3111  }
3112  if (ii > level) {
3113  team = team->t.t_parent;
3114  ii--;
3115  }
3116  }
3117 
3118  return team->t.t_nproc;
3119 }
3120 
3121 kmp_r_sched_t __kmp_get_schedule_global() {
3122  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3123  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3124  // independently. So one can get the updated schedule here.
3125 
3126  kmp_r_sched_t r_sched;
3127 
3128  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3129  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3130  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3131  // different roots (even in OMP 2.5)
3132  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3133  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3134  if (s == kmp_sch_static) {
3135  // replace STATIC with more detailed schedule (balanced or greedy)
3136  r_sched.r_sched_type = __kmp_static;
3137  } else if (s == kmp_sch_guided_chunked) {
3138  // replace GUIDED with more detailed schedule (iterative or analytical)
3139  r_sched.r_sched_type = __kmp_guided;
3140  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3141  r_sched.r_sched_type = __kmp_sched;
3142  }
3143  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3144 
3145  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3146  // __kmp_chunk may be wrong here (if it was not ever set)
3147  r_sched.chunk = KMP_DEFAULT_CHUNK;
3148  } else {
3149  r_sched.chunk = __kmp_chunk;
3150  }
3151 
3152  return r_sched;
3153 }
3154 
3155 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3156  at least argc number of *t_argv entries for the requested team. */
3157 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3158 
3159  KMP_DEBUG_ASSERT(team);
3160  if (!realloc || argc > team->t.t_max_argc) {
3161 
3162  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3163  "current entries=%d\n",
3164  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3165  /* if previously allocated heap space for args, free them */
3166  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3167  __kmp_free((void *)team->t.t_argv);
3168 
3169  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3170  /* use unused space in the cache line for arguments */
3171  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3172  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3173  "argv entries\n",
3174  team->t.t_id, team->t.t_max_argc));
3175  team->t.t_argv = &team->t.t_inline_argv[0];
3176  if (__kmp_storage_map) {
3177  __kmp_print_storage_map_gtid(
3178  -1, &team->t.t_inline_argv[0],
3179  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3180  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3181  team->t.t_id);
3182  }
3183  } else {
3184  /* allocate space for arguments in the heap */
3185  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3186  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3187  : 2 * argc;
3188  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3189  "argv entries\n",
3190  team->t.t_id, team->t.t_max_argc));
3191  team->t.t_argv =
3192  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3193  if (__kmp_storage_map) {
3194  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3195  &team->t.t_argv[team->t.t_max_argc],
3196  sizeof(void *) * team->t.t_max_argc,
3197  "team_%d.t_argv", team->t.t_id);
3198  }
3199  }
3200  }
3201 }
3202 
3203 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3204  int i;
3205  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3206  team->t.t_threads =
3207  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3208  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3209  sizeof(dispatch_shared_info_t) * num_disp_buff);
3210  team->t.t_dispatch =
3211  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3212  team->t.t_implicit_task_taskdata =
3213  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3214  team->t.t_max_nproc = max_nth;
3215 
3216  /* setup dispatch buffers */
3217  for (i = 0; i < num_disp_buff; ++i) {
3218  team->t.t_disp_buffer[i].buffer_index = i;
3219  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3220  }
3221 }
3222 
3223 static void __kmp_free_team_arrays(kmp_team_t *team) {
3224  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3225  int i;
3226  for (i = 0; i < team->t.t_max_nproc; ++i) {
3227  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3228  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3229  team->t.t_dispatch[i].th_disp_buffer = NULL;
3230  }
3231  }
3232 #if KMP_USE_HIER_SCHED
3233  __kmp_dispatch_free_hierarchies(team);
3234 #endif
3235  __kmp_free(team->t.t_threads);
3236  __kmp_free(team->t.t_disp_buffer);
3237  __kmp_free(team->t.t_dispatch);
3238  __kmp_free(team->t.t_implicit_task_taskdata);
3239  team->t.t_threads = NULL;
3240  team->t.t_disp_buffer = NULL;
3241  team->t.t_dispatch = NULL;
3242  team->t.t_implicit_task_taskdata = 0;
3243 }
3244 
3245 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3246  kmp_info_t **oldThreads = team->t.t_threads;
3247 
3248  __kmp_free(team->t.t_disp_buffer);
3249  __kmp_free(team->t.t_dispatch);
3250  __kmp_free(team->t.t_implicit_task_taskdata);
3251  __kmp_allocate_team_arrays(team, max_nth);
3252 
3253  KMP_MEMCPY(team->t.t_threads, oldThreads,
3254  team->t.t_nproc * sizeof(kmp_info_t *));
3255 
3256  __kmp_free(oldThreads);
3257 }
3258 
3259 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3260 
3261  kmp_r_sched_t r_sched =
3262  __kmp_get_schedule_global(); // get current state of scheduling globals
3263 
3264  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3265 
3266  kmp_internal_control_t g_icvs = {
3267  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3268  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3269  // adjustment of threads (per thread)
3270  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3271  // whether blocktime is explicitly set
3272  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3273 #if KMP_USE_MONITOR
3274  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3275 // intervals
3276 #endif
3277  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3278  // next parallel region (per thread)
3279  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3280  __kmp_cg_max_nth, // int thread_limit;
3281  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3282  // for max_active_levels
3283  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3284  // {sched,chunk} pair
3285  __kmp_nested_proc_bind.bind_types[0],
3286  __kmp_default_device,
3287  NULL // struct kmp_internal_control *next;
3288  };
3289 
3290  return g_icvs;
3291 }
3292 
3293 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3294 
3295  kmp_internal_control_t gx_icvs;
3296  gx_icvs.serial_nesting_level =
3297  0; // probably =team->t.t_serial like in save_inter_controls
3298  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3299  gx_icvs.next = NULL;
3300 
3301  return gx_icvs;
3302 }
3303 
3304 static void __kmp_initialize_root(kmp_root_t *root) {
3305  int f;
3306  kmp_team_t *root_team;
3307  kmp_team_t *hot_team;
3308  int hot_team_max_nth;
3309  kmp_r_sched_t r_sched =
3310  __kmp_get_schedule_global(); // get current state of scheduling globals
3311  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3312  KMP_DEBUG_ASSERT(root);
3313  KMP_ASSERT(!root->r.r_begin);
3314 
3315  /* setup the root state structure */
3316  __kmp_init_lock(&root->r.r_begin_lock);
3317  root->r.r_begin = FALSE;
3318  root->r.r_active = FALSE;
3319  root->r.r_in_parallel = 0;
3320  root->r.r_blocktime = __kmp_dflt_blocktime;
3321 #if KMP_AFFINITY_SUPPORTED
3322  root->r.r_affinity_assigned = FALSE;
3323 #endif
3324 
3325  /* setup the root team for this task */
3326  /* allocate the root team structure */
3327  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3328 
3329  root_team =
3330  __kmp_allocate_team(root,
3331  1, // new_nproc
3332  1, // max_nproc
3333 #if OMPT_SUPPORT
3334  ompt_data_none, // root parallel id
3335 #endif
3336  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3337  0 // argc
3338  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3339  );
3340 #if USE_DEBUGGER
3341  // Non-NULL value should be assigned to make the debugger display the root
3342  // team.
3343  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3344 #endif
3345 
3346  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3347 
3348  root->r.r_root_team = root_team;
3349  root_team->t.t_control_stack_top = NULL;
3350 
3351  /* initialize root team */
3352  root_team->t.t_threads[0] = NULL;
3353  root_team->t.t_nproc = 1;
3354  root_team->t.t_serialized = 1;
3355  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3356  root_team->t.t_sched.sched = r_sched.sched;
3357  KA_TRACE(
3358  20,
3359  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3360  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3361 
3362  /* setup the hot team for this task */
3363  /* allocate the hot team structure */
3364  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3365 
3366  hot_team =
3367  __kmp_allocate_team(root,
3368  1, // new_nproc
3369  __kmp_dflt_team_nth_ub * 2, // max_nproc
3370 #if OMPT_SUPPORT
3371  ompt_data_none, // root parallel id
3372 #endif
3373  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3374  0 // argc
3375  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3376  );
3377  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3378 
3379  root->r.r_hot_team = hot_team;
3380  root_team->t.t_control_stack_top = NULL;
3381 
3382  /* first-time initialization */
3383  hot_team->t.t_parent = root_team;
3384 
3385  /* initialize hot team */
3386  hot_team_max_nth = hot_team->t.t_max_nproc;
3387  for (f = 0; f < hot_team_max_nth; ++f) {
3388  hot_team->t.t_threads[f] = NULL;
3389  }
3390  hot_team->t.t_nproc = 1;
3391  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392  hot_team->t.t_sched.sched = r_sched.sched;
3393  hot_team->t.t_size_changed = 0;
3394 }
3395 
3396 #ifdef KMP_DEBUG
3397 
3398 typedef struct kmp_team_list_item {
3399  kmp_team_p const *entry;
3400  struct kmp_team_list_item *next;
3401 } kmp_team_list_item_t;
3402 typedef kmp_team_list_item_t *kmp_team_list_t;
3403 
3404 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3405  kmp_team_list_t list, // List of teams.
3406  kmp_team_p const *team // Team to add.
3407 ) {
3408 
3409  // List must terminate with item where both entry and next are NULL.
3410  // Team is added to the list only once.
3411  // List is sorted in ascending order by team id.
3412  // Team id is *not* a key.
3413 
3414  kmp_team_list_t l;
3415 
3416  KMP_DEBUG_ASSERT(list != NULL);
3417  if (team == NULL) {
3418  return;
3419  }
3420 
3421  __kmp_print_structure_team_accum(list, team->t.t_parent);
3422  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3423 
3424  // Search list for the team.
3425  l = list;
3426  while (l->next != NULL && l->entry != team) {
3427  l = l->next;
3428  }
3429  if (l->next != NULL) {
3430  return; // Team has been added before, exit.
3431  }
3432 
3433  // Team is not found. Search list again for insertion point.
3434  l = list;
3435  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3436  l = l->next;
3437  }
3438 
3439  // Insert team.
3440  {
3441  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3442  sizeof(kmp_team_list_item_t));
3443  *item = *l;
3444  l->entry = team;
3445  l->next = item;
3446  }
3447 }
3448 
3449 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3450 
3451 ) {
3452  __kmp_printf("%s", title);
3453  if (team != NULL) {
3454  __kmp_printf("%2x %p\n", team->t.t_id, team);
3455  } else {
3456  __kmp_printf(" - (nil)\n");
3457  }
3458 }
3459 
3460 static void __kmp_print_structure_thread(char const *title,
3461  kmp_info_p const *thread) {
3462  __kmp_printf("%s", title);
3463  if (thread != NULL) {
3464  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3465  } else {
3466  __kmp_printf(" - (nil)\n");
3467  }
3468 }
3469 
3470 void __kmp_print_structure(void) {
3471 
3472  kmp_team_list_t list;
3473 
3474  // Initialize list of teams.
3475  list =
3476  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3477  list->entry = NULL;
3478  list->next = NULL;
3479 
3480  __kmp_printf("\n------------------------------\nGlobal Thread "
3481  "Table\n------------------------------\n");
3482  {
3483  int gtid;
3484  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3485  __kmp_printf("%2d", gtid);
3486  if (__kmp_threads != NULL) {
3487  __kmp_printf(" %p", __kmp_threads[gtid]);
3488  }
3489  if (__kmp_root != NULL) {
3490  __kmp_printf(" %p", __kmp_root[gtid]);
3491  }
3492  __kmp_printf("\n");
3493  }
3494  }
3495 
3496  // Print out __kmp_threads array.
3497  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3498  "----------\n");
3499  if (__kmp_threads != NULL) {
3500  int gtid;
3501  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3502  kmp_info_t const *thread = __kmp_threads[gtid];
3503  if (thread != NULL) {
3504  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3505  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3506  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3507  __kmp_print_structure_team(" Serial Team: ",
3508  thread->th.th_serial_team);
3509  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3510  __kmp_print_structure_thread(" Primary: ",
3511  thread->th.th_team_master);
3512  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3513  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3514  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3515  __kmp_print_structure_thread(" Next in pool: ",
3516  thread->th.th_next_pool);
3517  __kmp_printf("\n");
3518  __kmp_print_structure_team_accum(list, thread->th.th_team);
3519  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3520  }
3521  }
3522  } else {
3523  __kmp_printf("Threads array is not allocated.\n");
3524  }
3525 
3526  // Print out __kmp_root array.
3527  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3528  "--------\n");
3529  if (__kmp_root != NULL) {
3530  int gtid;
3531  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3532  kmp_root_t const *root = __kmp_root[gtid];
3533  if (root != NULL) {
3534  __kmp_printf("GTID %2d %p:\n", gtid, root);
3535  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3536  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3537  __kmp_print_structure_thread(" Uber Thread: ",
3538  root->r.r_uber_thread);
3539  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3540  __kmp_printf(" In Parallel: %2d\n",
3541  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3542  __kmp_printf("\n");
3543  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3544  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3545  }
3546  }
3547  } else {
3548  __kmp_printf("Ubers array is not allocated.\n");
3549  }
3550 
3551  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3552  "--------\n");
3553  while (list->next != NULL) {
3554  kmp_team_p const *team = list->entry;
3555  int i;
3556  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3557  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3558  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3559  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3560  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3561  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3562  for (i = 0; i < team->t.t_nproc; ++i) {
3563  __kmp_printf(" Thread %2d: ", i);
3564  __kmp_print_structure_thread("", team->t.t_threads[i]);
3565  }
3566  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3567  __kmp_printf("\n");
3568  list = list->next;
3569  }
3570 
3571  // Print out __kmp_thread_pool and __kmp_team_pool.
3572  __kmp_printf("\n------------------------------\nPools\n----------------------"
3573  "--------\n");
3574  __kmp_print_structure_thread("Thread pool: ",
3575  CCAST(kmp_info_t *, __kmp_thread_pool));
3576  __kmp_print_structure_team("Team pool: ",
3577  CCAST(kmp_team_t *, __kmp_team_pool));
3578  __kmp_printf("\n");
3579 
3580  // Free team list.
3581  while (list != NULL) {
3582  kmp_team_list_item_t *item = list;
3583  list = list->next;
3584  KMP_INTERNAL_FREE(item);
3585  }
3586 }
3587 
3588 #endif
3589 
3590 //---------------------------------------------------------------------------
3591 // Stuff for per-thread fast random number generator
3592 // Table of primes
3593 static const unsigned __kmp_primes[] = {
3594  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3595  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3596  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3597  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3598  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3599  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3600  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3601  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3602  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3603  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3604  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3605 
3606 //---------------------------------------------------------------------------
3607 // __kmp_get_random: Get a random number using a linear congruential method.
3608 unsigned short __kmp_get_random(kmp_info_t *thread) {
3609  unsigned x = thread->th.th_x;
3610  unsigned short r = (unsigned short)(x >> 16);
3611 
3612  thread->th.th_x = x * thread->th.th_a + 1;
3613 
3614  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3615  thread->th.th_info.ds.ds_tid, r));
3616 
3617  return r;
3618 }
3619 //--------------------------------------------------------
3620 // __kmp_init_random: Initialize a random number generator
3621 void __kmp_init_random(kmp_info_t *thread) {
3622  unsigned seed = thread->th.th_info.ds.ds_tid;
3623 
3624  thread->th.th_a =
3625  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3626  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3627  KA_TRACE(30,
3628  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3629 }
3630 
3631 #if KMP_OS_WINDOWS
3632 /* reclaim array entries for root threads that are already dead, returns number
3633  * reclaimed */
3634 static int __kmp_reclaim_dead_roots(void) {
3635  int i, r = 0;
3636 
3637  for (i = 0; i < __kmp_threads_capacity; ++i) {
3638  if (KMP_UBER_GTID(i) &&
3639  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3640  !__kmp_root[i]
3641  ->r.r_active) { // AC: reclaim only roots died in non-active state
3642  r += __kmp_unregister_root_other_thread(i);
3643  }
3644  }
3645  return r;
3646 }
3647 #endif
3648 
3649 /* This function attempts to create free entries in __kmp_threads and
3650  __kmp_root, and returns the number of free entries generated.
3651 
3652  For Windows* OS static library, the first mechanism used is to reclaim array
3653  entries for root threads that are already dead.
3654 
3655  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3656  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3657  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3658  threadprivate cache array has been created. Synchronization with
3659  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3660 
3661  After any dead root reclamation, if the clipping value allows array expansion
3662  to result in the generation of a total of nNeed free slots, the function does
3663  that expansion. If not, nothing is done beyond the possible initial root
3664  thread reclamation.
3665 
3666  If any argument is negative, the behavior is undefined. */
3667 static int __kmp_expand_threads(int nNeed) {
3668  int added = 0;
3669  int minimumRequiredCapacity;
3670  int newCapacity;
3671  kmp_info_t **newThreads;
3672  kmp_root_t **newRoot;
3673 
3674  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3675  // resizing __kmp_threads does not need additional protection if foreign
3676  // threads are present
3677 
3678 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3679  /* only for Windows static library */
3680  /* reclaim array entries for root threads that are already dead */
3681  added = __kmp_reclaim_dead_roots();
3682 
3683  if (nNeed) {
3684  nNeed -= added;
3685  if (nNeed < 0)
3686  nNeed = 0;
3687  }
3688 #endif
3689  if (nNeed <= 0)
3690  return added;
3691 
3692  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3693  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3694  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3695  // > __kmp_max_nth in one of two ways:
3696  //
3697  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3698  // may not be reused by another thread, so we may need to increase
3699  // __kmp_threads_capacity to __kmp_max_nth + 1.
3700  //
3701  // 2) New foreign root(s) are encountered. We always register new foreign
3702  // roots. This may cause a smaller # of threads to be allocated at
3703  // subsequent parallel regions, but the worker threads hang around (and
3704  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3705  //
3706  // Anyway, that is the reason for moving the check to see if
3707  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3708  // instead of having it performed here. -BB
3709 
3710  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3711 
3712  /* compute expansion headroom to check if we can expand */
3713  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3714  /* possible expansion too small -- give up */
3715  return added;
3716  }
3717  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3718 
3719  newCapacity = __kmp_threads_capacity;
3720  do {
3721  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3722  : __kmp_sys_max_nth;
3723  } while (newCapacity < minimumRequiredCapacity);
3724  newThreads = (kmp_info_t **)__kmp_allocate(
3725  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3726  newRoot =
3727  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3728  KMP_MEMCPY(newThreads, __kmp_threads,
3729  __kmp_threads_capacity * sizeof(kmp_info_t *));
3730  KMP_MEMCPY(newRoot, __kmp_root,
3731  __kmp_threads_capacity * sizeof(kmp_root_t *));
3732  // Put old __kmp_threads array on a list. Any ongoing references to the old
3733  // list will be valid. This list is cleaned up at library shutdown.
3734  kmp_old_threads_list_t *node =
3735  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3736  node->threads = __kmp_threads;
3737  node->next = __kmp_old_threads_list;
3738  __kmp_old_threads_list = node;
3739 
3740  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3741  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3742  added += newCapacity - __kmp_threads_capacity;
3743  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3744 
3745  if (newCapacity > __kmp_tp_capacity) {
3746  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3747  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3748  __kmp_threadprivate_resize_cache(newCapacity);
3749  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3750  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3751  }
3752  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3753  }
3754 
3755  return added;
3756 }
3757 
3758 /* Register the current thread as a root thread and obtain our gtid. We must
3759  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3760  thread that calls from __kmp_do_serial_initialize() */
3761 int __kmp_register_root(int initial_thread) {
3762  kmp_info_t *root_thread;
3763  kmp_root_t *root;
3764  int gtid;
3765  int capacity;
3766  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3767  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3768  KMP_MB();
3769 
3770  /* 2007-03-02:
3771  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3772  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3773  work as expected -- it may return false (that means there is at least one
3774  empty slot in __kmp_threads array), but it is possible the only free slot
3775  is #0, which is reserved for initial thread and so cannot be used for this
3776  one. Following code workarounds this bug.
3777 
3778  However, right solution seems to be not reserving slot #0 for initial
3779  thread because:
3780  (1) there is no magic in slot #0,
3781  (2) we cannot detect initial thread reliably (the first thread which does
3782  serial initialization may be not a real initial thread).
3783  */
3784  capacity = __kmp_threads_capacity;
3785  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3786  --capacity;
3787  }
3788 
3789  // If it is not for initializing the hidden helper team, we need to take
3790  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3791  // in __kmp_threads_capacity.
3792  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3793  capacity -= __kmp_hidden_helper_threads_num;
3794  }
3795 
3796  /* see if there are too many threads */
3797  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3798  if (__kmp_tp_cached) {
3799  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3800  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3801  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3802  } else {
3803  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3804  __kmp_msg_null);
3805  }
3806  }
3807 
3808  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3809  // 0: initial thread, also a regular OpenMP thread.
3810  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3811  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3812  // regular OpenMP threads.
3813  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3814  // Find an available thread slot for hidden helper thread. Slots for hidden
3815  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3816  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3817  gtid <= __kmp_hidden_helper_threads_num;
3818  gtid++)
3819  ;
3820  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3821  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3822  "hidden helper thread: T#%d\n",
3823  gtid));
3824  } else {
3825  /* find an available thread slot */
3826  // Don't reassign the zero slot since we need that to only be used by
3827  // initial thread. Slots for hidden helper threads should also be skipped.
3828  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3829  gtid = 0;
3830  } else {
3831  for (gtid = __kmp_hidden_helper_threads_num + 1;
3832  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3833  ;
3834  }
3835  KA_TRACE(
3836  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3837  KMP_ASSERT(gtid < __kmp_threads_capacity);
3838  }
3839 
3840  /* update global accounting */
3841  __kmp_all_nth++;
3842  TCW_4(__kmp_nth, __kmp_nth + 1);
3843 
3844  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3845  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3846  if (__kmp_adjust_gtid_mode) {
3847  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3848  if (TCR_4(__kmp_gtid_mode) != 2) {
3849  TCW_4(__kmp_gtid_mode, 2);
3850  }
3851  } else {
3852  if (TCR_4(__kmp_gtid_mode) != 1) {
3853  TCW_4(__kmp_gtid_mode, 1);
3854  }
3855  }
3856  }
3857 
3858 #ifdef KMP_ADJUST_BLOCKTIME
3859  /* Adjust blocktime to zero if necessary */
3860  /* Middle initialization might not have occurred yet */
3861  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3862  if (__kmp_nth > __kmp_avail_proc) {
3863  __kmp_zero_bt = TRUE;
3864  }
3865  }
3866 #endif /* KMP_ADJUST_BLOCKTIME */
3867 
3868  /* setup this new hierarchy */
3869  if (!(root = __kmp_root[gtid])) {
3870  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3871  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3872  }
3873 
3874 #if KMP_STATS_ENABLED
3875  // Initialize stats as soon as possible (right after gtid assignment).
3876  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3877  __kmp_stats_thread_ptr->startLife();
3878  KMP_SET_THREAD_STATE(SERIAL_REGION);
3879  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3880 #endif
3881  __kmp_initialize_root(root);
3882 
3883  /* setup new root thread structure */
3884  if (root->r.r_uber_thread) {
3885  root_thread = root->r.r_uber_thread;
3886  } else {
3887  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3888  if (__kmp_storage_map) {
3889  __kmp_print_thread_storage_map(root_thread, gtid);
3890  }
3891  root_thread->th.th_info.ds.ds_gtid = gtid;
3892 #if OMPT_SUPPORT
3893  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3894 #endif
3895  root_thread->th.th_root = root;
3896  if (__kmp_env_consistency_check) {
3897  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3898  }
3899 #if USE_FAST_MEMORY
3900  __kmp_initialize_fast_memory(root_thread);
3901 #endif /* USE_FAST_MEMORY */
3902 
3903 #if KMP_USE_BGET
3904  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3905  __kmp_initialize_bget(root_thread);
3906 #endif
3907  __kmp_init_random(root_thread); // Initialize random number generator
3908  }
3909 
3910  /* setup the serial team held in reserve by the root thread */
3911  if (!root_thread->th.th_serial_team) {
3912  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3913  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3914  root_thread->th.th_serial_team = __kmp_allocate_team(
3915  root, 1, 1,
3916 #if OMPT_SUPPORT
3917  ompt_data_none, // root parallel id
3918 #endif
3919  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3920  }
3921  KMP_ASSERT(root_thread->th.th_serial_team);
3922  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3923  root_thread->th.th_serial_team));
3924 
3925  /* drop root_thread into place */
3926  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3927 
3928  root->r.r_root_team->t.t_threads[0] = root_thread;
3929  root->r.r_hot_team->t.t_threads[0] = root_thread;
3930  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3931  // AC: the team created in reserve, not for execution (it is unused for now).
3932  root_thread->th.th_serial_team->t.t_serialized = 0;
3933  root->r.r_uber_thread = root_thread;
3934 
3935  /* initialize the thread, get it ready to go */
3936  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3937  TCW_4(__kmp_init_gtid, TRUE);
3938 
3939  /* prepare the primary thread for get_gtid() */
3940  __kmp_gtid_set_specific(gtid);
3941 
3942 #if USE_ITT_BUILD
3943  __kmp_itt_thread_name(gtid);
3944 #endif /* USE_ITT_BUILD */
3945 
3946 #ifdef KMP_TDATA_GTID
3947  __kmp_gtid = gtid;
3948 #endif
3949  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3950  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3951 
3952  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3953  "plain=%u\n",
3954  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3955  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3956  KMP_INIT_BARRIER_STATE));
3957  { // Initialize barrier data.
3958  int b;
3959  for (b = 0; b < bs_last_barrier; ++b) {
3960  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3961 #if USE_DEBUGGER
3962  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3963 #endif
3964  }
3965  }
3966  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3967  KMP_INIT_BARRIER_STATE);
3968 
3969 #if KMP_AFFINITY_SUPPORTED
3970  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3971  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3972  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3973  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3974 #endif /* KMP_AFFINITY_SUPPORTED */
3975  root_thread->th.th_def_allocator = __kmp_def_allocator;
3976  root_thread->th.th_prev_level = 0;
3977  root_thread->th.th_prev_num_threads = 1;
3978 
3979  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3980  tmp->cg_root = root_thread;
3981  tmp->cg_thread_limit = __kmp_cg_max_nth;
3982  tmp->cg_nthreads = 1;
3983  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3984  " cg_nthreads init to 1\n",
3985  root_thread, tmp));
3986  tmp->up = NULL;
3987  root_thread->th.th_cg_roots = tmp;
3988 
3989  __kmp_root_counter++;
3990 
3991 #if OMPT_SUPPORT
3992  if (!initial_thread && ompt_enabled.enabled) {
3993 
3994  kmp_info_t *root_thread = ompt_get_thread();
3995 
3996  ompt_set_thread_state(root_thread, ompt_state_overhead);
3997 
3998  if (ompt_enabled.ompt_callback_thread_begin) {
3999  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4000  ompt_thread_initial, __ompt_get_thread_data_internal());
4001  }
4002  ompt_data_t *task_data;
4003  ompt_data_t *parallel_data;
4004  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4005  NULL);
4006  if (ompt_enabled.ompt_callback_implicit_task) {
4007  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4008  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4009  }
4010 
4011  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4012  }
4013 #endif
4014 #if OMPD_SUPPORT
4015  if (ompd_state & OMPD_ENABLE_BP)
4016  ompd_bp_thread_begin();
4017 #endif
4018 
4019  KMP_MB();
4020  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4021 
4022  return gtid;
4023 }
4024 
4025 #if KMP_NESTED_HOT_TEAMS
4026 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4027  const int max_level) {
4028  int i, n, nth;
4029  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4030  if (!hot_teams || !hot_teams[level].hot_team) {
4031  return 0;
4032  }
4033  KMP_DEBUG_ASSERT(level < max_level);
4034  kmp_team_t *team = hot_teams[level].hot_team;
4035  nth = hot_teams[level].hot_team_nth;
4036  n = nth - 1; // primary thread is not freed
4037  if (level < max_level - 1) {
4038  for (i = 0; i < nth; ++i) {
4039  kmp_info_t *th = team->t.t_threads[i];
4040  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4041  if (i > 0 && th->th.th_hot_teams) {
4042  __kmp_free(th->th.th_hot_teams);
4043  th->th.th_hot_teams = NULL;
4044  }
4045  }
4046  }
4047  __kmp_free_team(root, team, NULL);
4048  return n;
4049 }
4050 #endif
4051 
4052 // Resets a root thread and clear its root and hot teams.
4053 // Returns the number of __kmp_threads entries directly and indirectly freed.
4054 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4055  kmp_team_t *root_team = root->r.r_root_team;
4056  kmp_team_t *hot_team = root->r.r_hot_team;
4057  int n = hot_team->t.t_nproc;
4058  int i;
4059 
4060  KMP_DEBUG_ASSERT(!root->r.r_active);
4061 
4062  root->r.r_root_team = NULL;
4063  root->r.r_hot_team = NULL;
4064  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4065  // before call to __kmp_free_team().
4066  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4067 #if KMP_NESTED_HOT_TEAMS
4068  if (__kmp_hot_teams_max_level >
4069  0) { // need to free nested hot teams and their threads if any
4070  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4071  kmp_info_t *th = hot_team->t.t_threads[i];
4072  if (__kmp_hot_teams_max_level > 1) {
4073  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4074  }
4075  if (th->th.th_hot_teams) {
4076  __kmp_free(th->th.th_hot_teams);
4077  th->th.th_hot_teams = NULL;
4078  }
4079  }
4080  }
4081 #endif
4082  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4083 
4084  // Before we can reap the thread, we need to make certain that all other
4085  // threads in the teams that had this root as ancestor have stopped trying to
4086  // steal tasks.
4087  if (__kmp_tasking_mode != tskm_immediate_exec) {
4088  __kmp_wait_to_unref_task_teams();
4089  }
4090 
4091 #if KMP_OS_WINDOWS
4092  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4093  KA_TRACE(
4094  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4095  "\n",
4096  (LPVOID) & (root->r.r_uber_thread->th),
4097  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4098  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4099 #endif /* KMP_OS_WINDOWS */
4100 
4101 #if OMPD_SUPPORT
4102  if (ompd_state & OMPD_ENABLE_BP)
4103  ompd_bp_thread_end();
4104 #endif
4105 
4106 #if OMPT_SUPPORT
4107  ompt_data_t *task_data;
4108  ompt_data_t *parallel_data;
4109  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4110  NULL);
4111  if (ompt_enabled.ompt_callback_implicit_task) {
4112  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4113  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4114  }
4115  if (ompt_enabled.ompt_callback_thread_end) {
4116  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4117  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4118  }
4119 #endif
4120 
4121  TCW_4(__kmp_nth,
4122  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4123  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4124  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4125  " to %d\n",
4126  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4127  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4128  if (i == 1) {
4129  // need to free contention group structure
4130  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4131  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4132  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4133  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4134  root->r.r_uber_thread->th.th_cg_roots = NULL;
4135  }
4136  __kmp_reap_thread(root->r.r_uber_thread, 1);
4137 
4138  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4139  // instead of freeing.
4140  root->r.r_uber_thread = NULL;
4141  /* mark root as no longer in use */
4142  root->r.r_begin = FALSE;
4143 
4144  return n;
4145 }
4146 
4147 void __kmp_unregister_root_current_thread(int gtid) {
4148  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4149  /* this lock should be ok, since unregister_root_current_thread is never
4150  called during an abort, only during a normal close. furthermore, if you
4151  have the forkjoin lock, you should never try to get the initz lock */
4152  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4153  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4154  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4155  "exiting T#%d\n",
4156  gtid));
4157  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4158  return;
4159  }
4160  kmp_root_t *root = __kmp_root[gtid];
4161 
4162  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4163  KMP_ASSERT(KMP_UBER_GTID(gtid));
4164  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4165  KMP_ASSERT(root->r.r_active == FALSE);
4166 
4167  KMP_MB();
4168 
4169  kmp_info_t *thread = __kmp_threads[gtid];
4170  kmp_team_t *team = thread->th.th_team;
4171  kmp_task_team_t *task_team = thread->th.th_task_team;
4172 
4173  // we need to wait for the proxy tasks before finishing the thread
4174  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4175  task_team->tt.tt_hidden_helper_task_encountered)) {
4176 #if OMPT_SUPPORT
4177  // the runtime is shutting down so we won't report any events
4178  thread->th.ompt_thread_info.state = ompt_state_undefined;
4179 #endif
4180  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4181  }
4182 
4183  __kmp_reset_root(gtid, root);
4184 
4185  KMP_MB();
4186  KC_TRACE(10,
4187  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4188 
4189  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4190 }
4191 
4192 #if KMP_OS_WINDOWS
4193 /* __kmp_forkjoin_lock must be already held
4194  Unregisters a root thread that is not the current thread. Returns the number
4195  of __kmp_threads entries freed as a result. */
4196 static int __kmp_unregister_root_other_thread(int gtid) {
4197  kmp_root_t *root = __kmp_root[gtid];
4198  int r;
4199 
4200  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4201  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4202  KMP_ASSERT(KMP_UBER_GTID(gtid));
4203  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4204  KMP_ASSERT(root->r.r_active == FALSE);
4205 
4206  r = __kmp_reset_root(gtid, root);
4207  KC_TRACE(10,
4208  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4209  return r;
4210 }
4211 #endif
4212 
4213 #if KMP_DEBUG
4214 void __kmp_task_info() {
4215 
4216  kmp_int32 gtid = __kmp_entry_gtid();
4217  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4218  kmp_info_t *this_thr = __kmp_threads[gtid];
4219  kmp_team_t *steam = this_thr->th.th_serial_team;
4220  kmp_team_t *team = this_thr->th.th_team;
4221 
4222  __kmp_printf(
4223  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4224  "ptask=%p\n",
4225  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4226  team->t.t_implicit_task_taskdata[tid].td_parent);
4227 }
4228 #endif // KMP_DEBUG
4229 
4230 /* TODO optimize with one big memclr, take out what isn't needed, split
4231  responsibility to workers as much as possible, and delay initialization of
4232  features as much as possible */
4233 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4234  int tid, int gtid) {
4235  /* this_thr->th.th_info.ds.ds_gtid is setup in
4236  kmp_allocate_thread/create_worker.
4237  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4238  KMP_DEBUG_ASSERT(this_thr != NULL);
4239  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4240  KMP_DEBUG_ASSERT(team);
4241  KMP_DEBUG_ASSERT(team->t.t_threads);
4242  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4243  kmp_info_t *master = team->t.t_threads[0];
4244  KMP_DEBUG_ASSERT(master);
4245  KMP_DEBUG_ASSERT(master->th.th_root);
4246 
4247  KMP_MB();
4248 
4249  TCW_SYNC_PTR(this_thr->th.th_team, team);
4250 
4251  this_thr->th.th_info.ds.ds_tid = tid;
4252  this_thr->th.th_set_nproc = 0;
4253  if (__kmp_tasking_mode != tskm_immediate_exec)
4254  // When tasking is possible, threads are not safe to reap until they are
4255  // done tasking; this will be set when tasking code is exited in wait
4256  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4257  else // no tasking --> always safe to reap
4258  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4259  this_thr->th.th_set_proc_bind = proc_bind_default;
4260 #if KMP_AFFINITY_SUPPORTED
4261  this_thr->th.th_new_place = this_thr->th.th_current_place;
4262 #endif
4263  this_thr->th.th_root = master->th.th_root;
4264 
4265  /* setup the thread's cache of the team structure */
4266  this_thr->th.th_team_nproc = team->t.t_nproc;
4267  this_thr->th.th_team_master = master;
4268  this_thr->th.th_team_serialized = team->t.t_serialized;
4269 
4270  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4271 
4272  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4273  tid, gtid, this_thr, this_thr->th.th_current_task));
4274 
4275  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4276  team, tid, TRUE);
4277 
4278  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4279  tid, gtid, this_thr, this_thr->th.th_current_task));
4280  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4281  // __kmp_initialize_team()?
4282 
4283  /* TODO no worksharing in speculative threads */
4284  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4285 
4286  this_thr->th.th_local.this_construct = 0;
4287 
4288  if (!this_thr->th.th_pri_common) {
4289  this_thr->th.th_pri_common =
4290  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4291  if (__kmp_storage_map) {
4292  __kmp_print_storage_map_gtid(
4293  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4294  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4295  }
4296  this_thr->th.th_pri_head = NULL;
4297  }
4298 
4299  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4300  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4301  // Make new thread's CG root same as primary thread's
4302  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4303  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4304  if (tmp) {
4305  // worker changes CG, need to check if old CG should be freed
4306  int i = tmp->cg_nthreads--;
4307  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4308  " on node %p of thread %p to %d\n",
4309  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4310  if (i == 1) {
4311  __kmp_free(tmp); // last thread left CG --> free it
4312  }
4313  }
4314  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4315  // Increment new thread's CG root's counter to add the new thread
4316  this_thr->th.th_cg_roots->cg_nthreads++;
4317  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4318  " node %p of thread %p to %d\n",
4319  this_thr, this_thr->th.th_cg_roots,
4320  this_thr->th.th_cg_roots->cg_root,
4321  this_thr->th.th_cg_roots->cg_nthreads));
4322  this_thr->th.th_current_task->td_icvs.thread_limit =
4323  this_thr->th.th_cg_roots->cg_thread_limit;
4324  }
4325 
4326  /* Initialize dynamic dispatch */
4327  {
4328  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4329  // Use team max_nproc since this will never change for the team.
4330  size_t disp_size =
4331  sizeof(dispatch_private_info_t) *
4332  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4333  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4334  team->t.t_max_nproc));
4335  KMP_ASSERT(dispatch);
4336  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4337  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4338 
4339  dispatch->th_disp_index = 0;
4340  dispatch->th_doacross_buf_idx = 0;
4341  if (!dispatch->th_disp_buffer) {
4342  dispatch->th_disp_buffer =
4343  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4344 
4345  if (__kmp_storage_map) {
4346  __kmp_print_storage_map_gtid(
4347  gtid, &dispatch->th_disp_buffer[0],
4348  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4349  ? 1
4350  : __kmp_dispatch_num_buffers],
4351  disp_size,
4352  "th_%d.th_dispatch.th_disp_buffer "
4353  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4354  gtid, team->t.t_id, gtid);
4355  }
4356  } else {
4357  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4358  }
4359 
4360  dispatch->th_dispatch_pr_current = 0;
4361  dispatch->th_dispatch_sh_current = 0;
4362 
4363  dispatch->th_deo_fcn = 0; /* ORDERED */
4364  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4365  }
4366 
4367  this_thr->th.th_next_pool = NULL;
4368 
4369  if (!this_thr->th.th_task_state_memo_stack) {
4370  size_t i;
4371  this_thr->th.th_task_state_memo_stack =
4372  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4373  this_thr->th.th_task_state_top = 0;
4374  this_thr->th.th_task_state_stack_sz = 4;
4375  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4376  ++i) // zero init the stack
4377  this_thr->th.th_task_state_memo_stack[i] = 0;
4378  }
4379 
4380  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4381  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4382 
4383  KMP_MB();
4384 }
4385 
4386 /* allocate a new thread for the requesting team. this is only called from
4387  within a forkjoin critical section. we will first try to get an available
4388  thread from the thread pool. if none is available, we will fork a new one
4389  assuming we are able to create a new one. this should be assured, as the
4390  caller should check on this first. */
4391 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4392  int new_tid) {
4393  kmp_team_t *serial_team;
4394  kmp_info_t *new_thr;
4395  int new_gtid;
4396 
4397  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4398  KMP_DEBUG_ASSERT(root && team);
4399 #if !KMP_NESTED_HOT_TEAMS
4400  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4401 #endif
4402  KMP_MB();
4403 
4404  /* first, try to get one from the thread pool */
4405  if (__kmp_thread_pool) {
4406  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4407  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4408  if (new_thr == __kmp_thread_pool_insert_pt) {
4409  __kmp_thread_pool_insert_pt = NULL;
4410  }
4411  TCW_4(new_thr->th.th_in_pool, FALSE);
4412  __kmp_suspend_initialize_thread(new_thr);
4413  __kmp_lock_suspend_mx(new_thr);
4414  if (new_thr->th.th_active_in_pool == TRUE) {
4415  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4416  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4417  new_thr->th.th_active_in_pool = FALSE;
4418  }
4419  __kmp_unlock_suspend_mx(new_thr);
4420 
4421  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4422  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4423  KMP_ASSERT(!new_thr->th.th_team);
4424  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4425 
4426  /* setup the thread structure */
4427  __kmp_initialize_info(new_thr, team, new_tid,
4428  new_thr->th.th_info.ds.ds_gtid);
4429  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4430 
4431  TCW_4(__kmp_nth, __kmp_nth + 1);
4432 
4433  new_thr->th.th_task_state = 0;
4434  new_thr->th.th_task_state_top = 0;
4435  new_thr->th.th_task_state_stack_sz = 4;
4436 
4437  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4438  // Make sure pool thread has transitioned to waiting on own thread struct
4439  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4440  // Thread activated in __kmp_allocate_team when increasing team size
4441  }
4442 
4443 #ifdef KMP_ADJUST_BLOCKTIME
4444  /* Adjust blocktime back to zero if necessary */
4445  /* Middle initialization might not have occurred yet */
4446  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4447  if (__kmp_nth > __kmp_avail_proc) {
4448  __kmp_zero_bt = TRUE;
4449  }
4450  }
4451 #endif /* KMP_ADJUST_BLOCKTIME */
4452 
4453 #if KMP_DEBUG
4454  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4455  // KMP_BARRIER_PARENT_FLAG.
4456  int b;
4457  kmp_balign_t *balign = new_thr->th.th_bar;
4458  for (b = 0; b < bs_last_barrier; ++b)
4459  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4460 #endif
4461 
4462  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4463  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4464 
4465  KMP_MB();
4466  return new_thr;
4467  }
4468 
4469  /* no, well fork a new one */
4470  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4471  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4472 
4473 #if KMP_USE_MONITOR
4474  // If this is the first worker thread the RTL is creating, then also
4475  // launch the monitor thread. We try to do this as early as possible.
4476  if (!TCR_4(__kmp_init_monitor)) {
4477  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4478  if (!TCR_4(__kmp_init_monitor)) {
4479  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4480  TCW_4(__kmp_init_monitor, 1);
4481  __kmp_create_monitor(&__kmp_monitor);
4482  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4483 #if KMP_OS_WINDOWS
4484  // AC: wait until monitor has started. This is a fix for CQ232808.
4485  // The reason is that if the library is loaded/unloaded in a loop with
4486  // small (parallel) work in between, then there is high probability that
4487  // monitor thread started after the library shutdown. At shutdown it is
4488  // too late to cope with the problem, because when the primary thread is
4489  // in DllMain (process detach) the monitor has no chances to start (it is
4490  // blocked), and primary thread has no means to inform the monitor that
4491  // the library has gone, because all the memory which the monitor can
4492  // access is going to be released/reset.
4493  while (TCR_4(__kmp_init_monitor) < 2) {
4494  KMP_YIELD(TRUE);
4495  }
4496  KF_TRACE(10, ("after monitor thread has started\n"));
4497 #endif
4498  }
4499  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4500  }
4501 #endif
4502 
4503  KMP_MB();
4504 
4505  {
4506  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4507  ? 1
4508  : __kmp_hidden_helper_threads_num + 1;
4509 
4510  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4511  ++new_gtid) {
4512  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4513  }
4514 
4515  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4516  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4517  }
4518  }
4519 
4520  /* allocate space for it. */
4521  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4522 
4523  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4524 
4525 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4526  // suppress race conditions detection on synchronization flags in debug mode
4527  // this helps to analyze library internals eliminating false positives
4528  __itt_suppress_mark_range(
4529  __itt_suppress_range, __itt_suppress_threading_errors,
4530  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4531  __itt_suppress_mark_range(
4532  __itt_suppress_range, __itt_suppress_threading_errors,
4533  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4534 #if KMP_OS_WINDOWS
4535  __itt_suppress_mark_range(
4536  __itt_suppress_range, __itt_suppress_threading_errors,
4537  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4538 #else
4539  __itt_suppress_mark_range(__itt_suppress_range,
4540  __itt_suppress_threading_errors,
4541  &new_thr->th.th_suspend_init_count,
4542  sizeof(new_thr->th.th_suspend_init_count));
4543 #endif
4544  // TODO: check if we need to also suppress b_arrived flags
4545  __itt_suppress_mark_range(__itt_suppress_range,
4546  __itt_suppress_threading_errors,
4547  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4548  sizeof(new_thr->th.th_bar[0].bb.b_go));
4549  __itt_suppress_mark_range(__itt_suppress_range,
4550  __itt_suppress_threading_errors,
4551  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4552  sizeof(new_thr->th.th_bar[1].bb.b_go));
4553  __itt_suppress_mark_range(__itt_suppress_range,
4554  __itt_suppress_threading_errors,
4555  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4556  sizeof(new_thr->th.th_bar[2].bb.b_go));
4557 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4558  if (__kmp_storage_map) {
4559  __kmp_print_thread_storage_map(new_thr, new_gtid);
4560  }
4561 
4562  // add the reserve serialized team, initialized from the team's primary thread
4563  {
4564  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4565  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4566  new_thr->th.th_serial_team = serial_team =
4567  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4568 #if OMPT_SUPPORT
4569  ompt_data_none, // root parallel id
4570 #endif
4571  proc_bind_default, &r_icvs,
4572  0 USE_NESTED_HOT_ARG(NULL));
4573  }
4574  KMP_ASSERT(serial_team);
4575  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4576  // execution (it is unused for now).
4577  serial_team->t.t_threads[0] = new_thr;
4578  KF_TRACE(10,
4579  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4580  new_thr));
4581 
4582  /* setup the thread structures */
4583  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4584 
4585 #if USE_FAST_MEMORY
4586  __kmp_initialize_fast_memory(new_thr);
4587 #endif /* USE_FAST_MEMORY */
4588 
4589 #if KMP_USE_BGET
4590  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4591  __kmp_initialize_bget(new_thr);
4592 #endif
4593 
4594  __kmp_init_random(new_thr); // Initialize random number generator
4595 
4596  /* Initialize these only once when thread is grabbed for a team allocation */
4597  KA_TRACE(20,
4598  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4599  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4600 
4601  int b;
4602  kmp_balign_t *balign = new_thr->th.th_bar;
4603  for (b = 0; b < bs_last_barrier; ++b) {
4604  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4605  balign[b].bb.team = NULL;
4606  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4607  balign[b].bb.use_oncore_barrier = 0;
4608  }
4609 
4610  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4611  new_thr->th.th_sleep_loc_type = flag_unset;
4612 
4613  new_thr->th.th_spin_here = FALSE;
4614  new_thr->th.th_next_waiting = 0;
4615 #if KMP_OS_UNIX
4616  new_thr->th.th_blocking = false;
4617 #endif
4618 
4619 #if KMP_AFFINITY_SUPPORTED
4620  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4621  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4622  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4623  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4624 #endif
4625  new_thr->th.th_def_allocator = __kmp_def_allocator;
4626  new_thr->th.th_prev_level = 0;
4627  new_thr->th.th_prev_num_threads = 1;
4628 
4629  TCW_4(new_thr->th.th_in_pool, FALSE);
4630  new_thr->th.th_active_in_pool = FALSE;
4631  TCW_4(new_thr->th.th_active, TRUE);
4632 
4633  /* adjust the global counters */
4634  __kmp_all_nth++;
4635  __kmp_nth++;
4636 
4637  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4638  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4639  if (__kmp_adjust_gtid_mode) {
4640  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4641  if (TCR_4(__kmp_gtid_mode) != 2) {
4642  TCW_4(__kmp_gtid_mode, 2);
4643  }
4644  } else {
4645  if (TCR_4(__kmp_gtid_mode) != 1) {
4646  TCW_4(__kmp_gtid_mode, 1);
4647  }
4648  }
4649  }
4650 
4651 #ifdef KMP_ADJUST_BLOCKTIME
4652  /* Adjust blocktime back to zero if necessary */
4653  /* Middle initialization might not have occurred yet */
4654  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4655  if (__kmp_nth > __kmp_avail_proc) {
4656  __kmp_zero_bt = TRUE;
4657  }
4658  }
4659 #endif /* KMP_ADJUST_BLOCKTIME */
4660 
4661  /* actually fork it and create the new worker thread */
4662  KF_TRACE(
4663  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4664  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4665  KF_TRACE(10,
4666  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4667 
4668  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4669  new_gtid));
4670  KMP_MB();
4671  return new_thr;
4672 }
4673 
4674 /* Reinitialize team for reuse.
4675  The hot team code calls this case at every fork barrier, so EPCC barrier
4676  test are extremely sensitive to changes in it, esp. writes to the team
4677  struct, which cause a cache invalidation in all threads.
4678  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4679 static void __kmp_reinitialize_team(kmp_team_t *team,
4680  kmp_internal_control_t *new_icvs,
4681  ident_t *loc) {
4682  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4683  team->t.t_threads[0], team));
4684  KMP_DEBUG_ASSERT(team && new_icvs);
4685  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4686  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4687 
4688  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4689  // Copy ICVs to the primary thread's implicit taskdata
4690  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4691  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4692 
4693  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4694  team->t.t_threads[0], team));
4695 }
4696 
4697 /* Initialize the team data structure.
4698  This assumes the t_threads and t_max_nproc are already set.
4699  Also, we don't touch the arguments */
4700 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4701  kmp_internal_control_t *new_icvs,
4702  ident_t *loc) {
4703  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4704 
4705  /* verify */
4706  KMP_DEBUG_ASSERT(team);
4707  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4708  KMP_DEBUG_ASSERT(team->t.t_threads);
4709  KMP_MB();
4710 
4711  team->t.t_master_tid = 0; /* not needed */
4712  /* team->t.t_master_bar; not needed */
4713  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4714  team->t.t_nproc = new_nproc;
4715 
4716  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4717  team->t.t_next_pool = NULL;
4718  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4719  * up hot team */
4720 
4721  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4722  team->t.t_invoke = NULL; /* not needed */
4723 
4724  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4725  team->t.t_sched.sched = new_icvs->sched.sched;
4726 
4727 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4728  team->t.t_fp_control_saved = FALSE; /* not needed */
4729  team->t.t_x87_fpu_control_word = 0; /* not needed */
4730  team->t.t_mxcsr = 0; /* not needed */
4731 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4732 
4733  team->t.t_construct = 0;
4734 
4735  team->t.t_ordered.dt.t_value = 0;
4736  team->t.t_master_active = FALSE;
4737 
4738 #ifdef KMP_DEBUG
4739  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4740 #endif
4741 #if KMP_OS_WINDOWS
4742  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4743 #endif
4744 
4745  team->t.t_control_stack_top = NULL;
4746 
4747  __kmp_reinitialize_team(team, new_icvs, loc);
4748 
4749  KMP_MB();
4750  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4751 }
4752 
4753 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4754 /* Sets full mask for thread and returns old mask, no changes to structures. */
4755 static void
4756 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4757  if (KMP_AFFINITY_CAPABLE()) {
4758  int status;
4759  if (old_mask != NULL) {
4760  status = __kmp_get_system_affinity(old_mask, TRUE);
4761  int error = errno;
4762  if (status != 0) {
4763  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4764  __kmp_msg_null);
4765  }
4766  }
4767  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4768  }
4769 }
4770 #endif
4771 
4772 #if KMP_AFFINITY_SUPPORTED
4773 
4774 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4775 // It calculates the worker + primary thread's partition based upon the parent
4776 // thread's partition, and binds each worker to a thread in their partition.
4777 // The primary thread's partition should already include its current binding.
4778 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4779  // Do not partition places for the hidden helper team
4780  if (KMP_HIDDEN_HELPER_TEAM(team))
4781  return;
4782  // Copy the primary thread's place partition to the team struct
4783  kmp_info_t *master_th = team->t.t_threads[0];
4784  KMP_DEBUG_ASSERT(master_th != NULL);
4785  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4786  int first_place = master_th->th.th_first_place;
4787  int last_place = master_th->th.th_last_place;
4788  int masters_place = master_th->th.th_current_place;
4789  int num_masks = __kmp_affinity.num_masks;
4790  team->t.t_first_place = first_place;
4791  team->t.t_last_place = last_place;
4792 
4793  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4794  "bound to place %d partition = [%d,%d]\n",
4795  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4796  team->t.t_id, masters_place, first_place, last_place));
4797 
4798  switch (proc_bind) {
4799 
4800  case proc_bind_default:
4801  // Serial teams might have the proc_bind policy set to proc_bind_default.
4802  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4803  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4804  break;
4805 
4806  case proc_bind_primary: {
4807  int f;
4808  int n_th = team->t.t_nproc;
4809  for (f = 1; f < n_th; f++) {
4810  kmp_info_t *th = team->t.t_threads[f];
4811  KMP_DEBUG_ASSERT(th != NULL);
4812  th->th.th_first_place = first_place;
4813  th->th.th_last_place = last_place;
4814  th->th.th_new_place = masters_place;
4815  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4816  team->t.t_display_affinity != 1) {
4817  team->t.t_display_affinity = 1;
4818  }
4819 
4820  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4821  "partition = [%d,%d]\n",
4822  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4823  f, masters_place, first_place, last_place));
4824  }
4825  } break;
4826 
4827  case proc_bind_close: {
4828  int f;
4829  int n_th = team->t.t_nproc;
4830  int n_places;
4831  if (first_place <= last_place) {
4832  n_places = last_place - first_place + 1;
4833  } else {
4834  n_places = num_masks - first_place + last_place + 1;
4835  }
4836  if (n_th <= n_places) {
4837  int place = masters_place;
4838  for (f = 1; f < n_th; f++) {
4839  kmp_info_t *th = team->t.t_threads[f];
4840  KMP_DEBUG_ASSERT(th != NULL);
4841 
4842  if (place == last_place) {
4843  place = first_place;
4844  } else if (place == (num_masks - 1)) {
4845  place = 0;
4846  } else {
4847  place++;
4848  }
4849  th->th.th_first_place = first_place;
4850  th->th.th_last_place = last_place;
4851  th->th.th_new_place = place;
4852  if (__kmp_display_affinity && place != th->th.th_current_place &&
4853  team->t.t_display_affinity != 1) {
4854  team->t.t_display_affinity = 1;
4855  }
4856 
4857  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4858  "partition = [%d,%d]\n",
4859  __kmp_gtid_from_thread(team->t.t_threads[f]),
4860  team->t.t_id, f, place, first_place, last_place));
4861  }
4862  } else {
4863  int S, rem, gap, s_count;
4864  S = n_th / n_places;
4865  s_count = 0;
4866  rem = n_th - (S * n_places);
4867  gap = rem > 0 ? n_places / rem : n_places;
4868  int place = masters_place;
4869  int gap_ct = gap;
4870  for (f = 0; f < n_th; f++) {
4871  kmp_info_t *th = team->t.t_threads[f];
4872  KMP_DEBUG_ASSERT(th != NULL);
4873 
4874  th->th.th_first_place = first_place;
4875  th->th.th_last_place = last_place;
4876  th->th.th_new_place = place;
4877  if (__kmp_display_affinity && place != th->th.th_current_place &&
4878  team->t.t_display_affinity != 1) {
4879  team->t.t_display_affinity = 1;
4880  }
4881  s_count++;
4882 
4883  if ((s_count == S) && rem && (gap_ct == gap)) {
4884  // do nothing, add an extra thread to place on next iteration
4885  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4886  // we added an extra thread to this place; move to next place
4887  if (place == last_place) {
4888  place = first_place;
4889  } else if (place == (num_masks - 1)) {
4890  place = 0;
4891  } else {
4892  place++;
4893  }
4894  s_count = 0;
4895  gap_ct = 1;
4896  rem--;
4897  } else if (s_count == S) { // place full; don't add extra
4898  if (place == last_place) {
4899  place = first_place;
4900  } else if (place == (num_masks - 1)) {
4901  place = 0;
4902  } else {
4903  place++;
4904  }
4905  gap_ct++;
4906  s_count = 0;
4907  }
4908 
4909  KA_TRACE(100,
4910  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4911  "partition = [%d,%d]\n",
4912  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4913  th->th.th_new_place, first_place, last_place));
4914  }
4915  KMP_DEBUG_ASSERT(place == masters_place);
4916  }
4917  } break;
4918 
4919  case proc_bind_spread: {
4920  int f;
4921  int n_th = team->t.t_nproc;
4922  int n_places;
4923  int thidx;
4924  if (first_place <= last_place) {
4925  n_places = last_place - first_place + 1;
4926  } else {
4927  n_places = num_masks - first_place + last_place + 1;
4928  }
4929  if (n_th <= n_places) {
4930  int place = -1;
4931 
4932  if (n_places != num_masks) {
4933  int S = n_places / n_th;
4934  int s_count, rem, gap, gap_ct;
4935 
4936  place = masters_place;
4937  rem = n_places - n_th * S;
4938  gap = rem ? n_th / rem : 1;
4939  gap_ct = gap;
4940  thidx = n_th;
4941  if (update_master_only == 1)
4942  thidx = 1;
4943  for (f = 0; f < thidx; f++) {
4944  kmp_info_t *th = team->t.t_threads[f];
4945  KMP_DEBUG_ASSERT(th != NULL);
4946 
4947  th->th.th_first_place = place;
4948  th->th.th_new_place = place;
4949  if (__kmp_display_affinity && place != th->th.th_current_place &&
4950  team->t.t_display_affinity != 1) {
4951  team->t.t_display_affinity = 1;
4952  }
4953  s_count = 1;
4954  while (s_count < S) {
4955  if (place == last_place) {
4956  place = first_place;
4957  } else if (place == (num_masks - 1)) {
4958  place = 0;
4959  } else {
4960  place++;
4961  }
4962  s_count++;
4963  }
4964  if (rem && (gap_ct == gap)) {
4965  if (place == last_place) {
4966  place = first_place;
4967  } else if (place == (num_masks - 1)) {
4968  place = 0;
4969  } else {
4970  place++;
4971  }
4972  rem--;
4973  gap_ct = 0;
4974  }
4975  th->th.th_last_place = place;
4976  gap_ct++;
4977 
4978  if (place == last_place) {
4979  place = first_place;
4980  } else if (place == (num_masks - 1)) {
4981  place = 0;
4982  } else {
4983  place++;
4984  }
4985 
4986  KA_TRACE(100,
4987  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988  "partition = [%d,%d], num_masks: %u\n",
4989  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4990  f, th->th.th_new_place, th->th.th_first_place,
4991  th->th.th_last_place, num_masks));
4992  }
4993  } else {
4994  /* Having uniform space of available computation places I can create
4995  T partitions of round(P/T) size and put threads into the first
4996  place of each partition. */
4997  double current = static_cast<double>(masters_place);
4998  double spacing =
4999  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5000  int first, last;
5001  kmp_info_t *th;
5002 
5003  thidx = n_th + 1;
5004  if (update_master_only == 1)
5005  thidx = 1;
5006  for (f = 0; f < thidx; f++) {
5007  first = static_cast<int>(current);
5008  last = static_cast<int>(current + spacing) - 1;
5009  KMP_DEBUG_ASSERT(last >= first);
5010  if (first >= n_places) {
5011  if (masters_place) {
5012  first -= n_places;
5013  last -= n_places;
5014  if (first == (masters_place + 1)) {
5015  KMP_DEBUG_ASSERT(f == n_th);
5016  first--;
5017  }
5018  if (last == masters_place) {
5019  KMP_DEBUG_ASSERT(f == (n_th - 1));
5020  last--;
5021  }
5022  } else {
5023  KMP_DEBUG_ASSERT(f == n_th);
5024  first = 0;
5025  last = 0;
5026  }
5027  }
5028  if (last >= n_places) {
5029  last = (n_places - 1);
5030  }
5031  place = first;
5032  current += spacing;
5033  if (f < n_th) {
5034  KMP_DEBUG_ASSERT(0 <= first);
5035  KMP_DEBUG_ASSERT(n_places > first);
5036  KMP_DEBUG_ASSERT(0 <= last);
5037  KMP_DEBUG_ASSERT(n_places > last);
5038  KMP_DEBUG_ASSERT(last_place >= first_place);
5039  th = team->t.t_threads[f];
5040  KMP_DEBUG_ASSERT(th);
5041  th->th.th_first_place = first;
5042  th->th.th_new_place = place;
5043  th->th.th_last_place = last;
5044  if (__kmp_display_affinity && place != th->th.th_current_place &&
5045  team->t.t_display_affinity != 1) {
5046  team->t.t_display_affinity = 1;
5047  }
5048  KA_TRACE(100,
5049  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5050  "partition = [%d,%d], spacing = %.4f\n",
5051  __kmp_gtid_from_thread(team->t.t_threads[f]),
5052  team->t.t_id, f, th->th.th_new_place,
5053  th->th.th_first_place, th->th.th_last_place, spacing));
5054  }
5055  }
5056  }
5057  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5058  } else {
5059  int S, rem, gap, s_count;
5060  S = n_th / n_places;
5061  s_count = 0;
5062  rem = n_th - (S * n_places);
5063  gap = rem > 0 ? n_places / rem : n_places;
5064  int place = masters_place;
5065  int gap_ct = gap;
5066  thidx = n_th;
5067  if (update_master_only == 1)
5068  thidx = 1;
5069  for (f = 0; f < thidx; f++) {
5070  kmp_info_t *th = team->t.t_threads[f];
5071  KMP_DEBUG_ASSERT(th != NULL);
5072 
5073  th->th.th_first_place = place;
5074  th->th.th_last_place = place;
5075  th->th.th_new_place = place;
5076  if (__kmp_display_affinity && place != th->th.th_current_place &&
5077  team->t.t_display_affinity != 1) {
5078  team->t.t_display_affinity = 1;
5079  }
5080  s_count++;
5081 
5082  if ((s_count == S) && rem && (gap_ct == gap)) {
5083  // do nothing, add an extra thread to place on next iteration
5084  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085  // we added an extra thread to this place; move on to next place
5086  if (place == last_place) {
5087  place = first_place;
5088  } else if (place == (num_masks - 1)) {
5089  place = 0;
5090  } else {
5091  place++;
5092  }
5093  s_count = 0;
5094  gap_ct = 1;
5095  rem--;
5096  } else if (s_count == S) { // place is full; don't add extra thread
5097  if (place == last_place) {
5098  place = first_place;
5099  } else if (place == (num_masks - 1)) {
5100  place = 0;
5101  } else {
5102  place++;
5103  }
5104  gap_ct++;
5105  s_count = 0;
5106  }
5107 
5108  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109  "partition = [%d,%d]\n",
5110  __kmp_gtid_from_thread(team->t.t_threads[f]),
5111  team->t.t_id, f, th->th.th_new_place,
5112  th->th.th_first_place, th->th.th_last_place));
5113  }
5114  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115  }
5116  } break;
5117 
5118  default:
5119  break;
5120  }
5121 
5122  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123 }
5124 
5125 #endif // KMP_AFFINITY_SUPPORTED
5126 
5127 /* allocate a new team data structure to use. take one off of the free pool if
5128  available */
5129 kmp_team_t *
5130 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131 #if OMPT_SUPPORT
5132  ompt_data_t ompt_parallel_data,
5133 #endif
5134  kmp_proc_bind_t new_proc_bind,
5135  kmp_internal_control_t *new_icvs,
5136  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138  int f;
5139  kmp_team_t *team;
5140  int use_hot_team = !root->r.r_active;
5141  int level = 0;
5142  int do_place_partition = 1;
5143 
5144  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147  KMP_MB();
5148 
5149 #if KMP_NESTED_HOT_TEAMS
5150  kmp_hot_team_ptr_t *hot_teams;
5151  if (master) {
5152  team = master->th.th_team;
5153  level = team->t.t_active_level;
5154  if (master->th.th_teams_microtask) { // in teams construct?
5155  if (master->th.th_teams_size.nteams > 1 &&
5156  ( // #teams > 1
5157  team->t.t_pkfn ==
5158  (microtask_t)__kmp_teams_master || // inner fork of the teams
5159  master->th.th_teams_level <
5160  team->t.t_level)) { // or nested parallel inside the teams
5161  ++level; // not increment if #teams==1, or for outer fork of the teams;
5162  // increment otherwise
5163  }
5164  // Do not perform the place partition if inner fork of the teams
5165  // Wait until nested parallel region encountered inside teams construct
5166  if ((master->th.th_teams_size.nteams == 1 &&
5167  master->th.th_teams_level >= team->t.t_level) ||
5168  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169  do_place_partition = 0;
5170  }
5171  hot_teams = master->th.th_hot_teams;
5172  if (level < __kmp_hot_teams_max_level && hot_teams &&
5173  hot_teams[level].hot_team) {
5174  // hot team has already been allocated for given level
5175  use_hot_team = 1;
5176  } else {
5177  use_hot_team = 0;
5178  }
5179  } else {
5180  // check we won't access uninitialized hot_teams, just in case
5181  KMP_DEBUG_ASSERT(new_nproc == 1);
5182  }
5183 #endif
5184  // Optimization to use a "hot" team
5185  if (use_hot_team && new_nproc > 1) {
5186  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187 #if KMP_NESTED_HOT_TEAMS
5188  team = hot_teams[level].hot_team;
5189 #else
5190  team = root->r.r_hot_team;
5191 #endif
5192 #if KMP_DEBUG
5193  if (__kmp_tasking_mode != tskm_immediate_exec) {
5194  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195  "task_team[1] = %p before reinit\n",
5196  team->t.t_task_team[0], team->t.t_task_team[1]));
5197  }
5198 #endif
5199 
5200  if (team->t.t_nproc != new_nproc &&
5201  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202  // Distributed barrier may need a resize
5203  int old_nthr = team->t.t_nproc;
5204  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205  }
5206 
5207  // If not doing the place partition, then reset the team's proc bind
5208  // to indicate that partitioning of all threads still needs to take place
5209  if (do_place_partition == 0)
5210  team->t.t_proc_bind = proc_bind_default;
5211  // Has the number of threads changed?
5212  /* Let's assume the most common case is that the number of threads is
5213  unchanged, and put that case first. */
5214  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216  // This case can mean that omp_set_num_threads() was called and the hot
5217  // team size was already reduced, so we check the special flag
5218  if (team->t.t_size_changed == -1) {
5219  team->t.t_size_changed = 1;
5220  } else {
5221  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222  }
5223 
5224  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225  kmp_r_sched_t new_sched = new_icvs->sched;
5226  // set primary thread's schedule as new run-time schedule
5227  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228 
5229  __kmp_reinitialize_team(team, new_icvs,
5230  root->r.r_uber_thread->th.th_ident);
5231 
5232  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233  team->t.t_threads[0], team));
5234  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235 
5236 #if KMP_AFFINITY_SUPPORTED
5237  if ((team->t.t_size_changed == 0) &&
5238  (team->t.t_proc_bind == new_proc_bind)) {
5239  if (new_proc_bind == proc_bind_spread) {
5240  if (do_place_partition) {
5241  // add flag to update only master for spread
5242  __kmp_partition_places(team, 1);
5243  }
5244  }
5245  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246  "proc_bind = %d, partition = [%d,%d]\n",
5247  team->t.t_id, new_proc_bind, team->t.t_first_place,
5248  team->t.t_last_place));
5249  } else {
5250  if (do_place_partition) {
5251  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252  __kmp_partition_places(team);
5253  }
5254  }
5255 #else
5256  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257 #endif /* KMP_AFFINITY_SUPPORTED */
5258  } else if (team->t.t_nproc > new_nproc) {
5259  KA_TRACE(20,
5260  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261  new_nproc));
5262 
5263  team->t.t_size_changed = 1;
5264  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265  // Barrier size already reduced earlier in this function
5266  // Activate team threads via th_used_in_team
5267  __kmp_add_threads_to_team(team, new_nproc);
5268  }
5269 #if KMP_NESTED_HOT_TEAMS
5270  if (__kmp_hot_teams_mode == 0) {
5271  // AC: saved number of threads should correspond to team's value in this
5272  // mode, can be bigger in mode 1, when hot team has threads in reserve
5273  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274  hot_teams[level].hot_team_nth = new_nproc;
5275 #endif // KMP_NESTED_HOT_TEAMS
5276  /* release the extra threads we don't need any more */
5277  for (f = new_nproc; f < team->t.t_nproc; f++) {
5278  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279  if (__kmp_tasking_mode != tskm_immediate_exec) {
5280  // When decreasing team size, threads no longer in the team should
5281  // unref task team.
5282  team->t.t_threads[f]->th.th_task_team = NULL;
5283  }
5284  __kmp_free_thread(team->t.t_threads[f]);
5285  team->t.t_threads[f] = NULL;
5286  }
5287 #if KMP_NESTED_HOT_TEAMS
5288  } // (__kmp_hot_teams_mode == 0)
5289  else {
5290  // When keeping extra threads in team, switch threads to wait on own
5291  // b_go flag
5292  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295  for (int b = 0; b < bs_last_barrier; ++b) {
5296  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298  }
5299  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300  }
5301  }
5302  }
5303 #endif // KMP_NESTED_HOT_TEAMS
5304  team->t.t_nproc = new_nproc;
5305  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307  __kmp_reinitialize_team(team, new_icvs,
5308  root->r.r_uber_thread->th.th_ident);
5309 
5310  // Update remaining threads
5311  for (f = 0; f < new_nproc; ++f) {
5312  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313  }
5314 
5315  // restore the current task state of the primary thread: should be the
5316  // implicit task
5317  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318  team->t.t_threads[0], team));
5319 
5320  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321 
5322 #ifdef KMP_DEBUG
5323  for (f = 0; f < team->t.t_nproc; f++) {
5324  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325  team->t.t_threads[f]->th.th_team_nproc ==
5326  team->t.t_nproc);
5327  }
5328 #endif
5329 
5330  if (do_place_partition) {
5331  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332 #if KMP_AFFINITY_SUPPORTED
5333  __kmp_partition_places(team);
5334 #endif
5335  }
5336  } else { // team->t.t_nproc < new_nproc
5337 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5338  kmp_affin_mask_t *old_mask;
5339  if (KMP_AFFINITY_CAPABLE()) {
5340  KMP_CPU_ALLOC(old_mask);
5341  }
5342 #endif
5343 
5344  KA_TRACE(20,
5345  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5346  new_nproc));
5347  int old_nproc = team->t.t_nproc; // save old value and use to update only
5348  team->t.t_size_changed = 1;
5349 
5350 #if KMP_NESTED_HOT_TEAMS
5351  int avail_threads = hot_teams[level].hot_team_nth;
5352  if (new_nproc < avail_threads)
5353  avail_threads = new_nproc;
5354  kmp_info_t **other_threads = team->t.t_threads;
5355  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5356  // Adjust barrier data of reserved threads (if any) of the team
5357  // Other data will be set in __kmp_initialize_info() below.
5358  int b;
5359  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5360  for (b = 0; b < bs_last_barrier; ++b) {
5361  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5362  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5363 #if USE_DEBUGGER
5364  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5365 #endif
5366  }
5367  }
5368  if (hot_teams[level].hot_team_nth >= new_nproc) {
5369  // we have all needed threads in reserve, no need to allocate any
5370  // this only possible in mode 1, cannot have reserved threads in mode 0
5371  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5372  team->t.t_nproc = new_nproc; // just get reserved threads involved
5373  } else {
5374  // We may have some threads in reserve, but not enough;
5375  // get reserved threads involved if any.
5376  team->t.t_nproc = hot_teams[level].hot_team_nth;
5377  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5378 #endif // KMP_NESTED_HOT_TEAMS
5379  if (team->t.t_max_nproc < new_nproc) {
5380  /* reallocate larger arrays */
5381  __kmp_reallocate_team_arrays(team, new_nproc);
5382  __kmp_reinitialize_team(team, new_icvs, NULL);
5383  }
5384 
5385 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5386  /* Temporarily set full mask for primary thread before creation of
5387  workers. The reason is that workers inherit the affinity from the
5388  primary thread, so if a lot of workers are created on the single
5389  core quickly, they don't get a chance to set their own affinity for
5390  a long time. */
5391  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5392 #endif
5393 
5394  /* allocate new threads for the hot team */
5395  for (f = team->t.t_nproc; f < new_nproc; f++) {
5396  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5397  KMP_DEBUG_ASSERT(new_worker);
5398  team->t.t_threads[f] = new_worker;
5399 
5400  KA_TRACE(20,
5401  ("__kmp_allocate_team: team %d init T#%d arrived: "
5402  "join=%llu, plain=%llu\n",
5403  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5404  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5405  team->t.t_bar[bs_plain_barrier].b_arrived));
5406 
5407  { // Initialize barrier data for new threads.
5408  int b;
5409  kmp_balign_t *balign = new_worker->th.th_bar;
5410  for (b = 0; b < bs_last_barrier; ++b) {
5411  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5412  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5413  KMP_BARRIER_PARENT_FLAG);
5414 #if USE_DEBUGGER
5415  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5416 #endif
5417  }
5418  }
5419  }
5420 
5421 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5422  if (KMP_AFFINITY_CAPABLE()) {
5423  /* Restore initial primary thread's affinity mask */
5424  __kmp_set_system_affinity(old_mask, TRUE);
5425  KMP_CPU_FREE(old_mask);
5426  }
5427 #endif
5428 #if KMP_NESTED_HOT_TEAMS
5429  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5430 #endif // KMP_NESTED_HOT_TEAMS
5431  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5432  // Barrier size already increased earlier in this function
5433  // Activate team threads via th_used_in_team
5434  __kmp_add_threads_to_team(team, new_nproc);
5435  }
5436  /* make sure everyone is syncronized */
5437  // new threads below
5438  __kmp_initialize_team(team, new_nproc, new_icvs,
5439  root->r.r_uber_thread->th.th_ident);
5440 
5441  /* reinitialize the threads */
5442  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5443  for (f = 0; f < team->t.t_nproc; ++f)
5444  __kmp_initialize_info(team->t.t_threads[f], team, f,
5445  __kmp_gtid_from_tid(f, team));
5446 
5447  // set th_task_state for new threads in hot team with older thread's state
5448  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5449  for (f = old_nproc; f < team->t.t_nproc; ++f)
5450  team->t.t_threads[f]->th.th_task_state = old_state;
5451 
5452 #ifdef KMP_DEBUG
5453  for (f = 0; f < team->t.t_nproc; ++f) {
5454  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5455  team->t.t_threads[f]->th.th_team_nproc ==
5456  team->t.t_nproc);
5457  }
5458 #endif
5459 
5460  if (do_place_partition) {
5461  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5462 #if KMP_AFFINITY_SUPPORTED
5463  __kmp_partition_places(team);
5464 #endif
5465  }
5466  } // Check changes in number of threads
5467 
5468  kmp_info_t *master = team->t.t_threads[0];
5469  if (master->th.th_teams_microtask) {
5470  for (f = 1; f < new_nproc; ++f) {
5471  // propagate teams construct specific info to workers
5472  kmp_info_t *thr = team->t.t_threads[f];
5473  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474  thr->th.th_teams_level = master->th.th_teams_level;
5475  thr->th.th_teams_size = master->th.th_teams_size;
5476  }
5477  }
5478 #if KMP_NESTED_HOT_TEAMS
5479  if (level) {
5480  // Sync barrier state for nested hot teams, not needed for outermost hot
5481  // team.
5482  for (f = 1; f < new_nproc; ++f) {
5483  kmp_info_t *thr = team->t.t_threads[f];
5484  int b;
5485  kmp_balign_t *balign = thr->th.th_bar;
5486  for (b = 0; b < bs_last_barrier; ++b) {
5487  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489 #if USE_DEBUGGER
5490  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491 #endif
5492  }
5493  }
5494  }
5495 #endif // KMP_NESTED_HOT_TEAMS
5496 
5497  /* reallocate space for arguments if necessary */
5498  __kmp_alloc_argv_entries(argc, team, TRUE);
5499  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500  // The hot team re-uses the previous task team,
5501  // if untouched during the previous release->gather phase.
5502 
5503  KF_TRACE(10, (" hot_team = %p\n", team));
5504 
5505 #if KMP_DEBUG
5506  if (__kmp_tasking_mode != tskm_immediate_exec) {
5507  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508  "task_team[1] = %p after reinit\n",
5509  team->t.t_task_team[0], team->t.t_task_team[1]));
5510  }
5511 #endif
5512 
5513 #if OMPT_SUPPORT
5514  __ompt_team_assign_id(team, ompt_parallel_data);
5515 #endif
5516 
5517  KMP_MB();
5518 
5519  return team;
5520  }
5521 
5522  /* next, let's try to take one from the team pool */
5523  KMP_MB();
5524  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525  /* TODO: consider resizing undersized teams instead of reaping them, now
5526  that we have a resizing mechanism */
5527  if (team->t.t_max_nproc >= max_nproc) {
5528  /* take this team from the team pool */
5529  __kmp_team_pool = team->t.t_next_pool;
5530 
5531  if (max_nproc > 1 &&
5532  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533  if (!team->t.b) { // Allocate barrier structure
5534  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535  }
5536  }
5537 
5538  /* setup the team for fresh use */
5539  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540 
5541  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542  "task_team[1] %p to NULL\n",
5543  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544  team->t.t_task_team[0] = NULL;
5545  team->t.t_task_team[1] = NULL;
5546 
5547  /* reallocate space for arguments if necessary */
5548  __kmp_alloc_argv_entries(argc, team, TRUE);
5549  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550 
5551  KA_TRACE(
5552  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554  { // Initialize barrier data.
5555  int b;
5556  for (b = 0; b < bs_last_barrier; ++b) {
5557  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558 #if USE_DEBUGGER
5559  team->t.t_bar[b].b_master_arrived = 0;
5560  team->t.t_bar[b].b_team_arrived = 0;
5561 #endif
5562  }
5563  }
5564 
5565  team->t.t_proc_bind = new_proc_bind;
5566 
5567  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568  team->t.t_id));
5569 
5570 #if OMPT_SUPPORT
5571  __ompt_team_assign_id(team, ompt_parallel_data);
5572 #endif
5573 
5574  KMP_MB();
5575 
5576  return team;
5577  }
5578 
5579  /* reap team if it is too small, then loop back and check the next one */
5580  // not sure if this is wise, but, will be redone during the hot-teams
5581  // rewrite.
5582  /* TODO: Use technique to find the right size hot-team, don't reap them */
5583  team = __kmp_reap_team(team);
5584  __kmp_team_pool = team;
5585  }
5586 
5587  /* nothing available in the pool, no matter, make a new team! */
5588  KMP_MB();
5589  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5590 
5591  /* and set it up */
5592  team->t.t_max_nproc = max_nproc;
5593  if (max_nproc > 1 &&
5594  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5595  // Allocate barrier structure
5596  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5597  }
5598 
5599  /* NOTE well, for some reason allocating one big buffer and dividing it up
5600  seems to really hurt performance a lot on the P4, so, let's not use this */
5601  __kmp_allocate_team_arrays(team, max_nproc);
5602 
5603  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5604  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5605 
5606  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5607  "%p to NULL\n",
5608  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5609  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5610  // memory, no need to duplicate
5611  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5612  // memory, no need to duplicate
5613 
5614  if (__kmp_storage_map) {
5615  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5616  }
5617 
5618  /* allocate space for arguments */
5619  __kmp_alloc_argv_entries(argc, team, FALSE);
5620  team->t.t_argc = argc;
5621 
5622  KA_TRACE(20,
5623  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5624  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5625  { // Initialize barrier data.
5626  int b;
5627  for (b = 0; b < bs_last_barrier; ++b) {
5628  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5629 #if USE_DEBUGGER
5630  team->t.t_bar[b].b_master_arrived = 0;
5631  team->t.t_bar[b].b_team_arrived = 0;
5632 #endif
5633  }
5634  }
5635 
5636  team->t.t_proc_bind = new_proc_bind;
5637 
5638 #if OMPT_SUPPORT
5639  __ompt_team_assign_id(team, ompt_parallel_data);
5640  team->t.ompt_serialized_team_info = NULL;
5641 #endif
5642 
5643  KMP_MB();
5644 
5645  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5646  team->t.t_id));
5647 
5648  return team;
5649 }
5650 
5651 /* TODO implement hot-teams at all levels */
5652 /* TODO implement lazy thread release on demand (disband request) */
5653 
5654 /* free the team. return it to the team pool. release all the threads
5655  * associated with it */
5656 void __kmp_free_team(kmp_root_t *root,
5657  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5658  int f;
5659  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5660  team->t.t_id));
5661 
5662  /* verify state */
5663  KMP_DEBUG_ASSERT(root);
5664  KMP_DEBUG_ASSERT(team);
5665  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5666  KMP_DEBUG_ASSERT(team->t.t_threads);
5667 
5668  int use_hot_team = team == root->r.r_hot_team;
5669 #if KMP_NESTED_HOT_TEAMS
5670  int level;
5671  if (master) {
5672  level = team->t.t_active_level - 1;
5673  if (master->th.th_teams_microtask) { // in teams construct?
5674  if (master->th.th_teams_size.nteams > 1) {
5675  ++level; // level was not increased in teams construct for
5676  // team_of_masters
5677  }
5678  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5679  master->th.th_teams_level == team->t.t_level) {
5680  ++level; // level was not increased in teams construct for
5681  // team_of_workers before the parallel
5682  } // team->t.t_level will be increased inside parallel
5683  }
5684 #if KMP_DEBUG
5685  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5686 #endif
5687  if (level < __kmp_hot_teams_max_level) {
5688  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5689  use_hot_team = 1;
5690  }
5691  }
5692 #endif // KMP_NESTED_HOT_TEAMS
5693 
5694  /* team is done working */
5695  TCW_SYNC_PTR(team->t.t_pkfn,
5696  NULL); // Important for Debugging Support Library.
5697 #if KMP_OS_WINDOWS
5698  team->t.t_copyin_counter = 0; // init counter for possible reuse
5699 #endif
5700  // Do not reset pointer to parent team to NULL for hot teams.
5701 
5702  /* if we are non-hot team, release our threads */
5703  if (!use_hot_team) {
5704  if (__kmp_tasking_mode != tskm_immediate_exec) {
5705  // Wait for threads to reach reapable state
5706  for (f = 1; f < team->t.t_nproc; ++f) {
5707  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5708  kmp_info_t *th = team->t.t_threads[f];
5709  volatile kmp_uint32 *state = &th->th.th_reap_state;
5710  while (*state != KMP_SAFE_TO_REAP) {
5711 #if KMP_OS_WINDOWS
5712  // On Windows a thread can be killed at any time, check this
5713  DWORD ecode;
5714  if (!__kmp_is_thread_alive(th, &ecode)) {
5715  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5716  break;
5717  }
5718 #endif
5719  // first check if thread is sleeping
5720  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5721  if (fl.is_sleeping())
5722  fl.resume(__kmp_gtid_from_thread(th));
5723  KMP_CPU_PAUSE();
5724  }
5725  }
5726 
5727  // Delete task teams
5728  int tt_idx;
5729  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5730  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5731  if (task_team != NULL) {
5732  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5733  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5734  team->t.t_threads[f]->th.th_task_team = NULL;
5735  }
5736  KA_TRACE(
5737  20,
5738  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5739  __kmp_get_gtid(), task_team, team->t.t_id));
5740 #if KMP_NESTED_HOT_TEAMS
5741  __kmp_free_task_team(master, task_team);
5742 #endif
5743  team->t.t_task_team[tt_idx] = NULL;
5744  }
5745  }
5746  }
5747 
5748  // Reset pointer to parent team only for non-hot teams.
5749  team->t.t_parent = NULL;
5750  team->t.t_level = 0;
5751  team->t.t_active_level = 0;
5752 
5753  /* free the worker threads */
5754  for (f = 1; f < team->t.t_nproc; ++f) {
5755  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5756  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5757  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5758  1, 2);
5759  }
5760  __kmp_free_thread(team->t.t_threads[f]);
5761  }
5762 
5763  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5764  if (team->t.b) {
5765  // wake up thread at old location
5766  team->t.b->go_release();
5767  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5768  for (f = 1; f < team->t.t_nproc; ++f) {
5769  if (team->t.b->sleep[f].sleep) {
5770  __kmp_atomic_resume_64(
5771  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5772  (kmp_atomic_flag_64<> *)NULL);
5773  }
5774  }
5775  }
5776  // Wait for threads to be removed from team
5777  for (int f = 1; f < team->t.t_nproc; ++f) {
5778  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5779  KMP_CPU_PAUSE();
5780  }
5781  }
5782  }
5783 
5784  for (f = 1; f < team->t.t_nproc; ++f) {
5785  team->t.t_threads[f] = NULL;
5786  }
5787 
5788  if (team->t.t_max_nproc > 1 &&
5789  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5790  distributedBarrier::deallocate(team->t.b);
5791  team->t.b = NULL;
5792  }
5793  /* put the team back in the team pool */
5794  /* TODO limit size of team pool, call reap_team if pool too large */
5795  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5796  __kmp_team_pool = (volatile kmp_team_t *)team;
5797  } else { // Check if team was created for primary threads in teams construct
5798  // See if first worker is a CG root
5799  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5800  team->t.t_threads[1]->th.th_cg_roots);
5801  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5802  // Clean up the CG root nodes on workers so that this team can be re-used
5803  for (f = 1; f < team->t.t_nproc; ++f) {
5804  kmp_info_t *thr = team->t.t_threads[f];
5805  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5806  thr->th.th_cg_roots->cg_root == thr);
5807  // Pop current CG root off list
5808  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5809  thr->th.th_cg_roots = tmp->up;
5810  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5811  " up to node %p. cg_nthreads was %d\n",
5812  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5813  int i = tmp->cg_nthreads--;
5814  if (i == 1) {
5815  __kmp_free(tmp); // free CG if we are the last thread in it
5816  }
5817  // Restore current task's thread_limit from CG root
5818  if (thr->th.th_cg_roots)
5819  thr->th.th_current_task->td_icvs.thread_limit =
5820  thr->th.th_cg_roots->cg_thread_limit;
5821  }
5822  }
5823  }
5824 
5825  KMP_MB();
5826 }
5827 
5828 /* reap the team. destroy it, reclaim all its resources and free its memory */
5829 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5830  kmp_team_t *next_pool = team->t.t_next_pool;
5831 
5832  KMP_DEBUG_ASSERT(team);
5833  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5834  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5835  KMP_DEBUG_ASSERT(team->t.t_threads);
5836  KMP_DEBUG_ASSERT(team->t.t_argv);
5837 
5838  /* TODO clean the threads that are a part of this? */
5839 
5840  /* free stuff */
5841  __kmp_free_team_arrays(team);
5842  if (team->t.t_argv != &team->t.t_inline_argv[0])
5843  __kmp_free((void *)team->t.t_argv);
5844  __kmp_free(team);
5845 
5846  KMP_MB();
5847  return next_pool;
5848 }
5849 
5850 // Free the thread. Don't reap it, just place it on the pool of available
5851 // threads.
5852 //
5853 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5854 // binding for the affinity mechanism to be useful.
5855 //
5856 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5857 // However, we want to avoid a potential performance problem by always
5858 // scanning through the list to find the correct point at which to insert
5859 // the thread (potential N**2 behavior). To do this we keep track of the
5860 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5861 // With single-level parallelism, threads will always be added to the tail
5862 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5863 // parallelism, all bets are off and we may need to scan through the entire
5864 // free list.
5865 //
5866 // This change also has a potentially large performance benefit, for some
5867 // applications. Previously, as threads were freed from the hot team, they
5868 // would be placed back on the free list in inverse order. If the hot team
5869 // grew back to it's original size, then the freed thread would be placed
5870 // back on the hot team in reverse order. This could cause bad cache
5871 // locality problems on programs where the size of the hot team regularly
5872 // grew and shrunk.
5873 //
5874 // Now, for single-level parallelism, the OMP tid is always == gtid.
5875 void __kmp_free_thread(kmp_info_t *this_th) {
5876  int gtid;
5877  kmp_info_t **scan;
5878 
5879  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5880  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5881 
5882  KMP_DEBUG_ASSERT(this_th);
5883 
5884  // When moving thread to pool, switch thread to wait on own b_go flag, and
5885  // uninitialized (NULL team).
5886  int b;
5887  kmp_balign_t *balign = this_th->th.th_bar;
5888  for (b = 0; b < bs_last_barrier; ++b) {
5889  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5890  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5891  balign[b].bb.team = NULL;
5892  balign[b].bb.leaf_kids = 0;
5893  }
5894  this_th->th.th_task_state = 0;
5895  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5896 
5897  /* put thread back on the free pool */
5898  TCW_PTR(this_th->th.th_team, NULL);
5899  TCW_PTR(this_th->th.th_root, NULL);
5900  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5901 
5902  while (this_th->th.th_cg_roots) {
5903  this_th->th.th_cg_roots->cg_nthreads--;
5904  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5905  " %p of thread %p to %d\n",
5906  this_th, this_th->th.th_cg_roots,
5907  this_th->th.th_cg_roots->cg_root,
5908  this_th->th.th_cg_roots->cg_nthreads));
5909  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5910  if (tmp->cg_root == this_th) { // Thread is a cg_root
5911  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5912  KA_TRACE(
5913  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5914  this_th->th.th_cg_roots = tmp->up;
5915  __kmp_free(tmp);
5916  } else { // Worker thread
5917  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5918  __kmp_free(tmp);
5919  }
5920  this_th->th.th_cg_roots = NULL;
5921  break;
5922  }
5923  }
5924 
5925  /* If the implicit task assigned to this thread can be used by other threads
5926  * -> multiple threads can share the data and try to free the task at
5927  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5928  * with higher probability when hot team is disabled but can occurs even when
5929  * the hot team is enabled */
5930  __kmp_free_implicit_task(this_th);
5931  this_th->th.th_current_task = NULL;
5932 
5933  // If the __kmp_thread_pool_insert_pt is already past the new insert
5934  // point, then we need to re-scan the entire list.
5935  gtid = this_th->th.th_info.ds.ds_gtid;
5936  if (__kmp_thread_pool_insert_pt != NULL) {
5937  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5938  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5939  __kmp_thread_pool_insert_pt = NULL;
5940  }
5941  }
5942 
5943  // Scan down the list to find the place to insert the thread.
5944  // scan is the address of a link in the list, possibly the address of
5945  // __kmp_thread_pool itself.
5946  //
5947  // In the absence of nested parallelism, the for loop will have 0 iterations.
5948  if (__kmp_thread_pool_insert_pt != NULL) {
5949  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5950  } else {
5951  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5952  }
5953  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5954  scan = &((*scan)->th.th_next_pool))
5955  ;
5956 
5957  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5958  // to its address.
5959  TCW_PTR(this_th->th.th_next_pool, *scan);
5960  __kmp_thread_pool_insert_pt = *scan = this_th;
5961  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5962  (this_th->th.th_info.ds.ds_gtid <
5963  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5964  TCW_4(this_th->th.th_in_pool, TRUE);
5965  __kmp_suspend_initialize_thread(this_th);
5966  __kmp_lock_suspend_mx(this_th);
5967  if (this_th->th.th_active == TRUE) {
5968  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5969  this_th->th.th_active_in_pool = TRUE;
5970  }
5971 #if KMP_DEBUG
5972  else {
5973  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5974  }
5975 #endif
5976  __kmp_unlock_suspend_mx(this_th);
5977 
5978  TCW_4(__kmp_nth, __kmp_nth - 1);
5979 
5980 #ifdef KMP_ADJUST_BLOCKTIME
5981  /* Adjust blocktime back to user setting or default if necessary */
5982  /* Middle initialization might never have occurred */
5983  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5984  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5985  if (__kmp_nth <= __kmp_avail_proc) {
5986  __kmp_zero_bt = FALSE;
5987  }
5988  }
5989 #endif /* KMP_ADJUST_BLOCKTIME */
5990 
5991  KMP_MB();
5992 }
5993 
5994 /* ------------------------------------------------------------------------ */
5995 
5996 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5997 #if OMP_PROFILING_SUPPORT
5998  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5999  // TODO: add a configuration option for time granularity
6000  if (ProfileTraceFile)
6001  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6002 #endif
6003 
6004  int gtid = this_thr->th.th_info.ds.ds_gtid;
6005  /* void *stack_data;*/
6006  kmp_team_t **volatile pteam;
6007 
6008  KMP_MB();
6009  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6010 
6011  if (__kmp_env_consistency_check) {
6012  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6013  }
6014 
6015 #if OMPD_SUPPORT
6016  if (ompd_state & OMPD_ENABLE_BP)
6017  ompd_bp_thread_begin();
6018 #endif
6019 
6020 #if OMPT_SUPPORT
6021  ompt_data_t *thread_data = nullptr;
6022  if (ompt_enabled.enabled) {
6023  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6024  *thread_data = ompt_data_none;
6025 
6026  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6027  this_thr->th.ompt_thread_info.wait_id = 0;
6028  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6029  this_thr->th.ompt_thread_info.parallel_flags = 0;
6030  if (ompt_enabled.ompt_callback_thread_begin) {
6031  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6032  ompt_thread_worker, thread_data);
6033  }
6034  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6035  }
6036 #endif
6037 
6038  /* This is the place where threads wait for work */
6039  while (!TCR_4(__kmp_global.g.g_done)) {
6040  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6041  KMP_MB();
6042 
6043  /* wait for work to do */
6044  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6045 
6046  /* No tid yet since not part of a team */
6047  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6048 
6049 #if OMPT_SUPPORT
6050  if (ompt_enabled.enabled) {
6051  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6052  }
6053 #endif
6054 
6055  pteam = &this_thr->th.th_team;
6056 
6057  /* have we been allocated? */
6058  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6059  /* we were just woken up, so run our new task */
6060  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6061  int rc;
6062  KA_TRACE(20,
6063  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6064  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6065  (*pteam)->t.t_pkfn));
6066 
6067  updateHWFPControl(*pteam);
6068 
6069 #if OMPT_SUPPORT
6070  if (ompt_enabled.enabled) {
6071  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6072  }
6073 #endif
6074 
6075  rc = (*pteam)->t.t_invoke(gtid);
6076  KMP_ASSERT(rc);
6077 
6078  KMP_MB();
6079  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6080  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6081  (*pteam)->t.t_pkfn));
6082  }
6083 #if OMPT_SUPPORT
6084  if (ompt_enabled.enabled) {
6085  /* no frame set while outside task */
6086  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6087 
6088  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6089  }
6090 #endif
6091  /* join barrier after parallel region */
6092  __kmp_join_barrier(gtid);
6093  }
6094  }
6095  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6096 
6097 #if OMPD_SUPPORT
6098  if (ompd_state & OMPD_ENABLE_BP)
6099  ompd_bp_thread_end();
6100 #endif
6101 
6102 #if OMPT_SUPPORT
6103  if (ompt_enabled.ompt_callback_thread_end) {
6104  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6105  }
6106 #endif
6107 
6108  this_thr->th.th_task_team = NULL;
6109  /* run the destructors for the threadprivate data for this thread */
6110  __kmp_common_destroy_gtid(gtid);
6111 
6112  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6113  KMP_MB();
6114 
6115 #if OMP_PROFILING_SUPPORT
6116  llvm::timeTraceProfilerFinishThread();
6117 #endif
6118  return this_thr;
6119 }
6120 
6121 /* ------------------------------------------------------------------------ */
6122 
6123 void __kmp_internal_end_dest(void *specific_gtid) {
6124  // Make sure no significant bits are lost
6125  int gtid;
6126  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6127 
6128  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6129  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6130  * this is because 0 is reserved for the nothing-stored case */
6131 
6132  __kmp_internal_end_thread(gtid);
6133 }
6134 
6135 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6136 
6137 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6138  __kmp_internal_end_atexit();
6139 }
6140 
6141 #endif
6142 
6143 /* [Windows] josh: when the atexit handler is called, there may still be more
6144  than one thread alive */
6145 void __kmp_internal_end_atexit(void) {
6146  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6147  /* [Windows]
6148  josh: ideally, we want to completely shutdown the library in this atexit
6149  handler, but stat code that depends on thread specific data for gtid fails
6150  because that data becomes unavailable at some point during the shutdown, so
6151  we call __kmp_internal_end_thread instead. We should eventually remove the
6152  dependency on __kmp_get_specific_gtid in the stat code and use
6153  __kmp_internal_end_library to cleanly shutdown the library.
6154 
6155  // TODO: Can some of this comment about GVS be removed?
6156  I suspect that the offending stat code is executed when the calling thread
6157  tries to clean up a dead root thread's data structures, resulting in GVS
6158  code trying to close the GVS structures for that thread, but since the stat
6159  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6160  the calling thread is cleaning up itself instead of another thread, it get
6161  confused. This happens because allowing a thread to unregister and cleanup
6162  another thread is a recent modification for addressing an issue.
6163  Based on the current design (20050722), a thread may end up
6164  trying to unregister another thread only if thread death does not trigger
6165  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6166  thread specific data destructor function to detect thread death. For
6167  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6168  is nothing. Thus, the workaround is applicable only for Windows static
6169  stat library. */
6170  __kmp_internal_end_library(-1);
6171 #if KMP_OS_WINDOWS
6172  __kmp_close_console();
6173 #endif
6174 }
6175 
6176 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6177  // It is assumed __kmp_forkjoin_lock is acquired.
6178 
6179  int gtid;
6180 
6181  KMP_DEBUG_ASSERT(thread != NULL);
6182 
6183  gtid = thread->th.th_info.ds.ds_gtid;
6184 
6185  if (!is_root) {
6186  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6187  /* Assume the threads are at the fork barrier here */
6188  KA_TRACE(
6189  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6190  gtid));
6191  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6192  while (
6193  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6194  KMP_CPU_PAUSE();
6195  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6196  } else {
6197  /* Need release fence here to prevent seg faults for tree forkjoin
6198  barrier (GEH) */
6199  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6200  thread);
6201  __kmp_release_64(&flag);
6202  }
6203  }
6204 
6205  // Terminate OS thread.
6206  __kmp_reap_worker(thread);
6207 
6208  // The thread was killed asynchronously. If it was actively
6209  // spinning in the thread pool, decrement the global count.
6210  //
6211  // There is a small timing hole here - if the worker thread was just waking
6212  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6213  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6214  // the global counter might not get updated.
6215  //
6216  // Currently, this can only happen as the library is unloaded,
6217  // so there are no harmful side effects.
6218  if (thread->th.th_active_in_pool) {
6219  thread->th.th_active_in_pool = FALSE;
6220  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6221  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6222  }
6223  }
6224 
6225  __kmp_free_implicit_task(thread);
6226 
6227 // Free the fast memory for tasking
6228 #if USE_FAST_MEMORY
6229  __kmp_free_fast_memory(thread);
6230 #endif /* USE_FAST_MEMORY */
6231 
6232  __kmp_suspend_uninitialize_thread(thread);
6233 
6234  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6235  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6236 
6237  --__kmp_all_nth;
6238  // __kmp_nth was decremented when thread is added to the pool.
6239 
6240 #ifdef KMP_ADJUST_BLOCKTIME
6241  /* Adjust blocktime back to user setting or default if necessary */
6242  /* Middle initialization might never have occurred */
6243  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6244  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6245  if (__kmp_nth <= __kmp_avail_proc) {
6246  __kmp_zero_bt = FALSE;
6247  }
6248  }
6249 #endif /* KMP_ADJUST_BLOCKTIME */
6250 
6251  /* free the memory being used */
6252  if (__kmp_env_consistency_check) {
6253  if (thread->th.th_cons) {
6254  __kmp_free_cons_stack(thread->th.th_cons);
6255  thread->th.th_cons = NULL;
6256  }
6257  }
6258 
6259  if (thread->th.th_pri_common != NULL) {
6260  __kmp_free(thread->th.th_pri_common);
6261  thread->th.th_pri_common = NULL;
6262  }
6263 
6264  if (thread->th.th_task_state_memo_stack != NULL) {
6265  __kmp_free(thread->th.th_task_state_memo_stack);
6266  thread->th.th_task_state_memo_stack = NULL;
6267  }
6268 
6269 #if KMP_USE_BGET
6270  if (thread->th.th_local.bget_data != NULL) {
6271  __kmp_finalize_bget(thread);
6272  }
6273 #endif
6274 
6275 #if KMP_AFFINITY_SUPPORTED
6276  if (thread->th.th_affin_mask != NULL) {
6277  KMP_CPU_FREE(thread->th.th_affin_mask);
6278  thread->th.th_affin_mask = NULL;
6279  }
6280 #endif /* KMP_AFFINITY_SUPPORTED */
6281 
6282 #if KMP_USE_HIER_SCHED
6283  if (thread->th.th_hier_bar_data != NULL) {
6284  __kmp_free(thread->th.th_hier_bar_data);
6285  thread->th.th_hier_bar_data = NULL;
6286  }
6287 #endif
6288 
6289  __kmp_reap_team(thread->th.th_serial_team);
6290  thread->th.th_serial_team = NULL;
6291  __kmp_free(thread);
6292 
6293  KMP_MB();
6294 
6295 } // __kmp_reap_thread
6296 
6297 static void __kmp_itthash_clean(kmp_info_t *th) {
6298 #if USE_ITT_NOTIFY
6299  if (__kmp_itt_region_domains.count > 0) {
6300  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6302  while (bucket) {
6303  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304  __kmp_thread_free(th, bucket);
6305  bucket = next;
6306  }
6307  }
6308  }
6309  if (__kmp_itt_barrier_domains.count > 0) {
6310  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6311  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6312  while (bucket) {
6313  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6314  __kmp_thread_free(th, bucket);
6315  bucket = next;
6316  }
6317  }
6318  }
6319 #endif
6320 }
6321 
6322 static void __kmp_internal_end(void) {
6323  int i;
6324 
6325  /* First, unregister the library */
6326  __kmp_unregister_library();
6327 
6328 #if KMP_OS_WINDOWS
6329  /* In Win static library, we can't tell when a root actually dies, so we
6330  reclaim the data structures for any root threads that have died but not
6331  unregistered themselves, in order to shut down cleanly.
6332  In Win dynamic library we also can't tell when a thread dies. */
6333  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6334 // dead roots
6335 #endif
6336 
6337  for (i = 0; i < __kmp_threads_capacity; i++)
6338  if (__kmp_root[i])
6339  if (__kmp_root[i]->r.r_active)
6340  break;
6341  KMP_MB(); /* Flush all pending memory write invalidates. */
6342  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6343 
6344  if (i < __kmp_threads_capacity) {
6345 #if KMP_USE_MONITOR
6346  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6347  KMP_MB(); /* Flush all pending memory write invalidates. */
6348 
6349  // Need to check that monitor was initialized before reaping it. If we are
6350  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6351  // __kmp_monitor will appear to contain valid data, but it is only valid in
6352  // the parent process, not the child.
6353  // New behavior (201008): instead of keying off of the flag
6354  // __kmp_init_parallel, the monitor thread creation is keyed off
6355  // of the new flag __kmp_init_monitor.
6356  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6357  if (TCR_4(__kmp_init_monitor)) {
6358  __kmp_reap_monitor(&__kmp_monitor);
6359  TCW_4(__kmp_init_monitor, 0);
6360  }
6361  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6362  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6363 #endif // KMP_USE_MONITOR
6364  } else {
6365 /* TODO move this to cleanup code */
6366 #ifdef KMP_DEBUG
6367  /* make sure that everything has properly ended */
6368  for (i = 0; i < __kmp_threads_capacity; i++) {
6369  if (__kmp_root[i]) {
6370  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6371  // there can be uber threads alive here
6372  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6373  }
6374  }
6375 #endif
6376 
6377  KMP_MB();
6378 
6379  // Reap the worker threads.
6380  // This is valid for now, but be careful if threads are reaped sooner.
6381  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6382  // Get the next thread from the pool.
6383  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6384  __kmp_thread_pool = thread->th.th_next_pool;
6385  // Reap it.
6386  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6387  thread->th.th_next_pool = NULL;
6388  thread->th.th_in_pool = FALSE;
6389  __kmp_reap_thread(thread, 0);
6390  }
6391  __kmp_thread_pool_insert_pt = NULL;
6392 
6393  // Reap teams.
6394  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6395  // Get the next team from the pool.
6396  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6397  __kmp_team_pool = team->t.t_next_pool;
6398  // Reap it.
6399  team->t.t_next_pool = NULL;
6400  __kmp_reap_team(team);
6401  }
6402 
6403  __kmp_reap_task_teams();
6404 
6405 #if KMP_OS_UNIX
6406  // Threads that are not reaped should not access any resources since they
6407  // are going to be deallocated soon, so the shutdown sequence should wait
6408  // until all threads either exit the final spin-waiting loop or begin
6409  // sleeping after the given blocktime.
6410  for (i = 0; i < __kmp_threads_capacity; i++) {
6411  kmp_info_t *thr = __kmp_threads[i];
6412  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6413  KMP_CPU_PAUSE();
6414  }
6415 #endif
6416 
6417  for (i = 0; i < __kmp_threads_capacity; ++i) {
6418  // TBD: Add some checking...
6419  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6420  }
6421 
6422  /* Make sure all threadprivate destructors get run by joining with all
6423  worker threads before resetting this flag */
6424  TCW_SYNC_4(__kmp_init_common, FALSE);
6425 
6426  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6427  KMP_MB();
6428 
6429 #if KMP_USE_MONITOR
6430  // See note above: One of the possible fixes for CQ138434 / CQ140126
6431  //
6432  // FIXME: push both code fragments down and CSE them?
6433  // push them into __kmp_cleanup() ?
6434  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6435  if (TCR_4(__kmp_init_monitor)) {
6436  __kmp_reap_monitor(&__kmp_monitor);
6437  TCW_4(__kmp_init_monitor, 0);
6438  }
6439  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6440  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6441 #endif
6442  } /* else !__kmp_global.t_active */
6443  TCW_4(__kmp_init_gtid, FALSE);
6444  KMP_MB(); /* Flush all pending memory write invalidates. */
6445 
6446  __kmp_cleanup();
6447 #if OMPT_SUPPORT
6448  ompt_fini();
6449 #endif
6450 }
6451 
6452 void __kmp_internal_end_library(int gtid_req) {
6453  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6454  /* this shouldn't be a race condition because __kmp_internal_end() is the
6455  only place to clear __kmp_serial_init */
6456  /* we'll check this later too, after we get the lock */
6457  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6458  // redundant, because the next check will work in any case.
6459  if (__kmp_global.g.g_abort) {
6460  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6461  /* TODO abort? */
6462  return;
6463  }
6464  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6465  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6466  return;
6467  }
6468 
6469  // If hidden helper team has been initialized, we need to deinit it
6470  if (TCR_4(__kmp_init_hidden_helper) &&
6471  !TCR_4(__kmp_hidden_helper_team_done)) {
6472  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6473  // First release the main thread to let it continue its work
6474  __kmp_hidden_helper_main_thread_release();
6475  // Wait until the hidden helper team has been destroyed
6476  __kmp_hidden_helper_threads_deinitz_wait();
6477  }
6478 
6479  KMP_MB(); /* Flush all pending memory write invalidates. */
6480  /* find out who we are and what we should do */
6481  {
6482  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6483  KA_TRACE(
6484  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6485  if (gtid == KMP_GTID_SHUTDOWN) {
6486  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6487  "already shutdown\n"));
6488  return;
6489  } else if (gtid == KMP_GTID_MONITOR) {
6490  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6491  "registered, or system shutdown\n"));
6492  return;
6493  } else if (gtid == KMP_GTID_DNE) {
6494  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6495  "shutdown\n"));
6496  /* we don't know who we are, but we may still shutdown the library */
6497  } else if (KMP_UBER_GTID(gtid)) {
6498  /* unregister ourselves as an uber thread. gtid is no longer valid */
6499  if (__kmp_root[gtid]->r.r_active) {
6500  __kmp_global.g.g_abort = -1;
6501  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6502  __kmp_unregister_library();
6503  KA_TRACE(10,
6504  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6505  gtid));
6506  return;
6507  } else {
6508  __kmp_itthash_clean(__kmp_threads[gtid]);
6509  KA_TRACE(
6510  10,
6511  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6512  __kmp_unregister_root_current_thread(gtid);
6513  }
6514  } else {
6515 /* worker threads may call this function through the atexit handler, if they
6516  * call exit() */
6517 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6518  TODO: do a thorough shutdown instead */
6519 #ifdef DUMP_DEBUG_ON_EXIT
6520  if (__kmp_debug_buf)
6521  __kmp_dump_debug_buffer();
6522 #endif
6523  // added unregister library call here when we switch to shm linux
6524  // if we don't, it will leave lots of files in /dev/shm
6525  // cleanup shared memory file before exiting.
6526  __kmp_unregister_library();
6527  return;
6528  }
6529  }
6530  /* synchronize the termination process */
6531  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6532 
6533  /* have we already finished */
6534  if (__kmp_global.g.g_abort) {
6535  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6536  /* TODO abort? */
6537  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6538  return;
6539  }
6540  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6541  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6542  return;
6543  }
6544 
6545  /* We need this lock to enforce mutex between this reading of
6546  __kmp_threads_capacity and the writing by __kmp_register_root.
6547  Alternatively, we can use a counter of roots that is atomically updated by
6548  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6549  __kmp_internal_end_*. */
6550  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6551 
6552  /* now we can safely conduct the actual termination */
6553  __kmp_internal_end();
6554 
6555  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6556  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6557 
6558  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6559 
6560 #ifdef DUMP_DEBUG_ON_EXIT
6561  if (__kmp_debug_buf)
6562  __kmp_dump_debug_buffer();
6563 #endif
6564 
6565 #if KMP_OS_WINDOWS
6566  __kmp_close_console();
6567 #endif
6568 
6569  __kmp_fini_allocator();
6570 
6571 } // __kmp_internal_end_library
6572 
6573 void __kmp_internal_end_thread(int gtid_req) {
6574  int i;
6575 
6576  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6577  /* this shouldn't be a race condition because __kmp_internal_end() is the
6578  * only place to clear __kmp_serial_init */
6579  /* we'll check this later too, after we get the lock */
6580  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6581  // redundant, because the next check will work in any case.
6582  if (__kmp_global.g.g_abort) {
6583  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6584  /* TODO abort? */
6585  return;
6586  }
6587  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6588  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6589  return;
6590  }
6591 
6592  // If hidden helper team has been initialized, we need to deinit it
6593  if (TCR_4(__kmp_init_hidden_helper) &&
6594  !TCR_4(__kmp_hidden_helper_team_done)) {
6595  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6596  // First release the main thread to let it continue its work
6597  __kmp_hidden_helper_main_thread_release();
6598  // Wait until the hidden helper team has been destroyed
6599  __kmp_hidden_helper_threads_deinitz_wait();
6600  }
6601 
6602  KMP_MB(); /* Flush all pending memory write invalidates. */
6603 
6604  /* find out who we are and what we should do */
6605  {
6606  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6607  KA_TRACE(10,
6608  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6609  if (gtid == KMP_GTID_SHUTDOWN) {
6610  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6611  "already shutdown\n"));
6612  return;
6613  } else if (gtid == KMP_GTID_MONITOR) {
6614  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6615  "registered, or system shutdown\n"));
6616  return;
6617  } else if (gtid == KMP_GTID_DNE) {
6618  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6619  "shutdown\n"));
6620  return;
6621  /* we don't know who we are */
6622  } else if (KMP_UBER_GTID(gtid)) {
6623  /* unregister ourselves as an uber thread. gtid is no longer valid */
6624  if (__kmp_root[gtid]->r.r_active) {
6625  __kmp_global.g.g_abort = -1;
6626  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6627  KA_TRACE(10,
6628  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6629  gtid));
6630  return;
6631  } else {
6632  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6633  gtid));
6634  __kmp_unregister_root_current_thread(gtid);
6635  }
6636  } else {
6637  /* just a worker thread, let's leave */
6638  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6639 
6640  if (gtid >= 0) {
6641  __kmp_threads[gtid]->th.th_task_team = NULL;
6642  }
6643 
6644  KA_TRACE(10,
6645  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6646  gtid));
6647  return;
6648  }
6649  }
6650 #if KMP_DYNAMIC_LIB
6651  if (__kmp_pause_status != kmp_hard_paused)
6652  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6653  // because we will better shutdown later in the library destructor.
6654  {
6655  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6656  return;
6657  }
6658 #endif
6659  /* synchronize the termination process */
6660  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6661 
6662  /* have we already finished */
6663  if (__kmp_global.g.g_abort) {
6664  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6665  /* TODO abort? */
6666  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6667  return;
6668  }
6669  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6670  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6671  return;
6672  }
6673 
6674  /* We need this lock to enforce mutex between this reading of
6675  __kmp_threads_capacity and the writing by __kmp_register_root.
6676  Alternatively, we can use a counter of roots that is atomically updated by
6677  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6678  __kmp_internal_end_*. */
6679 
6680  /* should we finish the run-time? are all siblings done? */
6681  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6682 
6683  for (i = 0; i < __kmp_threads_capacity; ++i) {
6684  if (KMP_UBER_GTID(i)) {
6685  KA_TRACE(
6686  10,
6687  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6688  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690  return;
6691  }
6692  }
6693 
6694  /* now we can safely conduct the actual termination */
6695 
6696  __kmp_internal_end();
6697 
6698  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6699  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6700 
6701  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6702 
6703 #ifdef DUMP_DEBUG_ON_EXIT
6704  if (__kmp_debug_buf)
6705  __kmp_dump_debug_buffer();
6706 #endif
6707 } // __kmp_internal_end_thread
6708 
6709 // -----------------------------------------------------------------------------
6710 // Library registration stuff.
6711 
6712 static long __kmp_registration_flag = 0;
6713 // Random value used to indicate library initialization.
6714 static char *__kmp_registration_str = NULL;
6715 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6716 
6717 static inline char *__kmp_reg_status_name() {
6718 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6719  each thread. If registration and unregistration go in different threads
6720  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6721  env var can not be found, because the name will contain different pid. */
6722 // macOS* complains about name being too long with additional getuid()
6723 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6724  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6725  (int)getuid());
6726 #else
6727  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6728 #endif
6729 } // __kmp_reg_status_get
6730 
6731 #if defined(KMP_USE_SHM)
6732 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6733 char *temp_reg_status_file_name = nullptr;
6734 #endif
6735 
6736 void __kmp_register_library_startup(void) {
6737 
6738  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6739  int done = 0;
6740  union {
6741  double dtime;
6742  long ltime;
6743  } time;
6744 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6745  __kmp_initialize_system_tick();
6746 #endif
6747  __kmp_read_system_time(&time.dtime);
6748  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6749  __kmp_registration_str =
6750  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6751  __kmp_registration_flag, KMP_LIBRARY_FILE);
6752 
6753  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6754  __kmp_registration_str));
6755 
6756  while (!done) {
6757 
6758  char *value = NULL; // Actual value of the environment variable.
6759 
6760 #if defined(KMP_USE_SHM)
6761  char *shm_name = __kmp_str_format("/%s", name);
6762  int shm_preexist = 0;
6763  char *data1;
6764  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6765  if ((fd1 == -1) && (errno == EEXIST)) {
6766  // file didn't open because it already exists.
6767  // try opening existing file
6768  fd1 = shm_open(shm_name, O_RDWR, 0666);
6769  if (fd1 == -1) { // file didn't open
6770  // error out here
6771  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6772  __kmp_msg_null);
6773  } else {
6774  // able to open existing file
6775  shm_preexist = 1;
6776  }
6777  } else if (fd1 == -1) {
6778  // SHM didn't open; it was due to error other than already exists. Try to
6779  // create a temp file under /tmp.
6780  // TODO: /tmp might not always be the temporary directory. For now we will
6781  // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6782  char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6783  fd1 = mkstemp(temp_file_name);
6784  if (fd1 == -1) {
6785  // error out here.
6786  __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6787  __kmp_msg_null);
6788  }
6789  temp_reg_status_file_name = temp_file_name;
6790  }
6791  if (shm_preexist == 0) {
6792  // we created SHM now set size
6793  if (ftruncate(fd1, SHM_SIZE) == -1) {
6794  // error occured setting size;
6795  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6796  KMP_ERR(errno), __kmp_msg_null);
6797  }
6798  }
6799  data1 =
6800  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6801  if (data1 == MAP_FAILED) {
6802  // failed to map shared memory
6803  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6804  __kmp_msg_null);
6805  }
6806  if (shm_preexist == 0) { // set data to SHM, set value
6807  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6808  }
6809  // Read value from either what we just wrote or existing file.
6810  value = __kmp_str_format("%s", data1); // read value from SHM
6811  munmap(data1, SHM_SIZE);
6812  close(fd1);
6813 #else // Windows and unix with static library
6814  // Set environment variable, but do not overwrite if it is exist.
6815  __kmp_env_set(name, __kmp_registration_str, 0);
6816  // read value to see if it got set
6817  value = __kmp_env_get(name);
6818 #endif
6819 
6820  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6821  done = 1; // Ok, environment variable set successfully, exit the loop.
6822  } else {
6823  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6824  // Check whether it alive or dead.
6825  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6826  char *tail = value;
6827  char *flag_addr_str = NULL;
6828  char *flag_val_str = NULL;
6829  char const *file_name = NULL;
6830  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6831  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6832  file_name = tail;
6833  if (tail != NULL) {
6834  unsigned long *flag_addr = 0;
6835  unsigned long flag_val = 0;
6836  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6837  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6838  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6839  // First, check whether environment-encoded address is mapped into
6840  // addr space.
6841  // If so, dereference it to see if it still has the right value.
6842  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6843  neighbor = 1;
6844  } else {
6845  // If not, then we know the other copy of the library is no longer
6846  // running.
6847  neighbor = 2;
6848  }
6849  }
6850  }
6851  switch (neighbor) {
6852  case 0: // Cannot parse environment variable -- neighbor status unknown.
6853  // Assume it is the incompatible format of future version of the
6854  // library. Assume the other library is alive.
6855  // WARN( ... ); // TODO: Issue a warning.
6856  file_name = "unknown library";
6857  KMP_FALLTHROUGH();
6858  // Attention! Falling to the next case. That's intentional.
6859  case 1: { // Neighbor is alive.
6860  // Check it is allowed.
6861  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6862  if (!__kmp_str_match_true(duplicate_ok)) {
6863  // That's not allowed. Issue fatal error.
6864  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6865  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6866  }
6867  KMP_INTERNAL_FREE(duplicate_ok);
6868  __kmp_duplicate_library_ok = 1;
6869  done = 1; // Exit the loop.
6870  } break;
6871  case 2: { // Neighbor is dead.
6872 
6873 #if defined(KMP_USE_SHM)
6874  // close shared memory.
6875  shm_unlink(shm_name); // this removes file in /dev/shm
6876 #else
6877  // Clear the variable and try to register library again.
6878  __kmp_env_unset(name);
6879 #endif
6880  } break;
6881  default: {
6882  KMP_DEBUG_ASSERT(0);
6883  } break;
6884  }
6885  }
6886  KMP_INTERNAL_FREE((void *)value);
6887 #if defined(KMP_USE_SHM)
6888  KMP_INTERNAL_FREE((void *)shm_name);
6889 #endif
6890  } // while
6891  KMP_INTERNAL_FREE((void *)name);
6892 
6893 } // func __kmp_register_library_startup
6894 
6895 void __kmp_unregister_library(void) {
6896 
6897  char *name = __kmp_reg_status_name();
6898  char *value = NULL;
6899 
6900 #if defined(KMP_USE_SHM)
6901  bool use_shm = true;
6902  char *shm_name = __kmp_str_format("/%s", name);
6903  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6904  if (fd1 == -1) {
6905  // File did not open. Try the temporary file.
6906  use_shm = false;
6907  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6908  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6909  if (fd1 == -1) {
6910  // give it up now.
6911  return;
6912  }
6913  }
6914  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6915  if (data1 != MAP_FAILED) {
6916  value = __kmp_str_format("%s", data1); // read value from SHM
6917  munmap(data1, SHM_SIZE);
6918  }
6919  close(fd1);
6920 #else
6921  value = __kmp_env_get(name);
6922 #endif
6923 
6924  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6925  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6926  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6927 // Ok, this is our variable. Delete it.
6928 #if defined(KMP_USE_SHM)
6929  if (use_shm) {
6930  shm_unlink(shm_name); // this removes file in /dev/shm
6931  } else {
6932  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6933  unlink(temp_reg_status_file_name); // this removes the temp file
6934  }
6935 #else
6936  __kmp_env_unset(name);
6937 #endif
6938  }
6939 
6940 #if defined(KMP_USE_SHM)
6941  KMP_INTERNAL_FREE(shm_name);
6942  if (!use_shm) {
6943  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6944  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6945  }
6946 #endif
6947 
6948  KMP_INTERNAL_FREE(__kmp_registration_str);
6949  KMP_INTERNAL_FREE(value);
6950  KMP_INTERNAL_FREE(name);
6951 
6952  __kmp_registration_flag = 0;
6953  __kmp_registration_str = NULL;
6954 
6955 } // __kmp_unregister_library
6956 
6957 // End of Library registration stuff.
6958 // -----------------------------------------------------------------------------
6959 
6960 #if KMP_MIC_SUPPORTED
6961 
6962 static void __kmp_check_mic_type() {
6963  kmp_cpuid_t cpuid_state = {0};
6964  kmp_cpuid_t *cs_p = &cpuid_state;
6965  __kmp_x86_cpuid(1, 0, cs_p);
6966  // We don't support mic1 at the moment
6967  if ((cs_p->eax & 0xff0) == 0xB10) {
6968  __kmp_mic_type = mic2;
6969  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6970  __kmp_mic_type = mic3;
6971  } else {
6972  __kmp_mic_type = non_mic;
6973  }
6974 }
6975 
6976 #endif /* KMP_MIC_SUPPORTED */
6977 
6978 #if KMP_HAVE_UMWAIT
6979 static void __kmp_user_level_mwait_init() {
6980  struct kmp_cpuid buf;
6981  __kmp_x86_cpuid(7, 0, &buf);
6982  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6983  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6984  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6985  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6986  __kmp_umwait_enabled));
6987 }
6988 #elif KMP_HAVE_MWAIT
6989 #ifndef AT_INTELPHIUSERMWAIT
6990 // Spurious, non-existent value that should always fail to return anything.
6991 // Will be replaced with the correct value when we know that.
6992 #define AT_INTELPHIUSERMWAIT 10000
6993 #endif
6994 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6995 // earlier OS is used to build the RTL, we'll use the following internal
6996 // function when the entry is not found.
6997 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6998 unsigned long getauxval(unsigned long) { return 0; }
6999 
7000 static void __kmp_user_level_mwait_init() {
7001  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7002  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7003  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7004  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7005  if (__kmp_mic_type == mic3) {
7006  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7007  if ((res & 0x1) || __kmp_user_level_mwait) {
7008  __kmp_mwait_enabled = TRUE;
7009  if (__kmp_user_level_mwait) {
7010  KMP_INFORM(EnvMwaitWarn);
7011  }
7012  } else {
7013  __kmp_mwait_enabled = FALSE;
7014  }
7015  }
7016  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7017  "__kmp_mwait_enabled = %d\n",
7018  __kmp_mic_type, __kmp_mwait_enabled));
7019 }
7020 #endif /* KMP_HAVE_UMWAIT */
7021 
7022 static void __kmp_do_serial_initialize(void) {
7023  int i, gtid;
7024  size_t size;
7025 
7026  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7027 
7028  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7029  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7030  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7031  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7032  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7033 
7034 #if OMPT_SUPPORT
7035  ompt_pre_init();
7036 #endif
7037 #if OMPD_SUPPORT
7038  __kmp_env_dump();
7039  ompd_init();
7040 #endif
7041 
7042  __kmp_validate_locks();
7043 
7044  /* Initialize internal memory allocator */
7045  __kmp_init_allocator();
7046 
7047  /* Register the library startup via an environment variable or via mapped
7048  shared memory file and check to see whether another copy of the library is
7049  already registered. Since forked child process is often terminated, we
7050  postpone the registration till middle initialization in the child */
7051  if (__kmp_need_register_serial)
7052  __kmp_register_library_startup();
7053 
7054  /* TODO reinitialization of library */
7055  if (TCR_4(__kmp_global.g.g_done)) {
7056  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7057  }
7058 
7059  __kmp_global.g.g_abort = 0;
7060  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7061 
7062 /* initialize the locks */
7063 #if KMP_USE_ADAPTIVE_LOCKS
7064 #if KMP_DEBUG_ADAPTIVE_LOCKS
7065  __kmp_init_speculative_stats();
7066 #endif
7067 #endif
7068 #if KMP_STATS_ENABLED
7069  __kmp_stats_init();
7070 #endif
7071  __kmp_init_lock(&__kmp_global_lock);
7072  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7073  __kmp_init_lock(&__kmp_debug_lock);
7074  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7075  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7076  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7077  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7078  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7079  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7080  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7081  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7082  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7083  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7084  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7085  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7086  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7087  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7088  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7089 #if KMP_USE_MONITOR
7090  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7091 #endif
7092  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7093 
7094  /* conduct initialization and initial setup of configuration */
7095 
7096  __kmp_runtime_initialize();
7097 
7098 #if KMP_MIC_SUPPORTED
7099  __kmp_check_mic_type();
7100 #endif
7101 
7102 // Some global variable initialization moved here from kmp_env_initialize()
7103 #ifdef KMP_DEBUG
7104  kmp_diag = 0;
7105 #endif
7106  __kmp_abort_delay = 0;
7107 
7108  // From __kmp_init_dflt_team_nth()
7109  /* assume the entire machine will be used */
7110  __kmp_dflt_team_nth_ub = __kmp_xproc;
7111  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7112  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7113  }
7114  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7115  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7116  }
7117  __kmp_max_nth = __kmp_sys_max_nth;
7118  __kmp_cg_max_nth = __kmp_sys_max_nth;
7119  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7120  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7121  __kmp_teams_max_nth = __kmp_sys_max_nth;
7122  }
7123 
7124  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7125  // part
7126  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7127 #if KMP_USE_MONITOR
7128  __kmp_monitor_wakeups =
7129  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7130  __kmp_bt_intervals =
7131  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7132 #endif
7133  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7134  __kmp_library = library_throughput;
7135  // From KMP_SCHEDULE initialization
7136  __kmp_static = kmp_sch_static_balanced;
7137 // AC: do not use analytical here, because it is non-monotonous
7138 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7139 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7140 // need to repeat assignment
7141 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7142 // bit control and barrier method control parts
7143 #if KMP_FAST_REDUCTION_BARRIER
7144 #define kmp_reduction_barrier_gather_bb ((int)1)
7145 #define kmp_reduction_barrier_release_bb ((int)1)
7146 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7147 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7148 #endif // KMP_FAST_REDUCTION_BARRIER
7149  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7150  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7151  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7152  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7153  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7154 #if KMP_FAST_REDUCTION_BARRIER
7155  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7156  // lin_64 ): hyper,1
7157  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7158  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7159  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7160  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7161  }
7162 #endif // KMP_FAST_REDUCTION_BARRIER
7163  }
7164 #if KMP_FAST_REDUCTION_BARRIER
7165 #undef kmp_reduction_barrier_release_pat
7166 #undef kmp_reduction_barrier_gather_pat
7167 #undef kmp_reduction_barrier_release_bb
7168 #undef kmp_reduction_barrier_gather_bb
7169 #endif // KMP_FAST_REDUCTION_BARRIER
7170 #if KMP_MIC_SUPPORTED
7171  if (__kmp_mic_type == mic2) { // KNC
7172  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7173  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7174  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7175  1; // forkjoin release
7176  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7177  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7178  }
7179 #if KMP_FAST_REDUCTION_BARRIER
7180  if (__kmp_mic_type == mic2) { // KNC
7181  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7182  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7183  }
7184 #endif // KMP_FAST_REDUCTION_BARRIER
7185 #endif // KMP_MIC_SUPPORTED
7186 
7187 // From KMP_CHECKS initialization
7188 #ifdef KMP_DEBUG
7189  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7190 #else
7191  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7192 #endif
7193 
7194  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7195  __kmp_foreign_tp = TRUE;
7196 
7197  __kmp_global.g.g_dynamic = FALSE;
7198  __kmp_global.g.g_dynamic_mode = dynamic_default;
7199 
7200  __kmp_init_nesting_mode();
7201 
7202  __kmp_env_initialize(NULL);
7203 
7204 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7205  __kmp_user_level_mwait_init();
7206 #endif
7207 // Print all messages in message catalog for testing purposes.
7208 #ifdef KMP_DEBUG
7209  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7210  if (__kmp_str_match_true(val)) {
7211  kmp_str_buf_t buffer;
7212  __kmp_str_buf_init(&buffer);
7213  __kmp_i18n_dump_catalog(&buffer);
7214  __kmp_printf("%s", buffer.str);
7215  __kmp_str_buf_free(&buffer);
7216  }
7217  __kmp_env_free(&val);
7218 #endif
7219 
7220  __kmp_threads_capacity =
7221  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7222  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7223  __kmp_tp_capacity = __kmp_default_tp_capacity(
7224  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7225 
7226  // If the library is shut down properly, both pools must be NULL. Just in
7227  // case, set them to NULL -- some memory may leak, but subsequent code will
7228  // work even if pools are not freed.
7229  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7230  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7231  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7232  __kmp_thread_pool = NULL;
7233  __kmp_thread_pool_insert_pt = NULL;
7234  __kmp_team_pool = NULL;
7235 
7236  /* Allocate all of the variable sized records */
7237  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7238  * expandable */
7239  /* Since allocation is cache-aligned, just add extra padding at the end */
7240  size =
7241  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7242  CACHE_LINE;
7243  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7244  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7245  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7246 
7247  /* init thread counts */
7248  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7249  0); // Asserts fail if the library is reinitializing and
7250  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7251  __kmp_all_nth = 0;
7252  __kmp_nth = 0;
7253 
7254  /* setup the uber master thread and hierarchy */
7255  gtid = __kmp_register_root(TRUE);
7256  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7257  KMP_ASSERT(KMP_UBER_GTID(gtid));
7258  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7259 
7260  KMP_MB(); /* Flush all pending memory write invalidates. */
7261 
7262  __kmp_common_initialize();
7263 
7264 #if KMP_OS_UNIX
7265  /* invoke the child fork handler */
7266  __kmp_register_atfork();
7267 #endif
7268 
7269 #if !KMP_DYNAMIC_LIB || \
7270  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7271  {
7272  /* Invoke the exit handler when the program finishes, only for static
7273  library and macOS* dynamic. For other dynamic libraries, we already
7274  have _fini and DllMain. */
7275  int rc = atexit(__kmp_internal_end_atexit);
7276  if (rc != 0) {
7277  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7278  __kmp_msg_null);
7279  }
7280  }
7281 #endif
7282 
7283 #if KMP_HANDLE_SIGNALS
7284 #if KMP_OS_UNIX
7285  /* NOTE: make sure that this is called before the user installs their own
7286  signal handlers so that the user handlers are called first. this way they
7287  can return false, not call our handler, avoid terminating the library, and
7288  continue execution where they left off. */
7289  __kmp_install_signals(FALSE);
7290 #endif /* KMP_OS_UNIX */
7291 #if KMP_OS_WINDOWS
7292  __kmp_install_signals(TRUE);
7293 #endif /* KMP_OS_WINDOWS */
7294 #endif
7295 
7296  /* we have finished the serial initialization */
7297  __kmp_init_counter++;
7298 
7299  __kmp_init_serial = TRUE;
7300 
7301  if (__kmp_settings) {
7302  __kmp_env_print();
7303  }
7304 
7305  if (__kmp_display_env || __kmp_display_env_verbose) {
7306  __kmp_env_print_2();
7307  }
7308 
7309 #if OMPT_SUPPORT
7310  ompt_post_init();
7311 #endif
7312 
7313  KMP_MB();
7314 
7315  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7316 }
7317 
7318 void __kmp_serial_initialize(void) {
7319  if (__kmp_init_serial) {
7320  return;
7321  }
7322  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7323  if (__kmp_init_serial) {
7324  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7325  return;
7326  }
7327  __kmp_do_serial_initialize();
7328  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7329 }
7330 
7331 static void __kmp_do_middle_initialize(void) {
7332  int i, j;
7333  int prev_dflt_team_nth;
7334 
7335  if (!__kmp_init_serial) {
7336  __kmp_do_serial_initialize();
7337  }
7338 
7339  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7340 
7341  if (UNLIKELY(!__kmp_need_register_serial)) {
7342  // We are in a forked child process. The registration was skipped during
7343  // serial initialization in __kmp_atfork_child handler. Do it here.
7344  __kmp_register_library_startup();
7345  }
7346 
7347  // Save the previous value for the __kmp_dflt_team_nth so that
7348  // we can avoid some reinitialization if it hasn't changed.
7349  prev_dflt_team_nth = __kmp_dflt_team_nth;
7350 
7351 #if KMP_AFFINITY_SUPPORTED
7352  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7353  // number of cores on the machine.
7354  __kmp_affinity_initialize(__kmp_affinity);
7355 
7356 #endif /* KMP_AFFINITY_SUPPORTED */
7357 
7358  KMP_ASSERT(__kmp_xproc > 0);
7359  if (__kmp_avail_proc == 0) {
7360  __kmp_avail_proc = __kmp_xproc;
7361  }
7362 
7363  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7364  // correct them now
7365  j = 0;
7366  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7367  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7368  __kmp_avail_proc;
7369  j++;
7370  }
7371 
7372  if (__kmp_dflt_team_nth == 0) {
7373 #ifdef KMP_DFLT_NTH_CORES
7374  // Default #threads = #cores
7375  __kmp_dflt_team_nth = __kmp_ncores;
7376  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7377  "__kmp_ncores (%d)\n",
7378  __kmp_dflt_team_nth));
7379 #else
7380  // Default #threads = #available OS procs
7381  __kmp_dflt_team_nth = __kmp_avail_proc;
7382  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7383  "__kmp_avail_proc(%d)\n",
7384  __kmp_dflt_team_nth));
7385 #endif /* KMP_DFLT_NTH_CORES */
7386  }
7387 
7388  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7389  __kmp_dflt_team_nth = KMP_MIN_NTH;
7390  }
7391  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7392  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7393  }
7394 
7395  if (__kmp_nesting_mode > 0)
7396  __kmp_set_nesting_mode_threads();
7397 
7398  // There's no harm in continuing if the following check fails,
7399  // but it indicates an error in the previous logic.
7400  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7401 
7402  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7403  // Run through the __kmp_threads array and set the num threads icv for each
7404  // root thread that is currently registered with the RTL (which has not
7405  // already explicitly set its nthreads-var with a call to
7406  // omp_set_num_threads()).
7407  for (i = 0; i < __kmp_threads_capacity; i++) {
7408  kmp_info_t *thread = __kmp_threads[i];
7409  if (thread == NULL)
7410  continue;
7411  if (thread->th.th_current_task->td_icvs.nproc != 0)
7412  continue;
7413 
7414  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7415  }
7416  }
7417  KA_TRACE(
7418  20,
7419  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7420  __kmp_dflt_team_nth));
7421 
7422 #ifdef KMP_ADJUST_BLOCKTIME
7423  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7424  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7425  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7426  if (__kmp_nth > __kmp_avail_proc) {
7427  __kmp_zero_bt = TRUE;
7428  }
7429  }
7430 #endif /* KMP_ADJUST_BLOCKTIME */
7431 
7432  /* we have finished middle initialization */
7433  TCW_SYNC_4(__kmp_init_middle, TRUE);
7434 
7435  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7436 }
7437 
7438 void __kmp_middle_initialize(void) {
7439  if (__kmp_init_middle) {
7440  return;
7441  }
7442  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7443  if (__kmp_init_middle) {
7444  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7445  return;
7446  }
7447  __kmp_do_middle_initialize();
7448  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7449 }
7450 
7451 void __kmp_parallel_initialize(void) {
7452  int gtid = __kmp_entry_gtid(); // this might be a new root
7453 
7454  /* synchronize parallel initialization (for sibling) */
7455  if (TCR_4(__kmp_init_parallel))
7456  return;
7457  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7458  if (TCR_4(__kmp_init_parallel)) {
7459  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7460  return;
7461  }
7462 
7463  /* TODO reinitialization after we have already shut down */
7464  if (TCR_4(__kmp_global.g.g_done)) {
7465  KA_TRACE(
7466  10,
7467  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7468  __kmp_infinite_loop();
7469  }
7470 
7471  /* jc: The lock __kmp_initz_lock is already held, so calling
7472  __kmp_serial_initialize would cause a deadlock. So we call
7473  __kmp_do_serial_initialize directly. */
7474  if (!__kmp_init_middle) {
7475  __kmp_do_middle_initialize();
7476  }
7477  __kmp_assign_root_init_mask();
7478  __kmp_resume_if_hard_paused();
7479 
7480  /* begin initialization */
7481  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7482  KMP_ASSERT(KMP_UBER_GTID(gtid));
7483 
7484 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7485  // Save the FP control regs.
7486  // Worker threads will set theirs to these values at thread startup.
7487  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7488  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7489  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7490 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7491 
7492 #if KMP_OS_UNIX
7493 #if KMP_HANDLE_SIGNALS
7494  /* must be after __kmp_serial_initialize */
7495  __kmp_install_signals(TRUE);
7496 #endif
7497 #endif
7498 
7499  __kmp_suspend_initialize();
7500 
7501 #if defined(USE_LOAD_BALANCE)
7502  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7503  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7504  }
7505 #else
7506  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7507  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7508  }
7509 #endif
7510 
7511  if (__kmp_version) {
7512  __kmp_print_version_2();
7513  }
7514 
7515  /* we have finished parallel initialization */
7516  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7517 
7518  KMP_MB();
7519  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7520 
7521  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7522 }
7523 
7524 void __kmp_hidden_helper_initialize() {
7525  if (TCR_4(__kmp_init_hidden_helper))
7526  return;
7527 
7528  // __kmp_parallel_initialize is required before we initialize hidden helper
7529  if (!TCR_4(__kmp_init_parallel))
7530  __kmp_parallel_initialize();
7531 
7532  // Double check. Note that this double check should not be placed before
7533  // __kmp_parallel_initialize as it will cause dead lock.
7534  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7535  if (TCR_4(__kmp_init_hidden_helper)) {
7536  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7537  return;
7538  }
7539 
7540 #if KMP_AFFINITY_SUPPORTED
7541  // Initialize hidden helper affinity settings.
7542  // The above __kmp_parallel_initialize() will initialize
7543  // regular affinity (and topology) if not already done.
7544  if (!__kmp_hh_affinity.flags.initialized)
7545  __kmp_affinity_initialize(__kmp_hh_affinity);
7546 #endif
7547 
7548  // Set the count of hidden helper tasks to be executed to zero
7549  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7550 
7551  // Set the global variable indicating that we're initializing hidden helper
7552  // team/threads
7553  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7554 
7555  // Platform independent initialization
7556  __kmp_do_initialize_hidden_helper_threads();
7557 
7558  // Wait here for the finish of initialization of hidden helper teams
7559  __kmp_hidden_helper_threads_initz_wait();
7560 
7561  // We have finished hidden helper initialization
7562  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7563 
7564  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7565 }
7566 
7567 /* ------------------------------------------------------------------------ */
7568 
7569 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7570  kmp_team_t *team) {
7571  kmp_disp_t *dispatch;
7572 
7573  KMP_MB();
7574 
7575  /* none of the threads have encountered any constructs, yet. */
7576  this_thr->th.th_local.this_construct = 0;
7577 #if KMP_CACHE_MANAGE
7578  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7579 #endif /* KMP_CACHE_MANAGE */
7580  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7581  KMP_DEBUG_ASSERT(dispatch);
7582  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7583  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7584  // this_thr->th.th_info.ds.ds_tid ] );
7585 
7586  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7587  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7588  if (__kmp_env_consistency_check)
7589  __kmp_push_parallel(gtid, team->t.t_ident);
7590 
7591  KMP_MB(); /* Flush all pending memory write invalidates. */
7592 }
7593 
7594 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7595  kmp_team_t *team) {
7596  if (__kmp_env_consistency_check)
7597  __kmp_pop_parallel(gtid, team->t.t_ident);
7598 
7599  __kmp_finish_implicit_task(this_thr);
7600 }
7601 
7602 int __kmp_invoke_task_func(int gtid) {
7603  int rc;
7604  int tid = __kmp_tid_from_gtid(gtid);
7605  kmp_info_t *this_thr = __kmp_threads[gtid];
7606  kmp_team_t *team = this_thr->th.th_team;
7607 
7608  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7609 #if USE_ITT_BUILD
7610  if (__itt_stack_caller_create_ptr) {
7611  // inform ittnotify about entering user's code
7612  if (team->t.t_stack_id != NULL) {
7613  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7614  } else {
7615  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7616  __kmp_itt_stack_callee_enter(
7617  (__itt_caller)team->t.t_parent->t.t_stack_id);
7618  }
7619  }
7620 #endif /* USE_ITT_BUILD */
7621 #if INCLUDE_SSC_MARKS
7622  SSC_MARK_INVOKING();
7623 #endif
7624 
7625 #if OMPT_SUPPORT
7626  void *dummy;
7627  void **exit_frame_p;
7628  ompt_data_t *my_task_data;
7629  ompt_data_t *my_parallel_data;
7630  int ompt_team_size;
7631 
7632  if (ompt_enabled.enabled) {
7633  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7634  .ompt_task_info.frame.exit_frame.ptr);
7635  } else {
7636  exit_frame_p = &dummy;
7637  }
7638 
7639  my_task_data =
7640  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7641  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7642  if (ompt_enabled.ompt_callback_implicit_task) {
7643  ompt_team_size = team->t.t_nproc;
7644  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7645  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7646  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7647  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7648  }
7649 #endif
7650 
7651 #if KMP_STATS_ENABLED
7652  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7653  if (previous_state == stats_state_e::TEAMS_REGION) {
7654  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7655  } else {
7656  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7657  }
7658  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7659 #endif
7660 
7661  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7662  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7663 #if OMPT_SUPPORT
7664  ,
7665  exit_frame_p
7666 #endif
7667  );
7668 #if OMPT_SUPPORT
7669  *exit_frame_p = NULL;
7670  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7671 #endif
7672 
7673 #if KMP_STATS_ENABLED
7674  if (previous_state == stats_state_e::TEAMS_REGION) {
7675  KMP_SET_THREAD_STATE(previous_state);
7676  }
7677  KMP_POP_PARTITIONED_TIMER();
7678 #endif
7679 
7680 #if USE_ITT_BUILD
7681  if (__itt_stack_caller_create_ptr) {
7682  // inform ittnotify about leaving user's code
7683  if (team->t.t_stack_id != NULL) {
7684  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7685  } else {
7686  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7687  __kmp_itt_stack_callee_leave(
7688  (__itt_caller)team->t.t_parent->t.t_stack_id);
7689  }
7690  }
7691 #endif /* USE_ITT_BUILD */
7692  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7693 
7694  return rc;
7695 }
7696 
7697 void __kmp_teams_master(int gtid) {
7698  // This routine is called by all primary threads in teams construct
7699  kmp_info_t *thr = __kmp_threads[gtid];
7700  kmp_team_t *team = thr->th.th_team;
7701  ident_t *loc = team->t.t_ident;
7702  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7703  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7704  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7705  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7706  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7707 
7708  // This thread is a new CG root. Set up the proper variables.
7709  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7710  tmp->cg_root = thr; // Make thr the CG root
7711  // Init to thread limit stored when league primary threads were forked
7712  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7713  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7714  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7715  " cg_nthreads to 1\n",
7716  thr, tmp));
7717  tmp->up = thr->th.th_cg_roots;
7718  thr->th.th_cg_roots = tmp;
7719 
7720 // Launch league of teams now, but not let workers execute
7721 // (they hang on fork barrier until next parallel)
7722 #if INCLUDE_SSC_MARKS
7723  SSC_MARK_FORKING();
7724 #endif
7725  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7726  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7727  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7728 #if INCLUDE_SSC_MARKS
7729  SSC_MARK_JOINING();
7730 #endif
7731  // If the team size was reduced from the limit, set it to the new size
7732  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7733  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7734  // AC: last parameter "1" eliminates join barrier which won't work because
7735  // worker threads are in a fork barrier waiting for more parallel regions
7736  __kmp_join_call(loc, gtid
7737 #if OMPT_SUPPORT
7738  ,
7739  fork_context_intel
7740 #endif
7741  ,
7742  1);
7743 }
7744 
7745 int __kmp_invoke_teams_master(int gtid) {
7746  kmp_info_t *this_thr = __kmp_threads[gtid];
7747  kmp_team_t *team = this_thr->th.th_team;
7748 #if KMP_DEBUG
7749  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7750  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7751  (void *)__kmp_teams_master);
7752 #endif
7753  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7754 #if OMPT_SUPPORT
7755  int tid = __kmp_tid_from_gtid(gtid);
7756  ompt_data_t *task_data =
7757  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7758  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7759  if (ompt_enabled.ompt_callback_implicit_task) {
7760  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7761  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7762  ompt_task_initial);
7763  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7764  }
7765 #endif
7766  __kmp_teams_master(gtid);
7767 #if OMPT_SUPPORT
7768  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7769 #endif
7770  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7771  return 1;
7772 }
7773 
7774 /* this sets the requested number of threads for the next parallel region
7775  encountered by this team. since this should be enclosed in the forkjoin
7776  critical section it should avoid race conditions with asymmetrical nested
7777  parallelism */
7778 
7779 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7780  kmp_info_t *thr = __kmp_threads[gtid];
7781 
7782  if (num_threads > 0)
7783  thr->th.th_set_nproc = num_threads;
7784 }
7785 
7786 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7787  int num_threads) {
7788  KMP_DEBUG_ASSERT(thr);
7789  // Remember the number of threads for inner parallel regions
7790  if (!TCR_4(__kmp_init_middle))
7791  __kmp_middle_initialize(); // get internal globals calculated
7792  __kmp_assign_root_init_mask();
7793  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7794  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7795 
7796  if (num_threads == 0) {
7797  if (__kmp_teams_thread_limit > 0) {
7798  num_threads = __kmp_teams_thread_limit;
7799  } else {
7800  num_threads = __kmp_avail_proc / num_teams;
7801  }
7802  // adjust num_threads w/o warning as it is not user setting
7803  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7804  // no thread_limit clause specified - do not change thread-limit-var ICV
7805  if (num_threads > __kmp_dflt_team_nth) {
7806  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7807  }
7808  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7809  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7810  } // prevent team size to exceed thread-limit-var
7811  if (num_teams * num_threads > __kmp_teams_max_nth) {
7812  num_threads = __kmp_teams_max_nth / num_teams;
7813  }
7814  if (num_threads == 0) {
7815  num_threads = 1;
7816  }
7817  } else {
7818  if (num_threads < 0) {
7819  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7820  __kmp_msg_null);
7821  num_threads = 1;
7822  }
7823  // This thread will be the primary thread of the league primary threads
7824  // Store new thread limit; old limit is saved in th_cg_roots list
7825  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7826  // num_threads = min(num_threads, nthreads-var)
7827  if (num_threads > __kmp_dflt_team_nth) {
7828  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7829  }
7830  if (num_teams * num_threads > __kmp_teams_max_nth) {
7831  int new_threads = __kmp_teams_max_nth / num_teams;
7832  if (new_threads == 0) {
7833  new_threads = 1;
7834  }
7835  if (new_threads != num_threads) {
7836  if (!__kmp_reserve_warn) { // user asked for too many threads
7837  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7838  __kmp_msg(kmp_ms_warning,
7839  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7840  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7841  }
7842  }
7843  num_threads = new_threads;
7844  }
7845  }
7846  thr->th.th_teams_size.nth = num_threads;
7847 }
7848 
7849 /* this sets the requested number of teams for the teams region and/or
7850  the number of threads for the next parallel region encountered */
7851 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7852  int num_threads) {
7853  kmp_info_t *thr = __kmp_threads[gtid];
7854  if (num_teams < 0) {
7855  // OpenMP specification requires requested values to be positive,
7856  // but people can send us any value, so we'd better check
7857  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7858  __kmp_msg_null);
7859  num_teams = 1;
7860  }
7861  if (num_teams == 0) {
7862  if (__kmp_nteams > 0) {
7863  num_teams = __kmp_nteams;
7864  } else {
7865  num_teams = 1; // default number of teams is 1.
7866  }
7867  }
7868  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7869  if (!__kmp_reserve_warn) {
7870  __kmp_reserve_warn = 1;
7871  __kmp_msg(kmp_ms_warning,
7872  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7873  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7874  }
7875  num_teams = __kmp_teams_max_nth;
7876  }
7877  // Set number of teams (number of threads in the outer "parallel" of the
7878  // teams)
7879  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7880 
7881  __kmp_push_thread_limit(thr, num_teams, num_threads);
7882 }
7883 
7884 /* This sets the requested number of teams for the teams region and/or
7885  the number of threads for the next parallel region encountered */
7886 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7887  int num_teams_ub, int num_threads) {
7888  kmp_info_t *thr = __kmp_threads[gtid];
7889  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7890  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7891  KMP_DEBUG_ASSERT(num_threads >= 0);
7892 
7893  if (num_teams_lb > num_teams_ub) {
7894  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7895  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7896  }
7897 
7898  int num_teams = 1; // defalt number of teams is 1.
7899 
7900  if (num_teams_lb == 0 && num_teams_ub > 0)
7901  num_teams_lb = num_teams_ub;
7902 
7903  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7904  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7905  if (num_teams > __kmp_teams_max_nth) {
7906  if (!__kmp_reserve_warn) {
7907  __kmp_reserve_warn = 1;
7908  __kmp_msg(kmp_ms_warning,
7909  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7910  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7911  }
7912  num_teams = __kmp_teams_max_nth;
7913  }
7914  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7915  num_teams = num_teams_ub;
7916  } else { // num_teams_lb <= num_teams <= num_teams_ub
7917  if (num_threads <= 0) {
7918  if (num_teams_ub > __kmp_teams_max_nth) {
7919  num_teams = num_teams_lb;
7920  } else {
7921  num_teams = num_teams_ub;
7922  }
7923  } else {
7924  num_teams = (num_threads > __kmp_teams_max_nth)
7925  ? num_teams
7926  : __kmp_teams_max_nth / num_threads;
7927  if (num_teams < num_teams_lb) {
7928  num_teams = num_teams_lb;
7929  } else if (num_teams > num_teams_ub) {
7930  num_teams = num_teams_ub;
7931  }
7932  }
7933  }
7934  // Set number of teams (number of threads in the outer "parallel" of the
7935  // teams)
7936  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7937 
7938  __kmp_push_thread_limit(thr, num_teams, num_threads);
7939 }
7940 
7941 // Set the proc_bind var to use in the following parallel region.
7942 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7943  kmp_info_t *thr = __kmp_threads[gtid];
7944  thr->th.th_set_proc_bind = proc_bind;
7945 }
7946 
7947 /* Launch the worker threads into the microtask. */
7948 
7949 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7950  kmp_info_t *this_thr = __kmp_threads[gtid];
7951 
7952 #ifdef KMP_DEBUG
7953  int f;
7954 #endif /* KMP_DEBUG */
7955 
7956  KMP_DEBUG_ASSERT(team);
7957  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7958  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7959  KMP_MB(); /* Flush all pending memory write invalidates. */
7960 
7961  team->t.t_construct = 0; /* no single directives seen yet */
7962  team->t.t_ordered.dt.t_value =
7963  0; /* thread 0 enters the ordered section first */
7964 
7965  /* Reset the identifiers on the dispatch buffer */
7966  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7967  if (team->t.t_max_nproc > 1) {
7968  int i;
7969  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7970  team->t.t_disp_buffer[i].buffer_index = i;
7971  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7972  }
7973  } else {
7974  team->t.t_disp_buffer[0].buffer_index = 0;
7975  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7976  }
7977 
7978  KMP_MB(); /* Flush all pending memory write invalidates. */
7979  KMP_ASSERT(this_thr->th.th_team == team);
7980 
7981 #ifdef KMP_DEBUG
7982  for (f = 0; f < team->t.t_nproc; f++) {
7983  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7984  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7985  }
7986 #endif /* KMP_DEBUG */
7987 
7988  /* release the worker threads so they may begin working */
7989  __kmp_fork_barrier(gtid, 0);
7990 }
7991 
7992 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7993  kmp_info_t *this_thr = __kmp_threads[gtid];
7994 
7995  KMP_DEBUG_ASSERT(team);
7996  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7997  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7998  KMP_MB(); /* Flush all pending memory write invalidates. */
7999 
8000  /* Join barrier after fork */
8001 
8002 #ifdef KMP_DEBUG
8003  if (__kmp_threads[gtid] &&
8004  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8005  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8006  __kmp_threads[gtid]);
8007  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8008  "team->t.t_nproc=%d\n",
8009  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8010  team->t.t_nproc);
8011  __kmp_print_structure();
8012  }
8013  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8014  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8015 #endif /* KMP_DEBUG */
8016 
8017  __kmp_join_barrier(gtid); /* wait for everyone */
8018 #if OMPT_SUPPORT
8019  if (ompt_enabled.enabled &&
8020  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8021  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8022  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8023  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8024 #if OMPT_OPTIONAL
8025  void *codeptr = NULL;
8026  if (KMP_MASTER_TID(ds_tid) &&
8027  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8028  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8029  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8030 
8031  if (ompt_enabled.ompt_callback_sync_region_wait) {
8032  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8033  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8034  codeptr);
8035  }
8036  if (ompt_enabled.ompt_callback_sync_region) {
8037  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8038  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8039  codeptr);
8040  }
8041 #endif
8042  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8043  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8044  ompt_scope_end, NULL, task_data, 0, ds_tid,
8045  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8046  }
8047  }
8048 #endif
8049 
8050  KMP_MB(); /* Flush all pending memory write invalidates. */
8051  KMP_ASSERT(this_thr->th.th_team == team);
8052 }
8053 
8054 /* ------------------------------------------------------------------------ */
8055 
8056 #ifdef USE_LOAD_BALANCE
8057 
8058 // Return the worker threads actively spinning in the hot team, if we
8059 // are at the outermost level of parallelism. Otherwise, return 0.
8060 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8061  int i;
8062  int retval;
8063  kmp_team_t *hot_team;
8064 
8065  if (root->r.r_active) {
8066  return 0;
8067  }
8068  hot_team = root->r.r_hot_team;
8069  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8070  return hot_team->t.t_nproc - 1; // Don't count primary thread
8071  }
8072 
8073  // Skip the primary thread - it is accounted for elsewhere.
8074  retval = 0;
8075  for (i = 1; i < hot_team->t.t_nproc; i++) {
8076  if (hot_team->t.t_threads[i]->th.th_active) {
8077  retval++;
8078  }
8079  }
8080  return retval;
8081 }
8082 
8083 // Perform an automatic adjustment to the number of
8084 // threads used by the next parallel region.
8085 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8086  int retval;
8087  int pool_active;
8088  int hot_team_active;
8089  int team_curr_active;
8090  int system_active;
8091 
8092  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8093  set_nproc));
8094  KMP_DEBUG_ASSERT(root);
8095  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8096  ->th.th_current_task->td_icvs.dynamic == TRUE);
8097  KMP_DEBUG_ASSERT(set_nproc > 1);
8098 
8099  if (set_nproc == 1) {
8100  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8101  return 1;
8102  }
8103 
8104  // Threads that are active in the thread pool, active in the hot team for this
8105  // particular root (if we are at the outer par level), and the currently
8106  // executing thread (to become the primary thread) are available to add to the
8107  // new team, but are currently contributing to the system load, and must be
8108  // accounted for.
8109  pool_active = __kmp_thread_pool_active_nth;
8110  hot_team_active = __kmp_active_hot_team_nproc(root);
8111  team_curr_active = pool_active + hot_team_active + 1;
8112 
8113  // Check the system load.
8114  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8115  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8116  "hot team active = %d\n",
8117  system_active, pool_active, hot_team_active));
8118 
8119  if (system_active < 0) {
8120  // There was an error reading the necessary info from /proc, so use the
8121  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8122  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8123  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8124  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8125 
8126  // Make this call behave like the thread limit algorithm.
8127  retval = __kmp_avail_proc - __kmp_nth +
8128  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8129  if (retval > set_nproc) {
8130  retval = set_nproc;
8131  }
8132  if (retval < KMP_MIN_NTH) {
8133  retval = KMP_MIN_NTH;
8134  }
8135 
8136  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8137  retval));
8138  return retval;
8139  }
8140 
8141  // There is a slight delay in the load balance algorithm in detecting new
8142  // running procs. The real system load at this instant should be at least as
8143  // large as the #active omp thread that are available to add to the team.
8144  if (system_active < team_curr_active) {
8145  system_active = team_curr_active;
8146  }
8147  retval = __kmp_avail_proc - system_active + team_curr_active;
8148  if (retval > set_nproc) {
8149  retval = set_nproc;
8150  }
8151  if (retval < KMP_MIN_NTH) {
8152  retval = KMP_MIN_NTH;
8153  }
8154 
8155  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8156  return retval;
8157 } // __kmp_load_balance_nproc()
8158 
8159 #endif /* USE_LOAD_BALANCE */
8160 
8161 /* ------------------------------------------------------------------------ */
8162 
8163 /* NOTE: this is called with the __kmp_init_lock held */
8164 void __kmp_cleanup(void) {
8165  int f;
8166 
8167  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8168 
8169  if (TCR_4(__kmp_init_parallel)) {
8170 #if KMP_HANDLE_SIGNALS
8171  __kmp_remove_signals();
8172 #endif
8173  TCW_4(__kmp_init_parallel, FALSE);
8174  }
8175 
8176  if (TCR_4(__kmp_init_middle)) {
8177 #if KMP_AFFINITY_SUPPORTED
8178  __kmp_affinity_uninitialize();
8179 #endif /* KMP_AFFINITY_SUPPORTED */
8180  __kmp_cleanup_hierarchy();
8181  TCW_4(__kmp_init_middle, FALSE);
8182  }
8183 
8184  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8185 
8186  if (__kmp_init_serial) {
8187  __kmp_runtime_destroy();
8188  __kmp_init_serial = FALSE;
8189  }
8190 
8191  __kmp_cleanup_threadprivate_caches();
8192 
8193  for (f = 0; f < __kmp_threads_capacity; f++) {
8194  if (__kmp_root[f] != NULL) {
8195  __kmp_free(__kmp_root[f]);
8196  __kmp_root[f] = NULL;
8197  }
8198  }
8199  __kmp_free(__kmp_threads);
8200  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8201  // there is no need in freeing __kmp_root.
8202  __kmp_threads = NULL;
8203  __kmp_root = NULL;
8204  __kmp_threads_capacity = 0;
8205 
8206  // Free old __kmp_threads arrays if they exist.
8207  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8208  while (ptr) {
8209  kmp_old_threads_list_t *next = ptr->next;
8210  __kmp_free(ptr->threads);
8211  __kmp_free(ptr);
8212  ptr = next;
8213  }
8214 
8215 #if KMP_USE_DYNAMIC_LOCK
8216  __kmp_cleanup_indirect_user_locks();
8217 #else
8218  __kmp_cleanup_user_locks();
8219 #endif
8220 #if OMPD_SUPPORT
8221  if (ompd_state) {
8222  __kmp_free(ompd_env_block);
8223  ompd_env_block = NULL;
8224  ompd_env_block_size = 0;
8225  }
8226 #endif
8227 
8228 #if KMP_AFFINITY_SUPPORTED
8229  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8230  __kmp_cpuinfo_file = NULL;
8231 #endif /* KMP_AFFINITY_SUPPORTED */
8232 
8233 #if KMP_USE_ADAPTIVE_LOCKS
8234 #if KMP_DEBUG_ADAPTIVE_LOCKS
8235  __kmp_print_speculative_stats();
8236 #endif
8237 #endif
8238  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8239  __kmp_nested_nth.nth = NULL;
8240  __kmp_nested_nth.size = 0;
8241  __kmp_nested_nth.used = 0;
8242  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8243  __kmp_nested_proc_bind.bind_types = NULL;
8244  __kmp_nested_proc_bind.size = 0;
8245  __kmp_nested_proc_bind.used = 0;
8246  if (__kmp_affinity_format) {
8247  KMP_INTERNAL_FREE(__kmp_affinity_format);
8248  __kmp_affinity_format = NULL;
8249  }
8250 
8251  __kmp_i18n_catclose();
8252 
8253 #if KMP_USE_HIER_SCHED
8254  __kmp_hier_scheds.deallocate();
8255 #endif
8256 
8257 #if KMP_STATS_ENABLED
8258  __kmp_stats_fini();
8259 #endif
8260 
8261  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8262 }
8263 
8264 /* ------------------------------------------------------------------------ */
8265 
8266 int __kmp_ignore_mppbeg(void) {
8267  char *env;
8268 
8269  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8270  if (__kmp_str_match_false(env))
8271  return FALSE;
8272  }
8273  // By default __kmpc_begin() is no-op.
8274  return TRUE;
8275 }
8276 
8277 int __kmp_ignore_mppend(void) {
8278  char *env;
8279 
8280  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8281  if (__kmp_str_match_false(env))
8282  return FALSE;
8283  }
8284  // By default __kmpc_end() is no-op.
8285  return TRUE;
8286 }
8287 
8288 void __kmp_internal_begin(void) {
8289  int gtid;
8290  kmp_root_t *root;
8291 
8292  /* this is a very important step as it will register new sibling threads
8293  and assign these new uber threads a new gtid */
8294  gtid = __kmp_entry_gtid();
8295  root = __kmp_threads[gtid]->th.th_root;
8296  KMP_ASSERT(KMP_UBER_GTID(gtid));
8297 
8298  if (root->r.r_begin)
8299  return;
8300  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8301  if (root->r.r_begin) {
8302  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8303  return;
8304  }
8305 
8306  root->r.r_begin = TRUE;
8307 
8308  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8309 }
8310 
8311 /* ------------------------------------------------------------------------ */
8312 
8313 void __kmp_user_set_library(enum library_type arg) {
8314  int gtid;
8315  kmp_root_t *root;
8316  kmp_info_t *thread;
8317 
8318  /* first, make sure we are initialized so we can get our gtid */
8319 
8320  gtid = __kmp_entry_gtid();
8321  thread = __kmp_threads[gtid];
8322 
8323  root = thread->th.th_root;
8324 
8325  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8326  library_serial));
8327  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8328  thread */
8329  KMP_WARNING(SetLibraryIncorrectCall);
8330  return;
8331  }
8332 
8333  switch (arg) {
8334  case library_serial:
8335  thread->th.th_set_nproc = 0;
8336  set__nproc(thread, 1);
8337  break;
8338  case library_turnaround:
8339  thread->th.th_set_nproc = 0;
8340  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8341  : __kmp_dflt_team_nth_ub);
8342  break;
8343  case library_throughput:
8344  thread->th.th_set_nproc = 0;
8345  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8346  : __kmp_dflt_team_nth_ub);
8347  break;
8348  default:
8349  KMP_FATAL(UnknownLibraryType, arg);
8350  }
8351 
8352  __kmp_aux_set_library(arg);
8353 }
8354 
8355 void __kmp_aux_set_stacksize(size_t arg) {
8356  if (!__kmp_init_serial)
8357  __kmp_serial_initialize();
8358 
8359 #if KMP_OS_DARWIN
8360  if (arg & (0x1000 - 1)) {
8361  arg &= ~(0x1000 - 1);
8362  if (arg + 0x1000) /* check for overflow if we round up */
8363  arg += 0x1000;
8364  }
8365 #endif
8366  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8367 
8368  /* only change the default stacksize before the first parallel region */
8369  if (!TCR_4(__kmp_init_parallel)) {
8370  size_t value = arg; /* argument is in bytes */
8371 
8372  if (value < __kmp_sys_min_stksize)
8373  value = __kmp_sys_min_stksize;
8374  else if (value > KMP_MAX_STKSIZE)
8375  value = KMP_MAX_STKSIZE;
8376 
8377  __kmp_stksize = value;
8378 
8379  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8380  }
8381 
8382  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8383 }
8384 
8385 /* set the behaviour of the runtime library */
8386 /* TODO this can cause some odd behaviour with sibling parallelism... */
8387 void __kmp_aux_set_library(enum library_type arg) {
8388  __kmp_library = arg;
8389 
8390  switch (__kmp_library) {
8391  case library_serial: {
8392  KMP_INFORM(LibraryIsSerial);
8393  } break;
8394  case library_turnaround:
8395  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8396  __kmp_use_yield = 2; // only yield when oversubscribed
8397  break;
8398  case library_throughput:
8399  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8400  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8401  break;
8402  default:
8403  KMP_FATAL(UnknownLibraryType, arg);
8404  }
8405 }
8406 
8407 /* Getting team information common for all team API */
8408 // Returns NULL if not in teams construct
8409 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8410  kmp_info_t *thr = __kmp_entry_thread();
8411  teams_serialized = 0;
8412  if (thr->th.th_teams_microtask) {
8413  kmp_team_t *team = thr->th.th_team;
8414  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8415  int ii = team->t.t_level;
8416  teams_serialized = team->t.t_serialized;
8417  int level = tlevel + 1;
8418  KMP_DEBUG_ASSERT(ii >= tlevel);
8419  while (ii > level) {
8420  for (teams_serialized = team->t.t_serialized;
8421  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8422  }
8423  if (team->t.t_serialized && (!teams_serialized)) {
8424  team = team->t.t_parent;
8425  continue;
8426  }
8427  if (ii > level) {
8428  team = team->t.t_parent;
8429  ii--;
8430  }
8431  }
8432  return team;
8433  }
8434  return NULL;
8435 }
8436 
8437 int __kmp_aux_get_team_num() {
8438  int serialized;
8439  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8440  if (team) {
8441  if (serialized > 1) {
8442  return 0; // teams region is serialized ( 1 team of 1 thread ).
8443  } else {
8444  return team->t.t_master_tid;
8445  }
8446  }
8447  return 0;
8448 }
8449 
8450 int __kmp_aux_get_num_teams() {
8451  int serialized;
8452  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8453  if (team) {
8454  if (serialized > 1) {
8455  return 1;
8456  } else {
8457  return team->t.t_parent->t.t_nproc;
8458  }
8459  }
8460  return 1;
8461 }
8462 
8463 /* ------------------------------------------------------------------------ */
8464 
8465 /*
8466  * Affinity Format Parser
8467  *
8468  * Field is in form of: %[[[0].]size]type
8469  * % and type are required (%% means print a literal '%')
8470  * type is either single char or long name surrounded by {},
8471  * e.g., N or {num_threads}
8472  * 0 => leading zeros
8473  * . => right justified when size is specified
8474  * by default output is left justified
8475  * size is the *minimum* field length
8476  * All other characters are printed as is
8477  *
8478  * Available field types:
8479  * L {thread_level} - omp_get_level()
8480  * n {thread_num} - omp_get_thread_num()
8481  * h {host} - name of host machine
8482  * P {process_id} - process id (integer)
8483  * T {thread_identifier} - native thread identifier (integer)
8484  * N {num_threads} - omp_get_num_threads()
8485  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8486  * a {thread_affinity} - comma separated list of integers or integer ranges
8487  * (values of affinity mask)
8488  *
8489  * Implementation-specific field types can be added
8490  * If a type is unknown, print "undefined"
8491  */
8492 
8493 // Structure holding the short name, long name, and corresponding data type
8494 // for snprintf. A table of these will represent the entire valid keyword
8495 // field types.
8496 typedef struct kmp_affinity_format_field_t {
8497  char short_name; // from spec e.g., L -> thread level
8498  const char *long_name; // from spec thread_level -> thread level
8499  char field_format; // data type for snprintf (typically 'd' or 's'
8500  // for integer or string)
8501 } kmp_affinity_format_field_t;
8502 
8503 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8504 #if KMP_AFFINITY_SUPPORTED
8505  {'A', "thread_affinity", 's'},
8506 #endif
8507  {'t', "team_num", 'd'},
8508  {'T', "num_teams", 'd'},
8509  {'L', "nesting_level", 'd'},
8510  {'n', "thread_num", 'd'},
8511  {'N', "num_threads", 'd'},
8512  {'a', "ancestor_tnum", 'd'},
8513  {'H', "host", 's'},
8514  {'P', "process_id", 'd'},
8515  {'i', "native_thread_id", 'd'}};
8516 
8517 // Return the number of characters it takes to hold field
8518 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8519  const char **ptr,
8520  kmp_str_buf_t *field_buffer) {
8521  int rc, format_index, field_value;
8522  const char *width_left, *width_right;
8523  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8524  static const int FORMAT_SIZE = 20;
8525  char format[FORMAT_SIZE] = {0};
8526  char absolute_short_name = 0;
8527 
8528  KMP_DEBUG_ASSERT(gtid >= 0);
8529  KMP_DEBUG_ASSERT(th);
8530  KMP_DEBUG_ASSERT(**ptr == '%');
8531  KMP_DEBUG_ASSERT(field_buffer);
8532 
8533  __kmp_str_buf_clear(field_buffer);
8534 
8535  // Skip the initial %
8536  (*ptr)++;
8537 
8538  // Check for %% first
8539  if (**ptr == '%') {
8540  __kmp_str_buf_cat(field_buffer, "%", 1);
8541  (*ptr)++; // skip over the second %
8542  return 1;
8543  }
8544 
8545  // Parse field modifiers if they are present
8546  pad_zeros = false;
8547  if (**ptr == '0') {
8548  pad_zeros = true;
8549  (*ptr)++; // skip over 0
8550  }
8551  right_justify = false;
8552  if (**ptr == '.') {
8553  right_justify = true;
8554  (*ptr)++; // skip over .
8555  }
8556  // Parse width of field: [width_left, width_right)
8557  width_left = width_right = NULL;
8558  if (**ptr >= '0' && **ptr <= '9') {
8559  width_left = *ptr;
8560  SKIP_DIGITS(*ptr);
8561  width_right = *ptr;
8562  }
8563 
8564  // Create the format for KMP_SNPRINTF based on flags parsed above
8565  format_index = 0;
8566  format[format_index++] = '%';
8567  if (!right_justify)
8568  format[format_index++] = '-';
8569  if (pad_zeros)
8570  format[format_index++] = '0';
8571  if (width_left && width_right) {
8572  int i = 0;
8573  // Only allow 8 digit number widths.
8574  // This also prevents overflowing format variable
8575  while (i < 8 && width_left < width_right) {
8576  format[format_index++] = *width_left;
8577  width_left++;
8578  i++;
8579  }
8580  }
8581 
8582  // Parse a name (long or short)
8583  // Canonicalize the name into absolute_short_name
8584  found_valid_name = false;
8585  parse_long_name = (**ptr == '{');
8586  if (parse_long_name)
8587  (*ptr)++; // skip initial left brace
8588  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8589  sizeof(__kmp_affinity_format_table[0]);
8590  ++i) {
8591  char short_name = __kmp_affinity_format_table[i].short_name;
8592  const char *long_name = __kmp_affinity_format_table[i].long_name;
8593  char field_format = __kmp_affinity_format_table[i].field_format;
8594  if (parse_long_name) {
8595  size_t length = KMP_STRLEN(long_name);
8596  if (strncmp(*ptr, long_name, length) == 0) {
8597  found_valid_name = true;
8598  (*ptr) += length; // skip the long name
8599  }
8600  } else if (**ptr == short_name) {
8601  found_valid_name = true;
8602  (*ptr)++; // skip the short name
8603  }
8604  if (found_valid_name) {
8605  format[format_index++] = field_format;
8606  format[format_index++] = '\0';
8607  absolute_short_name = short_name;
8608  break;
8609  }
8610  }
8611  if (parse_long_name) {
8612  if (**ptr != '}') {
8613  absolute_short_name = 0;
8614  } else {
8615  (*ptr)++; // skip over the right brace
8616  }
8617  }
8618 
8619  // Attempt to fill the buffer with the requested
8620  // value using snprintf within __kmp_str_buf_print()
8621  switch (absolute_short_name) {
8622  case 't':
8623  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8624  break;
8625  case 'T':
8626  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8627  break;
8628  case 'L':
8629  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8630  break;
8631  case 'n':
8632  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8633  break;
8634  case 'H': {
8635  static const int BUFFER_SIZE = 256;
8636  char buf[BUFFER_SIZE];
8637  __kmp_expand_host_name(buf, BUFFER_SIZE);
8638  rc = __kmp_str_buf_print(field_buffer, format, buf);
8639  } break;
8640  case 'P':
8641  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8642  break;
8643  case 'i':
8644  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8645  break;
8646  case 'N':
8647  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8648  break;
8649  case 'a':
8650  field_value =
8651  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8652  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8653  break;
8654 #if KMP_AFFINITY_SUPPORTED
8655  case 'A': {
8656  kmp_str_buf_t buf;
8657  __kmp_str_buf_init(&buf);
8658  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8659  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8660  __kmp_str_buf_free(&buf);
8661  } break;
8662 #endif
8663  default:
8664  // According to spec, If an implementation does not have info for field
8665  // type, then "undefined" is printed
8666  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8667  // Skip the field
8668  if (parse_long_name) {
8669  SKIP_TOKEN(*ptr);
8670  if (**ptr == '}')
8671  (*ptr)++;
8672  } else {
8673  (*ptr)++;
8674  }
8675  }
8676 
8677  KMP_ASSERT(format_index <= FORMAT_SIZE);
8678  return rc;
8679 }
8680 
8681 /*
8682  * Return number of characters needed to hold the affinity string
8683  * (not including null byte character)
8684  * The resultant string is printed to buffer, which the caller can then
8685  * handle afterwards
8686  */
8687 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8688  kmp_str_buf_t *buffer) {
8689  const char *parse_ptr;
8690  size_t retval;
8691  const kmp_info_t *th;
8692  kmp_str_buf_t field;
8693 
8694  KMP_DEBUG_ASSERT(buffer);
8695  KMP_DEBUG_ASSERT(gtid >= 0);
8696 
8697  __kmp_str_buf_init(&field);
8698  __kmp_str_buf_clear(buffer);
8699 
8700  th = __kmp_threads[gtid];
8701  retval = 0;
8702 
8703  // If format is NULL or zero-length string, then we use
8704  // affinity-format-var ICV
8705  parse_ptr = format;
8706  if (parse_ptr == NULL || *parse_ptr == '\0') {
8707  parse_ptr = __kmp_affinity_format;
8708  }
8709  KMP_DEBUG_ASSERT(parse_ptr);
8710 
8711  while (*parse_ptr != '\0') {
8712  // Parse a field
8713  if (*parse_ptr == '%') {
8714  // Put field in the buffer
8715  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8716  __kmp_str_buf_catbuf(buffer, &field);
8717  retval += rc;
8718  } else {
8719  // Put literal character in buffer
8720  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8721  retval++;
8722  parse_ptr++;
8723  }
8724  }
8725  __kmp_str_buf_free(&field);
8726  return retval;
8727 }
8728 
8729 // Displays the affinity string to stdout
8730 void __kmp_aux_display_affinity(int gtid, const char *format) {
8731  kmp_str_buf_t buf;
8732  __kmp_str_buf_init(&buf);
8733  __kmp_aux_capture_affinity(gtid, format, &buf);
8734  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8735  __kmp_str_buf_free(&buf);
8736 }
8737 
8738 /* ------------------------------------------------------------------------ */
8739 
8740 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8741  int blocktime = arg; /* argument is in milliseconds */
8742 #if KMP_USE_MONITOR
8743  int bt_intervals;
8744 #endif
8745  kmp_int8 bt_set;
8746 
8747  __kmp_save_internal_controls(thread);
8748 
8749  /* Normalize and set blocktime for the teams */
8750  if (blocktime < KMP_MIN_BLOCKTIME)
8751  blocktime = KMP_MIN_BLOCKTIME;
8752  else if (blocktime > KMP_MAX_BLOCKTIME)
8753  blocktime = KMP_MAX_BLOCKTIME;
8754 
8755  set__blocktime_team(thread->th.th_team, tid, blocktime);
8756  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8757 
8758 #if KMP_USE_MONITOR
8759  /* Calculate and set blocktime intervals for the teams */
8760  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8761 
8762  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8763  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8764 #endif
8765 
8766  /* Set whether blocktime has been set to "TRUE" */
8767  bt_set = TRUE;
8768 
8769  set__bt_set_team(thread->th.th_team, tid, bt_set);
8770  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8771 #if KMP_USE_MONITOR
8772  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8773  "bt_intervals=%d, monitor_updates=%d\n",
8774  __kmp_gtid_from_tid(tid, thread->th.th_team),
8775  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8776  __kmp_monitor_wakeups));
8777 #else
8778  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8779  __kmp_gtid_from_tid(tid, thread->th.th_team),
8780  thread->th.th_team->t.t_id, tid, blocktime));
8781 #endif
8782 }
8783 
8784 void __kmp_aux_set_defaults(char const *str, size_t len) {
8785  if (!__kmp_init_serial) {
8786  __kmp_serial_initialize();
8787  }
8788  __kmp_env_initialize(str);
8789 
8790  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8791  __kmp_env_print();
8792  }
8793 } // __kmp_aux_set_defaults
8794 
8795 /* ------------------------------------------------------------------------ */
8796 /* internal fast reduction routines */
8797 
8798 PACKED_REDUCTION_METHOD_T
8799 __kmp_determine_reduction_method(
8800  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8801  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8802  kmp_critical_name *lck) {
8803 
8804  // Default reduction method: critical construct ( lck != NULL, like in current
8805  // PAROPT )
8806  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8807  // can be selected by RTL
8808  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8809  // can be selected by RTL
8810  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8811  // among generated by PAROPT.
8812 
8813  PACKED_REDUCTION_METHOD_T retval;
8814 
8815  int team_size;
8816 
8817  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8818  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8819 
8820 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8821  (loc && \
8822  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8823 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8824 
8825  retval = critical_reduce_block;
8826 
8827  // another choice of getting a team size (with 1 dynamic deference) is slower
8828  team_size = __kmp_get_team_num_threads(global_tid);
8829  if (team_size == 1) {
8830 
8831  retval = empty_reduce_block;
8832 
8833  } else {
8834 
8835  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8836 
8837 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8838  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8839 
8840 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8841  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8842 
8843  int teamsize_cutoff = 4;
8844 
8845 #if KMP_MIC_SUPPORTED
8846  if (__kmp_mic_type != non_mic) {
8847  teamsize_cutoff = 8;
8848  }
8849 #endif
8850  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8851  if (tree_available) {
8852  if (team_size <= teamsize_cutoff) {
8853  if (atomic_available) {
8854  retval = atomic_reduce_block;
8855  }
8856  } else {
8857  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8858  }
8859  } else if (atomic_available) {
8860  retval = atomic_reduce_block;
8861  }
8862 #else
8863 #error "Unknown or unsupported OS"
8864 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8865  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8866 
8867 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8868 
8869 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8870 
8871  // basic tuning
8872 
8873  if (atomic_available) {
8874  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8875  retval = atomic_reduce_block;
8876  }
8877  } // otherwise: use critical section
8878 
8879 #elif KMP_OS_DARWIN
8880 
8881  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8882  if (atomic_available && (num_vars <= 3)) {
8883  retval = atomic_reduce_block;
8884  } else if (tree_available) {
8885  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8886  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8887  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8888  }
8889  } // otherwise: use critical section
8890 
8891 #else
8892 #error "Unknown or unsupported OS"
8893 #endif
8894 
8895 #else
8896 #error "Unknown or unsupported architecture"
8897 #endif
8898  }
8899 
8900  // KMP_FORCE_REDUCTION
8901 
8902  // If the team is serialized (team_size == 1), ignore the forced reduction
8903  // method and stay with the unsynchronized method (empty_reduce_block)
8904  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8905  team_size != 1) {
8906 
8907  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8908 
8909  int atomic_available, tree_available;
8910 
8911  switch ((forced_retval = __kmp_force_reduction_method)) {
8912  case critical_reduce_block:
8913  KMP_ASSERT(lck); // lck should be != 0
8914  break;
8915 
8916  case atomic_reduce_block:
8917  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8918  if (!atomic_available) {
8919  KMP_WARNING(RedMethodNotSupported, "atomic");
8920  forced_retval = critical_reduce_block;
8921  }
8922  break;
8923 
8924  case tree_reduce_block:
8925  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8926  if (!tree_available) {
8927  KMP_WARNING(RedMethodNotSupported, "tree");
8928  forced_retval = critical_reduce_block;
8929  } else {
8930 #if KMP_FAST_REDUCTION_BARRIER
8931  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8932 #endif
8933  }
8934  break;
8935 
8936  default:
8937  KMP_ASSERT(0); // "unsupported method specified"
8938  }
8939 
8940  retval = forced_retval;
8941  }
8942 
8943  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8944 
8945 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8946 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8947 
8948  return (retval);
8949 }
8950 // this function is for testing set/get/determine reduce method
8951 kmp_int32 __kmp_get_reduce_method(void) {
8952  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8953 }
8954 
8955 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8956 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8957 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8958 
8959 // Hard pause shuts down the runtime completely. Resume happens naturally when
8960 // OpenMP is used subsequently.
8961 void __kmp_hard_pause() {
8962  __kmp_pause_status = kmp_hard_paused;
8963  __kmp_internal_end_thread(-1);
8964 }
8965 
8966 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8967 void __kmp_resume_if_soft_paused() {
8968  if (__kmp_pause_status == kmp_soft_paused) {
8969  __kmp_pause_status = kmp_not_paused;
8970 
8971  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8972  kmp_info_t *thread = __kmp_threads[gtid];
8973  if (thread) { // Wake it if sleeping
8974  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8975  thread);
8976  if (fl.is_sleeping())
8977  fl.resume(gtid);
8978  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8979  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8980  } else { // thread holds the lock and may sleep soon
8981  do { // until either the thread sleeps, or we can get the lock
8982  if (fl.is_sleeping()) {
8983  fl.resume(gtid);
8984  break;
8985  } else if (__kmp_try_suspend_mx(thread)) {
8986  __kmp_unlock_suspend_mx(thread);
8987  break;
8988  }
8989  } while (1);
8990  }
8991  }
8992  }
8993  }
8994 }
8995 
8996 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8997 // TODO: add warning messages
8998 int __kmp_pause_resource(kmp_pause_status_t level) {
8999  if (level == kmp_not_paused) { // requesting resume
9000  if (__kmp_pause_status == kmp_not_paused) {
9001  // error message about runtime not being paused, so can't resume
9002  return 1;
9003  } else {
9004  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9005  __kmp_pause_status == kmp_hard_paused);
9006  __kmp_pause_status = kmp_not_paused;
9007  return 0;
9008  }
9009  } else if (level == kmp_soft_paused) { // requesting soft pause
9010  if (__kmp_pause_status != kmp_not_paused) {
9011  // error message about already being paused
9012  return 1;
9013  } else {
9014  __kmp_soft_pause();
9015  return 0;
9016  }
9017  } else if (level == kmp_hard_paused) { // requesting hard pause
9018  if (__kmp_pause_status != kmp_not_paused) {
9019  // error message about already being paused
9020  return 1;
9021  } else {
9022  __kmp_hard_pause();
9023  return 0;
9024  }
9025  } else {
9026  // error message about invalid level
9027  return 1;
9028  }
9029 }
9030 
9031 void __kmp_omp_display_env(int verbose) {
9032  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9033  if (__kmp_init_serial == 0)
9034  __kmp_do_serial_initialize();
9035  __kmp_display_env_impl(!verbose, verbose);
9036  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9037 }
9038 
9039 // The team size is changing, so distributed barrier must be modified
9040 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9041  int new_nthreads) {
9042  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9043  bp_dist_bar);
9044  kmp_info_t **other_threads = team->t.t_threads;
9045 
9046  // We want all the workers to stop waiting on the barrier while we adjust the
9047  // size of the team.
9048  for (int f = 1; f < old_nthreads; ++f) {
9049  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9050  // Ignore threads that are already inactive or not present in the team
9051  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9052  // teams construct causes thread_limit to get passed in, and some of
9053  // those could be inactive; just ignore them
9054  continue;
9055  }
9056  // If thread is transitioning still to in_use state, wait for it
9057  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9058  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9059  KMP_CPU_PAUSE();
9060  }
9061  // The thread should be in_use now
9062  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9063  // Transition to unused state
9064  team->t.t_threads[f]->th.th_used_in_team.store(2);
9065  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9066  }
9067  // Release all the workers
9068  team->t.b->go_release();
9069 
9070  KMP_MFENCE();
9071 
9072  // Workers should see transition status 2 and move to 0; but may need to be
9073  // woken up first
9074  int count = old_nthreads - 1;
9075  while (count > 0) {
9076  count = old_nthreads - 1;
9077  for (int f = 1; f < old_nthreads; ++f) {
9078  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9079  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9080  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9081  void *, other_threads[f]->th.th_sleep_loc);
9082  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9083  }
9084  } else {
9085  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9086  count--;
9087  }
9088  }
9089  }
9090  // Now update the barrier size
9091  team->t.b->update_num_threads(new_nthreads);
9092  team->t.b->go_reset();
9093 }
9094 
9095 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9096  // Add the threads back to the team
9097  KMP_DEBUG_ASSERT(team);
9098  // Threads were paused and pointed at th_used_in_team temporarily during a
9099  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9100  // the thread that it should transition itself back into the team. Then, if
9101  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9102  // to wake it up.
9103  for (int f = 1; f < new_nthreads; ++f) {
9104  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9105  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9106  3);
9107  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9108  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9109  (kmp_flag_32<false, false> *)NULL);
9110  }
9111  }
9112  // The threads should be transitioning to the team; when they are done, they
9113  // should have set th_used_in_team to 1. This loop forces master to wait until
9114  // all threads have moved into the team and are waiting in the barrier.
9115  int count = new_nthreads - 1;
9116  while (count > 0) {
9117  count = new_nthreads - 1;
9118  for (int f = 1; f < new_nthreads; ++f) {
9119  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9120  count--;
9121  }
9122  }
9123  }
9124 }
9125 
9126 // Globals and functions for hidden helper task
9127 kmp_info_t **__kmp_hidden_helper_threads;
9128 kmp_info_t *__kmp_hidden_helper_main_thread;
9129 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9130 #if KMP_OS_LINUX
9131 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9132 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9133 #else
9134 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9135 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9136 #endif
9137 
9138 namespace {
9139 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9140 
9141 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9142  // This is an explicit synchronization on all hidden helper threads in case
9143  // that when a regular thread pushes a hidden helper task to one hidden
9144  // helper thread, the thread has not been awaken once since they're released
9145  // by the main thread after creating the team.
9146  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9147  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9148  __kmp_hidden_helper_threads_num)
9149  ;
9150 
9151  // If main thread, then wait for signal
9152  if (__kmpc_master(nullptr, *gtid)) {
9153  // First, unset the initial state and release the initial thread
9154  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9155  __kmp_hidden_helper_initz_release();
9156  __kmp_hidden_helper_main_thread_wait();
9157  // Now wake up all worker threads
9158  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9159  __kmp_hidden_helper_worker_thread_signal();
9160  }
9161  }
9162 }
9163 } // namespace
9164 
9165 void __kmp_hidden_helper_threads_initz_routine() {
9166  // Create a new root for hidden helper team/threads
9167  const int gtid = __kmp_register_root(TRUE);
9168  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9169  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9170  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9171  __kmp_hidden_helper_threads_num;
9172 
9173  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9174 
9175  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9176 
9177  // Set the initialization flag to FALSE
9178  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9179 
9180  __kmp_hidden_helper_threads_deinitz_release();
9181 }
9182 
9183 /* Nesting Mode:
9184  Set via KMP_NESTING_MODE, which takes an integer.
9185  Note: we skip duplicate topology levels, and skip levels with only
9186  one entity.
9187  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9188  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9189  in the topology, and initializes the number of threads at each of those
9190  levels to the number of entities at each level, respectively, below the
9191  entity at the parent level.
9192  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9193  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9194  the user to turn nesting on explicitly. This is an even more experimental
9195  option to this experimental feature, and may change or go away in the
9196  future.
9197 */
9198 
9199 // Allocate space to store nesting levels
9200 void __kmp_init_nesting_mode() {
9201  int levels = KMP_HW_LAST;
9202  __kmp_nesting_mode_nlevels = levels;
9203  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9204  for (int i = 0; i < levels; ++i)
9205  __kmp_nesting_nth_level[i] = 0;
9206  if (__kmp_nested_nth.size < levels) {
9207  __kmp_nested_nth.nth =
9208  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9209  __kmp_nested_nth.size = levels;
9210  }
9211 }
9212 
9213 // Set # threads for top levels of nesting; must be called after topology set
9214 void __kmp_set_nesting_mode_threads() {
9215  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9216 
9217  if (__kmp_nesting_mode == 1)
9218  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9219  else if (__kmp_nesting_mode > 1)
9220  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9221 
9222  if (__kmp_topology) { // use topology info
9223  int loc, hw_level;
9224  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9225  loc < __kmp_nesting_mode_nlevels;
9226  loc++, hw_level++) {
9227  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9228  if (__kmp_nesting_nth_level[loc] == 1)
9229  loc--;
9230  }
9231  // Make sure all cores are used
9232  if (__kmp_nesting_mode > 1 && loc > 1) {
9233  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9234  int num_cores = __kmp_topology->get_count(core_level);
9235  int upper_levels = 1;
9236  for (int level = 0; level < loc - 1; ++level)
9237  upper_levels *= __kmp_nesting_nth_level[level];
9238  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9239  __kmp_nesting_nth_level[loc - 1] =
9240  num_cores / __kmp_nesting_nth_level[loc - 2];
9241  }
9242  __kmp_nesting_mode_nlevels = loc;
9243  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9244  } else { // no topology info available; provide a reasonable guesstimation
9245  if (__kmp_avail_proc >= 4) {
9246  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9247  __kmp_nesting_nth_level[1] = 2;
9248  __kmp_nesting_mode_nlevels = 2;
9249  } else {
9250  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9251  __kmp_nesting_mode_nlevels = 1;
9252  }
9253  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9254  }
9255  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9256  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9257  }
9258  set__nproc(thread, __kmp_nesting_nth_level[0]);
9259  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9260  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9261  if (get__max_active_levels(thread) > 1) {
9262  // if max levels was set, set nesting mode levels to same
9263  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9264  }
9265  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9266  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9267 }
9268 
9269 // Empty symbols to export (see exports_so.txt) when feature is disabled
9270 extern "C" {
9271 #if !KMP_STATS_ENABLED
9272 void __kmp_reset_stats() {}
9273 #endif
9274 #if !USE_DEBUGGER
9275 int __kmp_omp_debug_struct_info = FALSE;
9276 int __kmp_debugging = FALSE;
9277 #endif
9278 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9279 void __kmp_itt_fini_ittlib() {}
9280 void __kmp_itt_init_ittlib() {}
9281 #endif
9282 }
9283 
9284 // end of file
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
sched_type
Definition: kmp.h:357
Definition: kmp.h:234
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
kmp_int32 flags
Definition: kmp.h:236