LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 
409  va_end(ap);
410 }
411 
412 void __kmp_warn(char const *format, ...) {
413  char buffer[MAX_MESSAGE];
414  va_list ap;
415 
416  if (__kmp_generate_warnings == kmp_warnings_off) {
417  return;
418  }
419 
420  va_start(ap, format);
421 
422  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
423  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
424  __kmp_vprintf(kmp_err, buffer, ap);
425  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
426 
427  va_end(ap);
428 }
429 
430 void __kmp_abort_process() {
431  // Later threads may stall here, but that's ok because abort() will kill them.
432  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
433 
434  if (__kmp_debug_buf) {
435  __kmp_dump_debug_buffer();
436  }
437 
438  if (KMP_OS_WINDOWS) {
439  // Let other threads know of abnormal termination and prevent deadlock
440  // if abort happened during library initialization or shutdown
441  __kmp_global.g.g_abort = SIGABRT;
442 
443  /* On Windows* OS by default abort() causes pop-up error box, which stalls
444  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
445  boxes. _set_abort_behavior() works well, but this function is not
446  available in VS7 (this is not problem for DLL, but it is a problem for
447  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
448  help, at least in some versions of MS C RTL.
449 
450  It seems following sequence is the only way to simulate abort() and
451  avoid pop-up error box. */
452  raise(SIGABRT);
453  _exit(3); // Just in case, if signal ignored, exit anyway.
454  } else {
455  __kmp_unregister_library();
456  abort();
457  }
458 
459  __kmp_infinite_loop();
460  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
461 
462 } // __kmp_abort_process
463 
464 void __kmp_abort_thread(void) {
465  // TODO: Eliminate g_abort global variable and this function.
466  // In case of abort just call abort(), it will kill all the threads.
467  __kmp_infinite_loop();
468 } // __kmp_abort_thread
469 
470 /* Print out the storage map for the major kmp_info_t thread data structures
471  that are allocated together. */
472 
473 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
474  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
475  gtid);
476 
477  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
478  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
481  sizeof(kmp_local_t), "th_%d.th_local", gtid);
482 
483  __kmp_print_storage_map_gtid(
484  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
485  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
486 
487  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
488  &thr->th.th_bar[bs_plain_barrier + 1],
489  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
490  gtid);
491 
492  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
493  &thr->th.th_bar[bs_forkjoin_barrier + 1],
494  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
495  gtid);
496 
497 #if KMP_FAST_REDUCTION_BARRIER
498  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
499  &thr->th.th_bar[bs_reduction_barrier + 1],
500  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
501  gtid);
502 #endif // KMP_FAST_REDUCTION_BARRIER
503 }
504 
505 /* Print out the storage map for the major kmp_team_t team data structures
506  that are allocated together. */
507 
508 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
509  int team_id, int num_thr) {
510  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
511  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
512  header, team_id);
513 
514  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
515  &team->t.t_bar[bs_last_barrier],
516  sizeof(kmp_balign_team_t) * bs_last_barrier,
517  "%s_%d.t_bar", header, team_id);
518 
519  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
520  &team->t.t_bar[bs_plain_barrier + 1],
521  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
522  header, team_id);
523 
524  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
525  &team->t.t_bar[bs_forkjoin_barrier + 1],
526  sizeof(kmp_balign_team_t),
527  "%s_%d.t_bar[forkjoin]", header, team_id);
528 
529 #if KMP_FAST_REDUCTION_BARRIER
530  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
531  &team->t.t_bar[bs_reduction_barrier + 1],
532  sizeof(kmp_balign_team_t),
533  "%s_%d.t_bar[reduction]", header, team_id);
534 #endif // KMP_FAST_REDUCTION_BARRIER
535 
536  __kmp_print_storage_map_gtid(
537  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
538  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
539 
540  __kmp_print_storage_map_gtid(
541  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
542  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
543 
544  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
545  &team->t.t_disp_buffer[num_disp_buff],
546  sizeof(dispatch_shared_info_t) * num_disp_buff,
547  "%s_%d.t_disp_buffer", header, team_id);
548 }
549 
550 static void __kmp_init_allocator() {
551  __kmp_init_memkind();
552  __kmp_init_target_mem();
553 }
554 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
555 
556 /* ------------------------------------------------------------------------ */
557 
558 #if ENABLE_LIBOMPTARGET
559 static void __kmp_init_omptarget() {
560  __kmp_init_target_task();
561 }
562 #endif
563 
564 /* ------------------------------------------------------------------------ */
565 
566 #if KMP_DYNAMIC_LIB
567 #if KMP_OS_WINDOWS
568 
569 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
570  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
571 
572  switch (fdwReason) {
573 
574  case DLL_PROCESS_ATTACH:
575  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
576 
577  return TRUE;
578 
579  case DLL_PROCESS_DETACH:
580  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
581 
582  // According to Windows* documentation for DllMain entry point:
583  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
584  // lpReserved == NULL when FreeLibrary() is called,
585  // lpReserved != NULL when the process is terminated.
586  // When FreeLibrary() is called, worker threads remain alive. So the
587  // runtime's state is consistent and executing proper shutdown is OK.
588  // When the process is terminated, worker threads have exited or been
589  // forcefully terminated by the OS and only the shutdown thread remains.
590  // This can leave the runtime in an inconsistent state.
591  // Hence, only attempt proper cleanup when FreeLibrary() is called.
592  // Otherwise, rely on OS to reclaim resources.
593  if (lpReserved == NULL)
594  __kmp_internal_end_library(__kmp_gtid_get_specific());
595 
596  return TRUE;
597 
598  case DLL_THREAD_ATTACH:
599  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
600 
601  /* if we want to register new siblings all the time here call
602  * __kmp_get_gtid(); */
603  return TRUE;
604 
605  case DLL_THREAD_DETACH:
606  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
607 
608  __kmp_internal_end_thread(__kmp_gtid_get_specific());
609  return TRUE;
610  }
611 
612  return TRUE;
613 }
614 
615 #endif /* KMP_OS_WINDOWS */
616 #endif /* KMP_DYNAMIC_LIB */
617 
618 /* __kmp_parallel_deo -- Wait until it's our turn. */
619 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
620  int gtid = *gtid_ref;
621 #ifdef BUILD_PARALLEL_ORDERED
622  kmp_team_t *team = __kmp_team_from_gtid(gtid);
623 #endif /* BUILD_PARALLEL_ORDERED */
624 
625  if (__kmp_env_consistency_check) {
626  if (__kmp_threads[gtid]->th.th_root->r.r_active)
627 #if KMP_USE_DYNAMIC_LOCK
628  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
629 #else
630  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
631 #endif
632  }
633 #ifdef BUILD_PARALLEL_ORDERED
634  if (!team->t.t_serialized) {
635  KMP_MB();
636  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
637  NULL);
638  KMP_MB();
639  }
640 #endif /* BUILD_PARALLEL_ORDERED */
641 }
642 
643 /* __kmp_parallel_dxo -- Signal the next task. */
644 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
645  int gtid = *gtid_ref;
646 #ifdef BUILD_PARALLEL_ORDERED
647  int tid = __kmp_tid_from_gtid(gtid);
648  kmp_team_t *team = __kmp_team_from_gtid(gtid);
649 #endif /* BUILD_PARALLEL_ORDERED */
650 
651  if (__kmp_env_consistency_check) {
652  if (__kmp_threads[gtid]->th.th_root->r.r_active)
653  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
654  }
655 #ifdef BUILD_PARALLEL_ORDERED
656  if (!team->t.t_serialized) {
657  KMP_MB(); /* Flush all pending memory write invalidates. */
658 
659  /* use the tid of the next thread in this team */
660  /* TODO replace with general release procedure */
661  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
662 
663  KMP_MB(); /* Flush all pending memory write invalidates. */
664  }
665 #endif /* BUILD_PARALLEL_ORDERED */
666 }
667 
668 /* ------------------------------------------------------------------------ */
669 /* The BARRIER for a SINGLE process section is always explicit */
670 
671 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
672  int status;
673  kmp_info_t *th;
674  kmp_team_t *team;
675 
676  if (!TCR_4(__kmp_init_parallel))
677  __kmp_parallel_initialize();
678  __kmp_resume_if_soft_paused();
679 
680  th = __kmp_threads[gtid];
681  team = th->th.th_team;
682  status = 0;
683 
684  th->th.th_ident = id_ref;
685 
686  if (team->t.t_serialized) {
687  status = 1;
688  } else {
689  kmp_int32 old_this = th->th.th_local.this_construct;
690 
691  ++th->th.th_local.this_construct;
692  /* try to set team count to thread count--success means thread got the
693  single block */
694  /* TODO: Should this be acquire or release? */
695  if (team->t.t_construct == old_this) {
696  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
697  th->th.th_local.this_construct);
698  }
699 #if USE_ITT_BUILD
700  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
701  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
702  team->t.t_active_level == 1) {
703  // Only report metadata by primary thread of active team at level 1
704  __kmp_itt_metadata_single(id_ref);
705  }
706 #endif /* USE_ITT_BUILD */
707  }
708 
709  if (__kmp_env_consistency_check) {
710  if (status && push_ws) {
711  __kmp_push_workshare(gtid, ct_psingle, id_ref);
712  } else {
713  __kmp_check_workshare(gtid, ct_psingle, id_ref);
714  }
715  }
716 #if USE_ITT_BUILD
717  if (status) {
718  __kmp_itt_single_start(gtid);
719  }
720 #endif /* USE_ITT_BUILD */
721  return status;
722 }
723 
724 void __kmp_exit_single(int gtid) {
725 #if USE_ITT_BUILD
726  __kmp_itt_single_end(gtid);
727 #endif /* USE_ITT_BUILD */
728  if (__kmp_env_consistency_check)
729  __kmp_pop_workshare(gtid, ct_psingle, NULL);
730 }
731 
732 /* determine if we can go parallel or must use a serialized parallel region and
733  * how many threads we can use
734  * set_nproc is the number of threads requested for the team
735  * returns 0 if we should serialize or only use one thread,
736  * otherwise the number of threads to use
737  * The forkjoin lock is held by the caller. */
738 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
739  int master_tid, int set_nthreads,
740  int enter_teams) {
741  int capacity;
742  int new_nthreads;
743  KMP_DEBUG_ASSERT(__kmp_init_serial);
744  KMP_DEBUG_ASSERT(root && parent_team);
745  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
746 
747  // If dyn-var is set, dynamically adjust the number of desired threads,
748  // according to the method specified by dynamic_mode.
749  new_nthreads = set_nthreads;
750  if (!get__dynamic_2(parent_team, master_tid)) {
751  ;
752  }
753 #ifdef USE_LOAD_BALANCE
754  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
755  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
756  if (new_nthreads == 1) {
757  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
758  "reservation to 1 thread\n",
759  master_tid));
760  return 1;
761  }
762  if (new_nthreads < set_nthreads) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
764  "reservation to %d threads\n",
765  master_tid, new_nthreads));
766  }
767  }
768 #endif /* USE_LOAD_BALANCE */
769  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
770  new_nthreads = __kmp_avail_proc - __kmp_nth +
771  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
772  if (new_nthreads <= 1) {
773  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
774  "reservation to 1 thread\n",
775  master_tid));
776  return 1;
777  }
778  if (new_nthreads < set_nthreads) {
779  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
780  "reservation to %d threads\n",
781  master_tid, new_nthreads));
782  } else {
783  new_nthreads = set_nthreads;
784  }
785  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
786  if (set_nthreads > 2) {
787  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
788  new_nthreads = (new_nthreads % set_nthreads) + 1;
789  if (new_nthreads == 1) {
790  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
791  "reservation to 1 thread\n",
792  master_tid));
793  return 1;
794  }
795  if (new_nthreads < set_nthreads) {
796  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
797  "reservation to %d threads\n",
798  master_tid, new_nthreads));
799  }
800  }
801  } else {
802  KMP_ASSERT(0);
803  }
804 
805  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
806  if (__kmp_nth + new_nthreads -
807  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
808  __kmp_max_nth) {
809  int tl_nthreads = __kmp_max_nth - __kmp_nth +
810  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
811  if (tl_nthreads <= 0) {
812  tl_nthreads = 1;
813  }
814 
815  // If dyn-var is false, emit a 1-time warning.
816  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
817  __kmp_reserve_warn = 1;
818  __kmp_msg(kmp_ms_warning,
819  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
820  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
821  }
822  if (tl_nthreads == 1) {
823  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
824  "reduced reservation to 1 thread\n",
825  master_tid));
826  return 1;
827  }
828  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
829  "reservation to %d threads\n",
830  master_tid, tl_nthreads));
831  new_nthreads = tl_nthreads;
832  }
833 
834  // Respect OMP_THREAD_LIMIT
835  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
836  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
837  if (cg_nthreads + new_nthreads -
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839  max_cg_threads) {
840  int tl_nthreads = max_cg_threads - cg_nthreads +
841  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842  if (tl_nthreads <= 0) {
843  tl_nthreads = 1;
844  }
845 
846  // If dyn-var is false, emit a 1-time warning.
847  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848  __kmp_reserve_warn = 1;
849  __kmp_msg(kmp_ms_warning,
850  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852  }
853  if (tl_nthreads == 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
855  "reduced reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
860  "reservation to %d threads\n",
861  master_tid, tl_nthreads));
862  new_nthreads = tl_nthreads;
863  }
864 
865  // Check if the threads array is large enough, or needs expanding.
866  // See comment in __kmp_register_root() about the adjustment if
867  // __kmp_threads[0] == NULL.
868  capacity = __kmp_threads_capacity;
869  if (TCR_PTR(__kmp_threads[0]) == NULL) {
870  --capacity;
871  }
872  // If it is not for initializing the hidden helper team, we need to take
873  // __kmp_hidden_helper_threads_num out of the capacity because it is included
874  // in __kmp_threads_capacity.
875  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
876  capacity -= __kmp_hidden_helper_threads_num;
877  }
878  if (__kmp_nth + new_nthreads -
879  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
880  capacity) {
881  // Expand the threads array.
882  int slotsRequired = __kmp_nth + new_nthreads -
883  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
884  capacity;
885  int slotsAdded = __kmp_expand_threads(slotsRequired);
886  if (slotsAdded < slotsRequired) {
887  // The threads array was not expanded enough.
888  new_nthreads -= (slotsRequired - slotsAdded);
889  KMP_ASSERT(new_nthreads >= 1);
890 
891  // If dyn-var is false, emit a 1-time warning.
892  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
893  __kmp_reserve_warn = 1;
894  if (__kmp_tp_cached) {
895  __kmp_msg(kmp_ms_warning,
896  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
897  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
898  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
899  } else {
900  __kmp_msg(kmp_ms_warning,
901  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
902  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
903  }
904  }
905  }
906  }
907 
908 #ifdef KMP_DEBUG
909  if (new_nthreads == 1) {
910  KC_TRACE(10,
911  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
912  "dead roots and rechecking; requested %d threads\n",
913  __kmp_get_gtid(), set_nthreads));
914  } else {
915  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
916  " %d threads\n",
917  __kmp_get_gtid(), new_nthreads, set_nthreads));
918  }
919 #endif // KMP_DEBUG
920  return new_nthreads;
921 }
922 
923 /* Allocate threads from the thread pool and assign them to the new team. We are
924  assured that there are enough threads available, because we checked on that
925  earlier within critical section forkjoin */
926 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
927  kmp_info_t *master_th, int master_gtid,
928  int fork_teams_workers) {
929  int i;
930  int use_hot_team;
931 
932  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
933  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
934  KMP_MB();
935 
936  /* first, let's setup the primary thread */
937  master_th->th.th_info.ds.ds_tid = 0;
938  master_th->th.th_team = team;
939  master_th->th.th_team_nproc = team->t.t_nproc;
940  master_th->th.th_team_master = master_th;
941  master_th->th.th_team_serialized = FALSE;
942  master_th->th.th_dispatch = &team->t.t_dispatch[0];
943 
944 /* make sure we are not the optimized hot team */
945 #if KMP_NESTED_HOT_TEAMS
946  use_hot_team = 0;
947  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
948  if (hot_teams) { // hot teams array is not allocated if
949  // KMP_HOT_TEAMS_MAX_LEVEL=0
950  int level = team->t.t_active_level - 1; // index in array of hot teams
951  if (master_th->th.th_teams_microtask) { // are we inside the teams?
952  if (master_th->th.th_teams_size.nteams > 1) {
953  ++level; // level was not increased in teams construct for
954  // team_of_masters
955  }
956  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
957  master_th->th.th_teams_level == team->t.t_level) {
958  ++level; // level was not increased in teams construct for
959  // team_of_workers before the parallel
960  } // team->t.t_level will be increased inside parallel
961  }
962  if (level < __kmp_hot_teams_max_level) {
963  if (hot_teams[level].hot_team) {
964  // hot team has already been allocated for given level
965  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
966  use_hot_team = 1; // the team is ready to use
967  } else {
968  use_hot_team = 0; // AC: threads are not allocated yet
969  hot_teams[level].hot_team = team; // remember new hot team
970  hot_teams[level].hot_team_nth = team->t.t_nproc;
971  }
972  } else {
973  use_hot_team = 0;
974  }
975  }
976 #else
977  use_hot_team = team == root->r.r_hot_team;
978 #endif
979  if (!use_hot_team) {
980 
981  /* install the primary thread */
982  team->t.t_threads[0] = master_th;
983  __kmp_initialize_info(master_th, team, 0, master_gtid);
984 
985  /* now, install the worker threads */
986  for (i = 1; i < team->t.t_nproc; i++) {
987 
988  /* fork or reallocate a new thread and install it in team */
989  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
990  team->t.t_threads[i] = thr;
991  KMP_DEBUG_ASSERT(thr);
992  KMP_DEBUG_ASSERT(thr->th.th_team == team);
993  /* align team and thread arrived states */
994  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
995  "T#%d(%d:%d) join =%llu, plain=%llu\n",
996  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
997  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
998  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
999  team->t.t_bar[bs_plain_barrier].b_arrived));
1000  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1001  thr->th.th_teams_level = master_th->th.th_teams_level;
1002  thr->th.th_teams_size = master_th->th.th_teams_size;
1003  { // Initialize threads' barrier data.
1004  int b;
1005  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1006  for (b = 0; b < bs_last_barrier; ++b) {
1007  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1008  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1009 #if USE_DEBUGGER
1010  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1011 #endif
1012  }
1013  }
1014  }
1015 
1016 #if KMP_AFFINITY_SUPPORTED
1017  // Do not partition the places list for teams construct workers who
1018  // haven't actually been forked to do real work yet. This partitioning
1019  // will take place in the parallel region nested within the teams construct.
1020  if (!fork_teams_workers) {
1021  __kmp_partition_places(team);
1022  }
1023 #endif
1024 
1025  if (team->t.t_nproc > 1 &&
1026  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1027  team->t.b->update_num_threads(team->t.t_nproc);
1028  __kmp_add_threads_to_team(team, team->t.t_nproc);
1029  }
1030  }
1031 
1032  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1033  for (i = 0; i < team->t.t_nproc; i++) {
1034  kmp_info_t *thr = team->t.t_threads[i];
1035  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1036  thr->th.th_prev_level != team->t.t_level) {
1037  team->t.t_display_affinity = 1;
1038  break;
1039  }
1040  }
1041  }
1042 
1043  KMP_MB();
1044 }
1045 
1046 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1047 // Propagate any changes to the floating point control registers out to the team
1048 // We try to avoid unnecessary writes to the relevant cache line in the team
1049 // structure, so we don't make changes unless they are needed.
1050 inline static void propagateFPControl(kmp_team_t *team) {
1051  if (__kmp_inherit_fp_control) {
1052  kmp_int16 x87_fpu_control_word;
1053  kmp_uint32 mxcsr;
1054 
1055  // Get primary thread's values of FPU control flags (both X87 and vector)
1056  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1057  __kmp_store_mxcsr(&mxcsr);
1058  mxcsr &= KMP_X86_MXCSR_MASK;
1059 
1060  // There is no point looking at t_fp_control_saved here.
1061  // If it is TRUE, we still have to update the values if they are different
1062  // from those we now have. If it is FALSE we didn't save anything yet, but
1063  // our objective is the same. We have to ensure that the values in the team
1064  // are the same as those we have.
1065  // So, this code achieves what we need whether or not t_fp_control_saved is
1066  // true. By checking whether the value needs updating we avoid unnecessary
1067  // writes that would put the cache-line into a written state, causing all
1068  // threads in the team to have to read it again.
1069  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1070  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1071  // Although we don't use this value, other code in the runtime wants to know
1072  // whether it should restore them. So we must ensure it is correct.
1073  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1074  } else {
1075  // Similarly here. Don't write to this cache-line in the team structure
1076  // unless we have to.
1077  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1078  }
1079 }
1080 
1081 // Do the opposite, setting the hardware registers to the updated values from
1082 // the team.
1083 inline static void updateHWFPControl(kmp_team_t *team) {
1084  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1085  // Only reset the fp control regs if they have been changed in the team.
1086  // the parallel region that we are exiting.
1087  kmp_int16 x87_fpu_control_word;
1088  kmp_uint32 mxcsr;
1089  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1090  __kmp_store_mxcsr(&mxcsr);
1091  mxcsr &= KMP_X86_MXCSR_MASK;
1092 
1093  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1094  __kmp_clear_x87_fpu_status_word();
1095  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1096  }
1097 
1098  if (team->t.t_mxcsr != mxcsr) {
1099  __kmp_load_mxcsr(&team->t.t_mxcsr);
1100  }
1101  }
1102 }
1103 #else
1104 #define propagateFPControl(x) ((void)0)
1105 #define updateHWFPControl(x) ((void)0)
1106 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1107 
1108 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1109  int realloc); // forward declaration
1110 
1111 /* Run a parallel region that has been serialized, so runs only in a team of the
1112  single primary thread. */
1113 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1114  kmp_info_t *this_thr;
1115  kmp_team_t *serial_team;
1116 
1117  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1118 
1119  /* Skip all this code for autopar serialized loops since it results in
1120  unacceptable overhead */
1121  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1122  return;
1123 
1124  if (!TCR_4(__kmp_init_parallel))
1125  __kmp_parallel_initialize();
1126  __kmp_resume_if_soft_paused();
1127 
1128  this_thr = __kmp_threads[global_tid];
1129  serial_team = this_thr->th.th_serial_team;
1130 
1131  /* utilize the serialized team held by this thread */
1132  KMP_DEBUG_ASSERT(serial_team);
1133  KMP_MB();
1134 
1135  if (__kmp_tasking_mode != tskm_immediate_exec) {
1136  KMP_DEBUG_ASSERT(
1137  this_thr->th.th_task_team ==
1138  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1139  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1140  NULL);
1141  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1142  "team %p, new task_team = NULL\n",
1143  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1144  this_thr->th.th_task_team = NULL;
1145  }
1146 
1147  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1148  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1149  proc_bind = proc_bind_false;
1150  } else if (proc_bind == proc_bind_default) {
1151  // No proc_bind clause was specified, so use the current value
1152  // of proc-bind-var for this parallel region.
1153  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1154  }
1155  // Reset for next parallel region
1156  this_thr->th.th_set_proc_bind = proc_bind_default;
1157 
1158  // Reset num_threads for next parallel region
1159  this_thr->th.th_set_nproc = 0;
1160 
1161 #if OMPT_SUPPORT
1162  ompt_data_t ompt_parallel_data = ompt_data_none;
1163  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1164  if (ompt_enabled.enabled &&
1165  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1166 
1167  ompt_task_info_t *parent_task_info;
1168  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1169 
1170  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1171  if (ompt_enabled.ompt_callback_parallel_begin) {
1172  int team_size = 1;
1173 
1174  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1175  &(parent_task_info->task_data), &(parent_task_info->frame),
1176  &ompt_parallel_data, team_size,
1177  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1178  }
1179  }
1180 #endif // OMPT_SUPPORT
1181 
1182  if (this_thr->th.th_team != serial_team) {
1183  // Nested level will be an index in the nested nthreads array
1184  int level = this_thr->th.th_team->t.t_level;
1185 
1186  if (serial_team->t.t_serialized) {
1187  /* this serial team was already used
1188  TODO increase performance by making this locks more specific */
1189  kmp_team_t *new_team;
1190 
1191  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1192 
1193  new_team =
1194  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1195 #if OMPT_SUPPORT
1196  ompt_parallel_data,
1197 #endif
1198  proc_bind, &this_thr->th.th_current_task->td_icvs,
1199  0 USE_NESTED_HOT_ARG(NULL));
1200  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1201  KMP_ASSERT(new_team);
1202 
1203  /* setup new serialized team and install it */
1204  new_team->t.t_threads[0] = this_thr;
1205  new_team->t.t_parent = this_thr->th.th_team;
1206  serial_team = new_team;
1207  this_thr->th.th_serial_team = serial_team;
1208 
1209  KF_TRACE(
1210  10,
1211  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1212  global_tid, serial_team));
1213 
1214  /* TODO the above breaks the requirement that if we run out of resources,
1215  then we can still guarantee that serialized teams are ok, since we may
1216  need to allocate a new one */
1217  } else {
1218  KF_TRACE(
1219  10,
1220  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1221  global_tid, serial_team));
1222  }
1223 
1224  /* we have to initialize this serial team */
1225  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1226  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1227  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1228  serial_team->t.t_ident = loc;
1229  serial_team->t.t_serialized = 1;
1230  serial_team->t.t_nproc = 1;
1231  serial_team->t.t_parent = this_thr->th.th_team;
1232  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1233  this_thr->th.th_team = serial_team;
1234  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1235 
1236  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1237  this_thr->th.th_current_task));
1238  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1239  this_thr->th.th_current_task->td_flags.executing = 0;
1240 
1241  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1242 
1243  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1244  implicit task for each serialized task represented by
1245  team->t.t_serialized? */
1246  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1247  &this_thr->th.th_current_task->td_parent->td_icvs);
1248 
1249  // Thread value exists in the nested nthreads array for the next nested
1250  // level
1251  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1252  this_thr->th.th_current_task->td_icvs.nproc =
1253  __kmp_nested_nth.nth[level + 1];
1254  }
1255 
1256  if (__kmp_nested_proc_bind.used &&
1257  (level + 1 < __kmp_nested_proc_bind.used)) {
1258  this_thr->th.th_current_task->td_icvs.proc_bind =
1259  __kmp_nested_proc_bind.bind_types[level + 1];
1260  }
1261 
1262 #if USE_DEBUGGER
1263  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1264 #endif
1265  this_thr->th.th_info.ds.ds_tid = 0;
1266 
1267  /* set thread cache values */
1268  this_thr->th.th_team_nproc = 1;
1269  this_thr->th.th_team_master = this_thr;
1270  this_thr->th.th_team_serialized = 1;
1271 
1272  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1273  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1274  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1275 
1276  propagateFPControl(serial_team);
1277 
1278  /* check if we need to allocate dispatch buffers stack */
1279  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1280  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1281  serial_team->t.t_dispatch->th_disp_buffer =
1282  (dispatch_private_info_t *)__kmp_allocate(
1283  sizeof(dispatch_private_info_t));
1284  }
1285  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1286 
1287  KMP_MB();
1288 
1289  } else {
1290  /* this serialized team is already being used,
1291  * that's fine, just add another nested level */
1292  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1293  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1294  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1295  ++serial_team->t.t_serialized;
1296  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1297 
1298  // Nested level will be an index in the nested nthreads array
1299  int level = this_thr->th.th_team->t.t_level;
1300  // Thread value exists in the nested nthreads array for the next nested
1301  // level
1302  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1303  this_thr->th.th_current_task->td_icvs.nproc =
1304  __kmp_nested_nth.nth[level + 1];
1305  }
1306  serial_team->t.t_level++;
1307  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1308  "of serial team %p to %d\n",
1309  global_tid, serial_team, serial_team->t.t_level));
1310 
1311  /* allocate/push dispatch buffers stack */
1312  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1313  {
1314  dispatch_private_info_t *disp_buffer =
1315  (dispatch_private_info_t *)__kmp_allocate(
1316  sizeof(dispatch_private_info_t));
1317  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1318  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1319  }
1320  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1321 
1322  KMP_MB();
1323  }
1324  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1325 
1326  // Perform the display affinity functionality for
1327  // serialized parallel regions
1328  if (__kmp_display_affinity) {
1329  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1330  this_thr->th.th_prev_num_threads != 1) {
1331  // NULL means use the affinity-format-var ICV
1332  __kmp_aux_display_affinity(global_tid, NULL);
1333  this_thr->th.th_prev_level = serial_team->t.t_level;
1334  this_thr->th.th_prev_num_threads = 1;
1335  }
1336  }
1337 
1338  if (__kmp_env_consistency_check)
1339  __kmp_push_parallel(global_tid, NULL);
1340 #if OMPT_SUPPORT
1341  serial_team->t.ompt_team_info.master_return_address = codeptr;
1342  if (ompt_enabled.enabled &&
1343  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1344  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1345  OMPT_GET_FRAME_ADDRESS(0);
1346 
1347  ompt_lw_taskteam_t lw_taskteam;
1348  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1349  &ompt_parallel_data, codeptr);
1350 
1351  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1352  // don't use lw_taskteam after linking. content was swaped
1353 
1354  /* OMPT implicit task begin */
1355  if (ompt_enabled.ompt_callback_implicit_task) {
1356  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1357  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1358  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1359  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1360  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1361  __kmp_tid_from_gtid(global_tid);
1362  }
1363 
1364  /* OMPT state */
1365  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1366  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1367  OMPT_GET_FRAME_ADDRESS(0);
1368  }
1369 #endif
1370 }
1371 
1372 // Test if this fork is for a team closely nested in a teams construct
1373 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1374  microtask_t microtask, int level,
1375  int teams_level, kmp_va_list ap) {
1376  return (master_th->th.th_teams_microtask && ap &&
1377  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1378 }
1379 
1380 // Test if this fork is for the teams construct, i.e. to form the outer league
1381 // of teams
1382 static inline bool __kmp_is_entering_teams(int active_level, int level,
1383  int teams_level, kmp_va_list ap) {
1384  return ((ap == NULL && active_level == 0) ||
1385  (ap && teams_level > 0 && teams_level == level));
1386 }
1387 
1388 // AC: This is start of parallel that is nested inside teams construct.
1389 // The team is actual (hot), all workers are ready at the fork barrier.
1390 // No lock needed to initialize the team a bit, then free workers.
1391 static inline int
1392 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1393  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1394  enum fork_context_e call_context, microtask_t microtask,
1395  launch_t invoker, int master_set_numthreads, int level,
1396 #if OMPT_SUPPORT
1397  ompt_data_t ompt_parallel_data, void *return_address,
1398 #endif
1399  kmp_va_list ap) {
1400  void **argv;
1401  int i;
1402 
1403  parent_team->t.t_ident = loc;
1404  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1405  parent_team->t.t_argc = argc;
1406  argv = (void **)parent_team->t.t_argv;
1407  for (i = argc - 1; i >= 0; --i) {
1408  *argv++ = va_arg(kmp_va_deref(ap), void *);
1409  }
1410  // Increment our nested depth levels, but not increase the serialization
1411  if (parent_team == master_th->th.th_serial_team) {
1412  // AC: we are in serialized parallel
1413  __kmpc_serialized_parallel(loc, gtid);
1414  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1415 
1416  if (call_context == fork_context_gnu) {
1417  // AC: need to decrement t_serialized for enquiry functions to work
1418  // correctly, will restore at join time
1419  parent_team->t.t_serialized--;
1420  return TRUE;
1421  }
1422 
1423 #if OMPD_SUPPORT
1424  parent_team->t.t_pkfn = microtask;
1425 #endif
1426 
1427 #if OMPT_SUPPORT
1428  void *dummy;
1429  void **exit_frame_p;
1430  ompt_data_t *implicit_task_data;
1431  ompt_lw_taskteam_t lw_taskteam;
1432 
1433  if (ompt_enabled.enabled) {
1434  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1435  &ompt_parallel_data, return_address);
1436  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1437 
1438  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1439  // Don't use lw_taskteam after linking. Content was swapped.
1440 
1441  /* OMPT implicit task begin */
1442  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1443  if (ompt_enabled.ompt_callback_implicit_task) {
1444  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1445  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1446  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1447  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1448  }
1449 
1450  /* OMPT state */
1451  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1452  } else {
1453  exit_frame_p = &dummy;
1454  }
1455 #endif
1456 
1457  // AC: need to decrement t_serialized for enquiry functions to work
1458  // correctly, will restore at join time
1459  parent_team->t.t_serialized--;
1460 
1461  {
1462  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1463  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1464  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1465 #if OMPT_SUPPORT
1466  ,
1467  exit_frame_p
1468 #endif
1469  );
1470  }
1471 
1472 #if OMPT_SUPPORT
1473  if (ompt_enabled.enabled) {
1474  *exit_frame_p = NULL;
1475  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1476  if (ompt_enabled.ompt_callback_implicit_task) {
1477  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1478  ompt_scope_end, NULL, implicit_task_data, 1,
1479  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1480  }
1481  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1482  __ompt_lw_taskteam_unlink(master_th);
1483  if (ompt_enabled.ompt_callback_parallel_end) {
1484  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1485  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1486  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1487  }
1488  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1489  }
1490 #endif
1491  return TRUE;
1492  }
1493 
1494  parent_team->t.t_pkfn = microtask;
1495  parent_team->t.t_invoke = invoker;
1496  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1497  parent_team->t.t_active_level++;
1498  parent_team->t.t_level++;
1499  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1500 
1501  // If the threads allocated to the team are less than the thread limit, update
1502  // the thread limit here. th_teams_size.nth is specific to this team nested
1503  // in a teams construct, the team is fully created, and we're about to do
1504  // the actual fork. Best to do this here so that the subsequent uses below
1505  // and in the join have the correct value.
1506  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1507 
1508 #if OMPT_SUPPORT
1509  if (ompt_enabled.enabled) {
1510  ompt_lw_taskteam_t lw_taskteam;
1511  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1512  return_address);
1513  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1514  }
1515 #endif
1516 
1517  /* Change number of threads in the team if requested */
1518  if (master_set_numthreads) { // The parallel has num_threads clause
1519  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1520  // AC: only can reduce number of threads dynamically, can't increase
1521  kmp_info_t **other_threads = parent_team->t.t_threads;
1522  // NOTE: if using distributed barrier, we need to run this code block
1523  // even when the team size appears not to have changed from the max.
1524  int old_proc = master_th->th.th_teams_size.nth;
1525  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1526  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1527  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1528  }
1529  parent_team->t.t_nproc = master_set_numthreads;
1530  for (i = 0; i < master_set_numthreads; ++i) {
1531  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1532  }
1533  }
1534  // Keep extra threads hot in the team for possible next parallels
1535  master_th->th.th_set_nproc = 0;
1536  }
1537 
1538 #if USE_DEBUGGER
1539  if (__kmp_debugging) { // Let debugger override number of threads.
1540  int nth = __kmp_omp_num_threads(loc);
1541  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1542  master_set_numthreads = nth;
1543  }
1544  }
1545 #endif
1546 
1547  // Figure out the proc_bind policy for the nested parallel within teams
1548  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1549  // proc_bind_default means don't update
1550  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1551  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1552  proc_bind = proc_bind_false;
1553  } else {
1554  // No proc_bind clause specified; use current proc-bind-var
1555  if (proc_bind == proc_bind_default) {
1556  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1557  }
1558  /* else: The proc_bind policy was specified explicitly on parallel clause.
1559  This overrides proc-bind-var for this parallel region, but does not
1560  change proc-bind-var. */
1561  // Figure the value of proc-bind-var for the child threads.
1562  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1563  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1564  master_th->th.th_current_task->td_icvs.proc_bind)) {
1565  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1566  }
1567  }
1568  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1569  // Need to change the bind-var ICV to correct value for each implicit task
1570  if (proc_bind_icv != proc_bind_default &&
1571  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1572  kmp_info_t **other_threads = parent_team->t.t_threads;
1573  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1574  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1575  }
1576  }
1577  // Reset for next parallel region
1578  master_th->th.th_set_proc_bind = proc_bind_default;
1579 
1580 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1581  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1582  KMP_ITT_DEBUG) &&
1583  __kmp_forkjoin_frames_mode == 3 &&
1584  parent_team->t.t_active_level == 1 // only report frames at level 1
1585  && master_th->th.th_teams_size.nteams == 1) {
1586  kmp_uint64 tmp_time = __itt_get_timestamp();
1587  master_th->th.th_frame_time = tmp_time;
1588  parent_team->t.t_region_time = tmp_time;
1589  }
1590  if (__itt_stack_caller_create_ptr) {
1591  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1592  // create new stack stitching id before entering fork barrier
1593  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1594  }
1595 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1596 #if KMP_AFFINITY_SUPPORTED
1597  __kmp_partition_places(parent_team);
1598 #endif
1599 
1600  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1601  "master_th=%p, gtid=%d\n",
1602  root, parent_team, master_th, gtid));
1603  __kmp_internal_fork(loc, gtid, parent_team);
1604  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1605  "master_th=%p, gtid=%d\n",
1606  root, parent_team, master_th, gtid));
1607 
1608  if (call_context == fork_context_gnu)
1609  return TRUE;
1610 
1611  /* Invoke microtask for PRIMARY thread */
1612  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1613  parent_team->t.t_id, parent_team->t.t_pkfn));
1614 
1615  if (!parent_team->t.t_invoke(gtid)) {
1616  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1617  }
1618  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1619  parent_team->t.t_id, parent_team->t.t_pkfn));
1620  KMP_MB(); /* Flush all pending memory write invalidates. */
1621 
1622  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1623 
1624  return TRUE;
1625 }
1626 
1627 // Create a serialized parallel region
1628 static inline int
1629 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1630  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1631  kmp_info_t *master_th, kmp_team_t *parent_team,
1632 #if OMPT_SUPPORT
1633  ompt_data_t *ompt_parallel_data, void **return_address,
1634  ompt_data_t **parent_task_data,
1635 #endif
1636  kmp_va_list ap) {
1637  kmp_team_t *team;
1638  int i;
1639  void **argv;
1640 
1641 /* josh todo: hypothetical question: what do we do for OS X*? */
1642 #if KMP_OS_LINUX && \
1643  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1644  void *args[argc];
1645 #else
1646  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1647 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1648  KMP_ARCH_AARCH64) */
1649 
1650  KA_TRACE(
1651  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1652 
1653  __kmpc_serialized_parallel(loc, gtid);
1654 
1655 #if OMPD_SUPPORT
1656  master_th->th.th_serial_team->t.t_pkfn = microtask;
1657 #endif
1658 
1659  if (call_context == fork_context_intel) {
1660  /* TODO this sucks, use the compiler itself to pass args! :) */
1661  master_th->th.th_serial_team->t.t_ident = loc;
1662  if (!ap) {
1663  // revert change made in __kmpc_serialized_parallel()
1664  master_th->th.th_serial_team->t.t_level--;
1665 // Get args from parent team for teams construct
1666 
1667 #if OMPT_SUPPORT
1668  void *dummy;
1669  void **exit_frame_p;
1670  ompt_task_info_t *task_info;
1671  ompt_lw_taskteam_t lw_taskteam;
1672 
1673  if (ompt_enabled.enabled) {
1674  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1675  ompt_parallel_data, *return_address);
1676 
1677  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1678  // don't use lw_taskteam after linking. content was swaped
1679  task_info = OMPT_CUR_TASK_INFO(master_th);
1680  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1681  if (ompt_enabled.ompt_callback_implicit_task) {
1682  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1683  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1684  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1685  &(task_info->task_data), 1,
1686  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1687  }
1688 
1689  /* OMPT state */
1690  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1691  } else {
1692  exit_frame_p = &dummy;
1693  }
1694 #endif
1695 
1696  {
1697  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1698  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1699  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1700 #if OMPT_SUPPORT
1701  ,
1702  exit_frame_p
1703 #endif
1704  );
1705  }
1706 
1707 #if OMPT_SUPPORT
1708  if (ompt_enabled.enabled) {
1709  *exit_frame_p = NULL;
1710  if (ompt_enabled.ompt_callback_implicit_task) {
1711  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1712  ompt_scope_end, NULL, &(task_info->task_data), 1,
1713  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1714  }
1715  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1716  __ompt_lw_taskteam_unlink(master_th);
1717  if (ompt_enabled.ompt_callback_parallel_end) {
1718  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1719  ompt_parallel_data, *parent_task_data,
1720  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1721  }
1722  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1723  }
1724 #endif
1725  } else if (microtask == (microtask_t)__kmp_teams_master) {
1726  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1727  team = master_th->th.th_team;
1728  // team->t.t_pkfn = microtask;
1729  team->t.t_invoke = invoker;
1730  __kmp_alloc_argv_entries(argc, team, TRUE);
1731  team->t.t_argc = argc;
1732  argv = (void **)team->t.t_argv;
1733  if (ap) {
1734  for (i = argc - 1; i >= 0; --i)
1735  *argv++ = va_arg(kmp_va_deref(ap), void *);
1736  } else {
1737  for (i = 0; i < argc; ++i)
1738  // Get args from parent team for teams construct
1739  argv[i] = parent_team->t.t_argv[i];
1740  }
1741  // AC: revert change made in __kmpc_serialized_parallel()
1742  // because initial code in teams should have level=0
1743  team->t.t_level--;
1744  // AC: call special invoker for outer "parallel" of teams construct
1745  invoker(gtid);
1746 #if OMPT_SUPPORT
1747  if (ompt_enabled.enabled) {
1748  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1749  if (ompt_enabled.ompt_callback_implicit_task) {
1750  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1751  ompt_scope_end, NULL, &(task_info->task_data), 0,
1752  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1753  }
1754  if (ompt_enabled.ompt_callback_parallel_end) {
1755  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1756  ompt_parallel_data, *parent_task_data,
1757  OMPT_INVOKER(call_context) | ompt_parallel_league,
1758  *return_address);
1759  }
1760  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1761  }
1762 #endif
1763  } else {
1764  argv = args;
1765  for (i = argc - 1; i >= 0; --i)
1766  *argv++ = va_arg(kmp_va_deref(ap), void *);
1767  KMP_MB();
1768 
1769 #if OMPT_SUPPORT
1770  void *dummy;
1771  void **exit_frame_p;
1772  ompt_task_info_t *task_info;
1773  ompt_lw_taskteam_t lw_taskteam;
1774  ompt_data_t *implicit_task_data;
1775 
1776  if (ompt_enabled.enabled) {
1777  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1778  ompt_parallel_data, *return_address);
1779  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1780  // don't use lw_taskteam after linking. content was swaped
1781  task_info = OMPT_CUR_TASK_INFO(master_th);
1782  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1783 
1784  /* OMPT implicit task begin */
1785  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1786  if (ompt_enabled.ompt_callback_implicit_task) {
1787  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1789  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1790  ompt_task_implicit);
1791  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1792  }
1793 
1794  /* OMPT state */
1795  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1796  } else {
1797  exit_frame_p = &dummy;
1798  }
1799 #endif
1800 
1801  {
1802  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1803  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1804  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1805 #if OMPT_SUPPORT
1806  ,
1807  exit_frame_p
1808 #endif
1809  );
1810  }
1811 
1812 #if OMPT_SUPPORT
1813  if (ompt_enabled.enabled) {
1814  *exit_frame_p = NULL;
1815  if (ompt_enabled.ompt_callback_implicit_task) {
1816  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1817  ompt_scope_end, NULL, &(task_info->task_data), 1,
1818  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1819  }
1820 
1821  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1822  __ompt_lw_taskteam_unlink(master_th);
1823  if (ompt_enabled.ompt_callback_parallel_end) {
1824  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1825  ompt_parallel_data, *parent_task_data,
1826  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1827  }
1828  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1829  }
1830 #endif
1831  }
1832  } else if (call_context == fork_context_gnu) {
1833 #if OMPT_SUPPORT
1834  if (ompt_enabled.enabled) {
1835  ompt_lw_taskteam_t lwt;
1836  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1837  *return_address);
1838 
1839  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1840  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1841  }
1842 // don't use lw_taskteam after linking. content was swaped
1843 #endif
1844 
1845  // we were called from GNU native code
1846  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1847  return FALSE;
1848  } else {
1849  KMP_ASSERT2(call_context < fork_context_last,
1850  "__kmp_serial_fork_call: unknown fork_context parameter");
1851  }
1852 
1853  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1854  KMP_MB();
1855  return FALSE;
1856 }
1857 
1858 /* most of the work for a fork */
1859 /* return true if we really went parallel, false if serialized */
1860 int __kmp_fork_call(ident_t *loc, int gtid,
1861  enum fork_context_e call_context, // Intel, GNU, ...
1862  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1863  kmp_va_list ap) {
1864  void **argv;
1865  int i;
1866  int master_tid;
1867  int master_this_cons;
1868  kmp_team_t *team;
1869  kmp_team_t *parent_team;
1870  kmp_info_t *master_th;
1871  kmp_root_t *root;
1872  int nthreads;
1873  int master_active;
1874  int master_set_numthreads;
1875  int level;
1876  int active_level;
1877  int teams_level;
1878 #if KMP_NESTED_HOT_TEAMS
1879  kmp_hot_team_ptr_t **p_hot_teams;
1880 #endif
1881  { // KMP_TIME_BLOCK
1882  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1883  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1884 
1885  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1886  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1887  /* Some systems prefer the stack for the root thread(s) to start with */
1888  /* some gap from the parent stack to prevent false sharing. */
1889  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1890  /* These 2 lines below are so this does not get optimized out */
1891  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1892  __kmp_stkpadding += (short)((kmp_int64)dummy);
1893  }
1894 
1895  /* initialize if needed */
1896  KMP_DEBUG_ASSERT(
1897  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1898  if (!TCR_4(__kmp_init_parallel))
1899  __kmp_parallel_initialize();
1900  __kmp_resume_if_soft_paused();
1901 
1902  /* setup current data */
1903  // AC: potentially unsafe, not in sync with library shutdown,
1904  // __kmp_threads can be freed
1905  master_th = __kmp_threads[gtid];
1906 
1907  parent_team = master_th->th.th_team;
1908  master_tid = master_th->th.th_info.ds.ds_tid;
1909  master_this_cons = master_th->th.th_local.this_construct;
1910  root = master_th->th.th_root;
1911  master_active = root->r.r_active;
1912  master_set_numthreads = master_th->th.th_set_nproc;
1913 
1914 #if OMPT_SUPPORT
1915  ompt_data_t ompt_parallel_data = ompt_data_none;
1916  ompt_data_t *parent_task_data;
1917  ompt_frame_t *ompt_frame;
1918  void *return_address = NULL;
1919 
1920  if (ompt_enabled.enabled) {
1921  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1922  NULL, NULL);
1923  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1924  }
1925 #endif
1926 
1927  // Assign affinity to root thread if it hasn't happened yet
1928  __kmp_assign_root_init_mask();
1929 
1930  // Nested level will be an index in the nested nthreads array
1931  level = parent_team->t.t_level;
1932  // used to launch non-serial teams even if nested is not allowed
1933  active_level = parent_team->t.t_active_level;
1934  // needed to check nesting inside the teams
1935  teams_level = master_th->th.th_teams_level;
1936 #if KMP_NESTED_HOT_TEAMS
1937  p_hot_teams = &master_th->th.th_hot_teams;
1938  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1939  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1940  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1941  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1942  // it is either actual or not needed (when active_level > 0)
1943  (*p_hot_teams)[0].hot_team_nth = 1;
1944  }
1945 #endif
1946 
1947 #if OMPT_SUPPORT
1948  if (ompt_enabled.enabled) {
1949  if (ompt_enabled.ompt_callback_parallel_begin) {
1950  int team_size = master_set_numthreads
1951  ? master_set_numthreads
1952  : get__nproc_2(parent_team, master_tid);
1953  int flags = OMPT_INVOKER(call_context) |
1954  ((microtask == (microtask_t)__kmp_teams_master)
1955  ? ompt_parallel_league
1956  : ompt_parallel_team);
1957  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1958  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1959  return_address);
1960  }
1961  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1962  }
1963 #endif
1964 
1965  master_th->th.th_ident = loc;
1966 
1967  // Parallel closely nested in teams construct:
1968  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1969  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1970  call_context, microtask, invoker,
1971  master_set_numthreads, level,
1972 #if OMPT_SUPPORT
1973  ompt_parallel_data, return_address,
1974 #endif
1975  ap);
1976  } // End parallel closely nested in teams construct
1977 
1978 #if KMP_DEBUG
1979  if (__kmp_tasking_mode != tskm_immediate_exec) {
1980  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1981  parent_team->t.t_task_team[master_th->th.th_task_state]);
1982  }
1983 #endif
1984 
1985  // Need this to happen before we determine the number of threads, not while
1986  // we are allocating the team
1987  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1988 
1989  // Determine the number of threads
1990  int enter_teams =
1991  __kmp_is_entering_teams(active_level, level, teams_level, ap);
1992  if ((!enter_teams &&
1993  (parent_team->t.t_active_level >=
1994  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1995  (__kmp_library == library_serial)) {
1996  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1997  nthreads = 1;
1998  } else {
1999  nthreads = master_set_numthreads
2000  ? master_set_numthreads
2001  // TODO: get nproc directly from current task
2002  : get__nproc_2(parent_team, master_tid);
2003  // Check if we need to take forkjoin lock? (no need for serialized
2004  // parallel out of teams construct).
2005  if (nthreads > 1) {
2006  /* determine how many new threads we can use */
2007  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2008  /* AC: If we execute teams from parallel region (on host), then teams
2009  should be created but each can only have 1 thread if nesting is
2010  disabled. If teams called from serial region, then teams and their
2011  threads should be created regardless of the nesting setting. */
2012  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2013  nthreads, enter_teams);
2014  if (nthreads == 1) {
2015  // Free lock for single thread execution here; for multi-thread
2016  // execution it will be freed later after team of threads created
2017  // and initialized
2018  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2019  }
2020  }
2021  }
2022  KMP_DEBUG_ASSERT(nthreads > 0);
2023 
2024  // If we temporarily changed the set number of threads then restore it now
2025  master_th->th.th_set_nproc = 0;
2026 
2027  if (nthreads == 1) {
2028  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2029  invoker, master_th, parent_team,
2030 #if OMPT_SUPPORT
2031  &ompt_parallel_data, &return_address,
2032  &parent_task_data,
2033 #endif
2034  ap);
2035  } // if (nthreads == 1)
2036 
2037  // GEH: only modify the executing flag in the case when not serialized
2038  // serialized case is handled in kmpc_serialized_parallel
2039  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2040  "curtask=%p, curtask_max_aclevel=%d\n",
2041  parent_team->t.t_active_level, master_th,
2042  master_th->th.th_current_task,
2043  master_th->th.th_current_task->td_icvs.max_active_levels));
2044  // TODO: GEH - cannot do this assertion because root thread not set up as
2045  // executing
2046  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2047  master_th->th.th_current_task->td_flags.executing = 0;
2048 
2049  if (!master_th->th.th_teams_microtask || level > teams_level) {
2050  /* Increment our nested depth level */
2051  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2052  }
2053 
2054  // See if we need to make a copy of the ICVs.
2055  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2056  if ((level + 1 < __kmp_nested_nth.used) &&
2057  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2058  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2059  } else {
2060  nthreads_icv = 0; // don't update
2061  }
2062 
2063  // Figure out the proc_bind_policy for the new team.
2064  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2065  // proc_bind_default means don't update
2066  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2067  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2068  proc_bind = proc_bind_false;
2069  } else {
2070  // No proc_bind clause specified; use current proc-bind-var for this
2071  // parallel region
2072  if (proc_bind == proc_bind_default) {
2073  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2074  }
2075  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2076  if (master_th->th.th_teams_microtask &&
2077  microtask == (microtask_t)__kmp_teams_master) {
2078  proc_bind = __kmp_teams_proc_bind;
2079  }
2080  /* else: The proc_bind policy was specified explicitly on parallel clause.
2081  This overrides proc-bind-var for this parallel region, but does not
2082  change proc-bind-var. */
2083  // Figure the value of proc-bind-var for the child threads.
2084  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2085  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2086  master_th->th.th_current_task->td_icvs.proc_bind)) {
2087  // Do not modify the proc bind icv for the two teams construct forks
2088  // They just let the proc bind icv pass through
2089  if (!master_th->th.th_teams_microtask ||
2090  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2091  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2092  }
2093  }
2094 
2095  // Reset for next parallel region
2096  master_th->th.th_set_proc_bind = proc_bind_default;
2097 
2098  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2099  kmp_internal_control_t new_icvs;
2100  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2101  new_icvs.next = NULL;
2102  if (nthreads_icv > 0) {
2103  new_icvs.nproc = nthreads_icv;
2104  }
2105  if (proc_bind_icv != proc_bind_default) {
2106  new_icvs.proc_bind = proc_bind_icv;
2107  }
2108 
2109  /* allocate a new parallel team */
2110  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2111  team = __kmp_allocate_team(root, nthreads, nthreads,
2112 #if OMPT_SUPPORT
2113  ompt_parallel_data,
2114 #endif
2115  proc_bind, &new_icvs,
2116  argc USE_NESTED_HOT_ARG(master_th));
2117  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2118  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2119  } else {
2120  /* allocate a new parallel team */
2121  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2122  team = __kmp_allocate_team(root, nthreads, nthreads,
2123 #if OMPT_SUPPORT
2124  ompt_parallel_data,
2125 #endif
2126  proc_bind,
2127  &master_th->th.th_current_task->td_icvs,
2128  argc USE_NESTED_HOT_ARG(master_th));
2129  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2130  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2131  &master_th->th.th_current_task->td_icvs);
2132  }
2133  KF_TRACE(
2134  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2135 
2136  /* setup the new team */
2137  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2138  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2139  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2140  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2141  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2142 #if OMPT_SUPPORT
2143  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2144  return_address);
2145 #endif
2146  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2147  // TODO: parent_team->t.t_level == INT_MAX ???
2148  if (!master_th->th.th_teams_microtask || level > teams_level) {
2149  int new_level = parent_team->t.t_level + 1;
2150  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2151  new_level = parent_team->t.t_active_level + 1;
2152  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2153  } else {
2154  // AC: Do not increase parallel level at start of the teams construct
2155  int new_level = parent_team->t.t_level;
2156  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2157  new_level = parent_team->t.t_active_level;
2158  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2159  }
2160  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2161  // set primary thread's schedule as new run-time schedule
2162  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2163 
2164  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2165  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2166 
2167  // Update the floating point rounding in the team if required.
2168  propagateFPControl(team);
2169 #if OMPD_SUPPORT
2170  if (ompd_state & OMPD_ENABLE_BP)
2171  ompd_bp_parallel_begin();
2172 #endif
2173 
2174  if (__kmp_tasking_mode != tskm_immediate_exec) {
2175  // Set primary thread's task team to team's task team. Unless this is hot
2176  // team, it should be NULL.
2177  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2178  parent_team->t.t_task_team[master_th->th.th_task_state]);
2179  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2180  "%p, new task_team %p / team %p\n",
2181  __kmp_gtid_from_thread(master_th),
2182  master_th->th.th_task_team, parent_team,
2183  team->t.t_task_team[master_th->th.th_task_state], team));
2184 
2185  if (active_level || master_th->th.th_task_team) {
2186  // Take a memo of primary thread's task_state
2187  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2188  if (master_th->th.th_task_state_top >=
2189  master_th->th.th_task_state_stack_sz) { // increase size
2190  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2191  kmp_uint8 *old_stack, *new_stack;
2192  kmp_uint32 i;
2193  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2194  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2195  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2196  }
2197  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2198  ++i) { // zero-init rest of stack
2199  new_stack[i] = 0;
2200  }
2201  old_stack = master_th->th.th_task_state_memo_stack;
2202  master_th->th.th_task_state_memo_stack = new_stack;
2203  master_th->th.th_task_state_stack_sz = new_size;
2204  __kmp_free(old_stack);
2205  }
2206  // Store primary thread's task_state on stack
2207  master_th->th
2208  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2209  master_th->th.th_task_state;
2210  master_th->th.th_task_state_top++;
2211 #if KMP_NESTED_HOT_TEAMS
2212  if (master_th->th.th_hot_teams &&
2213  active_level < __kmp_hot_teams_max_level &&
2214  team == master_th->th.th_hot_teams[active_level].hot_team) {
2215  // Restore primary thread's nested state if nested hot team
2216  master_th->th.th_task_state =
2217  master_th->th
2218  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2219  } else {
2220 #endif
2221  master_th->th.th_task_state = 0;
2222 #if KMP_NESTED_HOT_TEAMS
2223  }
2224 #endif
2225  }
2226 #if !KMP_NESTED_HOT_TEAMS
2227  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2228  (team == root->r.r_hot_team));
2229 #endif
2230  }
2231 
2232  KA_TRACE(
2233  20,
2234  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2235  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2236  team->t.t_nproc));
2237  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2238  (team->t.t_master_tid == 0 &&
2239  (team->t.t_parent == root->r.r_root_team ||
2240  team->t.t_parent->t.t_serialized)));
2241  KMP_MB();
2242 
2243  /* now, setup the arguments */
2244  argv = (void **)team->t.t_argv;
2245  if (ap) {
2246  for (i = argc - 1; i >= 0; --i) {
2247  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2248  KMP_CHECK_UPDATE(*argv, new_argv);
2249  argv++;
2250  }
2251  } else {
2252  for (i = 0; i < argc; ++i) {
2253  // Get args from parent team for teams construct
2254  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2255  }
2256  }
2257 
2258  /* now actually fork the threads */
2259  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2260  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2261  root->r.r_active = TRUE;
2262 
2263  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2264  __kmp_setup_icv_copy(team, nthreads,
2265  &master_th->th.th_current_task->td_icvs, loc);
2266 
2267 #if OMPT_SUPPORT
2268  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2269 #endif
2270 
2271  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2272 
2273 #if USE_ITT_BUILD
2274  if (team->t.t_active_level == 1 // only report frames at level 1
2275  && !master_th->th.th_teams_microtask) { // not in teams construct
2276 #if USE_ITT_NOTIFY
2277  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2278  (__kmp_forkjoin_frames_mode == 3 ||
2279  __kmp_forkjoin_frames_mode == 1)) {
2280  kmp_uint64 tmp_time = 0;
2281  if (__itt_get_timestamp_ptr)
2282  tmp_time = __itt_get_timestamp();
2283  // Internal fork - report frame begin
2284  master_th->th.th_frame_time = tmp_time;
2285  if (__kmp_forkjoin_frames_mode == 3)
2286  team->t.t_region_time = tmp_time;
2287  } else
2288 // only one notification scheme (either "submit" or "forking/joined", not both)
2289 #endif /* USE_ITT_NOTIFY */
2290  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2291  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2292  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2293  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2294  }
2295  }
2296 #endif /* USE_ITT_BUILD */
2297 
2298  /* now go on and do the work */
2299  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2300  KMP_MB();
2301  KF_TRACE(10,
2302  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2303  root, team, master_th, gtid));
2304 
2305 #if USE_ITT_BUILD
2306  if (__itt_stack_caller_create_ptr) {
2307  // create new stack stitching id before entering fork barrier
2308  if (!enter_teams) {
2309  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2310  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2311  } else if (parent_team->t.t_serialized) {
2312  // keep stack stitching id in the serialized parent_team;
2313  // current team will be used for parallel inside the teams;
2314  // if parent_team is active, then it already keeps stack stitching id
2315  // for the league of teams
2316  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2317  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2318  }
2319  }
2320 #endif /* USE_ITT_BUILD */
2321 
2322  // AC: skip __kmp_internal_fork at teams construct, let only primary
2323  // threads execute
2324  if (ap) {
2325  __kmp_internal_fork(loc, gtid, team);
2326  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2327  "master_th=%p, gtid=%d\n",
2328  root, team, master_th, gtid));
2329  }
2330 
2331  if (call_context == fork_context_gnu) {
2332  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2333  return TRUE;
2334  }
2335 
2336  /* Invoke microtask for PRIMARY thread */
2337  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2338  team->t.t_id, team->t.t_pkfn));
2339  } // END of timer KMP_fork_call block
2340 
2341 #if KMP_STATS_ENABLED
2342  // If beginning a teams construct, then change thread state
2343  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2344  if (!ap) {
2345  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2346  }
2347 #endif
2348 
2349  if (!team->t.t_invoke(gtid)) {
2350  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2351  }
2352 
2353 #if KMP_STATS_ENABLED
2354  // If was beginning of a teams construct, then reset thread state
2355  if (!ap) {
2356  KMP_SET_THREAD_STATE(previous_state);
2357  }
2358 #endif
2359 
2360  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2361  team->t.t_id, team->t.t_pkfn));
2362  KMP_MB(); /* Flush all pending memory write invalidates. */
2363 
2364  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2365 #if OMPT_SUPPORT
2366  if (ompt_enabled.enabled) {
2367  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2368  }
2369 #endif
2370 
2371  return TRUE;
2372 }
2373 
2374 #if OMPT_SUPPORT
2375 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2376  kmp_team_t *team) {
2377  // restore state outside the region
2378  thread->th.ompt_thread_info.state =
2379  ((team->t.t_serialized) ? ompt_state_work_serial
2380  : ompt_state_work_parallel);
2381 }
2382 
2383 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2384  kmp_team_t *team, ompt_data_t *parallel_data,
2385  int flags, void *codeptr) {
2386  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2387  if (ompt_enabled.ompt_callback_parallel_end) {
2388  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2389  parallel_data, &(task_info->task_data), flags, codeptr);
2390  }
2391 
2392  task_info->frame.enter_frame = ompt_data_none;
2393  __kmp_join_restore_state(thread, team);
2394 }
2395 #endif
2396 
2397 void __kmp_join_call(ident_t *loc, int gtid
2398 #if OMPT_SUPPORT
2399  ,
2400  enum fork_context_e fork_context
2401 #endif
2402  ,
2403  int exit_teams) {
2404  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2405  kmp_team_t *team;
2406  kmp_team_t *parent_team;
2407  kmp_info_t *master_th;
2408  kmp_root_t *root;
2409  int master_active;
2410 
2411  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2412 
2413  /* setup current data */
2414  master_th = __kmp_threads[gtid];
2415  root = master_th->th.th_root;
2416  team = master_th->th.th_team;
2417  parent_team = team->t.t_parent;
2418 
2419  master_th->th.th_ident = loc;
2420 
2421 #if OMPT_SUPPORT
2422  void *team_microtask = (void *)team->t.t_pkfn;
2423  // For GOMP interface with serialized parallel, need the
2424  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2425  // and end-parallel events.
2426  if (ompt_enabled.enabled &&
2427  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2428  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2429  }
2430 #endif
2431 
2432 #if KMP_DEBUG
2433  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2434  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2435  "th_task_team = %p\n",
2436  __kmp_gtid_from_thread(master_th), team,
2437  team->t.t_task_team[master_th->th.th_task_state],
2438  master_th->th.th_task_team));
2439  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2440  team->t.t_task_team[master_th->th.th_task_state]);
2441  }
2442 #endif
2443 
2444  if (team->t.t_serialized) {
2445  if (master_th->th.th_teams_microtask) {
2446  // We are in teams construct
2447  int level = team->t.t_level;
2448  int tlevel = master_th->th.th_teams_level;
2449  if (level == tlevel) {
2450  // AC: we haven't incremented it earlier at start of teams construct,
2451  // so do it here - at the end of teams construct
2452  team->t.t_level++;
2453  } else if (level == tlevel + 1) {
2454  // AC: we are exiting parallel inside teams, need to increment
2455  // serialization in order to restore it in the next call to
2456  // __kmpc_end_serialized_parallel
2457  team->t.t_serialized++;
2458  }
2459  }
2460  __kmpc_end_serialized_parallel(loc, gtid);
2461 
2462 #if OMPT_SUPPORT
2463  if (ompt_enabled.enabled) {
2464  if (fork_context == fork_context_gnu) {
2465  __ompt_lw_taskteam_unlink(master_th);
2466  }
2467  __kmp_join_restore_state(master_th, parent_team);
2468  }
2469 #endif
2470 
2471  return;
2472  }
2473 
2474  master_active = team->t.t_master_active;
2475 
2476  if (!exit_teams) {
2477  // AC: No barrier for internal teams at exit from teams construct.
2478  // But there is barrier for external team (league).
2479  __kmp_internal_join(loc, gtid, team);
2480 #if USE_ITT_BUILD
2481  if (__itt_stack_caller_create_ptr) {
2482  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2483  // destroy the stack stitching id after join barrier
2484  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2485  team->t.t_stack_id = NULL;
2486  }
2487 #endif
2488  } else {
2489  master_th->th.th_task_state =
2490  0; // AC: no tasking in teams (out of any parallel)
2491 #if USE_ITT_BUILD
2492  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2493  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2494  // destroy the stack stitching id on exit from the teams construct
2495  // if parent_team is active, then the id will be destroyed later on
2496  // by master of the league of teams
2497  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2498  parent_team->t.t_stack_id = NULL;
2499  }
2500 #endif
2501  }
2502 
2503  KMP_MB();
2504 
2505 #if OMPT_SUPPORT
2506  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2507  void *codeptr = team->t.ompt_team_info.master_return_address;
2508 #endif
2509 
2510 #if USE_ITT_BUILD
2511  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2512  if (team->t.t_active_level == 1 &&
2513  (!master_th->th.th_teams_microtask || /* not in teams construct */
2514  master_th->th.th_teams_size.nteams == 1)) {
2515  master_th->th.th_ident = loc;
2516  // only one notification scheme (either "submit" or "forking/joined", not
2517  // both)
2518  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2519  __kmp_forkjoin_frames_mode == 3)
2520  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2521  master_th->th.th_frame_time, 0, loc,
2522  master_th->th.th_team_nproc, 1);
2523  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2524  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2525  __kmp_itt_region_joined(gtid);
2526  } // active_level == 1
2527 #endif /* USE_ITT_BUILD */
2528 
2529 #if KMP_AFFINITY_SUPPORTED
2530  if (!exit_teams) {
2531  // Restore master thread's partition.
2532  master_th->th.th_first_place = team->t.t_first_place;
2533  master_th->th.th_last_place = team->t.t_last_place;
2534  }
2535 #endif // KMP_AFFINITY_SUPPORTED
2536 
2537  if (master_th->th.th_teams_microtask && !exit_teams &&
2538  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2539  team->t.t_level == master_th->th.th_teams_level + 1) {
2540 // AC: We need to leave the team structure intact at the end of parallel
2541 // inside the teams construct, so that at the next parallel same (hot) team
2542 // works, only adjust nesting levels
2543 #if OMPT_SUPPORT
2544  ompt_data_t ompt_parallel_data = ompt_data_none;
2545  if (ompt_enabled.enabled) {
2546  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2547  if (ompt_enabled.ompt_callback_implicit_task) {
2548  int ompt_team_size = team->t.t_nproc;
2549  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2550  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2551  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2552  }
2553  task_info->frame.exit_frame = ompt_data_none;
2554  task_info->task_data = ompt_data_none;
2555  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2556  __ompt_lw_taskteam_unlink(master_th);
2557  }
2558 #endif
2559  /* Decrement our nested depth level */
2560  team->t.t_level--;
2561  team->t.t_active_level--;
2562  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2563 
2564  // Restore number of threads in the team if needed. This code relies on
2565  // the proper adjustment of th_teams_size.nth after the fork in
2566  // __kmp_teams_master on each teams primary thread in the case that
2567  // __kmp_reserve_threads reduced it.
2568  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2569  int old_num = master_th->th.th_team_nproc;
2570  int new_num = master_th->th.th_teams_size.nth;
2571  kmp_info_t **other_threads = team->t.t_threads;
2572  team->t.t_nproc = new_num;
2573  for (int i = 0; i < old_num; ++i) {
2574  other_threads[i]->th.th_team_nproc = new_num;
2575  }
2576  // Adjust states of non-used threads of the team
2577  for (int i = old_num; i < new_num; ++i) {
2578  // Re-initialize thread's barrier data.
2579  KMP_DEBUG_ASSERT(other_threads[i]);
2580  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2581  for (int b = 0; b < bs_last_barrier; ++b) {
2582  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2583  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2584 #if USE_DEBUGGER
2585  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2586 #endif
2587  }
2588  if (__kmp_tasking_mode != tskm_immediate_exec) {
2589  // Synchronize thread's task state
2590  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2591  }
2592  }
2593  }
2594 
2595 #if OMPT_SUPPORT
2596  if (ompt_enabled.enabled) {
2597  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2598  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2599  }
2600 #endif
2601 
2602  return;
2603  }
2604 
2605  /* do cleanup and restore the parent team */
2606  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2607  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2608 
2609  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2610 
2611  /* jc: The following lock has instructions with REL and ACQ semantics,
2612  separating the parallel user code called in this parallel region
2613  from the serial user code called after this function returns. */
2614  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2615 
2616  if (!master_th->th.th_teams_microtask ||
2617  team->t.t_level > master_th->th.th_teams_level) {
2618  /* Decrement our nested depth level */
2619  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2620  }
2621  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2622 
2623 #if OMPT_SUPPORT
2624  if (ompt_enabled.enabled) {
2625  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2626  if (ompt_enabled.ompt_callback_implicit_task) {
2627  int flags = (team_microtask == (void *)__kmp_teams_master)
2628  ? ompt_task_initial
2629  : ompt_task_implicit;
2630  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2631  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2632  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2633  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2634  }
2635  task_info->frame.exit_frame = ompt_data_none;
2636  task_info->task_data = ompt_data_none;
2637  }
2638 #endif
2639 
2640  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2641  master_th, team));
2642  __kmp_pop_current_task_from_thread(master_th);
2643 
2644  master_th->th.th_def_allocator = team->t.t_def_allocator;
2645 
2646 #if OMPD_SUPPORT
2647  if (ompd_state & OMPD_ENABLE_BP)
2648  ompd_bp_parallel_end();
2649 #endif
2650  updateHWFPControl(team);
2651 
2652  if (root->r.r_active != master_active)
2653  root->r.r_active = master_active;
2654 
2655  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2656  master_th)); // this will free worker threads
2657 
2658  /* this race was fun to find. make sure the following is in the critical
2659  region otherwise assertions may fail occasionally since the old team may be
2660  reallocated and the hierarchy appears inconsistent. it is actually safe to
2661  run and won't cause any bugs, but will cause those assertion failures. it's
2662  only one deref&assign so might as well put this in the critical region */
2663  master_th->th.th_team = parent_team;
2664  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2665  master_th->th.th_team_master = parent_team->t.t_threads[0];
2666  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2667 
2668  /* restore serialized team, if need be */
2669  if (parent_team->t.t_serialized &&
2670  parent_team != master_th->th.th_serial_team &&
2671  parent_team != root->r.r_root_team) {
2672  __kmp_free_team(root,
2673  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2674  master_th->th.th_serial_team = parent_team;
2675  }
2676 
2677  if (__kmp_tasking_mode != tskm_immediate_exec) {
2678  if (master_th->th.th_task_state_top >
2679  0) { // Restore task state from memo stack
2680  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2681  // Remember primary thread's state if we re-use this nested hot team
2682  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2683  master_th->th.th_task_state;
2684  --master_th->th.th_task_state_top; // pop
2685  // Now restore state at this level
2686  master_th->th.th_task_state =
2687  master_th->th
2688  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2689  } else if (team != root->r.r_hot_team) {
2690  // Reset the task state of primary thread if we are not hot team because
2691  // in this case all the worker threads will be free, and their task state
2692  // will be reset. If not reset the primary's, the task state will be
2693  // inconsistent.
2694  master_th->th.th_task_state = 0;
2695  }
2696  // Copy the task team from the parent team to the primary thread
2697  master_th->th.th_task_team =
2698  parent_team->t.t_task_team[master_th->th.th_task_state];
2699  KA_TRACE(20,
2700  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2701  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2702  parent_team));
2703  }
2704 
2705  // TODO: GEH - cannot do this assertion because root thread not set up as
2706  // executing
2707  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2708  master_th->th.th_current_task->td_flags.executing = 1;
2709 
2710  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2711 
2712 #if KMP_AFFINITY_SUPPORTED
2713  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2714  __kmp_reset_root_init_mask(gtid);
2715  }
2716 #endif
2717 #if OMPT_SUPPORT
2718  int flags =
2719  OMPT_INVOKER(fork_context) |
2720  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2721  : ompt_parallel_team);
2722  if (ompt_enabled.enabled) {
2723  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2724  codeptr);
2725  }
2726 #endif
2727 
2728  KMP_MB();
2729  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2730 }
2731 
2732 /* Check whether we should push an internal control record onto the
2733  serial team stack. If so, do it. */
2734 void __kmp_save_internal_controls(kmp_info_t *thread) {
2735 
2736  if (thread->th.th_team != thread->th.th_serial_team) {
2737  return;
2738  }
2739  if (thread->th.th_team->t.t_serialized > 1) {
2740  int push = 0;
2741 
2742  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2743  push = 1;
2744  } else {
2745  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2746  thread->th.th_team->t.t_serialized) {
2747  push = 1;
2748  }
2749  }
2750  if (push) { /* push a record on the serial team's stack */
2751  kmp_internal_control_t *control =
2752  (kmp_internal_control_t *)__kmp_allocate(
2753  sizeof(kmp_internal_control_t));
2754 
2755  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2756 
2757  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2758 
2759  control->next = thread->th.th_team->t.t_control_stack_top;
2760  thread->th.th_team->t.t_control_stack_top = control;
2761  }
2762  }
2763 }
2764 
2765 /* Changes set_nproc */
2766 void __kmp_set_num_threads(int new_nth, int gtid) {
2767  kmp_info_t *thread;
2768  kmp_root_t *root;
2769 
2770  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2771  KMP_DEBUG_ASSERT(__kmp_init_serial);
2772 
2773  if (new_nth < 1)
2774  new_nth = 1;
2775  else if (new_nth > __kmp_max_nth)
2776  new_nth = __kmp_max_nth;
2777 
2778  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2779  thread = __kmp_threads[gtid];
2780  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2781  return; // nothing to do
2782 
2783  __kmp_save_internal_controls(thread);
2784 
2785  set__nproc(thread, new_nth);
2786 
2787  // If this omp_set_num_threads() call will cause the hot team size to be
2788  // reduced (in the absence of a num_threads clause), then reduce it now,
2789  // rather than waiting for the next parallel region.
2790  root = thread->th.th_root;
2791  if (__kmp_init_parallel && (!root->r.r_active) &&
2792  (root->r.r_hot_team->t.t_nproc > new_nth)
2793 #if KMP_NESTED_HOT_TEAMS
2794  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2795 #endif
2796  ) {
2797  kmp_team_t *hot_team = root->r.r_hot_team;
2798  int f;
2799 
2800  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2801 
2802  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2803  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2804  }
2805  // Release the extra threads we don't need any more.
2806  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2807  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2808  if (__kmp_tasking_mode != tskm_immediate_exec) {
2809  // When decreasing team size, threads no longer in the team should unref
2810  // task team.
2811  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2812  }
2813  __kmp_free_thread(hot_team->t.t_threads[f]);
2814  hot_team->t.t_threads[f] = NULL;
2815  }
2816  hot_team->t.t_nproc = new_nth;
2817 #if KMP_NESTED_HOT_TEAMS
2818  if (thread->th.th_hot_teams) {
2819  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2820  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2821  }
2822 #endif
2823 
2824  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2825  hot_team->t.b->update_num_threads(new_nth);
2826  __kmp_add_threads_to_team(hot_team, new_nth);
2827  }
2828 
2829  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2830 
2831  // Update the t_nproc field in the threads that are still active.
2832  for (f = 0; f < new_nth; f++) {
2833  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2834  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2835  }
2836  // Special flag in case omp_set_num_threads() call
2837  hot_team->t.t_size_changed = -1;
2838  }
2839 }
2840 
2841 /* Changes max_active_levels */
2842 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2843  kmp_info_t *thread;
2844 
2845  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2846  "%d = (%d)\n",
2847  gtid, max_active_levels));
2848  KMP_DEBUG_ASSERT(__kmp_init_serial);
2849 
2850  // validate max_active_levels
2851  if (max_active_levels < 0) {
2852  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2853  // We ignore this call if the user has specified a negative value.
2854  // The current setting won't be changed. The last valid setting will be
2855  // used. A warning will be issued (if warnings are allowed as controlled by
2856  // the KMP_WARNINGS env var).
2857  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2858  "max_active_levels for thread %d = (%d)\n",
2859  gtid, max_active_levels));
2860  return;
2861  }
2862  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2863  // it's OK, the max_active_levels is within the valid range: [ 0;
2864  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2865  // We allow a zero value. (implementation defined behavior)
2866  } else {
2867  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2868  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2869  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2870  // Current upper limit is MAX_INT. (implementation defined behavior)
2871  // If the input exceeds the upper limit, we correct the input to be the
2872  // upper limit. (implementation defined behavior)
2873  // Actually, the flow should never get here until we use MAX_INT limit.
2874  }
2875  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2876  "max_active_levels for thread %d = (%d)\n",
2877  gtid, max_active_levels));
2878 
2879  thread = __kmp_threads[gtid];
2880 
2881  __kmp_save_internal_controls(thread);
2882 
2883  set__max_active_levels(thread, max_active_levels);
2884 }
2885 
2886 /* Gets max_active_levels */
2887 int __kmp_get_max_active_levels(int gtid) {
2888  kmp_info_t *thread;
2889 
2890  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2891  KMP_DEBUG_ASSERT(__kmp_init_serial);
2892 
2893  thread = __kmp_threads[gtid];
2894  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2895  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2896  "curtask_maxaclevel=%d\n",
2897  gtid, thread->th.th_current_task,
2898  thread->th.th_current_task->td_icvs.max_active_levels));
2899  return thread->th.th_current_task->td_icvs.max_active_levels;
2900 }
2901 
2902 // nteams-var per-device ICV
2903 void __kmp_set_num_teams(int num_teams) {
2904  if (num_teams > 0)
2905  __kmp_nteams = num_teams;
2906 }
2907 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2908 // teams-thread-limit-var per-device ICV
2909 void __kmp_set_teams_thread_limit(int limit) {
2910  if (limit > 0)
2911  __kmp_teams_thread_limit = limit;
2912 }
2913 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2914 
2915 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2916 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2917 
2918 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2919 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2920  kmp_info_t *thread;
2921  kmp_sched_t orig_kind;
2922  // kmp_team_t *team;
2923 
2924  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2925  gtid, (int)kind, chunk));
2926  KMP_DEBUG_ASSERT(__kmp_init_serial);
2927 
2928  // Check if the kind parameter is valid, correct if needed.
2929  // Valid parameters should fit in one of two intervals - standard or extended:
2930  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2931  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2932  orig_kind = kind;
2933  kind = __kmp_sched_without_mods(kind);
2934 
2935  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2936  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2937  // TODO: Hint needs attention in case we change the default schedule.
2938  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2939  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2940  __kmp_msg_null);
2941  kind = kmp_sched_default;
2942  chunk = 0; // ignore chunk value in case of bad kind
2943  }
2944 
2945  thread = __kmp_threads[gtid];
2946 
2947  __kmp_save_internal_controls(thread);
2948 
2949  if (kind < kmp_sched_upper_std) {
2950  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2951  // differ static chunked vs. unchunked: chunk should be invalid to
2952  // indicate unchunked schedule (which is the default)
2953  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2954  } else {
2955  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2956  __kmp_sch_map[kind - kmp_sched_lower - 1];
2957  }
2958  } else {
2959  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2960  // kmp_sched_lower - 2 ];
2961  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2962  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2963  kmp_sched_lower - 2];
2964  }
2965  __kmp_sched_apply_mods_intkind(
2966  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2967  if (kind == kmp_sched_auto || chunk < 1) {
2968  // ignore parameter chunk for schedule auto
2969  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2970  } else {
2971  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2972  }
2973 }
2974 
2975 /* Gets def_sched_var ICV values */
2976 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2977  kmp_info_t *thread;
2978  enum sched_type th_type;
2979 
2980  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2981  KMP_DEBUG_ASSERT(__kmp_init_serial);
2982 
2983  thread = __kmp_threads[gtid];
2984 
2985  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2986  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2987  case kmp_sch_static:
2988  case kmp_sch_static_greedy:
2989  case kmp_sch_static_balanced:
2990  *kind = kmp_sched_static;
2991  __kmp_sched_apply_mods_stdkind(kind, th_type);
2992  *chunk = 0; // chunk was not set, try to show this fact via zero value
2993  return;
2994  case kmp_sch_static_chunked:
2995  *kind = kmp_sched_static;
2996  break;
2997  case kmp_sch_dynamic_chunked:
2998  *kind = kmp_sched_dynamic;
2999  break;
3001  case kmp_sch_guided_iterative_chunked:
3002  case kmp_sch_guided_analytical_chunked:
3003  *kind = kmp_sched_guided;
3004  break;
3005  case kmp_sch_auto:
3006  *kind = kmp_sched_auto;
3007  break;
3008  case kmp_sch_trapezoidal:
3009  *kind = kmp_sched_trapezoidal;
3010  break;
3011 #if KMP_STATIC_STEAL_ENABLED
3012  case kmp_sch_static_steal:
3013  *kind = kmp_sched_static_steal;
3014  break;
3015 #endif
3016  default:
3017  KMP_FATAL(UnknownSchedulingType, th_type);
3018  }
3019 
3020  __kmp_sched_apply_mods_stdkind(kind, th_type);
3021  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3022 }
3023 
3024 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3025 
3026  int ii, dd;
3027  kmp_team_t *team;
3028  kmp_info_t *thr;
3029 
3030  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3031  KMP_DEBUG_ASSERT(__kmp_init_serial);
3032 
3033  // validate level
3034  if (level == 0)
3035  return 0;
3036  if (level < 0)
3037  return -1;
3038  thr = __kmp_threads[gtid];
3039  team = thr->th.th_team;
3040  ii = team->t.t_level;
3041  if (level > ii)
3042  return -1;
3043 
3044  if (thr->th.th_teams_microtask) {
3045  // AC: we are in teams region where multiple nested teams have same level
3046  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3047  if (level <=
3048  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3049  KMP_DEBUG_ASSERT(ii >= tlevel);
3050  // AC: As we need to pass by the teams league, we need to artificially
3051  // increase ii
3052  if (ii == tlevel) {
3053  ii += 2; // three teams have same level
3054  } else {
3055  ii++; // two teams have same level
3056  }
3057  }
3058  }
3059 
3060  if (ii == level)
3061  return __kmp_tid_from_gtid(gtid);
3062 
3063  dd = team->t.t_serialized;
3064  level++;
3065  while (ii > level) {
3066  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3067  }
3068  if ((team->t.t_serialized) && (!dd)) {
3069  team = team->t.t_parent;
3070  continue;
3071  }
3072  if (ii > level) {
3073  team = team->t.t_parent;
3074  dd = team->t.t_serialized;
3075  ii--;
3076  }
3077  }
3078 
3079  return (dd > 1) ? (0) : (team->t.t_master_tid);
3080 }
3081 
3082 int __kmp_get_team_size(int gtid, int level) {
3083 
3084  int ii, dd;
3085  kmp_team_t *team;
3086  kmp_info_t *thr;
3087 
3088  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3089  KMP_DEBUG_ASSERT(__kmp_init_serial);
3090 
3091  // validate level
3092  if (level == 0)
3093  return 1;
3094  if (level < 0)
3095  return -1;
3096  thr = __kmp_threads[gtid];
3097  team = thr->th.th_team;
3098  ii = team->t.t_level;
3099  if (level > ii)
3100  return -1;
3101 
3102  if (thr->th.th_teams_microtask) {
3103  // AC: we are in teams region where multiple nested teams have same level
3104  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3105  if (level <=
3106  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3107  KMP_DEBUG_ASSERT(ii >= tlevel);
3108  // AC: As we need to pass by the teams league, we need to artificially
3109  // increase ii
3110  if (ii == tlevel) {
3111  ii += 2; // three teams have same level
3112  } else {
3113  ii++; // two teams have same level
3114  }
3115  }
3116  }
3117 
3118  while (ii > level) {
3119  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3120  }
3121  if (team->t.t_serialized && (!dd)) {
3122  team = team->t.t_parent;
3123  continue;
3124  }
3125  if (ii > level) {
3126  team = team->t.t_parent;
3127  ii--;
3128  }
3129  }
3130 
3131  return team->t.t_nproc;
3132 }
3133 
3134 kmp_r_sched_t __kmp_get_schedule_global() {
3135  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3136  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3137  // independently. So one can get the updated schedule here.
3138 
3139  kmp_r_sched_t r_sched;
3140 
3141  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3142  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3143  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3144  // different roots (even in OMP 2.5)
3145  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3146  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3147  if (s == kmp_sch_static) {
3148  // replace STATIC with more detailed schedule (balanced or greedy)
3149  r_sched.r_sched_type = __kmp_static;
3150  } else if (s == kmp_sch_guided_chunked) {
3151  // replace GUIDED with more detailed schedule (iterative or analytical)
3152  r_sched.r_sched_type = __kmp_guided;
3153  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3154  r_sched.r_sched_type = __kmp_sched;
3155  }
3156  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3157 
3158  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3159  // __kmp_chunk may be wrong here (if it was not ever set)
3160  r_sched.chunk = KMP_DEFAULT_CHUNK;
3161  } else {
3162  r_sched.chunk = __kmp_chunk;
3163  }
3164 
3165  return r_sched;
3166 }
3167 
3168 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3169  at least argc number of *t_argv entries for the requested team. */
3170 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3171 
3172  KMP_DEBUG_ASSERT(team);
3173  if (!realloc || argc > team->t.t_max_argc) {
3174 
3175  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3176  "current entries=%d\n",
3177  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3178  /* if previously allocated heap space for args, free them */
3179  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3180  __kmp_free((void *)team->t.t_argv);
3181 
3182  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3183  /* use unused space in the cache line for arguments */
3184  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3185  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3186  "argv entries\n",
3187  team->t.t_id, team->t.t_max_argc));
3188  team->t.t_argv = &team->t.t_inline_argv[0];
3189  if (__kmp_storage_map) {
3190  __kmp_print_storage_map_gtid(
3191  -1, &team->t.t_inline_argv[0],
3192  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3193  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3194  team->t.t_id);
3195  }
3196  } else {
3197  /* allocate space for arguments in the heap */
3198  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3199  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3200  : 2 * argc;
3201  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3202  "argv entries\n",
3203  team->t.t_id, team->t.t_max_argc));
3204  team->t.t_argv =
3205  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3206  if (__kmp_storage_map) {
3207  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3208  &team->t.t_argv[team->t.t_max_argc],
3209  sizeof(void *) * team->t.t_max_argc,
3210  "team_%d.t_argv", team->t.t_id);
3211  }
3212  }
3213  }
3214 }
3215 
3216 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3217  int i;
3218  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3219  team->t.t_threads =
3220  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3221  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3222  sizeof(dispatch_shared_info_t) * num_disp_buff);
3223  team->t.t_dispatch =
3224  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3225  team->t.t_implicit_task_taskdata =
3226  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3227  team->t.t_max_nproc = max_nth;
3228 
3229  /* setup dispatch buffers */
3230  for (i = 0; i < num_disp_buff; ++i) {
3231  team->t.t_disp_buffer[i].buffer_index = i;
3232  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3233  }
3234 }
3235 
3236 static void __kmp_free_team_arrays(kmp_team_t *team) {
3237  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3238  int i;
3239  for (i = 0; i < team->t.t_max_nproc; ++i) {
3240  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3241  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3242  team->t.t_dispatch[i].th_disp_buffer = NULL;
3243  }
3244  }
3245 #if KMP_USE_HIER_SCHED
3246  __kmp_dispatch_free_hierarchies(team);
3247 #endif
3248  __kmp_free(team->t.t_threads);
3249  __kmp_free(team->t.t_disp_buffer);
3250  __kmp_free(team->t.t_dispatch);
3251  __kmp_free(team->t.t_implicit_task_taskdata);
3252  team->t.t_threads = NULL;
3253  team->t.t_disp_buffer = NULL;
3254  team->t.t_dispatch = NULL;
3255  team->t.t_implicit_task_taskdata = 0;
3256 }
3257 
3258 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3259  kmp_info_t **oldThreads = team->t.t_threads;
3260 
3261  __kmp_free(team->t.t_disp_buffer);
3262  __kmp_free(team->t.t_dispatch);
3263  __kmp_free(team->t.t_implicit_task_taskdata);
3264  __kmp_allocate_team_arrays(team, max_nth);
3265 
3266  KMP_MEMCPY(team->t.t_threads, oldThreads,
3267  team->t.t_nproc * sizeof(kmp_info_t *));
3268 
3269  __kmp_free(oldThreads);
3270 }
3271 
3272 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3273 
3274  kmp_r_sched_t r_sched =
3275  __kmp_get_schedule_global(); // get current state of scheduling globals
3276 
3277  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3278 
3279  kmp_internal_control_t g_icvs = {
3280  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3281  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3282  // adjustment of threads (per thread)
3283  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3284  // whether blocktime is explicitly set
3285  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3286 #if KMP_USE_MONITOR
3287  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3288 // intervals
3289 #endif
3290  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3291  // next parallel region (per thread)
3292  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3293  __kmp_cg_max_nth, // int thread_limit;
3294  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3295  // for max_active_levels
3296  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3297  // {sched,chunk} pair
3298  __kmp_nested_proc_bind.bind_types[0],
3299  __kmp_default_device,
3300  NULL // struct kmp_internal_control *next;
3301  };
3302 
3303  return g_icvs;
3304 }
3305 
3306 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3307 
3308  kmp_internal_control_t gx_icvs;
3309  gx_icvs.serial_nesting_level =
3310  0; // probably =team->t.t_serial like in save_inter_controls
3311  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3312  gx_icvs.next = NULL;
3313 
3314  return gx_icvs;
3315 }
3316 
3317 static void __kmp_initialize_root(kmp_root_t *root) {
3318  int f;
3319  kmp_team_t *root_team;
3320  kmp_team_t *hot_team;
3321  int hot_team_max_nth;
3322  kmp_r_sched_t r_sched =
3323  __kmp_get_schedule_global(); // get current state of scheduling globals
3324  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3325  KMP_DEBUG_ASSERT(root);
3326  KMP_ASSERT(!root->r.r_begin);
3327 
3328  /* setup the root state structure */
3329  __kmp_init_lock(&root->r.r_begin_lock);
3330  root->r.r_begin = FALSE;
3331  root->r.r_active = FALSE;
3332  root->r.r_in_parallel = 0;
3333  root->r.r_blocktime = __kmp_dflt_blocktime;
3334 #if KMP_AFFINITY_SUPPORTED
3335  root->r.r_affinity_assigned = FALSE;
3336 #endif
3337 
3338  /* setup the root team for this task */
3339  /* allocate the root team structure */
3340  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3341 
3342  root_team =
3343  __kmp_allocate_team(root,
3344  1, // new_nproc
3345  1, // max_nproc
3346 #if OMPT_SUPPORT
3347  ompt_data_none, // root parallel id
3348 #endif
3349  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3350  0 // argc
3351  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3352  );
3353 #if USE_DEBUGGER
3354  // Non-NULL value should be assigned to make the debugger display the root
3355  // team.
3356  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3357 #endif
3358 
3359  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3360 
3361  root->r.r_root_team = root_team;
3362  root_team->t.t_control_stack_top = NULL;
3363 
3364  /* initialize root team */
3365  root_team->t.t_threads[0] = NULL;
3366  root_team->t.t_nproc = 1;
3367  root_team->t.t_serialized = 1;
3368  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3369  root_team->t.t_sched.sched = r_sched.sched;
3370  KA_TRACE(
3371  20,
3372  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3373  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3374 
3375  /* setup the hot team for this task */
3376  /* allocate the hot team structure */
3377  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3378 
3379  hot_team =
3380  __kmp_allocate_team(root,
3381  1, // new_nproc
3382  __kmp_dflt_team_nth_ub * 2, // max_nproc
3383 #if OMPT_SUPPORT
3384  ompt_data_none, // root parallel id
3385 #endif
3386  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3387  0 // argc
3388  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3389  );
3390  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3391 
3392  root->r.r_hot_team = hot_team;
3393  root_team->t.t_control_stack_top = NULL;
3394 
3395  /* first-time initialization */
3396  hot_team->t.t_parent = root_team;
3397 
3398  /* initialize hot team */
3399  hot_team_max_nth = hot_team->t.t_max_nproc;
3400  for (f = 0; f < hot_team_max_nth; ++f) {
3401  hot_team->t.t_threads[f] = NULL;
3402  }
3403  hot_team->t.t_nproc = 1;
3404  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3405  hot_team->t.t_sched.sched = r_sched.sched;
3406  hot_team->t.t_size_changed = 0;
3407 }
3408 
3409 #ifdef KMP_DEBUG
3410 
3411 typedef struct kmp_team_list_item {
3412  kmp_team_p const *entry;
3413  struct kmp_team_list_item *next;
3414 } kmp_team_list_item_t;
3415 typedef kmp_team_list_item_t *kmp_team_list_t;
3416 
3417 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3418  kmp_team_list_t list, // List of teams.
3419  kmp_team_p const *team // Team to add.
3420 ) {
3421 
3422  // List must terminate with item where both entry and next are NULL.
3423  // Team is added to the list only once.
3424  // List is sorted in ascending order by team id.
3425  // Team id is *not* a key.
3426 
3427  kmp_team_list_t l;
3428 
3429  KMP_DEBUG_ASSERT(list != NULL);
3430  if (team == NULL) {
3431  return;
3432  }
3433 
3434  __kmp_print_structure_team_accum(list, team->t.t_parent);
3435  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3436 
3437  // Search list for the team.
3438  l = list;
3439  while (l->next != NULL && l->entry != team) {
3440  l = l->next;
3441  }
3442  if (l->next != NULL) {
3443  return; // Team has been added before, exit.
3444  }
3445 
3446  // Team is not found. Search list again for insertion point.
3447  l = list;
3448  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3449  l = l->next;
3450  }
3451 
3452  // Insert team.
3453  {
3454  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3455  sizeof(kmp_team_list_item_t));
3456  *item = *l;
3457  l->entry = team;
3458  l->next = item;
3459  }
3460 }
3461 
3462 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3463 
3464 ) {
3465  __kmp_printf("%s", title);
3466  if (team != NULL) {
3467  __kmp_printf("%2x %p\n", team->t.t_id, team);
3468  } else {
3469  __kmp_printf(" - (nil)\n");
3470  }
3471 }
3472 
3473 static void __kmp_print_structure_thread(char const *title,
3474  kmp_info_p const *thread) {
3475  __kmp_printf("%s", title);
3476  if (thread != NULL) {
3477  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3478  } else {
3479  __kmp_printf(" - (nil)\n");
3480  }
3481 }
3482 
3483 void __kmp_print_structure(void) {
3484 
3485  kmp_team_list_t list;
3486 
3487  // Initialize list of teams.
3488  list =
3489  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3490  list->entry = NULL;
3491  list->next = NULL;
3492 
3493  __kmp_printf("\n------------------------------\nGlobal Thread "
3494  "Table\n------------------------------\n");
3495  {
3496  int gtid;
3497  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3498  __kmp_printf("%2d", gtid);
3499  if (__kmp_threads != NULL) {
3500  __kmp_printf(" %p", __kmp_threads[gtid]);
3501  }
3502  if (__kmp_root != NULL) {
3503  __kmp_printf(" %p", __kmp_root[gtid]);
3504  }
3505  __kmp_printf("\n");
3506  }
3507  }
3508 
3509  // Print out __kmp_threads array.
3510  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3511  "----------\n");
3512  if (__kmp_threads != NULL) {
3513  int gtid;
3514  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3515  kmp_info_t const *thread = __kmp_threads[gtid];
3516  if (thread != NULL) {
3517  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3518  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3519  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3520  __kmp_print_structure_team(" Serial Team: ",
3521  thread->th.th_serial_team);
3522  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3523  __kmp_print_structure_thread(" Primary: ",
3524  thread->th.th_team_master);
3525  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3526  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3527  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3528  __kmp_print_structure_thread(" Next in pool: ",
3529  thread->th.th_next_pool);
3530  __kmp_printf("\n");
3531  __kmp_print_structure_team_accum(list, thread->th.th_team);
3532  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3533  }
3534  }
3535  } else {
3536  __kmp_printf("Threads array is not allocated.\n");
3537  }
3538 
3539  // Print out __kmp_root array.
3540  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3541  "--------\n");
3542  if (__kmp_root != NULL) {
3543  int gtid;
3544  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3545  kmp_root_t const *root = __kmp_root[gtid];
3546  if (root != NULL) {
3547  __kmp_printf("GTID %2d %p:\n", gtid, root);
3548  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3549  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3550  __kmp_print_structure_thread(" Uber Thread: ",
3551  root->r.r_uber_thread);
3552  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3553  __kmp_printf(" In Parallel: %2d\n",
3554  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3555  __kmp_printf("\n");
3556  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3557  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3558  }
3559  }
3560  } else {
3561  __kmp_printf("Ubers array is not allocated.\n");
3562  }
3563 
3564  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3565  "--------\n");
3566  while (list->next != NULL) {
3567  kmp_team_p const *team = list->entry;
3568  int i;
3569  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3570  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3571  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3572  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3573  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3574  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3575  for (i = 0; i < team->t.t_nproc; ++i) {
3576  __kmp_printf(" Thread %2d: ", i);
3577  __kmp_print_structure_thread("", team->t.t_threads[i]);
3578  }
3579  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3580  __kmp_printf("\n");
3581  list = list->next;
3582  }
3583 
3584  // Print out __kmp_thread_pool and __kmp_team_pool.
3585  __kmp_printf("\n------------------------------\nPools\n----------------------"
3586  "--------\n");
3587  __kmp_print_structure_thread("Thread pool: ",
3588  CCAST(kmp_info_t *, __kmp_thread_pool));
3589  __kmp_print_structure_team("Team pool: ",
3590  CCAST(kmp_team_t *, __kmp_team_pool));
3591  __kmp_printf("\n");
3592 
3593  // Free team list.
3594  while (list != NULL) {
3595  kmp_team_list_item_t *item = list;
3596  list = list->next;
3597  KMP_INTERNAL_FREE(item);
3598  }
3599 }
3600 
3601 #endif
3602 
3603 //---------------------------------------------------------------------------
3604 // Stuff for per-thread fast random number generator
3605 // Table of primes
3606 static const unsigned __kmp_primes[] = {
3607  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3608  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3609  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3610  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3611  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3612  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3613  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3614  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3615  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3616  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3617  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3618 
3619 //---------------------------------------------------------------------------
3620 // __kmp_get_random: Get a random number using a linear congruential method.
3621 unsigned short __kmp_get_random(kmp_info_t *thread) {
3622  unsigned x = thread->th.th_x;
3623  unsigned short r = (unsigned short)(x >> 16);
3624 
3625  thread->th.th_x = x * thread->th.th_a + 1;
3626 
3627  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3628  thread->th.th_info.ds.ds_tid, r));
3629 
3630  return r;
3631 }
3632 //--------------------------------------------------------
3633 // __kmp_init_random: Initialize a random number generator
3634 void __kmp_init_random(kmp_info_t *thread) {
3635  unsigned seed = thread->th.th_info.ds.ds_tid;
3636 
3637  thread->th.th_a =
3638  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3639  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3640  KA_TRACE(30,
3641  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3642 }
3643 
3644 #if KMP_OS_WINDOWS
3645 /* reclaim array entries for root threads that are already dead, returns number
3646  * reclaimed */
3647 static int __kmp_reclaim_dead_roots(void) {
3648  int i, r = 0;
3649 
3650  for (i = 0; i < __kmp_threads_capacity; ++i) {
3651  if (KMP_UBER_GTID(i) &&
3652  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3653  !__kmp_root[i]
3654  ->r.r_active) { // AC: reclaim only roots died in non-active state
3655  r += __kmp_unregister_root_other_thread(i);
3656  }
3657  }
3658  return r;
3659 }
3660 #endif
3661 
3662 /* This function attempts to create free entries in __kmp_threads and
3663  __kmp_root, and returns the number of free entries generated.
3664 
3665  For Windows* OS static library, the first mechanism used is to reclaim array
3666  entries for root threads that are already dead.
3667 
3668  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3669  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3670  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3671  threadprivate cache array has been created. Synchronization with
3672  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3673 
3674  After any dead root reclamation, if the clipping value allows array expansion
3675  to result in the generation of a total of nNeed free slots, the function does
3676  that expansion. If not, nothing is done beyond the possible initial root
3677  thread reclamation.
3678 
3679  If any argument is negative, the behavior is undefined. */
3680 static int __kmp_expand_threads(int nNeed) {
3681  int added = 0;
3682  int minimumRequiredCapacity;
3683  int newCapacity;
3684  kmp_info_t **newThreads;
3685  kmp_root_t **newRoot;
3686 
3687  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3688  // resizing __kmp_threads does not need additional protection if foreign
3689  // threads are present
3690 
3691 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3692  /* only for Windows static library */
3693  /* reclaim array entries for root threads that are already dead */
3694  added = __kmp_reclaim_dead_roots();
3695 
3696  if (nNeed) {
3697  nNeed -= added;
3698  if (nNeed < 0)
3699  nNeed = 0;
3700  }
3701 #endif
3702  if (nNeed <= 0)
3703  return added;
3704 
3705  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3706  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3707  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3708  // > __kmp_max_nth in one of two ways:
3709  //
3710  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3711  // may not be reused by another thread, so we may need to increase
3712  // __kmp_threads_capacity to __kmp_max_nth + 1.
3713  //
3714  // 2) New foreign root(s) are encountered. We always register new foreign
3715  // roots. This may cause a smaller # of threads to be allocated at
3716  // subsequent parallel regions, but the worker threads hang around (and
3717  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3718  //
3719  // Anyway, that is the reason for moving the check to see if
3720  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3721  // instead of having it performed here. -BB
3722 
3723  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3724 
3725  /* compute expansion headroom to check if we can expand */
3726  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3727  /* possible expansion too small -- give up */
3728  return added;
3729  }
3730  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3731 
3732  newCapacity = __kmp_threads_capacity;
3733  do {
3734  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3735  : __kmp_sys_max_nth;
3736  } while (newCapacity < minimumRequiredCapacity);
3737  newThreads = (kmp_info_t **)__kmp_allocate(
3738  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3739  newRoot =
3740  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3741  KMP_MEMCPY(newThreads, __kmp_threads,
3742  __kmp_threads_capacity * sizeof(kmp_info_t *));
3743  KMP_MEMCPY(newRoot, __kmp_root,
3744  __kmp_threads_capacity * sizeof(kmp_root_t *));
3745  // Put old __kmp_threads array on a list. Any ongoing references to the old
3746  // list will be valid. This list is cleaned up at library shutdown.
3747  kmp_old_threads_list_t *node =
3748  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3749  node->threads = __kmp_threads;
3750  node->next = __kmp_old_threads_list;
3751  __kmp_old_threads_list = node;
3752 
3753  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3754  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3755  added += newCapacity - __kmp_threads_capacity;
3756  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3757 
3758  if (newCapacity > __kmp_tp_capacity) {
3759  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3760  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3761  __kmp_threadprivate_resize_cache(newCapacity);
3762  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3763  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3764  }
3765  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3766  }
3767 
3768  return added;
3769 }
3770 
3771 /* Register the current thread as a root thread and obtain our gtid. We must
3772  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3773  thread that calls from __kmp_do_serial_initialize() */
3774 int __kmp_register_root(int initial_thread) {
3775  kmp_info_t *root_thread;
3776  kmp_root_t *root;
3777  int gtid;
3778  int capacity;
3779  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3780  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3781  KMP_MB();
3782 
3783  /* 2007-03-02:
3784  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3785  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3786  work as expected -- it may return false (that means there is at least one
3787  empty slot in __kmp_threads array), but it is possible the only free slot
3788  is #0, which is reserved for initial thread and so cannot be used for this
3789  one. Following code workarounds this bug.
3790 
3791  However, right solution seems to be not reserving slot #0 for initial
3792  thread because:
3793  (1) there is no magic in slot #0,
3794  (2) we cannot detect initial thread reliably (the first thread which does
3795  serial initialization may be not a real initial thread).
3796  */
3797  capacity = __kmp_threads_capacity;
3798  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3799  --capacity;
3800  }
3801 
3802  // If it is not for initializing the hidden helper team, we need to take
3803  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3804  // in __kmp_threads_capacity.
3805  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3806  capacity -= __kmp_hidden_helper_threads_num;
3807  }
3808 
3809  /* see if there are too many threads */
3810  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3811  if (__kmp_tp_cached) {
3812  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3813  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3814  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3815  } else {
3816  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3817  __kmp_msg_null);
3818  }
3819  }
3820 
3821  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3822  // 0: initial thread, also a regular OpenMP thread.
3823  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3824  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3825  // regular OpenMP threads.
3826  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3827  // Find an available thread slot for hidden helper thread. Slots for hidden
3828  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3829  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3830  gtid <= __kmp_hidden_helper_threads_num;
3831  gtid++)
3832  ;
3833  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3834  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3835  "hidden helper thread: T#%d\n",
3836  gtid));
3837  } else {
3838  /* find an available thread slot */
3839  // Don't reassign the zero slot since we need that to only be used by
3840  // initial thread. Slots for hidden helper threads should also be skipped.
3841  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3842  gtid = 0;
3843  } else {
3844  for (gtid = __kmp_hidden_helper_threads_num + 1;
3845  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3846  ;
3847  }
3848  KA_TRACE(
3849  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3850  KMP_ASSERT(gtid < __kmp_threads_capacity);
3851  }
3852 
3853  /* update global accounting */
3854  __kmp_all_nth++;
3855  TCW_4(__kmp_nth, __kmp_nth + 1);
3856 
3857  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3858  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3859  if (__kmp_adjust_gtid_mode) {
3860  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3861  if (TCR_4(__kmp_gtid_mode) != 2) {
3862  TCW_4(__kmp_gtid_mode, 2);
3863  }
3864  } else {
3865  if (TCR_4(__kmp_gtid_mode) != 1) {
3866  TCW_4(__kmp_gtid_mode, 1);
3867  }
3868  }
3869  }
3870 
3871 #ifdef KMP_ADJUST_BLOCKTIME
3872  /* Adjust blocktime to zero if necessary */
3873  /* Middle initialization might not have occurred yet */
3874  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3875  if (__kmp_nth > __kmp_avail_proc) {
3876  __kmp_zero_bt = TRUE;
3877  }
3878  }
3879 #endif /* KMP_ADJUST_BLOCKTIME */
3880 
3881  /* setup this new hierarchy */
3882  if (!(root = __kmp_root[gtid])) {
3883  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3884  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3885  }
3886 
3887 #if KMP_STATS_ENABLED
3888  // Initialize stats as soon as possible (right after gtid assignment).
3889  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3890  __kmp_stats_thread_ptr->startLife();
3891  KMP_SET_THREAD_STATE(SERIAL_REGION);
3892  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3893 #endif
3894  __kmp_initialize_root(root);
3895 
3896  /* setup new root thread structure */
3897  if (root->r.r_uber_thread) {
3898  root_thread = root->r.r_uber_thread;
3899  } else {
3900  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3901  if (__kmp_storage_map) {
3902  __kmp_print_thread_storage_map(root_thread, gtid);
3903  }
3904  root_thread->th.th_info.ds.ds_gtid = gtid;
3905 #if OMPT_SUPPORT
3906  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3907 #endif
3908  root_thread->th.th_root = root;
3909  if (__kmp_env_consistency_check) {
3910  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3911  }
3912 #if USE_FAST_MEMORY
3913  __kmp_initialize_fast_memory(root_thread);
3914 #endif /* USE_FAST_MEMORY */
3915 
3916 #if KMP_USE_BGET
3917  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3918  __kmp_initialize_bget(root_thread);
3919 #endif
3920  __kmp_init_random(root_thread); // Initialize random number generator
3921  }
3922 
3923  /* setup the serial team held in reserve by the root thread */
3924  if (!root_thread->th.th_serial_team) {
3925  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3926  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3927  root_thread->th.th_serial_team = __kmp_allocate_team(
3928  root, 1, 1,
3929 #if OMPT_SUPPORT
3930  ompt_data_none, // root parallel id
3931 #endif
3932  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3933  }
3934  KMP_ASSERT(root_thread->th.th_serial_team);
3935  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3936  root_thread->th.th_serial_team));
3937 
3938  /* drop root_thread into place */
3939  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3940 
3941  root->r.r_root_team->t.t_threads[0] = root_thread;
3942  root->r.r_hot_team->t.t_threads[0] = root_thread;
3943  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3944  // AC: the team created in reserve, not for execution (it is unused for now).
3945  root_thread->th.th_serial_team->t.t_serialized = 0;
3946  root->r.r_uber_thread = root_thread;
3947 
3948  /* initialize the thread, get it ready to go */
3949  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3950  TCW_4(__kmp_init_gtid, TRUE);
3951 
3952  /* prepare the primary thread for get_gtid() */
3953  __kmp_gtid_set_specific(gtid);
3954 
3955 #if USE_ITT_BUILD
3956  __kmp_itt_thread_name(gtid);
3957 #endif /* USE_ITT_BUILD */
3958 
3959 #ifdef KMP_TDATA_GTID
3960  __kmp_gtid = gtid;
3961 #endif
3962  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3963  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3964 
3965  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3966  "plain=%u\n",
3967  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3968  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3969  KMP_INIT_BARRIER_STATE));
3970  { // Initialize barrier data.
3971  int b;
3972  for (b = 0; b < bs_last_barrier; ++b) {
3973  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3974 #if USE_DEBUGGER
3975  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3976 #endif
3977  }
3978  }
3979  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3980  KMP_INIT_BARRIER_STATE);
3981 
3982 #if KMP_AFFINITY_SUPPORTED
3983  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3984  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3985  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3986  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3987 #endif /* KMP_AFFINITY_SUPPORTED */
3988  root_thread->th.th_def_allocator = __kmp_def_allocator;
3989  root_thread->th.th_prev_level = 0;
3990  root_thread->th.th_prev_num_threads = 1;
3991 
3992  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3993  tmp->cg_root = root_thread;
3994  tmp->cg_thread_limit = __kmp_cg_max_nth;
3995  tmp->cg_nthreads = 1;
3996  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3997  " cg_nthreads init to 1\n",
3998  root_thread, tmp));
3999  tmp->up = NULL;
4000  root_thread->th.th_cg_roots = tmp;
4001 
4002  __kmp_root_counter++;
4003 
4004 #if OMPT_SUPPORT
4005  if (!initial_thread && ompt_enabled.enabled) {
4006 
4007  kmp_info_t *root_thread = ompt_get_thread();
4008 
4009  ompt_set_thread_state(root_thread, ompt_state_overhead);
4010 
4011  if (ompt_enabled.ompt_callback_thread_begin) {
4012  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4013  ompt_thread_initial, __ompt_get_thread_data_internal());
4014  }
4015  ompt_data_t *task_data;
4016  ompt_data_t *parallel_data;
4017  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4018  NULL);
4019  if (ompt_enabled.ompt_callback_implicit_task) {
4020  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4021  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4022  }
4023 
4024  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4025  }
4026 #endif
4027 #if OMPD_SUPPORT
4028  if (ompd_state & OMPD_ENABLE_BP)
4029  ompd_bp_thread_begin();
4030 #endif
4031 
4032  KMP_MB();
4033  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4034 
4035  return gtid;
4036 }
4037 
4038 #if KMP_NESTED_HOT_TEAMS
4039 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4040  const int max_level) {
4041  int i, n, nth;
4042  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4043  if (!hot_teams || !hot_teams[level].hot_team) {
4044  return 0;
4045  }
4046  KMP_DEBUG_ASSERT(level < max_level);
4047  kmp_team_t *team = hot_teams[level].hot_team;
4048  nth = hot_teams[level].hot_team_nth;
4049  n = nth - 1; // primary thread is not freed
4050  if (level < max_level - 1) {
4051  for (i = 0; i < nth; ++i) {
4052  kmp_info_t *th = team->t.t_threads[i];
4053  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4054  if (i > 0 && th->th.th_hot_teams) {
4055  __kmp_free(th->th.th_hot_teams);
4056  th->th.th_hot_teams = NULL;
4057  }
4058  }
4059  }
4060  __kmp_free_team(root, team, NULL);
4061  return n;
4062 }
4063 #endif
4064 
4065 // Resets a root thread and clear its root and hot teams.
4066 // Returns the number of __kmp_threads entries directly and indirectly freed.
4067 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4068  kmp_team_t *root_team = root->r.r_root_team;
4069  kmp_team_t *hot_team = root->r.r_hot_team;
4070  int n = hot_team->t.t_nproc;
4071  int i;
4072 
4073  KMP_DEBUG_ASSERT(!root->r.r_active);
4074 
4075  root->r.r_root_team = NULL;
4076  root->r.r_hot_team = NULL;
4077  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4078  // before call to __kmp_free_team().
4079  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4080 #if KMP_NESTED_HOT_TEAMS
4081  if (__kmp_hot_teams_max_level >
4082  0) { // need to free nested hot teams and their threads if any
4083  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4084  kmp_info_t *th = hot_team->t.t_threads[i];
4085  if (__kmp_hot_teams_max_level > 1) {
4086  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4087  }
4088  if (th->th.th_hot_teams) {
4089  __kmp_free(th->th.th_hot_teams);
4090  th->th.th_hot_teams = NULL;
4091  }
4092  }
4093  }
4094 #endif
4095  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4096 
4097  // Before we can reap the thread, we need to make certain that all other
4098  // threads in the teams that had this root as ancestor have stopped trying to
4099  // steal tasks.
4100  if (__kmp_tasking_mode != tskm_immediate_exec) {
4101  __kmp_wait_to_unref_task_teams();
4102  }
4103 
4104 #if KMP_OS_WINDOWS
4105  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4106  KA_TRACE(
4107  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4108  "\n",
4109  (LPVOID) & (root->r.r_uber_thread->th),
4110  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4111  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4112 #endif /* KMP_OS_WINDOWS */
4113 
4114 #if OMPD_SUPPORT
4115  if (ompd_state & OMPD_ENABLE_BP)
4116  ompd_bp_thread_end();
4117 #endif
4118 
4119 #if OMPT_SUPPORT
4120  ompt_data_t *task_data;
4121  ompt_data_t *parallel_data;
4122  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4123  NULL);
4124  if (ompt_enabled.ompt_callback_implicit_task) {
4125  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4126  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4127  }
4128  if (ompt_enabled.ompt_callback_thread_end) {
4129  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4130  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4131  }
4132 #endif
4133 
4134  TCW_4(__kmp_nth,
4135  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4136  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4137  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4138  " to %d\n",
4139  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4140  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4141  if (i == 1) {
4142  // need to free contention group structure
4143  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4144  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4145  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4146  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4147  root->r.r_uber_thread->th.th_cg_roots = NULL;
4148  }
4149  __kmp_reap_thread(root->r.r_uber_thread, 1);
4150 
4151  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4152  // instead of freeing.
4153  root->r.r_uber_thread = NULL;
4154  /* mark root as no longer in use */
4155  root->r.r_begin = FALSE;
4156 
4157  return n;
4158 }
4159 
4160 void __kmp_unregister_root_current_thread(int gtid) {
4161  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4162  /* this lock should be ok, since unregister_root_current_thread is never
4163  called during an abort, only during a normal close. furthermore, if you
4164  have the forkjoin lock, you should never try to get the initz lock */
4165  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4166  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4167  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4168  "exiting T#%d\n",
4169  gtid));
4170  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4171  return;
4172  }
4173  kmp_root_t *root = __kmp_root[gtid];
4174 
4175  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4176  KMP_ASSERT(KMP_UBER_GTID(gtid));
4177  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4178  KMP_ASSERT(root->r.r_active == FALSE);
4179 
4180  KMP_MB();
4181 
4182  kmp_info_t *thread = __kmp_threads[gtid];
4183  kmp_team_t *team = thread->th.th_team;
4184  kmp_task_team_t *task_team = thread->th.th_task_team;
4185 
4186  // we need to wait for the proxy tasks before finishing the thread
4187  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4188  task_team->tt.tt_hidden_helper_task_encountered)) {
4189 #if OMPT_SUPPORT
4190  // the runtime is shutting down so we won't report any events
4191  thread->th.ompt_thread_info.state = ompt_state_undefined;
4192 #endif
4193  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4194  }
4195 
4196  __kmp_reset_root(gtid, root);
4197 
4198  KMP_MB();
4199  KC_TRACE(10,
4200  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4201 
4202  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4203 }
4204 
4205 #if KMP_OS_WINDOWS
4206 /* __kmp_forkjoin_lock must be already held
4207  Unregisters a root thread that is not the current thread. Returns the number
4208  of __kmp_threads entries freed as a result. */
4209 static int __kmp_unregister_root_other_thread(int gtid) {
4210  kmp_root_t *root = __kmp_root[gtid];
4211  int r;
4212 
4213  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4214  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4215  KMP_ASSERT(KMP_UBER_GTID(gtid));
4216  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4217  KMP_ASSERT(root->r.r_active == FALSE);
4218 
4219  r = __kmp_reset_root(gtid, root);
4220  KC_TRACE(10,
4221  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4222  return r;
4223 }
4224 #endif
4225 
4226 #if KMP_DEBUG
4227 void __kmp_task_info() {
4228 
4229  kmp_int32 gtid = __kmp_entry_gtid();
4230  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4231  kmp_info_t *this_thr = __kmp_threads[gtid];
4232  kmp_team_t *steam = this_thr->th.th_serial_team;
4233  kmp_team_t *team = this_thr->th.th_team;
4234 
4235  __kmp_printf(
4236  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4237  "ptask=%p\n",
4238  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4239  team->t.t_implicit_task_taskdata[tid].td_parent);
4240 }
4241 #endif // KMP_DEBUG
4242 
4243 /* TODO optimize with one big memclr, take out what isn't needed, split
4244  responsibility to workers as much as possible, and delay initialization of
4245  features as much as possible */
4246 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4247  int tid, int gtid) {
4248  /* this_thr->th.th_info.ds.ds_gtid is setup in
4249  kmp_allocate_thread/create_worker.
4250  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4251  KMP_DEBUG_ASSERT(this_thr != NULL);
4252  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4253  KMP_DEBUG_ASSERT(team);
4254  KMP_DEBUG_ASSERT(team->t.t_threads);
4255  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4256  kmp_info_t *master = team->t.t_threads[0];
4257  KMP_DEBUG_ASSERT(master);
4258  KMP_DEBUG_ASSERT(master->th.th_root);
4259 
4260  KMP_MB();
4261 
4262  TCW_SYNC_PTR(this_thr->th.th_team, team);
4263 
4264  this_thr->th.th_info.ds.ds_tid = tid;
4265  this_thr->th.th_set_nproc = 0;
4266  if (__kmp_tasking_mode != tskm_immediate_exec)
4267  // When tasking is possible, threads are not safe to reap until they are
4268  // done tasking; this will be set when tasking code is exited in wait
4269  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4270  else // no tasking --> always safe to reap
4271  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4272  this_thr->th.th_set_proc_bind = proc_bind_default;
4273 #if KMP_AFFINITY_SUPPORTED
4274  this_thr->th.th_new_place = this_thr->th.th_current_place;
4275 #endif
4276  this_thr->th.th_root = master->th.th_root;
4277 
4278  /* setup the thread's cache of the team structure */
4279  this_thr->th.th_team_nproc = team->t.t_nproc;
4280  this_thr->th.th_team_master = master;
4281  this_thr->th.th_team_serialized = team->t.t_serialized;
4282 
4283  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4284 
4285  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4286  tid, gtid, this_thr, this_thr->th.th_current_task));
4287 
4288  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4289  team, tid, TRUE);
4290 
4291  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4292  tid, gtid, this_thr, this_thr->th.th_current_task));
4293  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4294  // __kmp_initialize_team()?
4295 
4296  /* TODO no worksharing in speculative threads */
4297  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4298 
4299  this_thr->th.th_local.this_construct = 0;
4300 
4301  if (!this_thr->th.th_pri_common) {
4302  this_thr->th.th_pri_common =
4303  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4304  if (__kmp_storage_map) {
4305  __kmp_print_storage_map_gtid(
4306  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4307  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4308  }
4309  this_thr->th.th_pri_head = NULL;
4310  }
4311 
4312  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4313  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4314  // Make new thread's CG root same as primary thread's
4315  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4316  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4317  if (tmp) {
4318  // worker changes CG, need to check if old CG should be freed
4319  int i = tmp->cg_nthreads--;
4320  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4321  " on node %p of thread %p to %d\n",
4322  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4323  if (i == 1) {
4324  __kmp_free(tmp); // last thread left CG --> free it
4325  }
4326  }
4327  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4328  // Increment new thread's CG root's counter to add the new thread
4329  this_thr->th.th_cg_roots->cg_nthreads++;
4330  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4331  " node %p of thread %p to %d\n",
4332  this_thr, this_thr->th.th_cg_roots,
4333  this_thr->th.th_cg_roots->cg_root,
4334  this_thr->th.th_cg_roots->cg_nthreads));
4335  this_thr->th.th_current_task->td_icvs.thread_limit =
4336  this_thr->th.th_cg_roots->cg_thread_limit;
4337  }
4338 
4339  /* Initialize dynamic dispatch */
4340  {
4341  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4342  // Use team max_nproc since this will never change for the team.
4343  size_t disp_size =
4344  sizeof(dispatch_private_info_t) *
4345  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4346  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4347  team->t.t_max_nproc));
4348  KMP_ASSERT(dispatch);
4349  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4350  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4351 
4352  dispatch->th_disp_index = 0;
4353  dispatch->th_doacross_buf_idx = 0;
4354  if (!dispatch->th_disp_buffer) {
4355  dispatch->th_disp_buffer =
4356  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4357 
4358  if (__kmp_storage_map) {
4359  __kmp_print_storage_map_gtid(
4360  gtid, &dispatch->th_disp_buffer[0],
4361  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4362  ? 1
4363  : __kmp_dispatch_num_buffers],
4364  disp_size,
4365  "th_%d.th_dispatch.th_disp_buffer "
4366  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4367  gtid, team->t.t_id, gtid);
4368  }
4369  } else {
4370  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4371  }
4372 
4373  dispatch->th_dispatch_pr_current = 0;
4374  dispatch->th_dispatch_sh_current = 0;
4375 
4376  dispatch->th_deo_fcn = 0; /* ORDERED */
4377  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4378  }
4379 
4380  this_thr->th.th_next_pool = NULL;
4381 
4382  if (!this_thr->th.th_task_state_memo_stack) {
4383  size_t i;
4384  this_thr->th.th_task_state_memo_stack =
4385  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4386  this_thr->th.th_task_state_top = 0;
4387  this_thr->th.th_task_state_stack_sz = 4;
4388  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4389  ++i) // zero init the stack
4390  this_thr->th.th_task_state_memo_stack[i] = 0;
4391  }
4392 
4393  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4394  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4395 
4396  KMP_MB();
4397 }
4398 
4399 /* allocate a new thread for the requesting team. this is only called from
4400  within a forkjoin critical section. we will first try to get an available
4401  thread from the thread pool. if none is available, we will fork a new one
4402  assuming we are able to create a new one. this should be assured, as the
4403  caller should check on this first. */
4404 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4405  int new_tid) {
4406  kmp_team_t *serial_team;
4407  kmp_info_t *new_thr;
4408  int new_gtid;
4409 
4410  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4411  KMP_DEBUG_ASSERT(root && team);
4412 #if !KMP_NESTED_HOT_TEAMS
4413  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4414 #endif
4415  KMP_MB();
4416 
4417  /* first, try to get one from the thread pool */
4418  if (__kmp_thread_pool) {
4419  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4420  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4421  if (new_thr == __kmp_thread_pool_insert_pt) {
4422  __kmp_thread_pool_insert_pt = NULL;
4423  }
4424  TCW_4(new_thr->th.th_in_pool, FALSE);
4425  __kmp_suspend_initialize_thread(new_thr);
4426  __kmp_lock_suspend_mx(new_thr);
4427  if (new_thr->th.th_active_in_pool == TRUE) {
4428  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4429  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4430  new_thr->th.th_active_in_pool = FALSE;
4431  }
4432  __kmp_unlock_suspend_mx(new_thr);
4433 
4434  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4435  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4436  KMP_ASSERT(!new_thr->th.th_team);
4437  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4438 
4439  /* setup the thread structure */
4440  __kmp_initialize_info(new_thr, team, new_tid,
4441  new_thr->th.th_info.ds.ds_gtid);
4442  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4443 
4444  TCW_4(__kmp_nth, __kmp_nth + 1);
4445 
4446  new_thr->th.th_task_state = 0;
4447  new_thr->th.th_task_state_top = 0;
4448  new_thr->th.th_task_state_stack_sz = 4;
4449 
4450  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4451  // Make sure pool thread has transitioned to waiting on own thread struct
4452  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4453  // Thread activated in __kmp_allocate_team when increasing team size
4454  }
4455 
4456 #ifdef KMP_ADJUST_BLOCKTIME
4457  /* Adjust blocktime back to zero if necessary */
4458  /* Middle initialization might not have occurred yet */
4459  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4460  if (__kmp_nth > __kmp_avail_proc) {
4461  __kmp_zero_bt = TRUE;
4462  }
4463  }
4464 #endif /* KMP_ADJUST_BLOCKTIME */
4465 
4466 #if KMP_DEBUG
4467  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4468  // KMP_BARRIER_PARENT_FLAG.
4469  int b;
4470  kmp_balign_t *balign = new_thr->th.th_bar;
4471  for (b = 0; b < bs_last_barrier; ++b)
4472  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4473 #endif
4474 
4475  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4476  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4477 
4478  KMP_MB();
4479  return new_thr;
4480  }
4481 
4482  /* no, well fork a new one */
4483  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4484  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4485 
4486 #if KMP_USE_MONITOR
4487  // If this is the first worker thread the RTL is creating, then also
4488  // launch the monitor thread. We try to do this as early as possible.
4489  if (!TCR_4(__kmp_init_monitor)) {
4490  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4491  if (!TCR_4(__kmp_init_monitor)) {
4492  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4493  TCW_4(__kmp_init_monitor, 1);
4494  __kmp_create_monitor(&__kmp_monitor);
4495  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4496 #if KMP_OS_WINDOWS
4497  // AC: wait until monitor has started. This is a fix for CQ232808.
4498  // The reason is that if the library is loaded/unloaded in a loop with
4499  // small (parallel) work in between, then there is high probability that
4500  // monitor thread started after the library shutdown. At shutdown it is
4501  // too late to cope with the problem, because when the primary thread is
4502  // in DllMain (process detach) the monitor has no chances to start (it is
4503  // blocked), and primary thread has no means to inform the monitor that
4504  // the library has gone, because all the memory which the monitor can
4505  // access is going to be released/reset.
4506  while (TCR_4(__kmp_init_monitor) < 2) {
4507  KMP_YIELD(TRUE);
4508  }
4509  KF_TRACE(10, ("after monitor thread has started\n"));
4510 #endif
4511  }
4512  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4513  }
4514 #endif
4515 
4516  KMP_MB();
4517 
4518  {
4519  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4520  ? 1
4521  : __kmp_hidden_helper_threads_num + 1;
4522 
4523  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4524  ++new_gtid) {
4525  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4526  }
4527 
4528  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4529  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4530  }
4531  }
4532 
4533  /* allocate space for it. */
4534  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4535 
4536  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4537 
4538 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4539  // suppress race conditions detection on synchronization flags in debug mode
4540  // this helps to analyze library internals eliminating false positives
4541  __itt_suppress_mark_range(
4542  __itt_suppress_range, __itt_suppress_threading_errors,
4543  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4544  __itt_suppress_mark_range(
4545  __itt_suppress_range, __itt_suppress_threading_errors,
4546  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4547 #if KMP_OS_WINDOWS
4548  __itt_suppress_mark_range(
4549  __itt_suppress_range, __itt_suppress_threading_errors,
4550  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4551 #else
4552  __itt_suppress_mark_range(__itt_suppress_range,
4553  __itt_suppress_threading_errors,
4554  &new_thr->th.th_suspend_init_count,
4555  sizeof(new_thr->th.th_suspend_init_count));
4556 #endif
4557  // TODO: check if we need to also suppress b_arrived flags
4558  __itt_suppress_mark_range(__itt_suppress_range,
4559  __itt_suppress_threading_errors,
4560  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4561  sizeof(new_thr->th.th_bar[0].bb.b_go));
4562  __itt_suppress_mark_range(__itt_suppress_range,
4563  __itt_suppress_threading_errors,
4564  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4565  sizeof(new_thr->th.th_bar[1].bb.b_go));
4566  __itt_suppress_mark_range(__itt_suppress_range,
4567  __itt_suppress_threading_errors,
4568  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4569  sizeof(new_thr->th.th_bar[2].bb.b_go));
4570 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4571  if (__kmp_storage_map) {
4572  __kmp_print_thread_storage_map(new_thr, new_gtid);
4573  }
4574 
4575  // add the reserve serialized team, initialized from the team's primary thread
4576  {
4577  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4578  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4579  new_thr->th.th_serial_team = serial_team =
4580  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4581 #if OMPT_SUPPORT
4582  ompt_data_none, // root parallel id
4583 #endif
4584  proc_bind_default, &r_icvs,
4585  0 USE_NESTED_HOT_ARG(NULL));
4586  }
4587  KMP_ASSERT(serial_team);
4588  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4589  // execution (it is unused for now).
4590  serial_team->t.t_threads[0] = new_thr;
4591  KF_TRACE(10,
4592  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4593  new_thr));
4594 
4595  /* setup the thread structures */
4596  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4597 
4598 #if USE_FAST_MEMORY
4599  __kmp_initialize_fast_memory(new_thr);
4600 #endif /* USE_FAST_MEMORY */
4601 
4602 #if KMP_USE_BGET
4603  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4604  __kmp_initialize_bget(new_thr);
4605 #endif
4606 
4607  __kmp_init_random(new_thr); // Initialize random number generator
4608 
4609  /* Initialize these only once when thread is grabbed for a team allocation */
4610  KA_TRACE(20,
4611  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4612  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4613 
4614  int b;
4615  kmp_balign_t *balign = new_thr->th.th_bar;
4616  for (b = 0; b < bs_last_barrier; ++b) {
4617  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4618  balign[b].bb.team = NULL;
4619  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4620  balign[b].bb.use_oncore_barrier = 0;
4621  }
4622 
4623  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4624  new_thr->th.th_sleep_loc_type = flag_unset;
4625 
4626  new_thr->th.th_spin_here = FALSE;
4627  new_thr->th.th_next_waiting = 0;
4628 #if KMP_OS_UNIX
4629  new_thr->th.th_blocking = false;
4630 #endif
4631 
4632 #if KMP_AFFINITY_SUPPORTED
4633  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4634  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4635  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4636  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4637 #endif
4638  new_thr->th.th_def_allocator = __kmp_def_allocator;
4639  new_thr->th.th_prev_level = 0;
4640  new_thr->th.th_prev_num_threads = 1;
4641 
4642  TCW_4(new_thr->th.th_in_pool, FALSE);
4643  new_thr->th.th_active_in_pool = FALSE;
4644  TCW_4(new_thr->th.th_active, TRUE);
4645 
4646  /* adjust the global counters */
4647  __kmp_all_nth++;
4648  __kmp_nth++;
4649 
4650  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4651  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4652  if (__kmp_adjust_gtid_mode) {
4653  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4654  if (TCR_4(__kmp_gtid_mode) != 2) {
4655  TCW_4(__kmp_gtid_mode, 2);
4656  }
4657  } else {
4658  if (TCR_4(__kmp_gtid_mode) != 1) {
4659  TCW_4(__kmp_gtid_mode, 1);
4660  }
4661  }
4662  }
4663 
4664 #ifdef KMP_ADJUST_BLOCKTIME
4665  /* Adjust blocktime back to zero if necessary */
4666  /* Middle initialization might not have occurred yet */
4667  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4668  if (__kmp_nth > __kmp_avail_proc) {
4669  __kmp_zero_bt = TRUE;
4670  }
4671  }
4672 #endif /* KMP_ADJUST_BLOCKTIME */
4673 
4674  /* actually fork it and create the new worker thread */
4675  KF_TRACE(
4676  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4677  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4678  KF_TRACE(10,
4679  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4680 
4681  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4682  new_gtid));
4683  KMP_MB();
4684  return new_thr;
4685 }
4686 
4687 /* Reinitialize team for reuse.
4688  The hot team code calls this case at every fork barrier, so EPCC barrier
4689  test are extremely sensitive to changes in it, esp. writes to the team
4690  struct, which cause a cache invalidation in all threads.
4691  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4692 static void __kmp_reinitialize_team(kmp_team_t *team,
4693  kmp_internal_control_t *new_icvs,
4694  ident_t *loc) {
4695  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4696  team->t.t_threads[0], team));
4697  KMP_DEBUG_ASSERT(team && new_icvs);
4698  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4699  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4700 
4701  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4702  // Copy ICVs to the primary thread's implicit taskdata
4703  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4704  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4705 
4706  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4707  team->t.t_threads[0], team));
4708 }
4709 
4710 /* Initialize the team data structure.
4711  This assumes the t_threads and t_max_nproc are already set.
4712  Also, we don't touch the arguments */
4713 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4714  kmp_internal_control_t *new_icvs,
4715  ident_t *loc) {
4716  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4717 
4718  /* verify */
4719  KMP_DEBUG_ASSERT(team);
4720  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4721  KMP_DEBUG_ASSERT(team->t.t_threads);
4722  KMP_MB();
4723 
4724  team->t.t_master_tid = 0; /* not needed */
4725  /* team->t.t_master_bar; not needed */
4726  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4727  team->t.t_nproc = new_nproc;
4728 
4729  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4730  team->t.t_next_pool = NULL;
4731  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4732  * up hot team */
4733 
4734  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4735  team->t.t_invoke = NULL; /* not needed */
4736 
4737  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4738  team->t.t_sched.sched = new_icvs->sched.sched;
4739 
4740 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4741  team->t.t_fp_control_saved = FALSE; /* not needed */
4742  team->t.t_x87_fpu_control_word = 0; /* not needed */
4743  team->t.t_mxcsr = 0; /* not needed */
4744 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4745 
4746  team->t.t_construct = 0;
4747 
4748  team->t.t_ordered.dt.t_value = 0;
4749  team->t.t_master_active = FALSE;
4750 
4751 #ifdef KMP_DEBUG
4752  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4753 #endif
4754 #if KMP_OS_WINDOWS
4755  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4756 #endif
4757 
4758  team->t.t_control_stack_top = NULL;
4759 
4760  __kmp_reinitialize_team(team, new_icvs, loc);
4761 
4762  KMP_MB();
4763  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4764 }
4765 
4766 #if KMP_AFFINITY_SUPPORTED
4767 
4768 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4769 // It calculates the worker + primary thread's partition based upon the parent
4770 // thread's partition, and binds each worker to a thread in their partition.
4771 // The primary thread's partition should already include its current binding.
4772 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4773  // Do not partition places for the hidden helper team
4774  if (KMP_HIDDEN_HELPER_TEAM(team))
4775  return;
4776  // Copy the primary thread's place partition to the team struct
4777  kmp_info_t *master_th = team->t.t_threads[0];
4778  KMP_DEBUG_ASSERT(master_th != NULL);
4779  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4780  int first_place = master_th->th.th_first_place;
4781  int last_place = master_th->th.th_last_place;
4782  int masters_place = master_th->th.th_current_place;
4783  int num_masks = __kmp_affinity.num_masks;
4784  team->t.t_first_place = first_place;
4785  team->t.t_last_place = last_place;
4786 
4787  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4788  "bound to place %d partition = [%d,%d]\n",
4789  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4790  team->t.t_id, masters_place, first_place, last_place));
4791 
4792  switch (proc_bind) {
4793 
4794  case proc_bind_default:
4795  // Serial teams might have the proc_bind policy set to proc_bind_default.
4796  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4797  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4798  break;
4799 
4800  case proc_bind_primary: {
4801  int f;
4802  int n_th = team->t.t_nproc;
4803  for (f = 1; f < n_th; f++) {
4804  kmp_info_t *th = team->t.t_threads[f];
4805  KMP_DEBUG_ASSERT(th != NULL);
4806  th->th.th_first_place = first_place;
4807  th->th.th_last_place = last_place;
4808  th->th.th_new_place = masters_place;
4809  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4810  team->t.t_display_affinity != 1) {
4811  team->t.t_display_affinity = 1;
4812  }
4813 
4814  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4815  "partition = [%d,%d]\n",
4816  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4817  f, masters_place, first_place, last_place));
4818  }
4819  } break;
4820 
4821  case proc_bind_close: {
4822  int f;
4823  int n_th = team->t.t_nproc;
4824  int n_places;
4825  if (first_place <= last_place) {
4826  n_places = last_place - first_place + 1;
4827  } else {
4828  n_places = num_masks - first_place + last_place + 1;
4829  }
4830  if (n_th <= n_places) {
4831  int place = masters_place;
4832  for (f = 1; f < n_th; f++) {
4833  kmp_info_t *th = team->t.t_threads[f];
4834  KMP_DEBUG_ASSERT(th != NULL);
4835 
4836  if (place == last_place) {
4837  place = first_place;
4838  } else if (place == (num_masks - 1)) {
4839  place = 0;
4840  } else {
4841  place++;
4842  }
4843  th->th.th_first_place = first_place;
4844  th->th.th_last_place = last_place;
4845  th->th.th_new_place = place;
4846  if (__kmp_display_affinity && place != th->th.th_current_place &&
4847  team->t.t_display_affinity != 1) {
4848  team->t.t_display_affinity = 1;
4849  }
4850 
4851  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4852  "partition = [%d,%d]\n",
4853  __kmp_gtid_from_thread(team->t.t_threads[f]),
4854  team->t.t_id, f, place, first_place, last_place));
4855  }
4856  } else {
4857  int S, rem, gap, s_count;
4858  S = n_th / n_places;
4859  s_count = 0;
4860  rem = n_th - (S * n_places);
4861  gap = rem > 0 ? n_places / rem : n_places;
4862  int place = masters_place;
4863  int gap_ct = gap;
4864  for (f = 0; f < n_th; f++) {
4865  kmp_info_t *th = team->t.t_threads[f];
4866  KMP_DEBUG_ASSERT(th != NULL);
4867 
4868  th->th.th_first_place = first_place;
4869  th->th.th_last_place = last_place;
4870  th->th.th_new_place = place;
4871  if (__kmp_display_affinity && place != th->th.th_current_place &&
4872  team->t.t_display_affinity != 1) {
4873  team->t.t_display_affinity = 1;
4874  }
4875  s_count++;
4876 
4877  if ((s_count == S) && rem && (gap_ct == gap)) {
4878  // do nothing, add an extra thread to place on next iteration
4879  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4880  // we added an extra thread to this place; move to next place
4881  if (place == last_place) {
4882  place = first_place;
4883  } else if (place == (num_masks - 1)) {
4884  place = 0;
4885  } else {
4886  place++;
4887  }
4888  s_count = 0;
4889  gap_ct = 1;
4890  rem--;
4891  } else if (s_count == S) { // place full; don't add extra
4892  if (place == last_place) {
4893  place = first_place;
4894  } else if (place == (num_masks - 1)) {
4895  place = 0;
4896  } else {
4897  place++;
4898  }
4899  gap_ct++;
4900  s_count = 0;
4901  }
4902 
4903  KA_TRACE(100,
4904  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4905  "partition = [%d,%d]\n",
4906  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4907  th->th.th_new_place, first_place, last_place));
4908  }
4909  KMP_DEBUG_ASSERT(place == masters_place);
4910  }
4911  } break;
4912 
4913  case proc_bind_spread: {
4914  int f;
4915  int n_th = team->t.t_nproc;
4916  int n_places;
4917  int thidx;
4918  if (first_place <= last_place) {
4919  n_places = last_place - first_place + 1;
4920  } else {
4921  n_places = num_masks - first_place + last_place + 1;
4922  }
4923  if (n_th <= n_places) {
4924  int place = -1;
4925 
4926  if (n_places != num_masks) {
4927  int S = n_places / n_th;
4928  int s_count, rem, gap, gap_ct;
4929 
4930  place = masters_place;
4931  rem = n_places - n_th * S;
4932  gap = rem ? n_th / rem : 1;
4933  gap_ct = gap;
4934  thidx = n_th;
4935  if (update_master_only == 1)
4936  thidx = 1;
4937  for (f = 0; f < thidx; f++) {
4938  kmp_info_t *th = team->t.t_threads[f];
4939  KMP_DEBUG_ASSERT(th != NULL);
4940 
4941  th->th.th_first_place = place;
4942  th->th.th_new_place = place;
4943  if (__kmp_display_affinity && place != th->th.th_current_place &&
4944  team->t.t_display_affinity != 1) {
4945  team->t.t_display_affinity = 1;
4946  }
4947  s_count = 1;
4948  while (s_count < S) {
4949  if (place == last_place) {
4950  place = first_place;
4951  } else if (place == (num_masks - 1)) {
4952  place = 0;
4953  } else {
4954  place++;
4955  }
4956  s_count++;
4957  }
4958  if (rem && (gap_ct == gap)) {
4959  if (place == last_place) {
4960  place = first_place;
4961  } else if (place == (num_masks - 1)) {
4962  place = 0;
4963  } else {
4964  place++;
4965  }
4966  rem--;
4967  gap_ct = 0;
4968  }
4969  th->th.th_last_place = place;
4970  gap_ct++;
4971 
4972  if (place == last_place) {
4973  place = first_place;
4974  } else if (place == (num_masks - 1)) {
4975  place = 0;
4976  } else {
4977  place++;
4978  }
4979 
4980  KA_TRACE(100,
4981  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4982  "partition = [%d,%d], num_masks: %u\n",
4983  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4984  f, th->th.th_new_place, th->th.th_first_place,
4985  th->th.th_last_place, num_masks));
4986  }
4987  } else {
4988  /* Having uniform space of available computation places I can create
4989  T partitions of round(P/T) size and put threads into the first
4990  place of each partition. */
4991  double current = static_cast<double>(masters_place);
4992  double spacing =
4993  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4994  int first, last;
4995  kmp_info_t *th;
4996 
4997  thidx = n_th + 1;
4998  if (update_master_only == 1)
4999  thidx = 1;
5000  for (f = 0; f < thidx; f++) {
5001  first = static_cast<int>(current);
5002  last = static_cast<int>(current + spacing) - 1;
5003  KMP_DEBUG_ASSERT(last >= first);
5004  if (first >= n_places) {
5005  if (masters_place) {
5006  first -= n_places;
5007  last -= n_places;
5008  if (first == (masters_place + 1)) {
5009  KMP_DEBUG_ASSERT(f == n_th);
5010  first--;
5011  }
5012  if (last == masters_place) {
5013  KMP_DEBUG_ASSERT(f == (n_th - 1));
5014  last--;
5015  }
5016  } else {
5017  KMP_DEBUG_ASSERT(f == n_th);
5018  first = 0;
5019  last = 0;
5020  }
5021  }
5022  if (last >= n_places) {
5023  last = (n_places - 1);
5024  }
5025  place = first;
5026  current += spacing;
5027  if (f < n_th) {
5028  KMP_DEBUG_ASSERT(0 <= first);
5029  KMP_DEBUG_ASSERT(n_places > first);
5030  KMP_DEBUG_ASSERT(0 <= last);
5031  KMP_DEBUG_ASSERT(n_places > last);
5032  KMP_DEBUG_ASSERT(last_place >= first_place);
5033  th = team->t.t_threads[f];
5034  KMP_DEBUG_ASSERT(th);
5035  th->th.th_first_place = first;
5036  th->th.th_new_place = place;
5037  th->th.th_last_place = last;
5038  if (__kmp_display_affinity && place != th->th.th_current_place &&
5039  team->t.t_display_affinity != 1) {
5040  team->t.t_display_affinity = 1;
5041  }
5042  KA_TRACE(100,
5043  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5044  "partition = [%d,%d], spacing = %.4f\n",
5045  __kmp_gtid_from_thread(team->t.t_threads[f]),
5046  team->t.t_id, f, th->th.th_new_place,
5047  th->th.th_first_place, th->th.th_last_place, spacing));
5048  }
5049  }
5050  }
5051  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5052  } else {
5053  int S, rem, gap, s_count;
5054  S = n_th / n_places;
5055  s_count = 0;
5056  rem = n_th - (S * n_places);
5057  gap = rem > 0 ? n_places / rem : n_places;
5058  int place = masters_place;
5059  int gap_ct = gap;
5060  thidx = n_th;
5061  if (update_master_only == 1)
5062  thidx = 1;
5063  for (f = 0; f < thidx; f++) {
5064  kmp_info_t *th = team->t.t_threads[f];
5065  KMP_DEBUG_ASSERT(th != NULL);
5066 
5067  th->th.th_first_place = place;
5068  th->th.th_last_place = place;
5069  th->th.th_new_place = place;
5070  if (__kmp_display_affinity && place != th->th.th_current_place &&
5071  team->t.t_display_affinity != 1) {
5072  team->t.t_display_affinity = 1;
5073  }
5074  s_count++;
5075 
5076  if ((s_count == S) && rem && (gap_ct == gap)) {
5077  // do nothing, add an extra thread to place on next iteration
5078  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5079  // we added an extra thread to this place; move on to next place
5080  if (place == last_place) {
5081  place = first_place;
5082  } else if (place == (num_masks - 1)) {
5083  place = 0;
5084  } else {
5085  place++;
5086  }
5087  s_count = 0;
5088  gap_ct = 1;
5089  rem--;
5090  } else if (s_count == S) { // place is full; don't add extra thread
5091  if (place == last_place) {
5092  place = first_place;
5093  } else if (place == (num_masks - 1)) {
5094  place = 0;
5095  } else {
5096  place++;
5097  }
5098  gap_ct++;
5099  s_count = 0;
5100  }
5101 
5102  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5103  "partition = [%d,%d]\n",
5104  __kmp_gtid_from_thread(team->t.t_threads[f]),
5105  team->t.t_id, f, th->th.th_new_place,
5106  th->th.th_first_place, th->th.th_last_place));
5107  }
5108  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5109  }
5110  } break;
5111 
5112  default:
5113  break;
5114  }
5115 
5116  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5117 }
5118 
5119 #endif // KMP_AFFINITY_SUPPORTED
5120 
5121 /* allocate a new team data structure to use. take one off of the free pool if
5122  available */
5123 kmp_team_t *
5124 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5125 #if OMPT_SUPPORT
5126  ompt_data_t ompt_parallel_data,
5127 #endif
5128  kmp_proc_bind_t new_proc_bind,
5129  kmp_internal_control_t *new_icvs,
5130  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5131  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5132  int f;
5133  kmp_team_t *team;
5134  int use_hot_team = !root->r.r_active;
5135  int level = 0;
5136  int do_place_partition = 1;
5137 
5138  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5139  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5140  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5141  KMP_MB();
5142 
5143 #if KMP_NESTED_HOT_TEAMS
5144  kmp_hot_team_ptr_t *hot_teams;
5145  if (master) {
5146  team = master->th.th_team;
5147  level = team->t.t_active_level;
5148  if (master->th.th_teams_microtask) { // in teams construct?
5149  if (master->th.th_teams_size.nteams > 1 &&
5150  ( // #teams > 1
5151  team->t.t_pkfn ==
5152  (microtask_t)__kmp_teams_master || // inner fork of the teams
5153  master->th.th_teams_level <
5154  team->t.t_level)) { // or nested parallel inside the teams
5155  ++level; // not increment if #teams==1, or for outer fork of the teams;
5156  // increment otherwise
5157  }
5158  // Do not perform the place partition if inner fork of the teams
5159  // Wait until nested parallel region encountered inside teams construct
5160  if ((master->th.th_teams_size.nteams == 1 &&
5161  master->th.th_teams_level >= team->t.t_level) ||
5162  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5163  do_place_partition = 0;
5164  }
5165  hot_teams = master->th.th_hot_teams;
5166  if (level < __kmp_hot_teams_max_level && hot_teams &&
5167  hot_teams[level].hot_team) {
5168  // hot team has already been allocated for given level
5169  use_hot_team = 1;
5170  } else {
5171  use_hot_team = 0;
5172  }
5173  } else {
5174  // check we won't access uninitialized hot_teams, just in case
5175  KMP_DEBUG_ASSERT(new_nproc == 1);
5176  }
5177 #endif
5178  // Optimization to use a "hot" team
5179  if (use_hot_team && new_nproc > 1) {
5180  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5181 #if KMP_NESTED_HOT_TEAMS
5182  team = hot_teams[level].hot_team;
5183 #else
5184  team = root->r.r_hot_team;
5185 #endif
5186 #if KMP_DEBUG
5187  if (__kmp_tasking_mode != tskm_immediate_exec) {
5188  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5189  "task_team[1] = %p before reinit\n",
5190  team->t.t_task_team[0], team->t.t_task_team[1]));
5191  }
5192 #endif
5193 
5194  if (team->t.t_nproc != new_nproc &&
5195  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5196  // Distributed barrier may need a resize
5197  int old_nthr = team->t.t_nproc;
5198  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5199  }
5200 
5201  // If not doing the place partition, then reset the team's proc bind
5202  // to indicate that partitioning of all threads still needs to take place
5203  if (do_place_partition == 0)
5204  team->t.t_proc_bind = proc_bind_default;
5205  // Has the number of threads changed?
5206  /* Let's assume the most common case is that the number of threads is
5207  unchanged, and put that case first. */
5208  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5209  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5210  // This case can mean that omp_set_num_threads() was called and the hot
5211  // team size was already reduced, so we check the special flag
5212  if (team->t.t_size_changed == -1) {
5213  team->t.t_size_changed = 1;
5214  } else {
5215  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5216  }
5217 
5218  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5219  kmp_r_sched_t new_sched = new_icvs->sched;
5220  // set primary thread's schedule as new run-time schedule
5221  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5222 
5223  __kmp_reinitialize_team(team, new_icvs,
5224  root->r.r_uber_thread->th.th_ident);
5225 
5226  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5227  team->t.t_threads[0], team));
5228  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5229 
5230 #if KMP_AFFINITY_SUPPORTED
5231  if ((team->t.t_size_changed == 0) &&
5232  (team->t.t_proc_bind == new_proc_bind)) {
5233  if (new_proc_bind == proc_bind_spread) {
5234  if (do_place_partition) {
5235  // add flag to update only master for spread
5236  __kmp_partition_places(team, 1);
5237  }
5238  }
5239  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5240  "proc_bind = %d, partition = [%d,%d]\n",
5241  team->t.t_id, new_proc_bind, team->t.t_first_place,
5242  team->t.t_last_place));
5243  } else {
5244  if (do_place_partition) {
5245  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5246  __kmp_partition_places(team);
5247  }
5248  }
5249 #else
5250  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5251 #endif /* KMP_AFFINITY_SUPPORTED */
5252  } else if (team->t.t_nproc > new_nproc) {
5253  KA_TRACE(20,
5254  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5255  new_nproc));
5256 
5257  team->t.t_size_changed = 1;
5258  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5259  // Barrier size already reduced earlier in this function
5260  // Activate team threads via th_used_in_team
5261  __kmp_add_threads_to_team(team, new_nproc);
5262  }
5263 #if KMP_NESTED_HOT_TEAMS
5264  if (__kmp_hot_teams_mode == 0) {
5265  // AC: saved number of threads should correspond to team's value in this
5266  // mode, can be bigger in mode 1, when hot team has threads in reserve
5267  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5268  hot_teams[level].hot_team_nth = new_nproc;
5269 #endif // KMP_NESTED_HOT_TEAMS
5270  /* release the extra threads we don't need any more */
5271  for (f = new_nproc; f < team->t.t_nproc; f++) {
5272  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5273  if (__kmp_tasking_mode != tskm_immediate_exec) {
5274  // When decreasing team size, threads no longer in the team should
5275  // unref task team.
5276  team->t.t_threads[f]->th.th_task_team = NULL;
5277  }
5278  __kmp_free_thread(team->t.t_threads[f]);
5279  team->t.t_threads[f] = NULL;
5280  }
5281 #if KMP_NESTED_HOT_TEAMS
5282  } // (__kmp_hot_teams_mode == 0)
5283  else {
5284  // When keeping extra threads in team, switch threads to wait on own
5285  // b_go flag
5286  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5287  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5288  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5289  for (int b = 0; b < bs_last_barrier; ++b) {
5290  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5291  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5292  }
5293  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5294  }
5295  }
5296  }
5297 #endif // KMP_NESTED_HOT_TEAMS
5298  team->t.t_nproc = new_nproc;
5299  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5300  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5301  __kmp_reinitialize_team(team, new_icvs,
5302  root->r.r_uber_thread->th.th_ident);
5303 
5304  // Update remaining threads
5305  for (f = 0; f < new_nproc; ++f) {
5306  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5307  }
5308 
5309  // restore the current task state of the primary thread: should be the
5310  // implicit task
5311  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5312  team->t.t_threads[0], team));
5313 
5314  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5315 
5316 #ifdef KMP_DEBUG
5317  for (f = 0; f < team->t.t_nproc; f++) {
5318  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5319  team->t.t_threads[f]->th.th_team_nproc ==
5320  team->t.t_nproc);
5321  }
5322 #endif
5323 
5324  if (do_place_partition) {
5325  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5326 #if KMP_AFFINITY_SUPPORTED
5327  __kmp_partition_places(team);
5328 #endif
5329  }
5330  } else { // team->t.t_nproc < new_nproc
5331 
5332  KA_TRACE(20,
5333  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5334  new_nproc));
5335  int old_nproc = team->t.t_nproc; // save old value and use to update only
5336  team->t.t_size_changed = 1;
5337 
5338 #if KMP_NESTED_HOT_TEAMS
5339  int avail_threads = hot_teams[level].hot_team_nth;
5340  if (new_nproc < avail_threads)
5341  avail_threads = new_nproc;
5342  kmp_info_t **other_threads = team->t.t_threads;
5343  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5344  // Adjust barrier data of reserved threads (if any) of the team
5345  // Other data will be set in __kmp_initialize_info() below.
5346  int b;
5347  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5348  for (b = 0; b < bs_last_barrier; ++b) {
5349  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5350  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5351 #if USE_DEBUGGER
5352  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5353 #endif
5354  }
5355  }
5356  if (hot_teams[level].hot_team_nth >= new_nproc) {
5357  // we have all needed threads in reserve, no need to allocate any
5358  // this only possible in mode 1, cannot have reserved threads in mode 0
5359  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5360  team->t.t_nproc = new_nproc; // just get reserved threads involved
5361  } else {
5362  // We may have some threads in reserve, but not enough;
5363  // get reserved threads involved if any.
5364  team->t.t_nproc = hot_teams[level].hot_team_nth;
5365  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5366 #endif // KMP_NESTED_HOT_TEAMS
5367  if (team->t.t_max_nproc < new_nproc) {
5368  /* reallocate larger arrays */
5369  __kmp_reallocate_team_arrays(team, new_nproc);
5370  __kmp_reinitialize_team(team, new_icvs, NULL);
5371  }
5372 
5373 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5374  /* Temporarily set full mask for primary thread before creation of
5375  workers. The reason is that workers inherit the affinity from the
5376  primary thread, so if a lot of workers are created on the single
5377  core quickly, they don't get a chance to set their own affinity for
5378  a long time. */
5379  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5380 #endif
5381 
5382  /* allocate new threads for the hot team */
5383  for (f = team->t.t_nproc; f < new_nproc; f++) {
5384  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5385  KMP_DEBUG_ASSERT(new_worker);
5386  team->t.t_threads[f] = new_worker;
5387 
5388  KA_TRACE(20,
5389  ("__kmp_allocate_team: team %d init T#%d arrived: "
5390  "join=%llu, plain=%llu\n",
5391  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5392  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5393  team->t.t_bar[bs_plain_barrier].b_arrived));
5394 
5395  { // Initialize barrier data for new threads.
5396  int b;
5397  kmp_balign_t *balign = new_worker->th.th_bar;
5398  for (b = 0; b < bs_last_barrier; ++b) {
5399  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5400  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5401  KMP_BARRIER_PARENT_FLAG);
5402 #if USE_DEBUGGER
5403  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5404 #endif
5405  }
5406  }
5407  }
5408 
5409 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5410  /* Restore initial primary thread's affinity mask */
5411  new_temp_affinity.restore();
5412 #endif
5413 #if KMP_NESTED_HOT_TEAMS
5414  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5415 #endif // KMP_NESTED_HOT_TEAMS
5416  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5417  // Barrier size already increased earlier in this function
5418  // Activate team threads via th_used_in_team
5419  __kmp_add_threads_to_team(team, new_nproc);
5420  }
5421  /* make sure everyone is syncronized */
5422  // new threads below
5423  __kmp_initialize_team(team, new_nproc, new_icvs,
5424  root->r.r_uber_thread->th.th_ident);
5425 
5426  /* reinitialize the threads */
5427  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5428  for (f = 0; f < team->t.t_nproc; ++f)
5429  __kmp_initialize_info(team->t.t_threads[f], team, f,
5430  __kmp_gtid_from_tid(f, team));
5431 
5432  // set th_task_state for new threads in hot team with older thread's state
5433  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5434  for (f = old_nproc; f < team->t.t_nproc; ++f)
5435  team->t.t_threads[f]->th.th_task_state = old_state;
5436 
5437 #ifdef KMP_DEBUG
5438  for (f = 0; f < team->t.t_nproc; ++f) {
5439  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5440  team->t.t_threads[f]->th.th_team_nproc ==
5441  team->t.t_nproc);
5442  }
5443 #endif
5444 
5445  if (do_place_partition) {
5446  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5447 #if KMP_AFFINITY_SUPPORTED
5448  __kmp_partition_places(team);
5449 #endif
5450  }
5451  } // Check changes in number of threads
5452 
5453  kmp_info_t *master = team->t.t_threads[0];
5454  if (master->th.th_teams_microtask) {
5455  for (f = 1; f < new_nproc; ++f) {
5456  // propagate teams construct specific info to workers
5457  kmp_info_t *thr = team->t.t_threads[f];
5458  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5459  thr->th.th_teams_level = master->th.th_teams_level;
5460  thr->th.th_teams_size = master->th.th_teams_size;
5461  }
5462  }
5463 #if KMP_NESTED_HOT_TEAMS
5464  if (level) {
5465  // Sync barrier state for nested hot teams, not needed for outermost hot
5466  // team.
5467  for (f = 1; f < new_nproc; ++f) {
5468  kmp_info_t *thr = team->t.t_threads[f];
5469  int b;
5470  kmp_balign_t *balign = thr->th.th_bar;
5471  for (b = 0; b < bs_last_barrier; ++b) {
5472  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5473  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5474 #if USE_DEBUGGER
5475  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5476 #endif
5477  }
5478  }
5479  }
5480 #endif // KMP_NESTED_HOT_TEAMS
5481 
5482  /* reallocate space for arguments if necessary */
5483  __kmp_alloc_argv_entries(argc, team, TRUE);
5484  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5485  // The hot team re-uses the previous task team,
5486  // if untouched during the previous release->gather phase.
5487 
5488  KF_TRACE(10, (" hot_team = %p\n", team));
5489 
5490 #if KMP_DEBUG
5491  if (__kmp_tasking_mode != tskm_immediate_exec) {
5492  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5493  "task_team[1] = %p after reinit\n",
5494  team->t.t_task_team[0], team->t.t_task_team[1]));
5495  }
5496 #endif
5497 
5498 #if OMPT_SUPPORT
5499  __ompt_team_assign_id(team, ompt_parallel_data);
5500 #endif
5501 
5502  KMP_MB();
5503 
5504  return team;
5505  }
5506 
5507  /* next, let's try to take one from the team pool */
5508  KMP_MB();
5509  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5510  /* TODO: consider resizing undersized teams instead of reaping them, now
5511  that we have a resizing mechanism */
5512  if (team->t.t_max_nproc >= max_nproc) {
5513  /* take this team from the team pool */
5514  __kmp_team_pool = team->t.t_next_pool;
5515 
5516  if (max_nproc > 1 &&
5517  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5518  if (!team->t.b) { // Allocate barrier structure
5519  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5520  }
5521  }
5522 
5523  /* setup the team for fresh use */
5524  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5525 
5526  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5527  "task_team[1] %p to NULL\n",
5528  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5529  team->t.t_task_team[0] = NULL;
5530  team->t.t_task_team[1] = NULL;
5531 
5532  /* reallocate space for arguments if necessary */
5533  __kmp_alloc_argv_entries(argc, team, TRUE);
5534  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5535 
5536  KA_TRACE(
5537  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5538  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5539  { // Initialize barrier data.
5540  int b;
5541  for (b = 0; b < bs_last_barrier; ++b) {
5542  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5543 #if USE_DEBUGGER
5544  team->t.t_bar[b].b_master_arrived = 0;
5545  team->t.t_bar[b].b_team_arrived = 0;
5546 #endif
5547  }
5548  }
5549 
5550  team->t.t_proc_bind = new_proc_bind;
5551 
5552  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5553  team->t.t_id));
5554 
5555 #if OMPT_SUPPORT
5556  __ompt_team_assign_id(team, ompt_parallel_data);
5557 #endif
5558 
5559  KMP_MB();
5560 
5561  return team;
5562  }
5563 
5564  /* reap team if it is too small, then loop back and check the next one */
5565  // not sure if this is wise, but, will be redone during the hot-teams
5566  // rewrite.
5567  /* TODO: Use technique to find the right size hot-team, don't reap them */
5568  team = __kmp_reap_team(team);
5569  __kmp_team_pool = team;
5570  }
5571 
5572  /* nothing available in the pool, no matter, make a new team! */
5573  KMP_MB();
5574  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5575 
5576  /* and set it up */
5577  team->t.t_max_nproc = max_nproc;
5578  if (max_nproc > 1 &&
5579  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5580  // Allocate barrier structure
5581  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5582  }
5583 
5584  /* NOTE well, for some reason allocating one big buffer and dividing it up
5585  seems to really hurt performance a lot on the P4, so, let's not use this */
5586  __kmp_allocate_team_arrays(team, max_nproc);
5587 
5588  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5589  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5590 
5591  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5592  "%p to NULL\n",
5593  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5594  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5595  // memory, no need to duplicate
5596  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5597  // memory, no need to duplicate
5598 
5599  if (__kmp_storage_map) {
5600  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5601  }
5602 
5603  /* allocate space for arguments */
5604  __kmp_alloc_argv_entries(argc, team, FALSE);
5605  team->t.t_argc = argc;
5606 
5607  KA_TRACE(20,
5608  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5609  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5610  { // Initialize barrier data.
5611  int b;
5612  for (b = 0; b < bs_last_barrier; ++b) {
5613  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5614 #if USE_DEBUGGER
5615  team->t.t_bar[b].b_master_arrived = 0;
5616  team->t.t_bar[b].b_team_arrived = 0;
5617 #endif
5618  }
5619  }
5620 
5621  team->t.t_proc_bind = new_proc_bind;
5622 
5623 #if OMPT_SUPPORT
5624  __ompt_team_assign_id(team, ompt_parallel_data);
5625  team->t.ompt_serialized_team_info = NULL;
5626 #endif
5627 
5628  KMP_MB();
5629 
5630  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5631  team->t.t_id));
5632 
5633  return team;
5634 }
5635 
5636 /* TODO implement hot-teams at all levels */
5637 /* TODO implement lazy thread release on demand (disband request) */
5638 
5639 /* free the team. return it to the team pool. release all the threads
5640  * associated with it */
5641 void __kmp_free_team(kmp_root_t *root,
5642  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5643  int f;
5644  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5645  team->t.t_id));
5646 
5647  /* verify state */
5648  KMP_DEBUG_ASSERT(root);
5649  KMP_DEBUG_ASSERT(team);
5650  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5651  KMP_DEBUG_ASSERT(team->t.t_threads);
5652 
5653  int use_hot_team = team == root->r.r_hot_team;
5654 #if KMP_NESTED_HOT_TEAMS
5655  int level;
5656  if (master) {
5657  level = team->t.t_active_level - 1;
5658  if (master->th.th_teams_microtask) { // in teams construct?
5659  if (master->th.th_teams_size.nteams > 1) {
5660  ++level; // level was not increased in teams construct for
5661  // team_of_masters
5662  }
5663  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5664  master->th.th_teams_level == team->t.t_level) {
5665  ++level; // level was not increased in teams construct for
5666  // team_of_workers before the parallel
5667  } // team->t.t_level will be increased inside parallel
5668  }
5669 #if KMP_DEBUG
5670  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5671 #endif
5672  if (level < __kmp_hot_teams_max_level) {
5673  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5674  use_hot_team = 1;
5675  }
5676  }
5677 #endif // KMP_NESTED_HOT_TEAMS
5678 
5679  /* team is done working */
5680  TCW_SYNC_PTR(team->t.t_pkfn,
5681  NULL); // Important for Debugging Support Library.
5682 #if KMP_OS_WINDOWS
5683  team->t.t_copyin_counter = 0; // init counter for possible reuse
5684 #endif
5685  // Do not reset pointer to parent team to NULL for hot teams.
5686 
5687  /* if we are non-hot team, release our threads */
5688  if (!use_hot_team) {
5689  if (__kmp_tasking_mode != tskm_immediate_exec) {
5690  // Wait for threads to reach reapable state
5691  for (f = 1; f < team->t.t_nproc; ++f) {
5692  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5693  kmp_info_t *th = team->t.t_threads[f];
5694  volatile kmp_uint32 *state = &th->th.th_reap_state;
5695  while (*state != KMP_SAFE_TO_REAP) {
5696 #if KMP_OS_WINDOWS
5697  // On Windows a thread can be killed at any time, check this
5698  DWORD ecode;
5699  if (!__kmp_is_thread_alive(th, &ecode)) {
5700  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5701  break;
5702  }
5703 #endif
5704  // first check if thread is sleeping
5705  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5706  if (fl.is_sleeping())
5707  fl.resume(__kmp_gtid_from_thread(th));
5708  KMP_CPU_PAUSE();
5709  }
5710  }
5711 
5712  // Delete task teams
5713  int tt_idx;
5714  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5715  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5716  if (task_team != NULL) {
5717  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5718  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5719  team->t.t_threads[f]->th.th_task_team = NULL;
5720  }
5721  KA_TRACE(
5722  20,
5723  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5724  __kmp_get_gtid(), task_team, team->t.t_id));
5725 #if KMP_NESTED_HOT_TEAMS
5726  __kmp_free_task_team(master, task_team);
5727 #endif
5728  team->t.t_task_team[tt_idx] = NULL;
5729  }
5730  }
5731  }
5732 
5733  // Reset pointer to parent team only for non-hot teams.
5734  team->t.t_parent = NULL;
5735  team->t.t_level = 0;
5736  team->t.t_active_level = 0;
5737 
5738  /* free the worker threads */
5739  for (f = 1; f < team->t.t_nproc; ++f) {
5740  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5741  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5742  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5743  1, 2);
5744  }
5745  __kmp_free_thread(team->t.t_threads[f]);
5746  }
5747 
5748  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5749  if (team->t.b) {
5750  // wake up thread at old location
5751  team->t.b->go_release();
5752  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5753  for (f = 1; f < team->t.t_nproc; ++f) {
5754  if (team->t.b->sleep[f].sleep) {
5755  __kmp_atomic_resume_64(
5756  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5757  (kmp_atomic_flag_64<> *)NULL);
5758  }
5759  }
5760  }
5761  // Wait for threads to be removed from team
5762  for (int f = 1; f < team->t.t_nproc; ++f) {
5763  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5764  KMP_CPU_PAUSE();
5765  }
5766  }
5767  }
5768 
5769  for (f = 1; f < team->t.t_nproc; ++f) {
5770  team->t.t_threads[f] = NULL;
5771  }
5772 
5773  if (team->t.t_max_nproc > 1 &&
5774  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5775  distributedBarrier::deallocate(team->t.b);
5776  team->t.b = NULL;
5777  }
5778  /* put the team back in the team pool */
5779  /* TODO limit size of team pool, call reap_team if pool too large */
5780  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5781  __kmp_team_pool = (volatile kmp_team_t *)team;
5782  } else { // Check if team was created for primary threads in teams construct
5783  // See if first worker is a CG root
5784  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5785  team->t.t_threads[1]->th.th_cg_roots);
5786  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5787  // Clean up the CG root nodes on workers so that this team can be re-used
5788  for (f = 1; f < team->t.t_nproc; ++f) {
5789  kmp_info_t *thr = team->t.t_threads[f];
5790  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5791  thr->th.th_cg_roots->cg_root == thr);
5792  // Pop current CG root off list
5793  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5794  thr->th.th_cg_roots = tmp->up;
5795  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5796  " up to node %p. cg_nthreads was %d\n",
5797  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5798  int i = tmp->cg_nthreads--;
5799  if (i == 1) {
5800  __kmp_free(tmp); // free CG if we are the last thread in it
5801  }
5802  // Restore current task's thread_limit from CG root
5803  if (thr->th.th_cg_roots)
5804  thr->th.th_current_task->td_icvs.thread_limit =
5805  thr->th.th_cg_roots->cg_thread_limit;
5806  }
5807  }
5808  }
5809 
5810  KMP_MB();
5811 }
5812 
5813 /* reap the team. destroy it, reclaim all its resources and free its memory */
5814 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5815  kmp_team_t *next_pool = team->t.t_next_pool;
5816 
5817  KMP_DEBUG_ASSERT(team);
5818  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5819  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5820  KMP_DEBUG_ASSERT(team->t.t_threads);
5821  KMP_DEBUG_ASSERT(team->t.t_argv);
5822 
5823  /* TODO clean the threads that are a part of this? */
5824 
5825  /* free stuff */
5826  __kmp_free_team_arrays(team);
5827  if (team->t.t_argv != &team->t.t_inline_argv[0])
5828  __kmp_free((void *)team->t.t_argv);
5829  __kmp_free(team);
5830 
5831  KMP_MB();
5832  return next_pool;
5833 }
5834 
5835 // Free the thread. Don't reap it, just place it on the pool of available
5836 // threads.
5837 //
5838 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5839 // binding for the affinity mechanism to be useful.
5840 //
5841 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5842 // However, we want to avoid a potential performance problem by always
5843 // scanning through the list to find the correct point at which to insert
5844 // the thread (potential N**2 behavior). To do this we keep track of the
5845 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5846 // With single-level parallelism, threads will always be added to the tail
5847 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5848 // parallelism, all bets are off and we may need to scan through the entire
5849 // free list.
5850 //
5851 // This change also has a potentially large performance benefit, for some
5852 // applications. Previously, as threads were freed from the hot team, they
5853 // would be placed back on the free list in inverse order. If the hot team
5854 // grew back to it's original size, then the freed thread would be placed
5855 // back on the hot team in reverse order. This could cause bad cache
5856 // locality problems on programs where the size of the hot team regularly
5857 // grew and shrunk.
5858 //
5859 // Now, for single-level parallelism, the OMP tid is always == gtid.
5860 void __kmp_free_thread(kmp_info_t *this_th) {
5861  int gtid;
5862  kmp_info_t **scan;
5863 
5864  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5865  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5866 
5867  KMP_DEBUG_ASSERT(this_th);
5868 
5869  // When moving thread to pool, switch thread to wait on own b_go flag, and
5870  // uninitialized (NULL team).
5871  int b;
5872  kmp_balign_t *balign = this_th->th.th_bar;
5873  for (b = 0; b < bs_last_barrier; ++b) {
5874  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5875  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5876  balign[b].bb.team = NULL;
5877  balign[b].bb.leaf_kids = 0;
5878  }
5879  this_th->th.th_task_state = 0;
5880  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5881 
5882  /* put thread back on the free pool */
5883  TCW_PTR(this_th->th.th_team, NULL);
5884  TCW_PTR(this_th->th.th_root, NULL);
5885  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5886 
5887  while (this_th->th.th_cg_roots) {
5888  this_th->th.th_cg_roots->cg_nthreads--;
5889  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5890  " %p of thread %p to %d\n",
5891  this_th, this_th->th.th_cg_roots,
5892  this_th->th.th_cg_roots->cg_root,
5893  this_th->th.th_cg_roots->cg_nthreads));
5894  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5895  if (tmp->cg_root == this_th) { // Thread is a cg_root
5896  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5897  KA_TRACE(
5898  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5899  this_th->th.th_cg_roots = tmp->up;
5900  __kmp_free(tmp);
5901  } else { // Worker thread
5902  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5903  __kmp_free(tmp);
5904  }
5905  this_th->th.th_cg_roots = NULL;
5906  break;
5907  }
5908  }
5909 
5910  /* If the implicit task assigned to this thread can be used by other threads
5911  * -> multiple threads can share the data and try to free the task at
5912  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5913  * with higher probability when hot team is disabled but can occurs even when
5914  * the hot team is enabled */
5915  __kmp_free_implicit_task(this_th);
5916  this_th->th.th_current_task = NULL;
5917 
5918  // If the __kmp_thread_pool_insert_pt is already past the new insert
5919  // point, then we need to re-scan the entire list.
5920  gtid = this_th->th.th_info.ds.ds_gtid;
5921  if (__kmp_thread_pool_insert_pt != NULL) {
5922  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5923  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5924  __kmp_thread_pool_insert_pt = NULL;
5925  }
5926  }
5927 
5928  // Scan down the list to find the place to insert the thread.
5929  // scan is the address of a link in the list, possibly the address of
5930  // __kmp_thread_pool itself.
5931  //
5932  // In the absence of nested parallelism, the for loop will have 0 iterations.
5933  if (__kmp_thread_pool_insert_pt != NULL) {
5934  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5935  } else {
5936  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5937  }
5938  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5939  scan = &((*scan)->th.th_next_pool))
5940  ;
5941 
5942  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5943  // to its address.
5944  TCW_PTR(this_th->th.th_next_pool, *scan);
5945  __kmp_thread_pool_insert_pt = *scan = this_th;
5946  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5947  (this_th->th.th_info.ds.ds_gtid <
5948  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5949  TCW_4(this_th->th.th_in_pool, TRUE);
5950  __kmp_suspend_initialize_thread(this_th);
5951  __kmp_lock_suspend_mx(this_th);
5952  if (this_th->th.th_active == TRUE) {
5953  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5954  this_th->th.th_active_in_pool = TRUE;
5955  }
5956 #if KMP_DEBUG
5957  else {
5958  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5959  }
5960 #endif
5961  __kmp_unlock_suspend_mx(this_th);
5962 
5963  TCW_4(__kmp_nth, __kmp_nth - 1);
5964 
5965 #ifdef KMP_ADJUST_BLOCKTIME
5966  /* Adjust blocktime back to user setting or default if necessary */
5967  /* Middle initialization might never have occurred */
5968  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5969  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5970  if (__kmp_nth <= __kmp_avail_proc) {
5971  __kmp_zero_bt = FALSE;
5972  }
5973  }
5974 #endif /* KMP_ADJUST_BLOCKTIME */
5975 
5976  KMP_MB();
5977 }
5978 
5979 /* ------------------------------------------------------------------------ */
5980 
5981 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5982 #if OMP_PROFILING_SUPPORT
5983  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5984  // TODO: add a configuration option for time granularity
5985  if (ProfileTraceFile)
5986  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5987 #endif
5988 
5989  int gtid = this_thr->th.th_info.ds.ds_gtid;
5990  /* void *stack_data;*/
5991  kmp_team_t **volatile pteam;
5992 
5993  KMP_MB();
5994  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5995 
5996  if (__kmp_env_consistency_check) {
5997  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5998  }
5999 
6000 #if OMPD_SUPPORT
6001  if (ompd_state & OMPD_ENABLE_BP)
6002  ompd_bp_thread_begin();
6003 #endif
6004 
6005 #if OMPT_SUPPORT
6006  ompt_data_t *thread_data = nullptr;
6007  if (ompt_enabled.enabled) {
6008  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6009  *thread_data = ompt_data_none;
6010 
6011  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6012  this_thr->th.ompt_thread_info.wait_id = 0;
6013  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6014  this_thr->th.ompt_thread_info.parallel_flags = 0;
6015  if (ompt_enabled.ompt_callback_thread_begin) {
6016  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6017  ompt_thread_worker, thread_data);
6018  }
6019  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6020  }
6021 #endif
6022 
6023  /* This is the place where threads wait for work */
6024  while (!TCR_4(__kmp_global.g.g_done)) {
6025  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6026  KMP_MB();
6027 
6028  /* wait for work to do */
6029  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6030 
6031  /* No tid yet since not part of a team */
6032  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6033 
6034 #if OMPT_SUPPORT
6035  if (ompt_enabled.enabled) {
6036  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6037  }
6038 #endif
6039 
6040  pteam = &this_thr->th.th_team;
6041 
6042  /* have we been allocated? */
6043  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6044  /* we were just woken up, so run our new task */
6045  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6046  int rc;
6047  KA_TRACE(20,
6048  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6049  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6050  (*pteam)->t.t_pkfn));
6051 
6052  updateHWFPControl(*pteam);
6053 
6054 #if OMPT_SUPPORT
6055  if (ompt_enabled.enabled) {
6056  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6057  }
6058 #endif
6059 
6060  rc = (*pteam)->t.t_invoke(gtid);
6061  KMP_ASSERT(rc);
6062 
6063  KMP_MB();
6064  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6065  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6066  (*pteam)->t.t_pkfn));
6067  }
6068 #if OMPT_SUPPORT
6069  if (ompt_enabled.enabled) {
6070  /* no frame set while outside task */
6071  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6072 
6073  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6074  }
6075 #endif
6076  /* join barrier after parallel region */
6077  __kmp_join_barrier(gtid);
6078  }
6079  }
6080 
6081 #if OMPD_SUPPORT
6082  if (ompd_state & OMPD_ENABLE_BP)
6083  ompd_bp_thread_end();
6084 #endif
6085 
6086 #if OMPT_SUPPORT
6087  if (ompt_enabled.ompt_callback_thread_end) {
6088  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6089  }
6090 #endif
6091 
6092  this_thr->th.th_task_team = NULL;
6093  /* run the destructors for the threadprivate data for this thread */
6094  __kmp_common_destroy_gtid(gtid);
6095 
6096  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6097  KMP_MB();
6098 
6099 #if OMP_PROFILING_SUPPORT
6100  llvm::timeTraceProfilerFinishThread();
6101 #endif
6102  return this_thr;
6103 }
6104 
6105 /* ------------------------------------------------------------------------ */
6106 
6107 void __kmp_internal_end_dest(void *specific_gtid) {
6108  // Make sure no significant bits are lost
6109  int gtid;
6110  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6111 
6112  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6113  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6114  * this is because 0 is reserved for the nothing-stored case */
6115 
6116  __kmp_internal_end_thread(gtid);
6117 }
6118 
6119 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6120 
6121 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6122  __kmp_internal_end_atexit();
6123 }
6124 
6125 #endif
6126 
6127 /* [Windows] josh: when the atexit handler is called, there may still be more
6128  than one thread alive */
6129 void __kmp_internal_end_atexit(void) {
6130  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6131  /* [Windows]
6132  josh: ideally, we want to completely shutdown the library in this atexit
6133  handler, but stat code that depends on thread specific data for gtid fails
6134  because that data becomes unavailable at some point during the shutdown, so
6135  we call __kmp_internal_end_thread instead. We should eventually remove the
6136  dependency on __kmp_get_specific_gtid in the stat code and use
6137  __kmp_internal_end_library to cleanly shutdown the library.
6138 
6139  // TODO: Can some of this comment about GVS be removed?
6140  I suspect that the offending stat code is executed when the calling thread
6141  tries to clean up a dead root thread's data structures, resulting in GVS
6142  code trying to close the GVS structures for that thread, but since the stat
6143  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6144  the calling thread is cleaning up itself instead of another thread, it get
6145  confused. This happens because allowing a thread to unregister and cleanup
6146  another thread is a recent modification for addressing an issue.
6147  Based on the current design (20050722), a thread may end up
6148  trying to unregister another thread only if thread death does not trigger
6149  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6150  thread specific data destructor function to detect thread death. For
6151  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6152  is nothing. Thus, the workaround is applicable only for Windows static
6153  stat library. */
6154  __kmp_internal_end_library(-1);
6155 #if KMP_OS_WINDOWS
6156  __kmp_close_console();
6157 #endif
6158 }
6159 
6160 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6161  // It is assumed __kmp_forkjoin_lock is acquired.
6162 
6163  int gtid;
6164 
6165  KMP_DEBUG_ASSERT(thread != NULL);
6166 
6167  gtid = thread->th.th_info.ds.ds_gtid;
6168 
6169  if (!is_root) {
6170  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6171  /* Assume the threads are at the fork barrier here */
6172  KA_TRACE(
6173  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6174  gtid));
6175  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6176  while (
6177  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6178  KMP_CPU_PAUSE();
6179  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6180  } else {
6181  /* Need release fence here to prevent seg faults for tree forkjoin
6182  barrier (GEH) */
6183  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6184  thread);
6185  __kmp_release_64(&flag);
6186  }
6187  }
6188 
6189  // Terminate OS thread.
6190  __kmp_reap_worker(thread);
6191 
6192  // The thread was killed asynchronously. If it was actively
6193  // spinning in the thread pool, decrement the global count.
6194  //
6195  // There is a small timing hole here - if the worker thread was just waking
6196  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6197  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6198  // the global counter might not get updated.
6199  //
6200  // Currently, this can only happen as the library is unloaded,
6201  // so there are no harmful side effects.
6202  if (thread->th.th_active_in_pool) {
6203  thread->th.th_active_in_pool = FALSE;
6204  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6205  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6206  }
6207  }
6208 
6209  __kmp_free_implicit_task(thread);
6210 
6211 // Free the fast memory for tasking
6212 #if USE_FAST_MEMORY
6213  __kmp_free_fast_memory(thread);
6214 #endif /* USE_FAST_MEMORY */
6215 
6216  __kmp_suspend_uninitialize_thread(thread);
6217 
6218  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6219  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6220 
6221  --__kmp_all_nth;
6222  // __kmp_nth was decremented when thread is added to the pool.
6223 
6224 #ifdef KMP_ADJUST_BLOCKTIME
6225  /* Adjust blocktime back to user setting or default if necessary */
6226  /* Middle initialization might never have occurred */
6227  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6228  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6229  if (__kmp_nth <= __kmp_avail_proc) {
6230  __kmp_zero_bt = FALSE;
6231  }
6232  }
6233 #endif /* KMP_ADJUST_BLOCKTIME */
6234 
6235  /* free the memory being used */
6236  if (__kmp_env_consistency_check) {
6237  if (thread->th.th_cons) {
6238  __kmp_free_cons_stack(thread->th.th_cons);
6239  thread->th.th_cons = NULL;
6240  }
6241  }
6242 
6243  if (thread->th.th_pri_common != NULL) {
6244  __kmp_free(thread->th.th_pri_common);
6245  thread->th.th_pri_common = NULL;
6246  }
6247 
6248  if (thread->th.th_task_state_memo_stack != NULL) {
6249  __kmp_free(thread->th.th_task_state_memo_stack);
6250  thread->th.th_task_state_memo_stack = NULL;
6251  }
6252 
6253 #if KMP_USE_BGET
6254  if (thread->th.th_local.bget_data != NULL) {
6255  __kmp_finalize_bget(thread);
6256  }
6257 #endif
6258 
6259 #if KMP_AFFINITY_SUPPORTED
6260  if (thread->th.th_affin_mask != NULL) {
6261  KMP_CPU_FREE(thread->th.th_affin_mask);
6262  thread->th.th_affin_mask = NULL;
6263  }
6264 #endif /* KMP_AFFINITY_SUPPORTED */
6265 
6266 #if KMP_USE_HIER_SCHED
6267  if (thread->th.th_hier_bar_data != NULL) {
6268  __kmp_free(thread->th.th_hier_bar_data);
6269  thread->th.th_hier_bar_data = NULL;
6270  }
6271 #endif
6272 
6273  __kmp_reap_team(thread->th.th_serial_team);
6274  thread->th.th_serial_team = NULL;
6275  __kmp_free(thread);
6276 
6277  KMP_MB();
6278 
6279 } // __kmp_reap_thread
6280 
6281 static void __kmp_itthash_clean(kmp_info_t *th) {
6282 #if USE_ITT_NOTIFY
6283  if (__kmp_itt_region_domains.count > 0) {
6284  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6285  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6286  while (bucket) {
6287  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6288  __kmp_thread_free(th, bucket);
6289  bucket = next;
6290  }
6291  }
6292  }
6293  if (__kmp_itt_barrier_domains.count > 0) {
6294  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6295  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6296  while (bucket) {
6297  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6298  __kmp_thread_free(th, bucket);
6299  bucket = next;
6300  }
6301  }
6302  }
6303 #endif
6304 }
6305 
6306 static void __kmp_internal_end(void) {
6307  int i;
6308 
6309  /* First, unregister the library */
6310  __kmp_unregister_library();
6311 
6312 #if KMP_OS_WINDOWS
6313  /* In Win static library, we can't tell when a root actually dies, so we
6314  reclaim the data structures for any root threads that have died but not
6315  unregistered themselves, in order to shut down cleanly.
6316  In Win dynamic library we also can't tell when a thread dies. */
6317  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6318 // dead roots
6319 #endif
6320 
6321  for (i = 0; i < __kmp_threads_capacity; i++)
6322  if (__kmp_root[i])
6323  if (__kmp_root[i]->r.r_active)
6324  break;
6325  KMP_MB(); /* Flush all pending memory write invalidates. */
6326  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6327 
6328  if (i < __kmp_threads_capacity) {
6329 #if KMP_USE_MONITOR
6330  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6331  KMP_MB(); /* Flush all pending memory write invalidates. */
6332 
6333  // Need to check that monitor was initialized before reaping it. If we are
6334  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6335  // __kmp_monitor will appear to contain valid data, but it is only valid in
6336  // the parent process, not the child.
6337  // New behavior (201008): instead of keying off of the flag
6338  // __kmp_init_parallel, the monitor thread creation is keyed off
6339  // of the new flag __kmp_init_monitor.
6340  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6341  if (TCR_4(__kmp_init_monitor)) {
6342  __kmp_reap_monitor(&__kmp_monitor);
6343  TCW_4(__kmp_init_monitor, 0);
6344  }
6345  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6346  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6347 #endif // KMP_USE_MONITOR
6348  } else {
6349 /* TODO move this to cleanup code */
6350 #ifdef KMP_DEBUG
6351  /* make sure that everything has properly ended */
6352  for (i = 0; i < __kmp_threads_capacity; i++) {
6353  if (__kmp_root[i]) {
6354  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6355  // there can be uber threads alive here
6356  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6357  }
6358  }
6359 #endif
6360 
6361  KMP_MB();
6362 
6363  // Reap the worker threads.
6364  // This is valid for now, but be careful if threads are reaped sooner.
6365  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6366  // Get the next thread from the pool.
6367  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6368  __kmp_thread_pool = thread->th.th_next_pool;
6369  // Reap it.
6370  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6371  thread->th.th_next_pool = NULL;
6372  thread->th.th_in_pool = FALSE;
6373  __kmp_reap_thread(thread, 0);
6374  }
6375  __kmp_thread_pool_insert_pt = NULL;
6376 
6377  // Reap teams.
6378  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6379  // Get the next team from the pool.
6380  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6381  __kmp_team_pool = team->t.t_next_pool;
6382  // Reap it.
6383  team->t.t_next_pool = NULL;
6384  __kmp_reap_team(team);
6385  }
6386 
6387  __kmp_reap_task_teams();
6388 
6389 #if KMP_OS_UNIX
6390  // Threads that are not reaped should not access any resources since they
6391  // are going to be deallocated soon, so the shutdown sequence should wait
6392  // until all threads either exit the final spin-waiting loop or begin
6393  // sleeping after the given blocktime.
6394  for (i = 0; i < __kmp_threads_capacity; i++) {
6395  kmp_info_t *thr = __kmp_threads[i];
6396  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6397  KMP_CPU_PAUSE();
6398  }
6399 #endif
6400 
6401  for (i = 0; i < __kmp_threads_capacity; ++i) {
6402  // TBD: Add some checking...
6403  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6404  }
6405 
6406  /* Make sure all threadprivate destructors get run by joining with all
6407  worker threads before resetting this flag */
6408  TCW_SYNC_4(__kmp_init_common, FALSE);
6409 
6410  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6411  KMP_MB();
6412 
6413 #if KMP_USE_MONITOR
6414  // See note above: One of the possible fixes for CQ138434 / CQ140126
6415  //
6416  // FIXME: push both code fragments down and CSE them?
6417  // push them into __kmp_cleanup() ?
6418  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6419  if (TCR_4(__kmp_init_monitor)) {
6420  __kmp_reap_monitor(&__kmp_monitor);
6421  TCW_4(__kmp_init_monitor, 0);
6422  }
6423  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6424  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6425 #endif
6426  } /* else !__kmp_global.t_active */
6427  TCW_4(__kmp_init_gtid, FALSE);
6428  KMP_MB(); /* Flush all pending memory write invalidates. */
6429 
6430  __kmp_cleanup();
6431 #if OMPT_SUPPORT
6432  ompt_fini();
6433 #endif
6434 }
6435 
6436 void __kmp_internal_end_library(int gtid_req) {
6437  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6438  /* this shouldn't be a race condition because __kmp_internal_end() is the
6439  only place to clear __kmp_serial_init */
6440  /* we'll check this later too, after we get the lock */
6441  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6442  // redundant, because the next check will work in any case.
6443  if (__kmp_global.g.g_abort) {
6444  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6445  /* TODO abort? */
6446  return;
6447  }
6448  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6449  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6450  return;
6451  }
6452 
6453  // If hidden helper team has been initialized, we need to deinit it
6454  if (TCR_4(__kmp_init_hidden_helper) &&
6455  !TCR_4(__kmp_hidden_helper_team_done)) {
6456  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6457  // First release the main thread to let it continue its work
6458  __kmp_hidden_helper_main_thread_release();
6459  // Wait until the hidden helper team has been destroyed
6460  __kmp_hidden_helper_threads_deinitz_wait();
6461  }
6462 
6463  KMP_MB(); /* Flush all pending memory write invalidates. */
6464  /* find out who we are and what we should do */
6465  {
6466  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6467  KA_TRACE(
6468  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6469  if (gtid == KMP_GTID_SHUTDOWN) {
6470  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6471  "already shutdown\n"));
6472  return;
6473  } else if (gtid == KMP_GTID_MONITOR) {
6474  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6475  "registered, or system shutdown\n"));
6476  return;
6477  } else if (gtid == KMP_GTID_DNE) {
6478  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6479  "shutdown\n"));
6480  /* we don't know who we are, but we may still shutdown the library */
6481  } else if (KMP_UBER_GTID(gtid)) {
6482  /* unregister ourselves as an uber thread. gtid is no longer valid */
6483  if (__kmp_root[gtid]->r.r_active) {
6484  __kmp_global.g.g_abort = -1;
6485  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6486  __kmp_unregister_library();
6487  KA_TRACE(10,
6488  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6489  gtid));
6490  return;
6491  } else {
6492  __kmp_itthash_clean(__kmp_threads[gtid]);
6493  KA_TRACE(
6494  10,
6495  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6496  __kmp_unregister_root_current_thread(gtid);
6497  }
6498  } else {
6499 /* worker threads may call this function through the atexit handler, if they
6500  * call exit() */
6501 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6502  TODO: do a thorough shutdown instead */
6503 #ifdef DUMP_DEBUG_ON_EXIT
6504  if (__kmp_debug_buf)
6505  __kmp_dump_debug_buffer();
6506 #endif
6507  // added unregister library call here when we switch to shm linux
6508  // if we don't, it will leave lots of files in /dev/shm
6509  // cleanup shared memory file before exiting.
6510  __kmp_unregister_library();
6511  return;
6512  }
6513  }
6514  /* synchronize the termination process */
6515  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6516 
6517  /* have we already finished */
6518  if (__kmp_global.g.g_abort) {
6519  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6520  /* TODO abort? */
6521  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6522  return;
6523  }
6524  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6525  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6526  return;
6527  }
6528 
6529  /* We need this lock to enforce mutex between this reading of
6530  __kmp_threads_capacity and the writing by __kmp_register_root.
6531  Alternatively, we can use a counter of roots that is atomically updated by
6532  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6533  __kmp_internal_end_*. */
6534  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6535 
6536  /* now we can safely conduct the actual termination */
6537  __kmp_internal_end();
6538 
6539  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6540  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6541 
6542  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6543 
6544 #ifdef DUMP_DEBUG_ON_EXIT
6545  if (__kmp_debug_buf)
6546  __kmp_dump_debug_buffer();
6547 #endif
6548 
6549 #if KMP_OS_WINDOWS
6550  __kmp_close_console();
6551 #endif
6552 
6553  __kmp_fini_allocator();
6554 
6555 } // __kmp_internal_end_library
6556 
6557 void __kmp_internal_end_thread(int gtid_req) {
6558  int i;
6559 
6560  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6561  /* this shouldn't be a race condition because __kmp_internal_end() is the
6562  * only place to clear __kmp_serial_init */
6563  /* we'll check this later too, after we get the lock */
6564  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6565  // redundant, because the next check will work in any case.
6566  if (__kmp_global.g.g_abort) {
6567  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6568  /* TODO abort? */
6569  return;
6570  }
6571  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6572  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6573  return;
6574  }
6575 
6576  // If hidden helper team has been initialized, we need to deinit it
6577  if (TCR_4(__kmp_init_hidden_helper) &&
6578  !TCR_4(__kmp_hidden_helper_team_done)) {
6579  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6580  // First release the main thread to let it continue its work
6581  __kmp_hidden_helper_main_thread_release();
6582  // Wait until the hidden helper team has been destroyed
6583  __kmp_hidden_helper_threads_deinitz_wait();
6584  }
6585 
6586  KMP_MB(); /* Flush all pending memory write invalidates. */
6587 
6588  /* find out who we are and what we should do */
6589  {
6590  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6591  KA_TRACE(10,
6592  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6593  if (gtid == KMP_GTID_SHUTDOWN) {
6594  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6595  "already shutdown\n"));
6596  return;
6597  } else if (gtid == KMP_GTID_MONITOR) {
6598  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6599  "registered, or system shutdown\n"));
6600  return;
6601  } else if (gtid == KMP_GTID_DNE) {
6602  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6603  "shutdown\n"));
6604  return;
6605  /* we don't know who we are */
6606  } else if (KMP_UBER_GTID(gtid)) {
6607  /* unregister ourselves as an uber thread. gtid is no longer valid */
6608  if (__kmp_root[gtid]->r.r_active) {
6609  __kmp_global.g.g_abort = -1;
6610  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6611  KA_TRACE(10,
6612  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6613  gtid));
6614  return;
6615  } else {
6616  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6617  gtid));
6618  __kmp_unregister_root_current_thread(gtid);
6619  }
6620  } else {
6621  /* just a worker thread, let's leave */
6622  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6623 
6624  if (gtid >= 0) {
6625  __kmp_threads[gtid]->th.th_task_team = NULL;
6626  }
6627 
6628  KA_TRACE(10,
6629  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6630  gtid));
6631  return;
6632  }
6633  }
6634 #if KMP_DYNAMIC_LIB
6635  if (__kmp_pause_status != kmp_hard_paused)
6636  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6637  // because we will better shutdown later in the library destructor.
6638  {
6639  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6640  return;
6641  }
6642 #endif
6643  /* synchronize the termination process */
6644  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6645 
6646  /* have we already finished */
6647  if (__kmp_global.g.g_abort) {
6648  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6649  /* TODO abort? */
6650  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6651  return;
6652  }
6653  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6654  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6655  return;
6656  }
6657 
6658  /* We need this lock to enforce mutex between this reading of
6659  __kmp_threads_capacity and the writing by __kmp_register_root.
6660  Alternatively, we can use a counter of roots that is atomically updated by
6661  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6662  __kmp_internal_end_*. */
6663 
6664  /* should we finish the run-time? are all siblings done? */
6665  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6666 
6667  for (i = 0; i < __kmp_threads_capacity; ++i) {
6668  if (KMP_UBER_GTID(i)) {
6669  KA_TRACE(
6670  10,
6671  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6672  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6673  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6674  return;
6675  }
6676  }
6677 
6678  /* now we can safely conduct the actual termination */
6679 
6680  __kmp_internal_end();
6681 
6682  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6683  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6684 
6685  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6686 
6687 #ifdef DUMP_DEBUG_ON_EXIT
6688  if (__kmp_debug_buf)
6689  __kmp_dump_debug_buffer();
6690 #endif
6691 } // __kmp_internal_end_thread
6692 
6693 // -----------------------------------------------------------------------------
6694 // Library registration stuff.
6695 
6696 static long __kmp_registration_flag = 0;
6697 // Random value used to indicate library initialization.
6698 static char *__kmp_registration_str = NULL;
6699 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6700 
6701 static inline char *__kmp_reg_status_name() {
6702 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6703  each thread. If registration and unregistration go in different threads
6704  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6705  env var can not be found, because the name will contain different pid. */
6706 // macOS* complains about name being too long with additional getuid()
6707 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6708  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6709  (int)getuid());
6710 #else
6711  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6712 #endif
6713 } // __kmp_reg_status_get
6714 
6715 #if defined(KMP_USE_SHM)
6716 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6717 char *temp_reg_status_file_name = nullptr;
6718 #endif
6719 
6720 void __kmp_register_library_startup(void) {
6721 
6722  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6723  int done = 0;
6724  union {
6725  double dtime;
6726  long ltime;
6727  } time;
6728 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6729  __kmp_initialize_system_tick();
6730 #endif
6731  __kmp_read_system_time(&time.dtime);
6732  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6733  __kmp_registration_str =
6734  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6735  __kmp_registration_flag, KMP_LIBRARY_FILE);
6736 
6737  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6738  __kmp_registration_str));
6739 
6740  while (!done) {
6741 
6742  char *value = NULL; // Actual value of the environment variable.
6743 
6744 #if defined(KMP_USE_SHM)
6745  char *shm_name = __kmp_str_format("/%s", name);
6746  int shm_preexist = 0;
6747  char *data1;
6748  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6749  if ((fd1 == -1) && (errno == EEXIST)) {
6750  // file didn't open because it already exists.
6751  // try opening existing file
6752  fd1 = shm_open(shm_name, O_RDWR, 0666);
6753  if (fd1 == -1) { // file didn't open
6754  // error out here
6755  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6756  __kmp_msg_null);
6757  } else {
6758  // able to open existing file
6759  shm_preexist = 1;
6760  }
6761  } else if (fd1 == -1) {
6762  // SHM didn't open; it was due to error other than already exists. Try to
6763  // create a temp file under /tmp.
6764  // TODO: /tmp might not always be the temporary directory. For now we will
6765  // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6766  char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6767  fd1 = mkstemp(temp_file_name);
6768  if (fd1 == -1) {
6769  // error out here.
6770  __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6771  __kmp_msg_null);
6772  }
6773  temp_reg_status_file_name = temp_file_name;
6774  }
6775  if (shm_preexist == 0) {
6776  // we created SHM now set size
6777  if (ftruncate(fd1, SHM_SIZE) == -1) {
6778  // error occured setting size;
6779  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6780  KMP_ERR(errno), __kmp_msg_null);
6781  }
6782  }
6783  data1 =
6784  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6785  if (data1 == MAP_FAILED) {
6786  // failed to map shared memory
6787  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6788  __kmp_msg_null);
6789  }
6790  if (shm_preexist == 0) { // set data to SHM, set value
6791  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6792  }
6793  // Read value from either what we just wrote or existing file.
6794  value = __kmp_str_format("%s", data1); // read value from SHM
6795  munmap(data1, SHM_SIZE);
6796  close(fd1);
6797 #else // Windows and unix with static library
6798  // Set environment variable, but do not overwrite if it is exist.
6799  __kmp_env_set(name, __kmp_registration_str, 0);
6800  // read value to see if it got set
6801  value = __kmp_env_get(name);
6802 #endif
6803 
6804  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6805  done = 1; // Ok, environment variable set successfully, exit the loop.
6806  } else {
6807  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6808  // Check whether it alive or dead.
6809  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6810  char *tail = value;
6811  char *flag_addr_str = NULL;
6812  char *flag_val_str = NULL;
6813  char const *file_name = NULL;
6814  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6815  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6816  file_name = tail;
6817  if (tail != NULL) {
6818  unsigned long *flag_addr = 0;
6819  unsigned long flag_val = 0;
6820  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6821  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6822  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6823  // First, check whether environment-encoded address is mapped into
6824  // addr space.
6825  // If so, dereference it to see if it still has the right value.
6826  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6827  neighbor = 1;
6828  } else {
6829  // If not, then we know the other copy of the library is no longer
6830  // running.
6831  neighbor = 2;
6832  }
6833  }
6834  }
6835  switch (neighbor) {
6836  case 0: // Cannot parse environment variable -- neighbor status unknown.
6837  // Assume it is the incompatible format of future version of the
6838  // library. Assume the other library is alive.
6839  // WARN( ... ); // TODO: Issue a warning.
6840  file_name = "unknown library";
6841  KMP_FALLTHROUGH();
6842  // Attention! Falling to the next case. That's intentional.
6843  case 1: { // Neighbor is alive.
6844  // Check it is allowed.
6845  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6846  if (!__kmp_str_match_true(duplicate_ok)) {
6847  // That's not allowed. Issue fatal error.
6848  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6849  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6850  }
6851  KMP_INTERNAL_FREE(duplicate_ok);
6852  __kmp_duplicate_library_ok = 1;
6853  done = 1; // Exit the loop.
6854  } break;
6855  case 2: { // Neighbor is dead.
6856 
6857 #if defined(KMP_USE_SHM)
6858  // close shared memory.
6859  shm_unlink(shm_name); // this removes file in /dev/shm
6860 #else
6861  // Clear the variable and try to register library again.
6862  __kmp_env_unset(name);
6863 #endif
6864  } break;
6865  default: {
6866  KMP_DEBUG_ASSERT(0);
6867  } break;
6868  }
6869  }
6870  KMP_INTERNAL_FREE((void *)value);
6871 #if defined(KMP_USE_SHM)
6872  KMP_INTERNAL_FREE((void *)shm_name);
6873 #endif
6874  } // while
6875  KMP_INTERNAL_FREE((void *)name);
6876 
6877 } // func __kmp_register_library_startup
6878 
6879 void __kmp_unregister_library(void) {
6880 
6881  char *name = __kmp_reg_status_name();
6882  char *value = NULL;
6883 
6884 #if defined(KMP_USE_SHM)
6885  bool use_shm = true;
6886  char *shm_name = __kmp_str_format("/%s", name);
6887  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6888  if (fd1 == -1) {
6889  // File did not open. Try the temporary file.
6890  use_shm = false;
6891  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6892  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6893  if (fd1 == -1) {
6894  // give it up now.
6895  return;
6896  }
6897  }
6898  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6899  if (data1 != MAP_FAILED) {
6900  value = __kmp_str_format("%s", data1); // read value from SHM
6901  munmap(data1, SHM_SIZE);
6902  }
6903  close(fd1);
6904 #else
6905  value = __kmp_env_get(name);
6906 #endif
6907 
6908  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6909  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6910  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6911 // Ok, this is our variable. Delete it.
6912 #if defined(KMP_USE_SHM)
6913  if (use_shm) {
6914  shm_unlink(shm_name); // this removes file in /dev/shm
6915  } else {
6916  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6917  unlink(temp_reg_status_file_name); // this removes the temp file
6918  }
6919 #else
6920  __kmp_env_unset(name);
6921 #endif
6922  }
6923 
6924 #if defined(KMP_USE_SHM)
6925  KMP_INTERNAL_FREE(shm_name);
6926  if (!use_shm) {
6927  KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6928  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6929  }
6930 #endif
6931 
6932  KMP_INTERNAL_FREE(__kmp_registration_str);
6933  KMP_INTERNAL_FREE(value);
6934  KMP_INTERNAL_FREE(name);
6935 
6936  __kmp_registration_flag = 0;
6937  __kmp_registration_str = NULL;
6938 
6939 } // __kmp_unregister_library
6940 
6941 // End of Library registration stuff.
6942 // -----------------------------------------------------------------------------
6943 
6944 #if KMP_MIC_SUPPORTED
6945 
6946 static void __kmp_check_mic_type() {
6947  kmp_cpuid_t cpuid_state = {0};
6948  kmp_cpuid_t *cs_p = &cpuid_state;
6949  __kmp_x86_cpuid(1, 0, cs_p);
6950  // We don't support mic1 at the moment
6951  if ((cs_p->eax & 0xff0) == 0xB10) {
6952  __kmp_mic_type = mic2;
6953  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6954  __kmp_mic_type = mic3;
6955  } else {
6956  __kmp_mic_type = non_mic;
6957  }
6958 }
6959 
6960 #endif /* KMP_MIC_SUPPORTED */
6961 
6962 #if KMP_HAVE_UMWAIT
6963 static void __kmp_user_level_mwait_init() {
6964  struct kmp_cpuid buf;
6965  __kmp_x86_cpuid(7, 0, &buf);
6966  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6967  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6968  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6969  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6970  __kmp_umwait_enabled));
6971 }
6972 #elif KMP_HAVE_MWAIT
6973 #ifndef AT_INTELPHIUSERMWAIT
6974 // Spurious, non-existent value that should always fail to return anything.
6975 // Will be replaced with the correct value when we know that.
6976 #define AT_INTELPHIUSERMWAIT 10000
6977 #endif
6978 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6979 // earlier OS is used to build the RTL, we'll use the following internal
6980 // function when the entry is not found.
6981 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6982 unsigned long getauxval(unsigned long) { return 0; }
6983 
6984 static void __kmp_user_level_mwait_init() {
6985  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6986  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6987  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6988  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6989  if (__kmp_mic_type == mic3) {
6990  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6991  if ((res & 0x1) || __kmp_user_level_mwait) {
6992  __kmp_mwait_enabled = TRUE;
6993  if (__kmp_user_level_mwait) {
6994  KMP_INFORM(EnvMwaitWarn);
6995  }
6996  } else {
6997  __kmp_mwait_enabled = FALSE;
6998  }
6999  }
7000  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7001  "__kmp_mwait_enabled = %d\n",
7002  __kmp_mic_type, __kmp_mwait_enabled));
7003 }
7004 #endif /* KMP_HAVE_UMWAIT */
7005 
7006 static void __kmp_do_serial_initialize(void) {
7007  int i, gtid;
7008  size_t size;
7009 
7010  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7011 
7012  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7013  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7014  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7015  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7016  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7017 
7018 #if OMPT_SUPPORT
7019  ompt_pre_init();
7020 #endif
7021 #if OMPD_SUPPORT
7022  __kmp_env_dump();
7023  ompd_init();
7024 #endif
7025 
7026  __kmp_validate_locks();
7027 
7028 #if ENABLE_LIBOMPTARGET
7029  /* Initialize functions from libomptarget */
7030  __kmp_init_omptarget();
7031 #endif
7032 
7033  /* Initialize internal memory allocator */
7034  __kmp_init_allocator();
7035 
7036  /* Register the library startup via an environment variable or via mapped
7037  shared memory file and check to see whether another copy of the library is
7038  already registered. Since forked child process is often terminated, we
7039  postpone the registration till middle initialization in the child */
7040  if (__kmp_need_register_serial)
7041  __kmp_register_library_startup();
7042 
7043  /* TODO reinitialization of library */
7044  if (TCR_4(__kmp_global.g.g_done)) {
7045  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7046  }
7047 
7048  __kmp_global.g.g_abort = 0;
7049  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7050 
7051 /* initialize the locks */
7052 #if KMP_USE_ADAPTIVE_LOCKS
7053 #if KMP_DEBUG_ADAPTIVE_LOCKS
7054  __kmp_init_speculative_stats();
7055 #endif
7056 #endif
7057 #if KMP_STATS_ENABLED
7058  __kmp_stats_init();
7059 #endif
7060  __kmp_init_lock(&__kmp_global_lock);
7061  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7062  __kmp_init_lock(&__kmp_debug_lock);
7063  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7064  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7065  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7066  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7067  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7068  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7069  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7070  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7071  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7072  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7073  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7074  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7075  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7076  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7077  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7078 #if KMP_USE_MONITOR
7079  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7080 #endif
7081  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7082 
7083  /* conduct initialization and initial setup of configuration */
7084 
7085  __kmp_runtime_initialize();
7086 
7087 #if KMP_MIC_SUPPORTED
7088  __kmp_check_mic_type();
7089 #endif
7090 
7091 // Some global variable initialization moved here from kmp_env_initialize()
7092 #ifdef KMP_DEBUG
7093  kmp_diag = 0;
7094 #endif
7095  __kmp_abort_delay = 0;
7096 
7097  // From __kmp_init_dflt_team_nth()
7098  /* assume the entire machine will be used */
7099  __kmp_dflt_team_nth_ub = __kmp_xproc;
7100  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7101  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7102  }
7103  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7104  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7105  }
7106  __kmp_max_nth = __kmp_sys_max_nth;
7107  __kmp_cg_max_nth = __kmp_sys_max_nth;
7108  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7109  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7110  __kmp_teams_max_nth = __kmp_sys_max_nth;
7111  }
7112 
7113  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7114  // part
7115  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7116 #if KMP_USE_MONITOR
7117  __kmp_monitor_wakeups =
7118  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7119  __kmp_bt_intervals =
7120  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7121 #endif
7122  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7123  __kmp_library = library_throughput;
7124  // From KMP_SCHEDULE initialization
7125  __kmp_static = kmp_sch_static_balanced;
7126 // AC: do not use analytical here, because it is non-monotonous
7127 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7128 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7129 // need to repeat assignment
7130 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7131 // bit control and barrier method control parts
7132 #if KMP_FAST_REDUCTION_BARRIER
7133 #define kmp_reduction_barrier_gather_bb ((int)1)
7134 #define kmp_reduction_barrier_release_bb ((int)1)
7135 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7136 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7137 #endif // KMP_FAST_REDUCTION_BARRIER
7138  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7139  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7140  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7141  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7142  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7143 #if KMP_FAST_REDUCTION_BARRIER
7144  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7145  // lin_64 ): hyper,1
7146  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7147  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7148  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7149  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7150  }
7151 #endif // KMP_FAST_REDUCTION_BARRIER
7152  }
7153 #if KMP_FAST_REDUCTION_BARRIER
7154 #undef kmp_reduction_barrier_release_pat
7155 #undef kmp_reduction_barrier_gather_pat
7156 #undef kmp_reduction_barrier_release_bb
7157 #undef kmp_reduction_barrier_gather_bb
7158 #endif // KMP_FAST_REDUCTION_BARRIER
7159 #if KMP_MIC_SUPPORTED
7160  if (__kmp_mic_type == mic2) { // KNC
7161  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7162  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7163  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7164  1; // forkjoin release
7165  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7166  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7167  }
7168 #if KMP_FAST_REDUCTION_BARRIER
7169  if (__kmp_mic_type == mic2) { // KNC
7170  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7171  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7172  }
7173 #endif // KMP_FAST_REDUCTION_BARRIER
7174 #endif // KMP_MIC_SUPPORTED
7175 
7176 // From KMP_CHECKS initialization
7177 #ifdef KMP_DEBUG
7178  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7179 #else
7180  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7181 #endif
7182 
7183  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7184  __kmp_foreign_tp = TRUE;
7185 
7186  __kmp_global.g.g_dynamic = FALSE;
7187  __kmp_global.g.g_dynamic_mode = dynamic_default;
7188 
7189  __kmp_init_nesting_mode();
7190 
7191  __kmp_env_initialize(NULL);
7192 
7193 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7194  __kmp_user_level_mwait_init();
7195 #endif
7196 // Print all messages in message catalog for testing purposes.
7197 #ifdef KMP_DEBUG
7198  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7199  if (__kmp_str_match_true(val)) {
7200  kmp_str_buf_t buffer;
7201  __kmp_str_buf_init(&buffer);
7202  __kmp_i18n_dump_catalog(&buffer);
7203  __kmp_printf("%s", buffer.str);
7204  __kmp_str_buf_free(&buffer);
7205  }
7206  __kmp_env_free(&val);
7207 #endif
7208 
7209  __kmp_threads_capacity =
7210  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7211  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7212  __kmp_tp_capacity = __kmp_default_tp_capacity(
7213  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7214 
7215  // If the library is shut down properly, both pools must be NULL. Just in
7216  // case, set them to NULL -- some memory may leak, but subsequent code will
7217  // work even if pools are not freed.
7218  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7219  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7220  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7221  __kmp_thread_pool = NULL;
7222  __kmp_thread_pool_insert_pt = NULL;
7223  __kmp_team_pool = NULL;
7224 
7225  /* Allocate all of the variable sized records */
7226  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7227  * expandable */
7228  /* Since allocation is cache-aligned, just add extra padding at the end */
7229  size =
7230  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7231  CACHE_LINE;
7232  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7233  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7234  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7235 
7236  /* init thread counts */
7237  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7238  0); // Asserts fail if the library is reinitializing and
7239  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7240  __kmp_all_nth = 0;
7241  __kmp_nth = 0;
7242 
7243  /* setup the uber master thread and hierarchy */
7244  gtid = __kmp_register_root(TRUE);
7245  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7246  KMP_ASSERT(KMP_UBER_GTID(gtid));
7247  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7248 
7249  KMP_MB(); /* Flush all pending memory write invalidates. */
7250 
7251  __kmp_common_initialize();
7252 
7253 #if KMP_OS_UNIX
7254  /* invoke the child fork handler */
7255  __kmp_register_atfork();
7256 #endif
7257 
7258 #if !KMP_DYNAMIC_LIB || \
7259  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7260  {
7261  /* Invoke the exit handler when the program finishes, only for static
7262  library and macOS* dynamic. For other dynamic libraries, we already
7263  have _fini and DllMain. */
7264  int rc = atexit(__kmp_internal_end_atexit);
7265  if (rc != 0) {
7266  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7267  __kmp_msg_null);
7268  }
7269  }
7270 #endif
7271 
7272 #if KMP_HANDLE_SIGNALS
7273 #if KMP_OS_UNIX
7274  /* NOTE: make sure that this is called before the user installs their own
7275  signal handlers so that the user handlers are called first. this way they
7276  can return false, not call our handler, avoid terminating the library, and
7277  continue execution where they left off. */
7278  __kmp_install_signals(FALSE);
7279 #endif /* KMP_OS_UNIX */
7280 #if KMP_OS_WINDOWS
7281  __kmp_install_signals(TRUE);
7282 #endif /* KMP_OS_WINDOWS */
7283 #endif
7284 
7285  /* we have finished the serial initialization */
7286  __kmp_init_counter++;
7287 
7288  __kmp_init_serial = TRUE;
7289 
7290  if (__kmp_version) {
7291  __kmp_print_version_1();
7292  }
7293 
7294  if (__kmp_settings) {
7295  __kmp_env_print();
7296  }
7297 
7298  if (__kmp_display_env || __kmp_display_env_verbose) {
7299  __kmp_env_print_2();
7300  }
7301 
7302 #if OMPT_SUPPORT
7303  ompt_post_init();
7304 #endif
7305 
7306  KMP_MB();
7307 
7308  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7309 }
7310 
7311 void __kmp_serial_initialize(void) {
7312  if (__kmp_init_serial) {
7313  return;
7314  }
7315  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7316  if (__kmp_init_serial) {
7317  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7318  return;
7319  }
7320  __kmp_do_serial_initialize();
7321  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7322 }
7323 
7324 static void __kmp_do_middle_initialize(void) {
7325  int i, j;
7326  int prev_dflt_team_nth;
7327 
7328  if (!__kmp_init_serial) {
7329  __kmp_do_serial_initialize();
7330  }
7331 
7332  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7333 
7334  if (UNLIKELY(!__kmp_need_register_serial)) {
7335  // We are in a forked child process. The registration was skipped during
7336  // serial initialization in __kmp_atfork_child handler. Do it here.
7337  __kmp_register_library_startup();
7338  }
7339 
7340  // Save the previous value for the __kmp_dflt_team_nth so that
7341  // we can avoid some reinitialization if it hasn't changed.
7342  prev_dflt_team_nth = __kmp_dflt_team_nth;
7343 
7344 #if KMP_AFFINITY_SUPPORTED
7345  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7346  // number of cores on the machine.
7347  __kmp_affinity_initialize(__kmp_affinity);
7348 
7349 #endif /* KMP_AFFINITY_SUPPORTED */
7350 
7351  KMP_ASSERT(__kmp_xproc > 0);
7352  if (__kmp_avail_proc == 0) {
7353  __kmp_avail_proc = __kmp_xproc;
7354  }
7355 
7356  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7357  // correct them now
7358  j = 0;
7359  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7360  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7361  __kmp_avail_proc;
7362  j++;
7363  }
7364 
7365  if (__kmp_dflt_team_nth == 0) {
7366 #ifdef KMP_DFLT_NTH_CORES
7367  // Default #threads = #cores
7368  __kmp_dflt_team_nth = __kmp_ncores;
7369  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7370  "__kmp_ncores (%d)\n",
7371  __kmp_dflt_team_nth));
7372 #else
7373  // Default #threads = #available OS procs
7374  __kmp_dflt_team_nth = __kmp_avail_proc;
7375  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7376  "__kmp_avail_proc(%d)\n",
7377  __kmp_dflt_team_nth));
7378 #endif /* KMP_DFLT_NTH_CORES */
7379  }
7380 
7381  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7382  __kmp_dflt_team_nth = KMP_MIN_NTH;
7383  }
7384  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7385  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7386  }
7387 
7388  if (__kmp_nesting_mode > 0)
7389  __kmp_set_nesting_mode_threads();
7390 
7391  // There's no harm in continuing if the following check fails,
7392  // but it indicates an error in the previous logic.
7393  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7394 
7395  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7396  // Run through the __kmp_threads array and set the num threads icv for each
7397  // root thread that is currently registered with the RTL (which has not
7398  // already explicitly set its nthreads-var with a call to
7399  // omp_set_num_threads()).
7400  for (i = 0; i < __kmp_threads_capacity; i++) {
7401  kmp_info_t *thread = __kmp_threads[i];
7402  if (thread == NULL)
7403  continue;
7404  if (thread->th.th_current_task->td_icvs.nproc != 0)
7405  continue;
7406 
7407  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7408  }
7409  }
7410  KA_TRACE(
7411  20,
7412  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7413  __kmp_dflt_team_nth));
7414 
7415 #ifdef KMP_ADJUST_BLOCKTIME
7416  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7417  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7418  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7419  if (__kmp_nth > __kmp_avail_proc) {
7420  __kmp_zero_bt = TRUE;
7421  }
7422  }
7423 #endif /* KMP_ADJUST_BLOCKTIME */
7424 
7425  /* we have finished middle initialization */
7426  TCW_SYNC_4(__kmp_init_middle, TRUE);
7427 
7428  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7429 }
7430 
7431 void __kmp_middle_initialize(void) {
7432  if (__kmp_init_middle) {
7433  return;
7434  }
7435  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7436  if (__kmp_init_middle) {
7437  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7438  return;
7439  }
7440  __kmp_do_middle_initialize();
7441  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7442 }
7443 
7444 void __kmp_parallel_initialize(void) {
7445  int gtid = __kmp_entry_gtid(); // this might be a new root
7446 
7447  /* synchronize parallel initialization (for sibling) */
7448  if (TCR_4(__kmp_init_parallel))
7449  return;
7450  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7451  if (TCR_4(__kmp_init_parallel)) {
7452  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7453  return;
7454  }
7455 
7456  /* TODO reinitialization after we have already shut down */
7457  if (TCR_4(__kmp_global.g.g_done)) {
7458  KA_TRACE(
7459  10,
7460  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7461  __kmp_infinite_loop();
7462  }
7463 
7464  /* jc: The lock __kmp_initz_lock is already held, so calling
7465  __kmp_serial_initialize would cause a deadlock. So we call
7466  __kmp_do_serial_initialize directly. */
7467  if (!__kmp_init_middle) {
7468  __kmp_do_middle_initialize();
7469  }
7470  __kmp_assign_root_init_mask();
7471  __kmp_resume_if_hard_paused();
7472 
7473  /* begin initialization */
7474  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7475  KMP_ASSERT(KMP_UBER_GTID(gtid));
7476 
7477 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7478  // Save the FP control regs.
7479  // Worker threads will set theirs to these values at thread startup.
7480  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7481  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7482  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7483 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7484 
7485 #if KMP_OS_UNIX
7486 #if KMP_HANDLE_SIGNALS
7487  /* must be after __kmp_serial_initialize */
7488  __kmp_install_signals(TRUE);
7489 #endif
7490 #endif
7491 
7492  __kmp_suspend_initialize();
7493 
7494 #if defined(USE_LOAD_BALANCE)
7495  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7496  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7497  }
7498 #else
7499  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7500  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7501  }
7502 #endif
7503 
7504  if (__kmp_version) {
7505  __kmp_print_version_2();
7506  }
7507 
7508  /* we have finished parallel initialization */
7509  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7510 
7511  KMP_MB();
7512  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7513 
7514  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7515 }
7516 
7517 void __kmp_hidden_helper_initialize() {
7518  if (TCR_4(__kmp_init_hidden_helper))
7519  return;
7520 
7521  // __kmp_parallel_initialize is required before we initialize hidden helper
7522  if (!TCR_4(__kmp_init_parallel))
7523  __kmp_parallel_initialize();
7524 
7525  // Double check. Note that this double check should not be placed before
7526  // __kmp_parallel_initialize as it will cause dead lock.
7527  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7528  if (TCR_4(__kmp_init_hidden_helper)) {
7529  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7530  return;
7531  }
7532 
7533 #if KMP_AFFINITY_SUPPORTED
7534  // Initialize hidden helper affinity settings.
7535  // The above __kmp_parallel_initialize() will initialize
7536  // regular affinity (and topology) if not already done.
7537  if (!__kmp_hh_affinity.flags.initialized)
7538  __kmp_affinity_initialize(__kmp_hh_affinity);
7539 #endif
7540 
7541  // Set the count of hidden helper tasks to be executed to zero
7542  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7543 
7544  // Set the global variable indicating that we're initializing hidden helper
7545  // team/threads
7546  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7547 
7548  // Platform independent initialization
7549  __kmp_do_initialize_hidden_helper_threads();
7550 
7551  // Wait here for the finish of initialization of hidden helper teams
7552  __kmp_hidden_helper_threads_initz_wait();
7553 
7554  // We have finished hidden helper initialization
7555  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7556 
7557  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7558 }
7559 
7560 /* ------------------------------------------------------------------------ */
7561 
7562 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7563  kmp_team_t *team) {
7564  kmp_disp_t *dispatch;
7565 
7566  KMP_MB();
7567 
7568  /* none of the threads have encountered any constructs, yet. */
7569  this_thr->th.th_local.this_construct = 0;
7570 #if KMP_CACHE_MANAGE
7571  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7572 #endif /* KMP_CACHE_MANAGE */
7573  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7574  KMP_DEBUG_ASSERT(dispatch);
7575  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7576  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7577  // this_thr->th.th_info.ds.ds_tid ] );
7578 
7579  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7580  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7581  if (__kmp_env_consistency_check)
7582  __kmp_push_parallel(gtid, team->t.t_ident);
7583 
7584  KMP_MB(); /* Flush all pending memory write invalidates. */
7585 }
7586 
7587 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7588  kmp_team_t *team) {
7589  if (__kmp_env_consistency_check)
7590  __kmp_pop_parallel(gtid, team->t.t_ident);
7591 
7592  __kmp_finish_implicit_task(this_thr);
7593 }
7594 
7595 int __kmp_invoke_task_func(int gtid) {
7596  int rc;
7597  int tid = __kmp_tid_from_gtid(gtid);
7598  kmp_info_t *this_thr = __kmp_threads[gtid];
7599  kmp_team_t *team = this_thr->th.th_team;
7600 
7601  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7602 #if USE_ITT_BUILD
7603  if (__itt_stack_caller_create_ptr) {
7604  // inform ittnotify about entering user's code
7605  if (team->t.t_stack_id != NULL) {
7606  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7607  } else {
7608  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7609  __kmp_itt_stack_callee_enter(
7610  (__itt_caller)team->t.t_parent->t.t_stack_id);
7611  }
7612  }
7613 #endif /* USE_ITT_BUILD */
7614 #if INCLUDE_SSC_MARKS
7615  SSC_MARK_INVOKING();
7616 #endif
7617 
7618 #if OMPT_SUPPORT
7619  void *dummy;
7620  void **exit_frame_p;
7621  ompt_data_t *my_task_data;
7622  ompt_data_t *my_parallel_data;
7623  int ompt_team_size;
7624 
7625  if (ompt_enabled.enabled) {
7626  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7627  .ompt_task_info.frame.exit_frame.ptr);
7628  } else {
7629  exit_frame_p = &dummy;
7630  }
7631 
7632  my_task_data =
7633  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7634  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7635  if (ompt_enabled.ompt_callback_implicit_task) {
7636  ompt_team_size = team->t.t_nproc;
7637  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7638  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7639  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7640  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7641  }
7642 #endif
7643 
7644 #if KMP_STATS_ENABLED
7645  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7646  if (previous_state == stats_state_e::TEAMS_REGION) {
7647  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7648  } else {
7649  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7650  }
7651  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7652 #endif
7653 
7654  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7655  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7656 #if OMPT_SUPPORT
7657  ,
7658  exit_frame_p
7659 #endif
7660  );
7661 #if OMPT_SUPPORT
7662  *exit_frame_p = NULL;
7663  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7664 #endif
7665 
7666 #if KMP_STATS_ENABLED
7667  if (previous_state == stats_state_e::TEAMS_REGION) {
7668  KMP_SET_THREAD_STATE(previous_state);
7669  }
7670  KMP_POP_PARTITIONED_TIMER();
7671 #endif
7672 
7673 #if USE_ITT_BUILD
7674  if (__itt_stack_caller_create_ptr) {
7675  // inform ittnotify about leaving user's code
7676  if (team->t.t_stack_id != NULL) {
7677  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7678  } else {
7679  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7680  __kmp_itt_stack_callee_leave(
7681  (__itt_caller)team->t.t_parent->t.t_stack_id);
7682  }
7683  }
7684 #endif /* USE_ITT_BUILD */
7685  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7686 
7687  return rc;
7688 }
7689 
7690 void __kmp_teams_master(int gtid) {
7691  // This routine is called by all primary threads in teams construct
7692  kmp_info_t *thr = __kmp_threads[gtid];
7693  kmp_team_t *team = thr->th.th_team;
7694  ident_t *loc = team->t.t_ident;
7695  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7696  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7697  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7698  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7699  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7700 
7701  // This thread is a new CG root. Set up the proper variables.
7702  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7703  tmp->cg_root = thr; // Make thr the CG root
7704  // Init to thread limit stored when league primary threads were forked
7705  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7706  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7707  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7708  " cg_nthreads to 1\n",
7709  thr, tmp));
7710  tmp->up = thr->th.th_cg_roots;
7711  thr->th.th_cg_roots = tmp;
7712 
7713 // Launch league of teams now, but not let workers execute
7714 // (they hang on fork barrier until next parallel)
7715 #if INCLUDE_SSC_MARKS
7716  SSC_MARK_FORKING();
7717 #endif
7718  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7719  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7720  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7721 #if INCLUDE_SSC_MARKS
7722  SSC_MARK_JOINING();
7723 #endif
7724  // If the team size was reduced from the limit, set it to the new size
7725  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7726  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7727  // AC: last parameter "1" eliminates join barrier which won't work because
7728  // worker threads are in a fork barrier waiting for more parallel regions
7729  __kmp_join_call(loc, gtid
7730 #if OMPT_SUPPORT
7731  ,
7732  fork_context_intel
7733 #endif
7734  ,
7735  1);
7736 }
7737 
7738 int __kmp_invoke_teams_master(int gtid) {
7739  kmp_info_t *this_thr = __kmp_threads[gtid];
7740  kmp_team_t *team = this_thr->th.th_team;
7741 #if KMP_DEBUG
7742  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7743  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7744  (void *)__kmp_teams_master);
7745 #endif
7746  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7747 #if OMPT_SUPPORT
7748  int tid = __kmp_tid_from_gtid(gtid);
7749  ompt_data_t *task_data =
7750  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7751  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7752  if (ompt_enabled.ompt_callback_implicit_task) {
7753  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7754  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7755  ompt_task_initial);
7756  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7757  }
7758 #endif
7759  __kmp_teams_master(gtid);
7760 #if OMPT_SUPPORT
7761  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7762 #endif
7763  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7764  return 1;
7765 }
7766 
7767 /* this sets the requested number of threads for the next parallel region
7768  encountered by this team. since this should be enclosed in the forkjoin
7769  critical section it should avoid race conditions with asymmetrical nested
7770  parallelism */
7771 
7772 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7773  kmp_info_t *thr = __kmp_threads[gtid];
7774 
7775  if (num_threads > 0)
7776  thr->th.th_set_nproc = num_threads;
7777 }
7778 
7779 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7780  int num_threads) {
7781  KMP_DEBUG_ASSERT(thr);
7782  // Remember the number of threads for inner parallel regions
7783  if (!TCR_4(__kmp_init_middle))
7784  __kmp_middle_initialize(); // get internal globals calculated
7785  __kmp_assign_root_init_mask();
7786  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7787  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7788 
7789  if (num_threads == 0) {
7790  if (__kmp_teams_thread_limit > 0) {
7791  num_threads = __kmp_teams_thread_limit;
7792  } else {
7793  num_threads = __kmp_avail_proc / num_teams;
7794  }
7795  // adjust num_threads w/o warning as it is not user setting
7796  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7797  // no thread_limit clause specified - do not change thread-limit-var ICV
7798  if (num_threads > __kmp_dflt_team_nth) {
7799  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7800  }
7801  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7802  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7803  } // prevent team size to exceed thread-limit-var
7804  if (num_teams * num_threads > __kmp_teams_max_nth) {
7805  num_threads = __kmp_teams_max_nth / num_teams;
7806  }
7807  if (num_threads == 0) {
7808  num_threads = 1;
7809  }
7810  } else {
7811  if (num_threads < 0) {
7812  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7813  __kmp_msg_null);
7814  num_threads = 1;
7815  }
7816  // This thread will be the primary thread of the league primary threads
7817  // Store new thread limit; old limit is saved in th_cg_roots list
7818  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7819  // num_threads = min(num_threads, nthreads-var)
7820  if (num_threads > __kmp_dflt_team_nth) {
7821  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7822  }
7823  if (num_teams * num_threads > __kmp_teams_max_nth) {
7824  int new_threads = __kmp_teams_max_nth / num_teams;
7825  if (new_threads == 0) {
7826  new_threads = 1;
7827  }
7828  if (new_threads != num_threads) {
7829  if (!__kmp_reserve_warn) { // user asked for too many threads
7830  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7831  __kmp_msg(kmp_ms_warning,
7832  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7833  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7834  }
7835  }
7836  num_threads = new_threads;
7837  }
7838  }
7839  thr->th.th_teams_size.nth = num_threads;
7840 }
7841 
7842 /* this sets the requested number of teams for the teams region and/or
7843  the number of threads for the next parallel region encountered */
7844 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7845  int num_threads) {
7846  kmp_info_t *thr = __kmp_threads[gtid];
7847  if (num_teams < 0) {
7848  // OpenMP specification requires requested values to be positive,
7849  // but people can send us any value, so we'd better check
7850  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7851  __kmp_msg_null);
7852  num_teams = 1;
7853  }
7854  if (num_teams == 0) {
7855  if (__kmp_nteams > 0) {
7856  num_teams = __kmp_nteams;
7857  } else {
7858  num_teams = 1; // default number of teams is 1.
7859  }
7860  }
7861  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7862  if (!__kmp_reserve_warn) {
7863  __kmp_reserve_warn = 1;
7864  __kmp_msg(kmp_ms_warning,
7865  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7866  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7867  }
7868  num_teams = __kmp_teams_max_nth;
7869  }
7870  // Set number of teams (number of threads in the outer "parallel" of the
7871  // teams)
7872  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7873 
7874  __kmp_push_thread_limit(thr, num_teams, num_threads);
7875 }
7876 
7877 /* This sets the requested number of teams for the teams region and/or
7878  the number of threads for the next parallel region encountered */
7879 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7880  int num_teams_ub, int num_threads) {
7881  kmp_info_t *thr = __kmp_threads[gtid];
7882  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7883  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7884  KMP_DEBUG_ASSERT(num_threads >= 0);
7885 
7886  if (num_teams_lb > num_teams_ub) {
7887  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7888  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7889  }
7890 
7891  int num_teams = 1; // defalt number of teams is 1.
7892 
7893  if (num_teams_lb == 0 && num_teams_ub > 0)
7894  num_teams_lb = num_teams_ub;
7895 
7896  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7897  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7898  if (num_teams > __kmp_teams_max_nth) {
7899  if (!__kmp_reserve_warn) {
7900  __kmp_reserve_warn = 1;
7901  __kmp_msg(kmp_ms_warning,
7902  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7903  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7904  }
7905  num_teams = __kmp_teams_max_nth;
7906  }
7907  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7908  num_teams = num_teams_ub;
7909  } else { // num_teams_lb <= num_teams <= num_teams_ub
7910  if (num_threads <= 0) {
7911  if (num_teams_ub > __kmp_teams_max_nth) {
7912  num_teams = num_teams_lb;
7913  } else {
7914  num_teams = num_teams_ub;
7915  }
7916  } else {
7917  num_teams = (num_threads > __kmp_teams_max_nth)
7918  ? num_teams
7919  : __kmp_teams_max_nth / num_threads;
7920  if (num_teams < num_teams_lb) {
7921  num_teams = num_teams_lb;
7922  } else if (num_teams > num_teams_ub) {
7923  num_teams = num_teams_ub;
7924  }
7925  }
7926  }
7927  // Set number of teams (number of threads in the outer "parallel" of the
7928  // teams)
7929  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7930 
7931  __kmp_push_thread_limit(thr, num_teams, num_threads);
7932 }
7933 
7934 // Set the proc_bind var to use in the following parallel region.
7935 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7936  kmp_info_t *thr = __kmp_threads[gtid];
7937  thr->th.th_set_proc_bind = proc_bind;
7938 }
7939 
7940 /* Launch the worker threads into the microtask. */
7941 
7942 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7943  kmp_info_t *this_thr = __kmp_threads[gtid];
7944 
7945 #ifdef KMP_DEBUG
7946  int f;
7947 #endif /* KMP_DEBUG */
7948 
7949  KMP_DEBUG_ASSERT(team);
7950  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7951  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7952  KMP_MB(); /* Flush all pending memory write invalidates. */
7953 
7954  team->t.t_construct = 0; /* no single directives seen yet */
7955  team->t.t_ordered.dt.t_value =
7956  0; /* thread 0 enters the ordered section first */
7957 
7958  /* Reset the identifiers on the dispatch buffer */
7959  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7960  if (team->t.t_max_nproc > 1) {
7961  int i;
7962  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7963  team->t.t_disp_buffer[i].buffer_index = i;
7964  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7965  }
7966  } else {
7967  team->t.t_disp_buffer[0].buffer_index = 0;
7968  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7969  }
7970 
7971  KMP_MB(); /* Flush all pending memory write invalidates. */
7972  KMP_ASSERT(this_thr->th.th_team == team);
7973 
7974 #ifdef KMP_DEBUG
7975  for (f = 0; f < team->t.t_nproc; f++) {
7976  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7977  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7978  }
7979 #endif /* KMP_DEBUG */
7980 
7981  /* release the worker threads so they may begin working */
7982  __kmp_fork_barrier(gtid, 0);
7983 }
7984 
7985 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7986  kmp_info_t *this_thr = __kmp_threads[gtid];
7987 
7988  KMP_DEBUG_ASSERT(team);
7989  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7990  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7991  KMP_MB(); /* Flush all pending memory write invalidates. */
7992 
7993  /* Join barrier after fork */
7994 
7995 #ifdef KMP_DEBUG
7996  if (__kmp_threads[gtid] &&
7997  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7998  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7999  __kmp_threads[gtid]);
8000  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8001  "team->t.t_nproc=%d\n",
8002  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8003  team->t.t_nproc);
8004  __kmp_print_structure();
8005  }
8006  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8007  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8008 #endif /* KMP_DEBUG */
8009 
8010  __kmp_join_barrier(gtid); /* wait for everyone */
8011 #if OMPT_SUPPORT
8012  if (ompt_enabled.enabled &&
8013  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8014  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8015  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8016  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8017 #if OMPT_OPTIONAL
8018  void *codeptr = NULL;
8019  if (KMP_MASTER_TID(ds_tid) &&
8020  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8021  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8022  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8023 
8024  if (ompt_enabled.ompt_callback_sync_region_wait) {
8025  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8026  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8027  codeptr);
8028  }
8029  if (ompt_enabled.ompt_callback_sync_region) {
8030  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8031  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8032  codeptr);
8033  }
8034 #endif
8035  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8036  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8037  ompt_scope_end, NULL, task_data, 0, ds_tid,
8038  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8039  }
8040  }
8041 #endif
8042 
8043  KMP_MB(); /* Flush all pending memory write invalidates. */
8044  KMP_ASSERT(this_thr->th.th_team == team);
8045 }
8046 
8047 /* ------------------------------------------------------------------------ */
8048 
8049 #ifdef USE_LOAD_BALANCE
8050 
8051 // Return the worker threads actively spinning in the hot team, if we
8052 // are at the outermost level of parallelism. Otherwise, return 0.
8053 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8054  int i;
8055  int retval;
8056  kmp_team_t *hot_team;
8057 
8058  if (root->r.r_active) {
8059  return 0;
8060  }
8061  hot_team = root->r.r_hot_team;
8062  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8063  return hot_team->t.t_nproc - 1; // Don't count primary thread
8064  }
8065 
8066  // Skip the primary thread - it is accounted for elsewhere.
8067  retval = 0;
8068  for (i = 1; i < hot_team->t.t_nproc; i++) {
8069  if (hot_team->t.t_threads[i]->th.th_active) {
8070  retval++;
8071  }
8072  }
8073  return retval;
8074 }
8075 
8076 // Perform an automatic adjustment to the number of
8077 // threads used by the next parallel region.
8078 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8079  int retval;
8080  int pool_active;
8081  int hot_team_active;
8082  int team_curr_active;
8083  int system_active;
8084 
8085  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8086  set_nproc));
8087  KMP_DEBUG_ASSERT(root);
8088  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8089  ->th.th_current_task->td_icvs.dynamic == TRUE);
8090  KMP_DEBUG_ASSERT(set_nproc > 1);
8091 
8092  if (set_nproc == 1) {
8093  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8094  return 1;
8095  }
8096 
8097  // Threads that are active in the thread pool, active in the hot team for this
8098  // particular root (if we are at the outer par level), and the currently
8099  // executing thread (to become the primary thread) are available to add to the
8100  // new team, but are currently contributing to the system load, and must be
8101  // accounted for.
8102  pool_active = __kmp_thread_pool_active_nth;
8103  hot_team_active = __kmp_active_hot_team_nproc(root);
8104  team_curr_active = pool_active + hot_team_active + 1;
8105 
8106  // Check the system load.
8107  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8108  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8109  "hot team active = %d\n",
8110  system_active, pool_active, hot_team_active));
8111 
8112  if (system_active < 0) {
8113  // There was an error reading the necessary info from /proc, so use the
8114  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8115  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8116  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8117  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8118 
8119  // Make this call behave like the thread limit algorithm.
8120  retval = __kmp_avail_proc - __kmp_nth +
8121  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8122  if (retval > set_nproc) {
8123  retval = set_nproc;
8124  }
8125  if (retval < KMP_MIN_NTH) {
8126  retval = KMP_MIN_NTH;
8127  }
8128 
8129  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8130  retval));
8131  return retval;
8132  }
8133 
8134  // There is a slight delay in the load balance algorithm in detecting new
8135  // running procs. The real system load at this instant should be at least as
8136  // large as the #active omp thread that are available to add to the team.
8137  if (system_active < team_curr_active) {
8138  system_active = team_curr_active;
8139  }
8140  retval = __kmp_avail_proc - system_active + team_curr_active;
8141  if (retval > set_nproc) {
8142  retval = set_nproc;
8143  }
8144  if (retval < KMP_MIN_NTH) {
8145  retval = KMP_MIN_NTH;
8146  }
8147 
8148  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8149  return retval;
8150 } // __kmp_load_balance_nproc()
8151 
8152 #endif /* USE_LOAD_BALANCE */
8153 
8154 /* ------------------------------------------------------------------------ */
8155 
8156 /* NOTE: this is called with the __kmp_init_lock held */
8157 void __kmp_cleanup(void) {
8158  int f;
8159 
8160  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8161 
8162  if (TCR_4(__kmp_init_parallel)) {
8163 #if KMP_HANDLE_SIGNALS
8164  __kmp_remove_signals();
8165 #endif
8166  TCW_4(__kmp_init_parallel, FALSE);
8167  }
8168 
8169  if (TCR_4(__kmp_init_middle)) {
8170 #if KMP_AFFINITY_SUPPORTED
8171  __kmp_affinity_uninitialize();
8172 #endif /* KMP_AFFINITY_SUPPORTED */
8173  __kmp_cleanup_hierarchy();
8174  TCW_4(__kmp_init_middle, FALSE);
8175  }
8176 
8177  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8178 
8179  if (__kmp_init_serial) {
8180  __kmp_runtime_destroy();
8181  __kmp_init_serial = FALSE;
8182  }
8183 
8184  __kmp_cleanup_threadprivate_caches();
8185 
8186  for (f = 0; f < __kmp_threads_capacity; f++) {
8187  if (__kmp_root[f] != NULL) {
8188  __kmp_free(__kmp_root[f]);
8189  __kmp_root[f] = NULL;
8190  }
8191  }
8192  __kmp_free(__kmp_threads);
8193  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8194  // there is no need in freeing __kmp_root.
8195  __kmp_threads = NULL;
8196  __kmp_root = NULL;
8197  __kmp_threads_capacity = 0;
8198 
8199  // Free old __kmp_threads arrays if they exist.
8200  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8201  while (ptr) {
8202  kmp_old_threads_list_t *next = ptr->next;
8203  __kmp_free(ptr->threads);
8204  __kmp_free(ptr);
8205  ptr = next;
8206  }
8207 
8208 #if KMP_USE_DYNAMIC_LOCK
8209  __kmp_cleanup_indirect_user_locks();
8210 #else
8211  __kmp_cleanup_user_locks();
8212 #endif
8213 #if OMPD_SUPPORT
8214  if (ompd_state) {
8215  __kmp_free(ompd_env_block);
8216  ompd_env_block = NULL;
8217  ompd_env_block_size = 0;
8218  }
8219 #endif
8220 
8221 #if KMP_AFFINITY_SUPPORTED
8222  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8223  __kmp_cpuinfo_file = NULL;
8224 #endif /* KMP_AFFINITY_SUPPORTED */
8225 
8226 #if KMP_USE_ADAPTIVE_LOCKS
8227 #if KMP_DEBUG_ADAPTIVE_LOCKS
8228  __kmp_print_speculative_stats();
8229 #endif
8230 #endif
8231  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8232  __kmp_nested_nth.nth = NULL;
8233  __kmp_nested_nth.size = 0;
8234  __kmp_nested_nth.used = 0;
8235  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8236  __kmp_nested_proc_bind.bind_types = NULL;
8237  __kmp_nested_proc_bind.size = 0;
8238  __kmp_nested_proc_bind.used = 0;
8239  if (__kmp_affinity_format) {
8240  KMP_INTERNAL_FREE(__kmp_affinity_format);
8241  __kmp_affinity_format = NULL;
8242  }
8243 
8244  __kmp_i18n_catclose();
8245 
8246 #if KMP_USE_HIER_SCHED
8247  __kmp_hier_scheds.deallocate();
8248 #endif
8249 
8250 #if KMP_STATS_ENABLED
8251  __kmp_stats_fini();
8252 #endif
8253 
8254  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8255 }
8256 
8257 /* ------------------------------------------------------------------------ */
8258 
8259 int __kmp_ignore_mppbeg(void) {
8260  char *env;
8261 
8262  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8263  if (__kmp_str_match_false(env))
8264  return FALSE;
8265  }
8266  // By default __kmpc_begin() is no-op.
8267  return TRUE;
8268 }
8269 
8270 int __kmp_ignore_mppend(void) {
8271  char *env;
8272 
8273  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8274  if (__kmp_str_match_false(env))
8275  return FALSE;
8276  }
8277  // By default __kmpc_end() is no-op.
8278  return TRUE;
8279 }
8280 
8281 void __kmp_internal_begin(void) {
8282  int gtid;
8283  kmp_root_t *root;
8284 
8285  /* this is a very important step as it will register new sibling threads
8286  and assign these new uber threads a new gtid */
8287  gtid = __kmp_entry_gtid();
8288  root = __kmp_threads[gtid]->th.th_root;
8289  KMP_ASSERT(KMP_UBER_GTID(gtid));
8290 
8291  if (root->r.r_begin)
8292  return;
8293  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8294  if (root->r.r_begin) {
8295  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8296  return;
8297  }
8298 
8299  root->r.r_begin = TRUE;
8300 
8301  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8302 }
8303 
8304 /* ------------------------------------------------------------------------ */
8305 
8306 void __kmp_user_set_library(enum library_type arg) {
8307  int gtid;
8308  kmp_root_t *root;
8309  kmp_info_t *thread;
8310 
8311  /* first, make sure we are initialized so we can get our gtid */
8312 
8313  gtid = __kmp_entry_gtid();
8314  thread = __kmp_threads[gtid];
8315 
8316  root = thread->th.th_root;
8317 
8318  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8319  library_serial));
8320  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8321  thread */
8322  KMP_WARNING(SetLibraryIncorrectCall);
8323  return;
8324  }
8325 
8326  switch (arg) {
8327  case library_serial:
8328  thread->th.th_set_nproc = 0;
8329  set__nproc(thread, 1);
8330  break;
8331  case library_turnaround:
8332  thread->th.th_set_nproc = 0;
8333  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8334  : __kmp_dflt_team_nth_ub);
8335  break;
8336  case library_throughput:
8337  thread->th.th_set_nproc = 0;
8338  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8339  : __kmp_dflt_team_nth_ub);
8340  break;
8341  default:
8342  KMP_FATAL(UnknownLibraryType, arg);
8343  }
8344 
8345  __kmp_aux_set_library(arg);
8346 }
8347 
8348 void __kmp_aux_set_stacksize(size_t arg) {
8349  if (!__kmp_init_serial)
8350  __kmp_serial_initialize();
8351 
8352 #if KMP_OS_DARWIN
8353  if (arg & (0x1000 - 1)) {
8354  arg &= ~(0x1000 - 1);
8355  if (arg + 0x1000) /* check for overflow if we round up */
8356  arg += 0x1000;
8357  }
8358 #endif
8359  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8360 
8361  /* only change the default stacksize before the first parallel region */
8362  if (!TCR_4(__kmp_init_parallel)) {
8363  size_t value = arg; /* argument is in bytes */
8364 
8365  if (value < __kmp_sys_min_stksize)
8366  value = __kmp_sys_min_stksize;
8367  else if (value > KMP_MAX_STKSIZE)
8368  value = KMP_MAX_STKSIZE;
8369 
8370  __kmp_stksize = value;
8371 
8372  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8373  }
8374 
8375  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8376 }
8377 
8378 /* set the behaviour of the runtime library */
8379 /* TODO this can cause some odd behaviour with sibling parallelism... */
8380 void __kmp_aux_set_library(enum library_type arg) {
8381  __kmp_library = arg;
8382 
8383  switch (__kmp_library) {
8384  case library_serial: {
8385  KMP_INFORM(LibraryIsSerial);
8386  } break;
8387  case library_turnaround:
8388  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8389  __kmp_use_yield = 2; // only yield when oversubscribed
8390  break;
8391  case library_throughput:
8392  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8393  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8394  break;
8395  default:
8396  KMP_FATAL(UnknownLibraryType, arg);
8397  }
8398 }
8399 
8400 /* Getting team information common for all team API */
8401 // Returns NULL if not in teams construct
8402 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8403  kmp_info_t *thr = __kmp_entry_thread();
8404  teams_serialized = 0;
8405  if (thr->th.th_teams_microtask) {
8406  kmp_team_t *team = thr->th.th_team;
8407  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8408  int ii = team->t.t_level;
8409  teams_serialized = team->t.t_serialized;
8410  int level = tlevel + 1;
8411  KMP_DEBUG_ASSERT(ii >= tlevel);
8412  while (ii > level) {
8413  for (teams_serialized = team->t.t_serialized;
8414  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8415  }
8416  if (team->t.t_serialized && (!teams_serialized)) {
8417  team = team->t.t_parent;
8418  continue;
8419  }
8420  if (ii > level) {
8421  team = team->t.t_parent;
8422  ii--;
8423  }
8424  }
8425  return team;
8426  }
8427  return NULL;
8428 }
8429 
8430 int __kmp_aux_get_team_num() {
8431  int serialized;
8432  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8433  if (team) {
8434  if (serialized > 1) {
8435  return 0; // teams region is serialized ( 1 team of 1 thread ).
8436  } else {
8437  return team->t.t_master_tid;
8438  }
8439  }
8440  return 0;
8441 }
8442 
8443 int __kmp_aux_get_num_teams() {
8444  int serialized;
8445  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8446  if (team) {
8447  if (serialized > 1) {
8448  return 1;
8449  } else {
8450  return team->t.t_parent->t.t_nproc;
8451  }
8452  }
8453  return 1;
8454 }
8455 
8456 /* ------------------------------------------------------------------------ */
8457 
8458 /*
8459  * Affinity Format Parser
8460  *
8461  * Field is in form of: %[[[0].]size]type
8462  * % and type are required (%% means print a literal '%')
8463  * type is either single char or long name surrounded by {},
8464  * e.g., N or {num_threads}
8465  * 0 => leading zeros
8466  * . => right justified when size is specified
8467  * by default output is left justified
8468  * size is the *minimum* field length
8469  * All other characters are printed as is
8470  *
8471  * Available field types:
8472  * L {thread_level} - omp_get_level()
8473  * n {thread_num} - omp_get_thread_num()
8474  * h {host} - name of host machine
8475  * P {process_id} - process id (integer)
8476  * T {thread_identifier} - native thread identifier (integer)
8477  * N {num_threads} - omp_get_num_threads()
8478  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8479  * a {thread_affinity} - comma separated list of integers or integer ranges
8480  * (values of affinity mask)
8481  *
8482  * Implementation-specific field types can be added
8483  * If a type is unknown, print "undefined"
8484  */
8485 
8486 // Structure holding the short name, long name, and corresponding data type
8487 // for snprintf. A table of these will represent the entire valid keyword
8488 // field types.
8489 typedef struct kmp_affinity_format_field_t {
8490  char short_name; // from spec e.g., L -> thread level
8491  const char *long_name; // from spec thread_level -> thread level
8492  char field_format; // data type for snprintf (typically 'd' or 's'
8493  // for integer or string)
8494 } kmp_affinity_format_field_t;
8495 
8496 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8497 #if KMP_AFFINITY_SUPPORTED
8498  {'A', "thread_affinity", 's'},
8499 #endif
8500  {'t', "team_num", 'd'},
8501  {'T', "num_teams", 'd'},
8502  {'L', "nesting_level", 'd'},
8503  {'n', "thread_num", 'd'},
8504  {'N', "num_threads", 'd'},
8505  {'a', "ancestor_tnum", 'd'},
8506  {'H', "host", 's'},
8507  {'P', "process_id", 'd'},
8508  {'i', "native_thread_id", 'd'}};
8509 
8510 // Return the number of characters it takes to hold field
8511 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8512  const char **ptr,
8513  kmp_str_buf_t *field_buffer) {
8514  int rc, format_index, field_value;
8515  const char *width_left, *width_right;
8516  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8517  static const int FORMAT_SIZE = 20;
8518  char format[FORMAT_SIZE] = {0};
8519  char absolute_short_name = 0;
8520 
8521  KMP_DEBUG_ASSERT(gtid >= 0);
8522  KMP_DEBUG_ASSERT(th);
8523  KMP_DEBUG_ASSERT(**ptr == '%');
8524  KMP_DEBUG_ASSERT(field_buffer);
8525 
8526  __kmp_str_buf_clear(field_buffer);
8527 
8528  // Skip the initial %
8529  (*ptr)++;
8530 
8531  // Check for %% first
8532  if (**ptr == '%') {
8533  __kmp_str_buf_cat(field_buffer, "%", 1);
8534  (*ptr)++; // skip over the second %
8535  return 1;
8536  }
8537 
8538  // Parse field modifiers if they are present
8539  pad_zeros = false;
8540  if (**ptr == '0') {
8541  pad_zeros = true;
8542  (*ptr)++; // skip over 0
8543  }
8544  right_justify = false;
8545  if (**ptr == '.') {
8546  right_justify = true;
8547  (*ptr)++; // skip over .
8548  }
8549  // Parse width of field: [width_left, width_right)
8550  width_left = width_right = NULL;
8551  if (**ptr >= '0' && **ptr <= '9') {
8552  width_left = *ptr;
8553  SKIP_DIGITS(*ptr);
8554  width_right = *ptr;
8555  }
8556 
8557  // Create the format for KMP_SNPRINTF based on flags parsed above
8558  format_index = 0;
8559  format[format_index++] = '%';
8560  if (!right_justify)
8561  format[format_index++] = '-';
8562  if (pad_zeros)
8563  format[format_index++] = '0';
8564  if (width_left && width_right) {
8565  int i = 0;
8566  // Only allow 8 digit number widths.
8567  // This also prevents overflowing format variable
8568  while (i < 8 && width_left < width_right) {
8569  format[format_index++] = *width_left;
8570  width_left++;
8571  i++;
8572  }
8573  }
8574 
8575  // Parse a name (long or short)
8576  // Canonicalize the name into absolute_short_name
8577  found_valid_name = false;
8578  parse_long_name = (**ptr == '{');
8579  if (parse_long_name)
8580  (*ptr)++; // skip initial left brace
8581  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8582  sizeof(__kmp_affinity_format_table[0]);
8583  ++i) {
8584  char short_name = __kmp_affinity_format_table[i].short_name;
8585  const char *long_name = __kmp_affinity_format_table[i].long_name;
8586  char field_format = __kmp_affinity_format_table[i].field_format;
8587  if (parse_long_name) {
8588  size_t length = KMP_STRLEN(long_name);
8589  if (strncmp(*ptr, long_name, length) == 0) {
8590  found_valid_name = true;
8591  (*ptr) += length; // skip the long name
8592  }
8593  } else if (**ptr == short_name) {
8594  found_valid_name = true;
8595  (*ptr)++; // skip the short name
8596  }
8597  if (found_valid_name) {
8598  format[format_index++] = field_format;
8599  format[format_index++] = '\0';
8600  absolute_short_name = short_name;
8601  break;
8602  }
8603  }
8604  if (parse_long_name) {
8605  if (**ptr != '}') {
8606  absolute_short_name = 0;
8607  } else {
8608  (*ptr)++; // skip over the right brace
8609  }
8610  }
8611 
8612  // Attempt to fill the buffer with the requested
8613  // value using snprintf within __kmp_str_buf_print()
8614  switch (absolute_short_name) {
8615  case 't':
8616  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8617  break;
8618  case 'T':
8619  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8620  break;
8621  case 'L':
8622  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8623  break;
8624  case 'n':
8625  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8626  break;
8627  case 'H': {
8628  static const int BUFFER_SIZE = 256;
8629  char buf[BUFFER_SIZE];
8630  __kmp_expand_host_name(buf, BUFFER_SIZE);
8631  rc = __kmp_str_buf_print(field_buffer, format, buf);
8632  } break;
8633  case 'P':
8634  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8635  break;
8636  case 'i':
8637  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8638  break;
8639  case 'N':
8640  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8641  break;
8642  case 'a':
8643  field_value =
8644  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8645  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8646  break;
8647 #if KMP_AFFINITY_SUPPORTED
8648  case 'A': {
8649  kmp_str_buf_t buf;
8650  __kmp_str_buf_init(&buf);
8651  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8652  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8653  __kmp_str_buf_free(&buf);
8654  } break;
8655 #endif
8656  default:
8657  // According to spec, If an implementation does not have info for field
8658  // type, then "undefined" is printed
8659  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8660  // Skip the field
8661  if (parse_long_name) {
8662  SKIP_TOKEN(*ptr);
8663  if (**ptr == '}')
8664  (*ptr)++;
8665  } else {
8666  (*ptr)++;
8667  }
8668  }
8669 
8670  KMP_ASSERT(format_index <= FORMAT_SIZE);
8671  return rc;
8672 }
8673 
8674 /*
8675  * Return number of characters needed to hold the affinity string
8676  * (not including null byte character)
8677  * The resultant string is printed to buffer, which the caller can then
8678  * handle afterwards
8679  */
8680 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8681  kmp_str_buf_t *buffer) {
8682  const char *parse_ptr;
8683  size_t retval;
8684  const kmp_info_t *th;
8685  kmp_str_buf_t field;
8686 
8687  KMP_DEBUG_ASSERT(buffer);
8688  KMP_DEBUG_ASSERT(gtid >= 0);
8689 
8690  __kmp_str_buf_init(&field);
8691  __kmp_str_buf_clear(buffer);
8692 
8693  th = __kmp_threads[gtid];
8694  retval = 0;
8695 
8696  // If format is NULL or zero-length string, then we use
8697  // affinity-format-var ICV
8698  parse_ptr = format;
8699  if (parse_ptr == NULL || *parse_ptr == '\0') {
8700  parse_ptr = __kmp_affinity_format;
8701  }
8702  KMP_DEBUG_ASSERT(parse_ptr);
8703 
8704  while (*parse_ptr != '\0') {
8705  // Parse a field
8706  if (*parse_ptr == '%') {
8707  // Put field in the buffer
8708  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8709  __kmp_str_buf_catbuf(buffer, &field);
8710  retval += rc;
8711  } else {
8712  // Put literal character in buffer
8713  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8714  retval++;
8715  parse_ptr++;
8716  }
8717  }
8718  __kmp_str_buf_free(&field);
8719  return retval;
8720 }
8721 
8722 // Displays the affinity string to stdout
8723 void __kmp_aux_display_affinity(int gtid, const char *format) {
8724  kmp_str_buf_t buf;
8725  __kmp_str_buf_init(&buf);
8726  __kmp_aux_capture_affinity(gtid, format, &buf);
8727  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8728  __kmp_str_buf_free(&buf);
8729 }
8730 
8731 /* ------------------------------------------------------------------------ */
8732 
8733 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8734  int blocktime = arg; /* argument is in milliseconds */
8735 #if KMP_USE_MONITOR
8736  int bt_intervals;
8737 #endif
8738  kmp_int8 bt_set;
8739 
8740  __kmp_save_internal_controls(thread);
8741 
8742  /* Normalize and set blocktime for the teams */
8743  if (blocktime < KMP_MIN_BLOCKTIME)
8744  blocktime = KMP_MIN_BLOCKTIME;
8745  else if (blocktime > KMP_MAX_BLOCKTIME)
8746  blocktime = KMP_MAX_BLOCKTIME;
8747 
8748  set__blocktime_team(thread->th.th_team, tid, blocktime);
8749  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8750 
8751 #if KMP_USE_MONITOR
8752  /* Calculate and set blocktime intervals for the teams */
8753  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8754 
8755  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8756  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8757 #endif
8758 
8759  /* Set whether blocktime has been set to "TRUE" */
8760  bt_set = TRUE;
8761 
8762  set__bt_set_team(thread->th.th_team, tid, bt_set);
8763  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8764 #if KMP_USE_MONITOR
8765  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8766  "bt_intervals=%d, monitor_updates=%d\n",
8767  __kmp_gtid_from_tid(tid, thread->th.th_team),
8768  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8769  __kmp_monitor_wakeups));
8770 #else
8771  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8772  __kmp_gtid_from_tid(tid, thread->th.th_team),
8773  thread->th.th_team->t.t_id, tid, blocktime));
8774 #endif
8775 }
8776 
8777 void __kmp_aux_set_defaults(char const *str, size_t len) {
8778  if (!__kmp_init_serial) {
8779  __kmp_serial_initialize();
8780  }
8781  __kmp_env_initialize(str);
8782 
8783  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8784  __kmp_env_print();
8785  }
8786 } // __kmp_aux_set_defaults
8787 
8788 /* ------------------------------------------------------------------------ */
8789 /* internal fast reduction routines */
8790 
8791 PACKED_REDUCTION_METHOD_T
8792 __kmp_determine_reduction_method(
8793  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8794  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8795  kmp_critical_name *lck) {
8796 
8797  // Default reduction method: critical construct ( lck != NULL, like in current
8798  // PAROPT )
8799  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8800  // can be selected by RTL
8801  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8802  // can be selected by RTL
8803  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8804  // among generated by PAROPT.
8805 
8806  PACKED_REDUCTION_METHOD_T retval;
8807 
8808  int team_size;
8809 
8810  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8811 
8812 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8813  (loc && \
8814  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8815 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8816 
8817  retval = critical_reduce_block;
8818 
8819  // another choice of getting a team size (with 1 dynamic deference) is slower
8820  team_size = __kmp_get_team_num_threads(global_tid);
8821  if (team_size == 1) {
8822 
8823  retval = empty_reduce_block;
8824 
8825  } else {
8826 
8827  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8828 
8829 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8830  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8831 
8832 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8833  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8834 
8835  int teamsize_cutoff = 4;
8836 
8837 #if KMP_MIC_SUPPORTED
8838  if (__kmp_mic_type != non_mic) {
8839  teamsize_cutoff = 8;
8840  }
8841 #endif
8842  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8843  if (tree_available) {
8844  if (team_size <= teamsize_cutoff) {
8845  if (atomic_available) {
8846  retval = atomic_reduce_block;
8847  }
8848  } else {
8849  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8850  }
8851  } else if (atomic_available) {
8852  retval = atomic_reduce_block;
8853  }
8854 #else
8855 #error "Unknown or unsupported OS"
8856 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8857  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8858 
8859 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8860 
8861 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8862  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8863 
8864  // basic tuning
8865 
8866  if (atomic_available) {
8867  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8868  retval = atomic_reduce_block;
8869  }
8870  } // otherwise: use critical section
8871 
8872 #elif KMP_OS_DARWIN
8873 
8874  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8875  if (atomic_available && (num_vars <= 3)) {
8876  retval = atomic_reduce_block;
8877  } else if (tree_available) {
8878  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8879  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8880  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8881  }
8882  } // otherwise: use critical section
8883 
8884 #else
8885 #error "Unknown or unsupported OS"
8886 #endif
8887 
8888 #else
8889 #error "Unknown or unsupported architecture"
8890 #endif
8891  }
8892 
8893  // KMP_FORCE_REDUCTION
8894 
8895  // If the team is serialized (team_size == 1), ignore the forced reduction
8896  // method and stay with the unsynchronized method (empty_reduce_block)
8897  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8898  team_size != 1) {
8899 
8900  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8901 
8902  int atomic_available, tree_available;
8903 
8904  switch ((forced_retval = __kmp_force_reduction_method)) {
8905  case critical_reduce_block:
8906  KMP_ASSERT(lck); // lck should be != 0
8907  break;
8908 
8909  case atomic_reduce_block:
8910  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8911  if (!atomic_available) {
8912  KMP_WARNING(RedMethodNotSupported, "atomic");
8913  forced_retval = critical_reduce_block;
8914  }
8915  break;
8916 
8917  case tree_reduce_block:
8918  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8919  if (!tree_available) {
8920  KMP_WARNING(RedMethodNotSupported, "tree");
8921  forced_retval = critical_reduce_block;
8922  } else {
8923 #if KMP_FAST_REDUCTION_BARRIER
8924  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8925 #endif
8926  }
8927  break;
8928 
8929  default:
8930  KMP_ASSERT(0); // "unsupported method specified"
8931  }
8932 
8933  retval = forced_retval;
8934  }
8935 
8936  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8937 
8938 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8939 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8940 
8941  return (retval);
8942 }
8943 // this function is for testing set/get/determine reduce method
8944 kmp_int32 __kmp_get_reduce_method(void) {
8945  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8946 }
8947 
8948 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8949 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8950 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8951 
8952 // Hard pause shuts down the runtime completely. Resume happens naturally when
8953 // OpenMP is used subsequently.
8954 void __kmp_hard_pause() {
8955  __kmp_pause_status = kmp_hard_paused;
8956  __kmp_internal_end_thread(-1);
8957 }
8958 
8959 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8960 void __kmp_resume_if_soft_paused() {
8961  if (__kmp_pause_status == kmp_soft_paused) {
8962  __kmp_pause_status = kmp_not_paused;
8963 
8964  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8965  kmp_info_t *thread = __kmp_threads[gtid];
8966  if (thread) { // Wake it if sleeping
8967  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8968  thread);
8969  if (fl.is_sleeping())
8970  fl.resume(gtid);
8971  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8972  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8973  } else { // thread holds the lock and may sleep soon
8974  do { // until either the thread sleeps, or we can get the lock
8975  if (fl.is_sleeping()) {
8976  fl.resume(gtid);
8977  break;
8978  } else if (__kmp_try_suspend_mx(thread)) {
8979  __kmp_unlock_suspend_mx(thread);
8980  break;
8981  }
8982  } while (1);
8983  }
8984  }
8985  }
8986  }
8987 }
8988 
8989 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8990 // TODO: add warning messages
8991 int __kmp_pause_resource(kmp_pause_status_t level) {
8992  if (level == kmp_not_paused) { // requesting resume
8993  if (__kmp_pause_status == kmp_not_paused) {
8994  // error message about runtime not being paused, so can't resume
8995  return 1;
8996  } else {
8997  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8998  __kmp_pause_status == kmp_hard_paused);
8999  __kmp_pause_status = kmp_not_paused;
9000  return 0;
9001  }
9002  } else if (level == kmp_soft_paused) { // requesting soft pause
9003  if (__kmp_pause_status != kmp_not_paused) {
9004  // error message about already being paused
9005  return 1;
9006  } else {
9007  __kmp_soft_pause();
9008  return 0;
9009  }
9010  } else if (level == kmp_hard_paused) { // requesting hard pause
9011  if (__kmp_pause_status != kmp_not_paused) {
9012  // error message about already being paused
9013  return 1;
9014  } else {
9015  __kmp_hard_pause();
9016  return 0;
9017  }
9018  } else {
9019  // error message about invalid level
9020  return 1;
9021  }
9022 }
9023 
9024 void __kmp_omp_display_env(int verbose) {
9025  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9026  if (__kmp_init_serial == 0)
9027  __kmp_do_serial_initialize();
9028  __kmp_display_env_impl(!verbose, verbose);
9029  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9030 }
9031 
9032 // The team size is changing, so distributed barrier must be modified
9033 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9034  int new_nthreads) {
9035  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9036  bp_dist_bar);
9037  kmp_info_t **other_threads = team->t.t_threads;
9038 
9039  // We want all the workers to stop waiting on the barrier while we adjust the
9040  // size of the team.
9041  for (int f = 1; f < old_nthreads; ++f) {
9042  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9043  // Ignore threads that are already inactive or not present in the team
9044  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9045  // teams construct causes thread_limit to get passed in, and some of
9046  // those could be inactive; just ignore them
9047  continue;
9048  }
9049  // If thread is transitioning still to in_use state, wait for it
9050  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9051  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9052  KMP_CPU_PAUSE();
9053  }
9054  // The thread should be in_use now
9055  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9056  // Transition to unused state
9057  team->t.t_threads[f]->th.th_used_in_team.store(2);
9058  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9059  }
9060  // Release all the workers
9061  team->t.b->go_release();
9062 
9063  KMP_MFENCE();
9064 
9065  // Workers should see transition status 2 and move to 0; but may need to be
9066  // woken up first
9067  int count = old_nthreads - 1;
9068  while (count > 0) {
9069  count = old_nthreads - 1;
9070  for (int f = 1; f < old_nthreads; ++f) {
9071  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9072  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9073  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9074  void *, other_threads[f]->th.th_sleep_loc);
9075  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9076  }
9077  } else {
9078  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9079  count--;
9080  }
9081  }
9082  }
9083  // Now update the barrier size
9084  team->t.b->update_num_threads(new_nthreads);
9085  team->t.b->go_reset();
9086 }
9087 
9088 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9089  // Add the threads back to the team
9090  KMP_DEBUG_ASSERT(team);
9091  // Threads were paused and pointed at th_used_in_team temporarily during a
9092  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9093  // the thread that it should transition itself back into the team. Then, if
9094  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9095  // to wake it up.
9096  for (int f = 1; f < new_nthreads; ++f) {
9097  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9098  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9099  3);
9100  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9101  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9102  (kmp_flag_32<false, false> *)NULL);
9103  }
9104  }
9105  // The threads should be transitioning to the team; when they are done, they
9106  // should have set th_used_in_team to 1. This loop forces master to wait until
9107  // all threads have moved into the team and are waiting in the barrier.
9108  int count = new_nthreads - 1;
9109  while (count > 0) {
9110  count = new_nthreads - 1;
9111  for (int f = 1; f < new_nthreads; ++f) {
9112  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9113  count--;
9114  }
9115  }
9116  }
9117 }
9118 
9119 // Globals and functions for hidden helper task
9120 kmp_info_t **__kmp_hidden_helper_threads;
9121 kmp_info_t *__kmp_hidden_helper_main_thread;
9122 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9123 #if KMP_OS_LINUX
9124 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9125 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9126 #else
9127 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9128 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9129 #endif
9130 
9131 namespace {
9132 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9133 
9134 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9135  // This is an explicit synchronization on all hidden helper threads in case
9136  // that when a regular thread pushes a hidden helper task to one hidden
9137  // helper thread, the thread has not been awaken once since they're released
9138  // by the main thread after creating the team.
9139  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9140  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9141  __kmp_hidden_helper_threads_num)
9142  ;
9143 
9144  // If main thread, then wait for signal
9145  if (__kmpc_master(nullptr, *gtid)) {
9146  // First, unset the initial state and release the initial thread
9147  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9148  __kmp_hidden_helper_initz_release();
9149  __kmp_hidden_helper_main_thread_wait();
9150  // Now wake up all worker threads
9151  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9152  __kmp_hidden_helper_worker_thread_signal();
9153  }
9154  }
9155 }
9156 } // namespace
9157 
9158 void __kmp_hidden_helper_threads_initz_routine() {
9159  // Create a new root for hidden helper team/threads
9160  const int gtid = __kmp_register_root(TRUE);
9161  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9162  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9163  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9164  __kmp_hidden_helper_threads_num;
9165 
9166  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9167 
9168  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9169 
9170  // Set the initialization flag to FALSE
9171  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9172 
9173  __kmp_hidden_helper_threads_deinitz_release();
9174 }
9175 
9176 /* Nesting Mode:
9177  Set via KMP_NESTING_MODE, which takes an integer.
9178  Note: we skip duplicate topology levels, and skip levels with only
9179  one entity.
9180  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9181  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9182  in the topology, and initializes the number of threads at each of those
9183  levels to the number of entities at each level, respectively, below the
9184  entity at the parent level.
9185  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9186  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9187  the user to turn nesting on explicitly. This is an even more experimental
9188  option to this experimental feature, and may change or go away in the
9189  future.
9190 */
9191 
9192 // Allocate space to store nesting levels
9193 void __kmp_init_nesting_mode() {
9194  int levels = KMP_HW_LAST;
9195  __kmp_nesting_mode_nlevels = levels;
9196  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9197  for (int i = 0; i < levels; ++i)
9198  __kmp_nesting_nth_level[i] = 0;
9199  if (__kmp_nested_nth.size < levels) {
9200  __kmp_nested_nth.nth =
9201  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9202  __kmp_nested_nth.size = levels;
9203  }
9204 }
9205 
9206 // Set # threads for top levels of nesting; must be called after topology set
9207 void __kmp_set_nesting_mode_threads() {
9208  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9209 
9210  if (__kmp_nesting_mode == 1)
9211  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9212  else if (__kmp_nesting_mode > 1)
9213  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9214 
9215  if (__kmp_topology) { // use topology info
9216  int loc, hw_level;
9217  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9218  loc < __kmp_nesting_mode_nlevels;
9219  loc++, hw_level++) {
9220  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9221  if (__kmp_nesting_nth_level[loc] == 1)
9222  loc--;
9223  }
9224  // Make sure all cores are used
9225  if (__kmp_nesting_mode > 1 && loc > 1) {
9226  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9227  int num_cores = __kmp_topology->get_count(core_level);
9228  int upper_levels = 1;
9229  for (int level = 0; level < loc - 1; ++level)
9230  upper_levels *= __kmp_nesting_nth_level[level];
9231  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9232  __kmp_nesting_nth_level[loc - 1] =
9233  num_cores / __kmp_nesting_nth_level[loc - 2];
9234  }
9235  __kmp_nesting_mode_nlevels = loc;
9236  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9237  } else { // no topology info available; provide a reasonable guesstimation
9238  if (__kmp_avail_proc >= 4) {
9239  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9240  __kmp_nesting_nth_level[1] = 2;
9241  __kmp_nesting_mode_nlevels = 2;
9242  } else {
9243  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9244  __kmp_nesting_mode_nlevels = 1;
9245  }
9246  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9247  }
9248  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9249  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9250  }
9251  set__nproc(thread, __kmp_nesting_nth_level[0]);
9252  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9253  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9254  if (get__max_active_levels(thread) > 1) {
9255  // if max levels was set, set nesting mode levels to same
9256  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9257  }
9258  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9259  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9260 }
9261 
9262 // Empty symbols to export (see exports_so.txt) when feature is disabled
9263 extern "C" {
9264 #if !KMP_STATS_ENABLED
9265 void __kmp_reset_stats() {}
9266 #endif
9267 #if !USE_DEBUGGER
9268 int __kmp_omp_debug_struct_info = FALSE;
9269 int __kmp_debugging = FALSE;
9270 #endif
9271 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9272 void __kmp_itt_fini_ittlib() {}
9273 void __kmp_itt_init_ittlib() {}
9274 #endif
9275 }
9276 
9277 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236