LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 #if OMPD_SUPPORT
35 #include "ompd-specific.h"
36 #endif
37 
38 #if OMP_PROFILING_SUPPORT
39 #include "llvm/Support/TimeProfiler.h"
40 static char *ProfileTraceFile = nullptr;
41 #endif
42 
43 /* these are temporary issues to be dealt with */
44 #define KMP_USE_PRCTL 0
45 
46 #if KMP_OS_WINDOWS
47 #include <process.h>
48 #endif
49 
50 #if KMP_OS_WINDOWS
51 // windows does not need include files as it doesn't use shared memory
52 #else
53 #include <sys/mman.h>
54 #include <sys/stat.h>
55 #include <fcntl.h>
56 #define SHM_SIZE 1024
57 #endif
58 
59 #if defined(KMP_GOMP_COMPAT)
60 char const __kmp_version_alt_comp[] =
61  KMP_VERSION_PREFIX "alternative compiler support: yes";
62 #endif /* defined(KMP_GOMP_COMPAT) */
63 
64 char const __kmp_version_omp_api[] =
65  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66 
67 #ifdef KMP_DEBUG
68 char const __kmp_version_lock[] =
69  KMP_VERSION_PREFIX "lock type: run time selectable";
70 #endif /* KMP_DEBUG */
71 
72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73 
74 /* ------------------------------------------------------------------------ */
75 
76 #if KMP_USE_MONITOR
77 kmp_info_t __kmp_monitor;
78 #endif
79 
80 /* Forward declarations */
81 
82 void __kmp_cleanup(void);
83 
84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85  int gtid);
86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87  kmp_internal_control_t *new_icvs,
88  ident_t *loc);
89 #if KMP_AFFINITY_SUPPORTED
90 static void __kmp_partition_places(kmp_team_t *team,
91  int update_master_only = 0);
92 #endif
93 static void __kmp_do_serial_initialize(void);
94 void __kmp_fork_barrier(int gtid, int tid);
95 void __kmp_join_barrier(int gtid);
96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97  kmp_internal_control_t *new_icvs, ident_t *loc);
98 
99 #ifdef USE_LOAD_BALANCE
100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101 #endif
102 
103 static int __kmp_expand_threads(int nNeed);
104 #if KMP_OS_WINDOWS
105 static int __kmp_unregister_root_other_thread(int gtid);
106 #endif
107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109 
110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111  int new_nthreads);
112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113 
114 /* Calculate the identifier of the current thread */
115 /* fast (and somewhat portable) way to get unique identifier of executing
116  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117 int __kmp_get_global_thread_id() {
118  int i;
119  kmp_info_t **other_threads;
120  size_t stack_data;
121  char *stack_addr;
122  size_t stack_size;
123  char *stack_base;
124 
125  KA_TRACE(
126  1000,
127  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128  __kmp_nth, __kmp_all_nth));
129 
130  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133  __kmp_init_gtid for this to work. */
134 
135  if (!TCR_4(__kmp_init_gtid))
136  return KMP_GTID_DNE;
137 
138 #ifdef KMP_TDATA_GTID
139  if (TCR_4(__kmp_gtid_mode) >= 3) {
140  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141  return __kmp_gtid;
142  }
143 #endif
144  if (TCR_4(__kmp_gtid_mode) >= 2) {
145  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146  return __kmp_gtid_get_specific();
147  }
148  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149 
150  stack_addr = (char *)&stack_data;
151  other_threads = __kmp_threads;
152 
153  /* ATT: The code below is a source of potential bugs due to unsynchronized
154  access to __kmp_threads array. For example:
155  1. Current thread loads other_threads[i] to thr and checks it, it is
156  non-NULL.
157  2. Current thread is suspended by OS.
158  3. Another thread unregisters and finishes (debug versions of free()
159  may fill memory with something like 0xEF).
160  4. Current thread is resumed.
161  5. Current thread reads junk from *thr.
162  TODO: Fix it. --ln */
163 
164  for (i = 0; i < __kmp_threads_capacity; i++) {
165 
166  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167  if (!thr)
168  continue;
169 
170  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172 
173  /* stack grows down -- search through all of the active threads */
174 
175  if (stack_addr <= stack_base) {
176  size_t stack_diff = stack_base - stack_addr;
177 
178  if (stack_diff <= stack_size) {
179  /* The only way we can be closer than the allocated */
180  /* stack size is if we are running on this thread. */
181  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182  return i;
183  }
184  }
185  }
186 
187  /* get specific to try and determine our gtid */
188  KA_TRACE(1000,
189  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190  "thread, using TLS\n"));
191  i = __kmp_gtid_get_specific();
192 
193  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194 
195  /* if we havn't been assigned a gtid, then return code */
196  if (i < 0)
197  return i;
198 
199  /* dynamically updated stack window for uber threads to avoid get_specific
200  call */
201  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202  KMP_FATAL(StackOverflow, i);
203  }
204 
205  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206  if (stack_addr > stack_base) {
207  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210  stack_base);
211  } else {
212  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213  stack_base - stack_addr);
214  }
215 
216  /* Reprint stack bounds for ubermaster since they have been refined */
217  if (__kmp_storage_map) {
218  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221  other_threads[i]->th.th_info.ds.ds_stacksize,
222  "th_%d stack (refinement)", i);
223  }
224  return i;
225 }
226 
227 int __kmp_get_global_thread_id_reg() {
228  int gtid;
229 
230  if (!__kmp_init_serial) {
231  gtid = KMP_GTID_DNE;
232  } else
233 #ifdef KMP_TDATA_GTID
234  if (TCR_4(__kmp_gtid_mode) >= 3) {
235  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236  gtid = __kmp_gtid;
237  } else
238 #endif
239  if (TCR_4(__kmp_gtid_mode) >= 2) {
240  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241  gtid = __kmp_gtid_get_specific();
242  } else {
243  KA_TRACE(1000,
244  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245  gtid = __kmp_get_global_thread_id();
246  }
247 
248  /* we must be a new uber master sibling thread */
249  if (gtid == KMP_GTID_DNE) {
250  KA_TRACE(10,
251  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252  "Registering a new gtid.\n"));
253  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254  if (!__kmp_init_serial) {
255  __kmp_do_serial_initialize();
256  gtid = __kmp_gtid_get_specific();
257  } else {
258  gtid = __kmp_register_root(FALSE);
259  }
260  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262  }
263 
264  KMP_DEBUG_ASSERT(gtid >= 0);
265 
266  return gtid;
267 }
268 
269 /* caller must hold forkjoin_lock */
270 void __kmp_check_stack_overlap(kmp_info_t *th) {
271  int f;
272  char *stack_beg = NULL;
273  char *stack_end = NULL;
274  int gtid;
275 
276  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277  if (__kmp_storage_map) {
278  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280 
281  gtid = __kmp_gtid_from_thread(th);
282 
283  if (gtid == KMP_GTID_MONITOR) {
284  __kmp_print_storage_map_gtid(
285  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286  "th_%s stack (%s)", "mon",
287  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288  } else {
289  __kmp_print_storage_map_gtid(
290  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291  "th_%d stack (%s)", gtid,
292  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293  }
294  }
295 
296  /* No point in checking ubermaster threads since they use refinement and
297  * cannot overlap */
298  gtid = __kmp_gtid_from_thread(th);
299  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300  KA_TRACE(10,
301  ("__kmp_check_stack_overlap: performing extensive checking\n"));
302  if (stack_beg == NULL) {
303  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305  }
306 
307  for (f = 0; f < __kmp_threads_capacity; f++) {
308  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309 
310  if (f_th && f_th != th) {
311  char *other_stack_end =
312  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313  char *other_stack_beg =
314  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317 
318  /* Print the other stack values before the abort */
319  if (__kmp_storage_map)
320  __kmp_print_storage_map_gtid(
321  -1, other_stack_beg, other_stack_end,
322  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324 
325  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326  __kmp_msg_null);
327  }
328  }
329  }
330  }
331  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332 }
333 
334 /* ------------------------------------------------------------------------ */
335 
336 void __kmp_infinite_loop(void) {
337  static int done = FALSE;
338 
339  while (!done) {
340  KMP_YIELD(TRUE);
341  }
342 }
343 
344 #define MAX_MESSAGE 512
345 
346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347  char const *format, ...) {
348  char buffer[MAX_MESSAGE];
349  va_list ap;
350 
351  va_start(ap, format);
352  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353  p2, (unsigned long)size, format);
354  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355  __kmp_vprintf(kmp_err, buffer, ap);
356 #if KMP_PRINT_DATA_PLACEMENT
357  int node;
358  if (gtid >= 0) {
359  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360  if (__kmp_storage_map_verbose) {
361  node = __kmp_get_host_node(p1);
362  if (node < 0) /* doesn't work, so don't try this next time */
363  __kmp_storage_map_verbose = FALSE;
364  else {
365  char *last;
366  int lastNode;
367  int localProc = __kmp_get_cpu_from_gtid(gtid);
368 
369  const int page_size = KMP_GET_PAGE_SIZE();
370 
371  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373  if (localProc >= 0)
374  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375  localProc >> 1);
376  else
377  __kmp_printf_no_lock(" GTID %d\n", gtid);
378 #if KMP_USE_PRCTL
379  /* The more elaborate format is disabled for now because of the prctl
380  * hanging bug. */
381  do {
382  last = p1;
383  lastNode = node;
384  /* This loop collates adjacent pages with the same host node. */
385  do {
386  (char *)p1 += page_size;
387  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389  lastNode);
390  } while (p1 <= p2);
391 #else
392  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393  (char *)p1 + (page_size - 1),
394  __kmp_get_host_node(p1));
395  if (p1 < p2) {
396  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397  (char *)p2 + (page_size - 1),
398  __kmp_get_host_node(p2));
399  }
400 #endif
401  }
402  }
403  } else
404  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405  }
406 #endif /* KMP_PRINT_DATA_PLACEMENT */
407  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408 }
409 
410 void __kmp_warn(char const *format, ...) {
411  char buffer[MAX_MESSAGE];
412  va_list ap;
413 
414  if (__kmp_generate_warnings == kmp_warnings_off) {
415  return;
416  }
417 
418  va_start(ap, format);
419 
420  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422  __kmp_vprintf(kmp_err, buffer, ap);
423  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424 
425  va_end(ap);
426 }
427 
428 void __kmp_abort_process() {
429  // Later threads may stall here, but that's ok because abort() will kill them.
430  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431 
432  if (__kmp_debug_buf) {
433  __kmp_dump_debug_buffer();
434  }
435 
436  if (KMP_OS_WINDOWS) {
437  // Let other threads know of abnormal termination and prevent deadlock
438  // if abort happened during library initialization or shutdown
439  __kmp_global.g.g_abort = SIGABRT;
440 
441  /* On Windows* OS by default abort() causes pop-up error box, which stalls
442  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443  boxes. _set_abort_behavior() works well, but this function is not
444  available in VS7 (this is not problem for DLL, but it is a problem for
445  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446  help, at least in some versions of MS C RTL.
447 
448  It seems following sequence is the only way to simulate abort() and
449  avoid pop-up error box. */
450  raise(SIGABRT);
451  _exit(3); // Just in case, if signal ignored, exit anyway.
452  } else {
453  __kmp_unregister_library();
454  abort();
455  }
456 
457  __kmp_infinite_loop();
458  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459 
460 } // __kmp_abort_process
461 
462 void __kmp_abort_thread(void) {
463  // TODO: Eliminate g_abort global variable and this function.
464  // In case of abort just call abort(), it will kill all the threads.
465  __kmp_infinite_loop();
466 } // __kmp_abort_thread
467 
468 /* Print out the storage map for the major kmp_info_t thread data structures
469  that are allocated together. */
470 
471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473  gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477 
478  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479  sizeof(kmp_local_t), "th_%d.th_local", gtid);
480 
481  __kmp_print_storage_map_gtid(
482  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484 
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486  &thr->th.th_bar[bs_plain_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488  gtid);
489 
490  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491  &thr->th.th_bar[bs_forkjoin_barrier + 1],
492  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493  gtid);
494 
495 #if KMP_FAST_REDUCTION_BARRIER
496  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497  &thr->th.th_bar[bs_reduction_barrier + 1],
498  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499  gtid);
500 #endif // KMP_FAST_REDUCTION_BARRIER
501 }
502 
503 /* Print out the storage map for the major kmp_team_t team data structures
504  that are allocated together. */
505 
506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507  int team_id, int num_thr) {
508  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513  &team->t.t_bar[bs_last_barrier],
514  sizeof(kmp_balign_team_t) * bs_last_barrier,
515  "%s_%d.t_bar", header, team_id);
516 
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518  &team->t.t_bar[bs_plain_barrier + 1],
519  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520  header, team_id);
521 
522  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523  &team->t.t_bar[bs_forkjoin_barrier + 1],
524  sizeof(kmp_balign_team_t),
525  "%s_%d.t_bar[forkjoin]", header, team_id);
526 
527 #if KMP_FAST_REDUCTION_BARRIER
528  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529  &team->t.t_bar[bs_reduction_barrier + 1],
530  sizeof(kmp_balign_team_t),
531  "%s_%d.t_bar[reduction]", header, team_id);
532 #endif // KMP_FAST_REDUCTION_BARRIER
533 
534  __kmp_print_storage_map_gtid(
535  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537 
538  __kmp_print_storage_map_gtid(
539  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543  &team->t.t_disp_buffer[num_disp_buff],
544  sizeof(dispatch_shared_info_t) * num_disp_buff,
545  "%s_%d.t_disp_buffer", header, team_id);
546 }
547 
548 static void __kmp_init_allocator() {
549  __kmp_init_memkind();
550  __kmp_init_target_mem();
551 }
552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553 
554 /* ------------------------------------------------------------------------ */
555 
556 #if KMP_DYNAMIC_LIB
557 #if KMP_OS_WINDOWS
558 
559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561 
562  switch (fdwReason) {
563 
564  case DLL_PROCESS_ATTACH:
565  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566 
567  return TRUE;
568 
569  case DLL_PROCESS_DETACH:
570  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571 
572  // According to Windows* documentation for DllMain entry point:
573  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574  // lpReserved == NULL when FreeLibrary() is called,
575  // lpReserved != NULL when the process is terminated.
576  // When FreeLibrary() is called, worker threads remain alive. So the
577  // runtime's state is consistent and executing proper shutdown is OK.
578  // When the process is terminated, worker threads have exited or been
579  // forcefully terminated by the OS and only the shutdown thread remains.
580  // This can leave the runtime in an inconsistent state.
581  // Hence, only attempt proper cleanup when FreeLibrary() is called.
582  // Otherwise, rely on OS to reclaim resources.
583  if (lpReserved == NULL)
584  __kmp_internal_end_library(__kmp_gtid_get_specific());
585 
586  return TRUE;
587 
588  case DLL_THREAD_ATTACH:
589  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590 
591  /* if we want to register new siblings all the time here call
592  * __kmp_get_gtid(); */
593  return TRUE;
594 
595  case DLL_THREAD_DETACH:
596  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597 
598  __kmp_internal_end_thread(__kmp_gtid_get_specific());
599  return TRUE;
600  }
601 
602  return TRUE;
603 }
604 
605 #endif /* KMP_OS_WINDOWS */
606 #endif /* KMP_DYNAMIC_LIB */
607 
608 /* __kmp_parallel_deo -- Wait until it's our turn. */
609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610  int gtid = *gtid_ref;
611 #ifdef BUILD_PARALLEL_ORDERED
612  kmp_team_t *team = __kmp_team_from_gtid(gtid);
613 #endif /* BUILD_PARALLEL_ORDERED */
614 
615  if (__kmp_env_consistency_check) {
616  if (__kmp_threads[gtid]->th.th_root->r.r_active)
617 #if KMP_USE_DYNAMIC_LOCK
618  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619 #else
620  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621 #endif
622  }
623 #ifdef BUILD_PARALLEL_ORDERED
624  if (!team->t.t_serialized) {
625  KMP_MB();
626  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627  NULL);
628  KMP_MB();
629  }
630 #endif /* BUILD_PARALLEL_ORDERED */
631 }
632 
633 /* __kmp_parallel_dxo -- Signal the next task. */
634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635  int gtid = *gtid_ref;
636 #ifdef BUILD_PARALLEL_ORDERED
637  int tid = __kmp_tid_from_gtid(gtid);
638  kmp_team_t *team = __kmp_team_from_gtid(gtid);
639 #endif /* BUILD_PARALLEL_ORDERED */
640 
641  if (__kmp_env_consistency_check) {
642  if (__kmp_threads[gtid]->th.th_root->r.r_active)
643  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644  }
645 #ifdef BUILD_PARALLEL_ORDERED
646  if (!team->t.t_serialized) {
647  KMP_MB(); /* Flush all pending memory write invalidates. */
648 
649  /* use the tid of the next thread in this team */
650  /* TODO replace with general release procedure */
651  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652 
653  KMP_MB(); /* Flush all pending memory write invalidates. */
654  }
655 #endif /* BUILD_PARALLEL_ORDERED */
656 }
657 
658 /* ------------------------------------------------------------------------ */
659 /* The BARRIER for a SINGLE process section is always explicit */
660 
661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662  int status;
663  kmp_info_t *th;
664  kmp_team_t *team;
665 
666  if (!TCR_4(__kmp_init_parallel))
667  __kmp_parallel_initialize();
668  __kmp_resume_if_soft_paused();
669 
670  th = __kmp_threads[gtid];
671  team = th->th.th_team;
672  status = 0;
673 
674  th->th.th_ident = id_ref;
675 
676  if (team->t.t_serialized) {
677  status = 1;
678  } else {
679  kmp_int32 old_this = th->th.th_local.this_construct;
680 
681  ++th->th.th_local.this_construct;
682  /* try to set team count to thread count--success means thread got the
683  single block */
684  /* TODO: Should this be acquire or release? */
685  if (team->t.t_construct == old_this) {
686  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687  th->th.th_local.this_construct);
688  }
689 #if USE_ITT_BUILD
690  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692  team->t.t_active_level == 1) {
693  // Only report metadata by primary thread of active team at level 1
694  __kmp_itt_metadata_single(id_ref);
695  }
696 #endif /* USE_ITT_BUILD */
697  }
698 
699  if (__kmp_env_consistency_check) {
700  if (status && push_ws) {
701  __kmp_push_workshare(gtid, ct_psingle, id_ref);
702  } else {
703  __kmp_check_workshare(gtid, ct_psingle, id_ref);
704  }
705  }
706 #if USE_ITT_BUILD
707  if (status) {
708  __kmp_itt_single_start(gtid);
709  }
710 #endif /* USE_ITT_BUILD */
711  return status;
712 }
713 
714 void __kmp_exit_single(int gtid) {
715 #if USE_ITT_BUILD
716  __kmp_itt_single_end(gtid);
717 #endif /* USE_ITT_BUILD */
718  if (__kmp_env_consistency_check)
719  __kmp_pop_workshare(gtid, ct_psingle, NULL);
720 }
721 
722 /* determine if we can go parallel or must use a serialized parallel region and
723  * how many threads we can use
724  * set_nproc is the number of threads requested for the team
725  * returns 0 if we should serialize or only use one thread,
726  * otherwise the number of threads to use
727  * The forkjoin lock is held by the caller. */
728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729  int master_tid, int set_nthreads,
730  int enter_teams) {
731  int capacity;
732  int new_nthreads;
733  KMP_DEBUG_ASSERT(__kmp_init_serial);
734  KMP_DEBUG_ASSERT(root && parent_team);
735  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736 
737  // If dyn-var is set, dynamically adjust the number of desired threads,
738  // according to the method specified by dynamic_mode.
739  new_nthreads = set_nthreads;
740  if (!get__dynamic_2(parent_team, master_tid)) {
741  ;
742  }
743 #ifdef USE_LOAD_BALANCE
744  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746  if (new_nthreads == 1) {
747  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748  "reservation to 1 thread\n",
749  master_tid));
750  return 1;
751  }
752  if (new_nthreads < set_nthreads) {
753  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754  "reservation to %d threads\n",
755  master_tid, new_nthreads));
756  }
757  }
758 #endif /* USE_LOAD_BALANCE */
759  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760  new_nthreads = __kmp_avail_proc - __kmp_nth +
761  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762  if (new_nthreads <= 1) {
763  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764  "reservation to 1 thread\n",
765  master_tid));
766  return 1;
767  }
768  if (new_nthreads < set_nthreads) {
769  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770  "reservation to %d threads\n",
771  master_tid, new_nthreads));
772  } else {
773  new_nthreads = set_nthreads;
774  }
775  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776  if (set_nthreads > 2) {
777  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778  new_nthreads = (new_nthreads % set_nthreads) + 1;
779  if (new_nthreads == 1) {
780  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781  "reservation to 1 thread\n",
782  master_tid));
783  return 1;
784  }
785  if (new_nthreads < set_nthreads) {
786  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787  "reservation to %d threads\n",
788  master_tid, new_nthreads));
789  }
790  }
791  } else {
792  KMP_ASSERT(0);
793  }
794 
795  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796  if (__kmp_nth + new_nthreads -
797  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798  __kmp_max_nth) {
799  int tl_nthreads = __kmp_max_nth - __kmp_nth +
800  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801  if (tl_nthreads <= 0) {
802  tl_nthreads = 1;
803  }
804 
805  // If dyn-var is false, emit a 1-time warning.
806  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807  __kmp_reserve_warn = 1;
808  __kmp_msg(kmp_ms_warning,
809  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811  }
812  if (tl_nthreads == 1) {
813  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814  "reduced reservation to 1 thread\n",
815  master_tid));
816  return 1;
817  }
818  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819  "reservation to %d threads\n",
820  master_tid, tl_nthreads));
821  new_nthreads = tl_nthreads;
822  }
823 
824  // Respect OMP_THREAD_LIMIT
825  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827  if (cg_nthreads + new_nthreads -
828  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829  max_cg_threads) {
830  int tl_nthreads = max_cg_threads - cg_nthreads +
831  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832  if (tl_nthreads <= 0) {
833  tl_nthreads = 1;
834  }
835 
836  // If dyn-var is false, emit a 1-time warning.
837  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838  __kmp_reserve_warn = 1;
839  __kmp_msg(kmp_ms_warning,
840  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842  }
843  if (tl_nthreads == 1) {
844  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845  "reduced reservation to 1 thread\n",
846  master_tid));
847  return 1;
848  }
849  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850  "reservation to %d threads\n",
851  master_tid, tl_nthreads));
852  new_nthreads = tl_nthreads;
853  }
854 
855  // Check if the threads array is large enough, or needs expanding.
856  // See comment in __kmp_register_root() about the adjustment if
857  // __kmp_threads[0] == NULL.
858  capacity = __kmp_threads_capacity;
859  if (TCR_PTR(__kmp_threads[0]) == NULL) {
860  --capacity;
861  }
862  // If it is not for initializing the hidden helper team, we need to take
863  // __kmp_hidden_helper_threads_num out of the capacity because it is included
864  // in __kmp_threads_capacity.
865  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866  capacity -= __kmp_hidden_helper_threads_num;
867  }
868  if (__kmp_nth + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  capacity) {
871  // Expand the threads array.
872  int slotsRequired = __kmp_nth + new_nthreads -
873  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874  capacity;
875  int slotsAdded = __kmp_expand_threads(slotsRequired);
876  if (slotsAdded < slotsRequired) {
877  // The threads array was not expanded enough.
878  new_nthreads -= (slotsRequired - slotsAdded);
879  KMP_ASSERT(new_nthreads >= 1);
880 
881  // If dyn-var is false, emit a 1-time warning.
882  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883  __kmp_reserve_warn = 1;
884  if (__kmp_tp_cached) {
885  __kmp_msg(kmp_ms_warning,
886  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889  } else {
890  __kmp_msg(kmp_ms_warning,
891  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893  }
894  }
895  }
896  }
897 
898 #ifdef KMP_DEBUG
899  if (new_nthreads == 1) {
900  KC_TRACE(10,
901  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902  "dead roots and rechecking; requested %d threads\n",
903  __kmp_get_gtid(), set_nthreads));
904  } else {
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906  " %d threads\n",
907  __kmp_get_gtid(), new_nthreads, set_nthreads));
908  }
909 #endif // KMP_DEBUG
910  return new_nthreads;
911 }
912 
913 /* Allocate threads from the thread pool and assign them to the new team. We are
914  assured that there are enough threads available, because we checked on that
915  earlier within critical section forkjoin */
916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917  kmp_info_t *master_th, int master_gtid,
918  int fork_teams_workers) {
919  int i;
920  int use_hot_team;
921 
922  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924  KMP_MB();
925 
926  /* first, let's setup the primary thread */
927  master_th->th.th_info.ds.ds_tid = 0;
928  master_th->th.th_team = team;
929  master_th->th.th_team_nproc = team->t.t_nproc;
930  master_th->th.th_team_master = master_th;
931  master_th->th.th_team_serialized = FALSE;
932  master_th->th.th_dispatch = &team->t.t_dispatch[0];
933 
934 /* make sure we are not the optimized hot team */
935 #if KMP_NESTED_HOT_TEAMS
936  use_hot_team = 0;
937  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938  if (hot_teams) { // hot teams array is not allocated if
939  // KMP_HOT_TEAMS_MAX_LEVEL=0
940  int level = team->t.t_active_level - 1; // index in array of hot teams
941  if (master_th->th.th_teams_microtask) { // are we inside the teams?
942  if (master_th->th.th_teams_size.nteams > 1) {
943  ++level; // level was not increased in teams construct for
944  // team_of_masters
945  }
946  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947  master_th->th.th_teams_level == team->t.t_level) {
948  ++level; // level was not increased in teams construct for
949  // team_of_workers before the parallel
950  } // team->t.t_level will be increased inside parallel
951  }
952  if (level < __kmp_hot_teams_max_level) {
953  if (hot_teams[level].hot_team) {
954  // hot team has already been allocated for given level
955  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956  use_hot_team = 1; // the team is ready to use
957  } else {
958  use_hot_team = 0; // AC: threads are not allocated yet
959  hot_teams[level].hot_team = team; // remember new hot team
960  hot_teams[level].hot_team_nth = team->t.t_nproc;
961  }
962  } else {
963  use_hot_team = 0;
964  }
965  }
966 #else
967  use_hot_team = team == root->r.r_hot_team;
968 #endif
969  if (!use_hot_team) {
970 
971  /* install the primary thread */
972  team->t.t_threads[0] = master_th;
973  __kmp_initialize_info(master_th, team, 0, master_gtid);
974 
975  /* now, install the worker threads */
976  for (i = 1; i < team->t.t_nproc; i++) {
977 
978  /* fork or reallocate a new thread and install it in team */
979  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980  team->t.t_threads[i] = thr;
981  KMP_DEBUG_ASSERT(thr);
982  KMP_DEBUG_ASSERT(thr->th.th_team == team);
983  /* align team and thread arrived states */
984  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985  "T#%d(%d:%d) join =%llu, plain=%llu\n",
986  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989  team->t.t_bar[bs_plain_barrier].b_arrived));
990  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991  thr->th.th_teams_level = master_th->th.th_teams_level;
992  thr->th.th_teams_size = master_th->th.th_teams_size;
993  { // Initialize threads' barrier data.
994  int b;
995  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996  for (b = 0; b < bs_last_barrier; ++b) {
997  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999 #if USE_DEBUGGER
1000  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001 #endif
1002  }
1003  }
1004  }
1005 
1006 #if KMP_AFFINITY_SUPPORTED
1007  // Do not partition the places list for teams construct workers who
1008  // haven't actually been forked to do real work yet. This partitioning
1009  // will take place in the parallel region nested within the teams construct.
1010  if (!fork_teams_workers) {
1011  __kmp_partition_places(team);
1012  }
1013 #endif
1014  }
1015 
1016  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1017  for (i = 0; i < team->t.t_nproc; i++) {
1018  kmp_info_t *thr = team->t.t_threads[i];
1019  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1020  thr->th.th_prev_level != team->t.t_level) {
1021  team->t.t_display_affinity = 1;
1022  break;
1023  }
1024  }
1025  }
1026 
1027  KMP_MB();
1028 }
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 // Propagate any changes to the floating point control registers out to the team
1032 // We try to avoid unnecessary writes to the relevant cache line in the team
1033 // structure, so we don't make changes unless they are needed.
1034 inline static void propagateFPControl(kmp_team_t *team) {
1035  if (__kmp_inherit_fp_control) {
1036  kmp_int16 x87_fpu_control_word;
1037  kmp_uint32 mxcsr;
1038 
1039  // Get primary thread's values of FPU control flags (both X87 and vector)
1040  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1041  __kmp_store_mxcsr(&mxcsr);
1042  mxcsr &= KMP_X86_MXCSR_MASK;
1043 
1044  // There is no point looking at t_fp_control_saved here.
1045  // If it is TRUE, we still have to update the values if they are different
1046  // from those we now have. If it is FALSE we didn't save anything yet, but
1047  // our objective is the same. We have to ensure that the values in the team
1048  // are the same as those we have.
1049  // So, this code achieves what we need whether or not t_fp_control_saved is
1050  // true. By checking whether the value needs updating we avoid unnecessary
1051  // writes that would put the cache-line into a written state, causing all
1052  // threads in the team to have to read it again.
1053  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1054  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1055  // Although we don't use this value, other code in the runtime wants to know
1056  // whether it should restore them. So we must ensure it is correct.
1057  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1058  } else {
1059  // Similarly here. Don't write to this cache-line in the team structure
1060  // unless we have to.
1061  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1062  }
1063 }
1064 
1065 // Do the opposite, setting the hardware registers to the updated values from
1066 // the team.
1067 inline static void updateHWFPControl(kmp_team_t *team) {
1068  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1069  // Only reset the fp control regs if they have been changed in the team.
1070  // the parallel region that we are exiting.
1071  kmp_int16 x87_fpu_control_word;
1072  kmp_uint32 mxcsr;
1073  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1074  __kmp_store_mxcsr(&mxcsr);
1075  mxcsr &= KMP_X86_MXCSR_MASK;
1076 
1077  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1078  __kmp_clear_x87_fpu_status_word();
1079  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1080  }
1081 
1082  if (team->t.t_mxcsr != mxcsr) {
1083  __kmp_load_mxcsr(&team->t.t_mxcsr);
1084  }
1085  }
1086 }
1087 #else
1088 #define propagateFPControl(x) ((void)0)
1089 #define updateHWFPControl(x) ((void)0)
1090 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1091 
1092 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1093  int realloc); // forward declaration
1094 
1095 /* Run a parallel region that has been serialized, so runs only in a team of the
1096  single primary thread. */
1097 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1098  kmp_info_t *this_thr;
1099  kmp_team_t *serial_team;
1100 
1101  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1102 
1103  /* Skip all this code for autopar serialized loops since it results in
1104  unacceptable overhead */
1105  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1106  return;
1107 
1108  if (!TCR_4(__kmp_init_parallel))
1109  __kmp_parallel_initialize();
1110  __kmp_resume_if_soft_paused();
1111 
1112  this_thr = __kmp_threads[global_tid];
1113  serial_team = this_thr->th.th_serial_team;
1114 
1115  /* utilize the serialized team held by this thread */
1116  KMP_DEBUG_ASSERT(serial_team);
1117  KMP_MB();
1118 
1119  if (__kmp_tasking_mode != tskm_immediate_exec) {
1120  KMP_DEBUG_ASSERT(
1121  this_thr->th.th_task_team ==
1122  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1123  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1124  NULL);
1125  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1126  "team %p, new task_team = NULL\n",
1127  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1128  this_thr->th.th_task_team = NULL;
1129  }
1130 
1131  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1132  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1133  proc_bind = proc_bind_false;
1134  } else if (proc_bind == proc_bind_default) {
1135  // No proc_bind clause was specified, so use the current value
1136  // of proc-bind-var for this parallel region.
1137  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1138  }
1139  // Reset for next parallel region
1140  this_thr->th.th_set_proc_bind = proc_bind_default;
1141 
1142 #if OMPT_SUPPORT
1143  ompt_data_t ompt_parallel_data = ompt_data_none;
1144  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1145  if (ompt_enabled.enabled &&
1146  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1147 
1148  ompt_task_info_t *parent_task_info;
1149  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1150 
1151  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1152  if (ompt_enabled.ompt_callback_parallel_begin) {
1153  int team_size = 1;
1154 
1155  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1156  &(parent_task_info->task_data), &(parent_task_info->frame),
1157  &ompt_parallel_data, team_size,
1158  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1159  }
1160  }
1161 #endif // OMPT_SUPPORT
1162 
1163  if (this_thr->th.th_team != serial_team) {
1164  // Nested level will be an index in the nested nthreads array
1165  int level = this_thr->th.th_team->t.t_level;
1166 
1167  if (serial_team->t.t_serialized) {
1168  /* this serial team was already used
1169  TODO increase performance by making this locks more specific */
1170  kmp_team_t *new_team;
1171 
1172  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1173 
1174  new_team =
1175  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1176 #if OMPT_SUPPORT
1177  ompt_parallel_data,
1178 #endif
1179  proc_bind, &this_thr->th.th_current_task->td_icvs,
1180  0 USE_NESTED_HOT_ARG(NULL));
1181  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1182  KMP_ASSERT(new_team);
1183 
1184  /* setup new serialized team and install it */
1185  new_team->t.t_threads[0] = this_thr;
1186  new_team->t.t_parent = this_thr->th.th_team;
1187  serial_team = new_team;
1188  this_thr->th.th_serial_team = serial_team;
1189 
1190  KF_TRACE(
1191  10,
1192  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1193  global_tid, serial_team));
1194 
1195  /* TODO the above breaks the requirement that if we run out of resources,
1196  then we can still guarantee that serialized teams are ok, since we may
1197  need to allocate a new one */
1198  } else {
1199  KF_TRACE(
1200  10,
1201  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1202  global_tid, serial_team));
1203  }
1204 
1205  /* we have to initialize this serial team */
1206  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1207  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1208  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1209  serial_team->t.t_ident = loc;
1210  serial_team->t.t_serialized = 1;
1211  serial_team->t.t_nproc = 1;
1212  serial_team->t.t_parent = this_thr->th.th_team;
1213  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1214  this_thr->th.th_team = serial_team;
1215  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1216 
1217  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1218  this_thr->th.th_current_task));
1219  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1220  this_thr->th.th_current_task->td_flags.executing = 0;
1221 
1222  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1223 
1224  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1225  implicit task for each serialized task represented by
1226  team->t.t_serialized? */
1227  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1228  &this_thr->th.th_current_task->td_parent->td_icvs);
1229 
1230  // Thread value exists in the nested nthreads array for the next nested
1231  // level
1232  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1233  this_thr->th.th_current_task->td_icvs.nproc =
1234  __kmp_nested_nth.nth[level + 1];
1235  }
1236 
1237  if (__kmp_nested_proc_bind.used &&
1238  (level + 1 < __kmp_nested_proc_bind.used)) {
1239  this_thr->th.th_current_task->td_icvs.proc_bind =
1240  __kmp_nested_proc_bind.bind_types[level + 1];
1241  }
1242 
1243 #if USE_DEBUGGER
1244  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1245 #endif
1246  this_thr->th.th_info.ds.ds_tid = 0;
1247 
1248  /* set thread cache values */
1249  this_thr->th.th_team_nproc = 1;
1250  this_thr->th.th_team_master = this_thr;
1251  this_thr->th.th_team_serialized = 1;
1252 
1253  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1254  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1255  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1256 
1257  propagateFPControl(serial_team);
1258 
1259  /* check if we need to allocate dispatch buffers stack */
1260  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1261  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1262  serial_team->t.t_dispatch->th_disp_buffer =
1263  (dispatch_private_info_t *)__kmp_allocate(
1264  sizeof(dispatch_private_info_t));
1265  }
1266  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1267 
1268  KMP_MB();
1269 
1270  } else {
1271  /* this serialized team is already being used,
1272  * that's fine, just add another nested level */
1273  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1274  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1275  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1276  ++serial_team->t.t_serialized;
1277  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1278 
1279  // Nested level will be an index in the nested nthreads array
1280  int level = this_thr->th.th_team->t.t_level;
1281  // Thread value exists in the nested nthreads array for the next nested
1282  // level
1283  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1284  this_thr->th.th_current_task->td_icvs.nproc =
1285  __kmp_nested_nth.nth[level + 1];
1286  }
1287  serial_team->t.t_level++;
1288  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1289  "of serial team %p to %d\n",
1290  global_tid, serial_team, serial_team->t.t_level));
1291 
1292  /* allocate/push dispatch buffers stack */
1293  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1294  {
1295  dispatch_private_info_t *disp_buffer =
1296  (dispatch_private_info_t *)__kmp_allocate(
1297  sizeof(dispatch_private_info_t));
1298  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1299  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1300  }
1301  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1302 
1303  KMP_MB();
1304  }
1305  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1306 
1307  // Perform the display affinity functionality for
1308  // serialized parallel regions
1309  if (__kmp_display_affinity) {
1310  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1311  this_thr->th.th_prev_num_threads != 1) {
1312  // NULL means use the affinity-format-var ICV
1313  __kmp_aux_display_affinity(global_tid, NULL);
1314  this_thr->th.th_prev_level = serial_team->t.t_level;
1315  this_thr->th.th_prev_num_threads = 1;
1316  }
1317  }
1318 
1319  if (__kmp_env_consistency_check)
1320  __kmp_push_parallel(global_tid, NULL);
1321 #if OMPT_SUPPORT
1322  serial_team->t.ompt_team_info.master_return_address = codeptr;
1323  if (ompt_enabled.enabled &&
1324  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1325  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1326  OMPT_GET_FRAME_ADDRESS(0);
1327 
1328  ompt_lw_taskteam_t lw_taskteam;
1329  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1330  &ompt_parallel_data, codeptr);
1331 
1332  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1333  // don't use lw_taskteam after linking. content was swaped
1334 
1335  /* OMPT implicit task begin */
1336  if (ompt_enabled.ompt_callback_implicit_task) {
1337  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1338  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1339  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1340  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1341  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1342  __kmp_tid_from_gtid(global_tid);
1343  }
1344 
1345  /* OMPT state */
1346  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1347  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1348  OMPT_GET_FRAME_ADDRESS(0);
1349  }
1350 #endif
1351 }
1352 
1353 /* most of the work for a fork */
1354 /* return true if we really went parallel, false if serialized */
1355 int __kmp_fork_call(ident_t *loc, int gtid,
1356  enum fork_context_e call_context, // Intel, GNU, ...
1357  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1358  kmp_va_list ap) {
1359  void **argv;
1360  int i;
1361  int master_tid;
1362  int master_this_cons;
1363  kmp_team_t *team;
1364  kmp_team_t *parent_team;
1365  kmp_info_t *master_th;
1366  kmp_root_t *root;
1367  int nthreads;
1368  int master_active;
1369  int master_set_numthreads;
1370  int level;
1371  int active_level;
1372  int teams_level;
1373 #if KMP_NESTED_HOT_TEAMS
1374  kmp_hot_team_ptr_t **p_hot_teams;
1375 #endif
1376  { // KMP_TIME_BLOCK
1377  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1378  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1379 
1380  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1381  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1382  /* Some systems prefer the stack for the root thread(s) to start with */
1383  /* some gap from the parent stack to prevent false sharing. */
1384  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1385  /* These 2 lines below are so this does not get optimized out */
1386  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1387  __kmp_stkpadding += (short)((kmp_int64)dummy);
1388  }
1389 
1390  /* initialize if needed */
1391  KMP_DEBUG_ASSERT(
1392  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1393  if (!TCR_4(__kmp_init_parallel))
1394  __kmp_parallel_initialize();
1395  __kmp_resume_if_soft_paused();
1396 
1397  /* setup current data */
1398  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1399  // shutdown
1400  parent_team = master_th->th.th_team;
1401  master_tid = master_th->th.th_info.ds.ds_tid;
1402  master_this_cons = master_th->th.th_local.this_construct;
1403  root = master_th->th.th_root;
1404  master_active = root->r.r_active;
1405  master_set_numthreads = master_th->th.th_set_nproc;
1406 
1407 #if OMPT_SUPPORT
1408  ompt_data_t ompt_parallel_data = ompt_data_none;
1409  ompt_data_t *parent_task_data;
1410  ompt_frame_t *ompt_frame;
1411  ompt_data_t *implicit_task_data;
1412  void *return_address = NULL;
1413 
1414  if (ompt_enabled.enabled) {
1415  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1416  NULL, NULL);
1417  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1418  }
1419 #endif
1420 
1421  // Assign affinity to root thread if it hasn't happened yet
1422  __kmp_assign_root_init_mask();
1423 
1424  // Nested level will be an index in the nested nthreads array
1425  level = parent_team->t.t_level;
1426  // used to launch non-serial teams even if nested is not allowed
1427  active_level = parent_team->t.t_active_level;
1428  // needed to check nesting inside the teams
1429  teams_level = master_th->th.th_teams_level;
1430 #if KMP_NESTED_HOT_TEAMS
1431  p_hot_teams = &master_th->th.th_hot_teams;
1432  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1433  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1434  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1435  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1436  // it is either actual or not needed (when active_level > 0)
1437  (*p_hot_teams)[0].hot_team_nth = 1;
1438  }
1439 #endif
1440 
1441 #if OMPT_SUPPORT
1442  if (ompt_enabled.enabled) {
1443  if (ompt_enabled.ompt_callback_parallel_begin) {
1444  int team_size = master_set_numthreads
1445  ? master_set_numthreads
1446  : get__nproc_2(parent_team, master_tid);
1447  int flags = OMPT_INVOKER(call_context) |
1448  ((microtask == (microtask_t)__kmp_teams_master)
1449  ? ompt_parallel_league
1450  : ompt_parallel_team);
1451  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1452  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1453  return_address);
1454  }
1455  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1456  }
1457 #endif
1458 
1459  master_th->th.th_ident = loc;
1460 
1461  if (master_th->th.th_teams_microtask && ap &&
1462  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1463  // AC: This is start of parallel that is nested inside teams construct.
1464  // The team is actual (hot), all workers are ready at the fork barrier.
1465  // No lock needed to initialize the team a bit, then free workers.
1466  parent_team->t.t_ident = loc;
1467  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1468  parent_team->t.t_argc = argc;
1469  argv = (void **)parent_team->t.t_argv;
1470  for (i = argc - 1; i >= 0; --i)
1471  *argv++ = va_arg(kmp_va_deref(ap), void *);
1472  // Increment our nested depth levels, but not increase the serialization
1473  if (parent_team == master_th->th.th_serial_team) {
1474  // AC: we are in serialized parallel
1475  __kmpc_serialized_parallel(loc, gtid);
1476  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1477 
1478  if (call_context == fork_context_gnu) {
1479  // AC: need to decrement t_serialized for enquiry functions to work
1480  // correctly, will restore at join time
1481  parent_team->t.t_serialized--;
1482  return TRUE;
1483  }
1484 
1485 #if OMPD_SUPPORT
1486  parent_team->t.t_pkfn = microtask;
1487 #endif
1488 
1489 #if OMPT_SUPPORT
1490  void *dummy;
1491  void **exit_frame_p;
1492 
1493  ompt_lw_taskteam_t lw_taskteam;
1494 
1495  if (ompt_enabled.enabled) {
1496  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1497  &ompt_parallel_data, return_address);
1498  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1499 
1500  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1501  // don't use lw_taskteam after linking. content was swaped
1502 
1503  /* OMPT implicit task begin */
1504  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1505  if (ompt_enabled.ompt_callback_implicit_task) {
1506  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1507  __kmp_tid_from_gtid(gtid);
1508  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1509  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1510  implicit_task_data, 1,
1511  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1512  }
1513 
1514  /* OMPT state */
1515  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1516  } else {
1517  exit_frame_p = &dummy;
1518  }
1519 #endif
1520  // AC: need to decrement t_serialized for enquiry functions to work
1521  // correctly, will restore at join time
1522  parent_team->t.t_serialized--;
1523 
1524  {
1525  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1526  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1527  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1528 #if OMPT_SUPPORT
1529  ,
1530  exit_frame_p
1531 #endif
1532  );
1533  }
1534 
1535 #if OMPT_SUPPORT
1536  if (ompt_enabled.enabled) {
1537  *exit_frame_p = NULL;
1538  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1539  if (ompt_enabled.ompt_callback_implicit_task) {
1540  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1541  ompt_scope_end, NULL, implicit_task_data, 1,
1542  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1543  }
1544  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1545  __ompt_lw_taskteam_unlink(master_th);
1546  if (ompt_enabled.ompt_callback_parallel_end) {
1547  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1548  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1549  OMPT_INVOKER(call_context) | ompt_parallel_team,
1550  return_address);
1551  }
1552  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1553  }
1554 #endif
1555  return TRUE;
1556  }
1557 
1558  parent_team->t.t_pkfn = microtask;
1559  parent_team->t.t_invoke = invoker;
1560  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1561  parent_team->t.t_active_level++;
1562  parent_team->t.t_level++;
1563  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1564 
1565 #if OMPT_SUPPORT
1566  if (ompt_enabled.enabled) {
1567  ompt_lw_taskteam_t lw_taskteam;
1568  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1569  &ompt_parallel_data, return_address);
1570  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1571  }
1572 #endif
1573 
1574  /* Change number of threads in the team if requested */
1575  if (master_set_numthreads) { // The parallel has num_threads clause
1576  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1577  // AC: only can reduce number of threads dynamically, can't increase
1578  kmp_info_t **other_threads = parent_team->t.t_threads;
1579  // NOTE: if using distributed barrier, we need to run this code block
1580  // even when the team size appears not to have changed from the max.
1581  int old_proc = master_th->th.th_teams_size.nth;
1582  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
1583  bp_dist_bar) {
1584  __kmp_resize_dist_barrier(parent_team, old_proc,
1585  master_set_numthreads);
1586  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1587  }
1588  parent_team->t.t_nproc = master_set_numthreads;
1589  for (i = 0; i < master_set_numthreads; ++i) {
1590  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1591  }
1592  }
1593  // Keep extra threads hot in the team for possible next parallels
1594  master_th->th.th_set_nproc = 0;
1595  }
1596 
1597 #if USE_DEBUGGER
1598  if (__kmp_debugging) { // Let debugger override number of threads.
1599  int nth = __kmp_omp_num_threads(loc);
1600  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1601  master_set_numthreads = nth;
1602  }
1603  }
1604 #endif
1605 
1606  // Figure out the proc_bind policy for the nested parallel within teams
1607  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1608  // proc_bind_default means don't update
1609  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1610  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1611  proc_bind = proc_bind_false;
1612  } else {
1613  // No proc_bind clause specified; use current proc-bind-var
1614  if (proc_bind == proc_bind_default) {
1615  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1616  }
1617  /* else: The proc_bind policy was specified explicitly on parallel
1618  clause.
1619  This overrides proc-bind-var for this parallel region, but does not
1620  change proc-bind-var. */
1621  // Figure the value of proc-bind-var for the child threads.
1622  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1623  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1624  master_th->th.th_current_task->td_icvs.proc_bind)) {
1625  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1626  }
1627  }
1628  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1629  // Need to change the bind-var ICV to correct value for each implicit task
1630  if (proc_bind_icv != proc_bind_default &&
1631  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1632  kmp_info_t **other_threads = parent_team->t.t_threads;
1633  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1634  other_threads[i]->th.th_current_task->td_icvs.proc_bind =
1635  proc_bind_icv;
1636  }
1637  }
1638  // Reset for next parallel region
1639  master_th->th.th_set_proc_bind = proc_bind_default;
1640 
1641 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1642  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1643  KMP_ITT_DEBUG) &&
1644  __kmp_forkjoin_frames_mode == 3 &&
1645  parent_team->t.t_active_level == 1 // only report frames at level 1
1646  && master_th->th.th_teams_size.nteams == 1) {
1647  kmp_uint64 tmp_time = __itt_get_timestamp();
1648  master_th->th.th_frame_time = tmp_time;
1649  parent_team->t.t_region_time = tmp_time;
1650  }
1651  if (__itt_stack_caller_create_ptr) {
1652  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1653  // create new stack stitching id before entering fork barrier
1654  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 #if KMP_AFFINITY_SUPPORTED
1658  __kmp_partition_places(parent_team);
1659 #endif
1660 
1661  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1662  "master_th=%p, gtid=%d\n",
1663  root, parent_team, master_th, gtid));
1664  __kmp_internal_fork(loc, gtid, parent_team);
1665  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1666  "master_th=%p, gtid=%d\n",
1667  root, parent_team, master_th, gtid));
1668 
1669  if (call_context == fork_context_gnu)
1670  return TRUE;
1671 
1672  /* Invoke microtask for PRIMARY thread */
1673  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1674  parent_team->t.t_id, parent_team->t.t_pkfn));
1675 
1676  if (!parent_team->t.t_invoke(gtid)) {
1677  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1678  }
1679  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1680  parent_team->t.t_id, parent_team->t.t_pkfn));
1681  KMP_MB(); /* Flush all pending memory write invalidates. */
1682 
1683  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1684 
1685  return TRUE;
1686  } // Parallel closely nested in teams construct
1687 
1688 #if KMP_DEBUG
1689  if (__kmp_tasking_mode != tskm_immediate_exec) {
1690  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1691  parent_team->t.t_task_team[master_th->th.th_task_state]);
1692  }
1693 #endif
1694 
1695  // Need this to happen before we determine the number of threads, not while
1696  // we are allocating the team
1697  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1698  int enter_teams = 0;
1699  if (parent_team->t.t_active_level >=
1700  master_th->th.th_current_task->td_icvs.max_active_levels) {
1701  nthreads = 1;
1702  } else {
1703  enter_teams = ((ap == NULL && active_level == 0) ||
1704  (ap && teams_level > 0 && teams_level == level));
1705  nthreads = master_set_numthreads
1706  ? master_set_numthreads
1707  // TODO: get nproc directly from current task
1708  : get__nproc_2(parent_team, master_tid);
1709  // Check if we need to take forkjoin lock? (no need for serialized
1710  // parallel out of teams construct). This code moved here from
1711  // __kmp_reserve_threads() to speedup nested serialized parallels.
1712  if (nthreads > 1) {
1713  if ((get__max_active_levels(master_th) == 1 &&
1714  (root->r.r_in_parallel && !enter_teams)) ||
1715  (__kmp_library == library_serial)) {
1716  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1717  " threads\n",
1718  gtid, nthreads));
1719  nthreads = 1;
1720  }
1721  }
1722  if (nthreads > 1) {
1723  /* determine how many new threads we can use */
1724  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1725  /* AC: If we execute teams from parallel region (on host), then teams
1726  should be created but each can only have 1 thread if nesting is
1727  disabled. If teams called from serial region, then teams and their
1728  threads should be created regardless of the nesting setting. */
1729  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1730  nthreads, enter_teams);
1731  if (nthreads == 1) {
1732  // Free lock for single thread execution here; for multi-thread
1733  // execution it will be freed later after team of threads created
1734  // and initialized
1735  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1736  }
1737  }
1738  }
1739  KMP_DEBUG_ASSERT(nthreads > 0);
1740 
1741  // If we temporarily changed the set number of threads then restore it now
1742  master_th->th.th_set_nproc = 0;
1743 
1744  /* create a serialized parallel region? */
1745  if (nthreads == 1) {
1746 /* josh todo: hypothetical question: what do we do for OS X*? */
1747 #if KMP_OS_LINUX && \
1748  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1749  void *args[argc];
1750 #else
1751  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1752 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1753  KMP_ARCH_AARCH64) */
1754 
1755  KA_TRACE(20,
1756  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1757 
1758  __kmpc_serialized_parallel(loc, gtid);
1759 
1760 #if OMPD_SUPPORT
1761  master_th->th.th_serial_team->t.t_pkfn = microtask;
1762 #endif
1763 
1764  if (call_context == fork_context_intel) {
1765  /* TODO this sucks, use the compiler itself to pass args! :) */
1766  master_th->th.th_serial_team->t.t_ident = loc;
1767  if (!ap) {
1768  // revert change made in __kmpc_serialized_parallel()
1769  master_th->th.th_serial_team->t.t_level--;
1770  // Get args from parent team for teams construct
1771 
1772 #if OMPT_SUPPORT
1773  void *dummy;
1774  void **exit_frame_p;
1775  ompt_task_info_t *task_info;
1776 
1777  ompt_lw_taskteam_t lw_taskteam;
1778 
1779  if (ompt_enabled.enabled) {
1780  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1781  &ompt_parallel_data, return_address);
1782 
1783  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1784  // don't use lw_taskteam after linking. content was swaped
1785 
1786  task_info = OMPT_CUR_TASK_INFO(master_th);
1787  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1788  if (ompt_enabled.ompt_callback_implicit_task) {
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1790  __kmp_tid_from_gtid(gtid);
1791  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1792  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1793  &(task_info->task_data), 1,
1794  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1795  ompt_task_implicit);
1796  }
1797 
1798  /* OMPT state */
1799  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1800  } else {
1801  exit_frame_p = &dummy;
1802  }
1803 #endif
1804 
1805  {
1806  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1809  parent_team->t.t_argv
1810 #if OMPT_SUPPORT
1811  ,
1812  exit_frame_p
1813 #endif
1814  );
1815  }
1816 
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  *exit_frame_p = NULL;
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 1,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1824  ompt_task_implicit);
1825  }
1826  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1827  __ompt_lw_taskteam_unlink(master_th);
1828  if (ompt_enabled.ompt_callback_parallel_end) {
1829  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1830  &ompt_parallel_data, parent_task_data,
1831  OMPT_INVOKER(call_context) | ompt_parallel_team,
1832  return_address);
1833  }
1834  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1835  }
1836 #endif
1837  } else if (microtask == (microtask_t)__kmp_teams_master) {
1838  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1839  master_th->th.th_serial_team);
1840  team = master_th->th.th_team;
1841  // team->t.t_pkfn = microtask;
1842  team->t.t_invoke = invoker;
1843  __kmp_alloc_argv_entries(argc, team, TRUE);
1844  team->t.t_argc = argc;
1845  argv = (void **)team->t.t_argv;
1846  if (ap) {
1847  for (i = argc - 1; i >= 0; --i)
1848  *argv++ = va_arg(kmp_va_deref(ap), void *);
1849  } else {
1850  for (i = 0; i < argc; ++i)
1851  // Get args from parent team for teams construct
1852  argv[i] = parent_team->t.t_argv[i];
1853  }
1854  // AC: revert change made in __kmpc_serialized_parallel()
1855  // because initial code in teams should have level=0
1856  team->t.t_level--;
1857  // AC: call special invoker for outer "parallel" of teams construct
1858  invoker(gtid);
1859 #if OMPT_SUPPORT
1860  if (ompt_enabled.enabled) {
1861  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1862  if (ompt_enabled.ompt_callback_implicit_task) {
1863  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1864  ompt_scope_end, NULL, &(task_info->task_data), 0,
1865  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1866  }
1867  if (ompt_enabled.ompt_callback_parallel_end) {
1868  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1869  &ompt_parallel_data, parent_task_data,
1870  OMPT_INVOKER(call_context) | ompt_parallel_league,
1871  return_address);
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876  } else {
1877  argv = args;
1878  for (i = argc - 1; i >= 0; --i)
1879  *argv++ = va_arg(kmp_va_deref(ap), void *);
1880  KMP_MB();
1881 
1882 #if OMPT_SUPPORT
1883  void *dummy;
1884  void **exit_frame_p;
1885  ompt_task_info_t *task_info;
1886 
1887  ompt_lw_taskteam_t lw_taskteam;
1888 
1889  if (ompt_enabled.enabled) {
1890  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1891  &ompt_parallel_data, return_address);
1892  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1893  // don't use lw_taskteam after linking. content was swaped
1894  task_info = OMPT_CUR_TASK_INFO(master_th);
1895  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1896 
1897  /* OMPT implicit task begin */
1898  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1899  if (ompt_enabled.ompt_callback_implicit_task) {
1900  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1901  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1902  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1903  ompt_task_implicit);
1904  OMPT_CUR_TASK_INFO(master_th)->thread_num =
1905  __kmp_tid_from_gtid(gtid);
1906  }
1907 
1908  /* OMPT state */
1909  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1910  } else {
1911  exit_frame_p = &dummy;
1912  }
1913 #endif
1914 
1915  {
1916  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1917  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1918  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1919 #if OMPT_SUPPORT
1920  ,
1921  exit_frame_p
1922 #endif
1923  );
1924  }
1925 
1926 #if OMPT_SUPPORT
1927  if (ompt_enabled.enabled) {
1928  *exit_frame_p = NULL;
1929  if (ompt_enabled.ompt_callback_implicit_task) {
1930  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1931  ompt_scope_end, NULL, &(task_info->task_data), 1,
1932  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1933  ompt_task_implicit);
1934  }
1935 
1936  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1937  __ompt_lw_taskteam_unlink(master_th);
1938  if (ompt_enabled.ompt_callback_parallel_end) {
1939  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1940  &ompt_parallel_data, parent_task_data,
1941  OMPT_INVOKER(call_context) | ompt_parallel_team,
1942  return_address);
1943  }
1944  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1945  }
1946 #endif
1947  }
1948  } else if (call_context == fork_context_gnu) {
1949 #if OMPT_SUPPORT
1950  ompt_lw_taskteam_t lwt;
1951  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1952  return_address);
1953 
1954  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1955  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1956 // don't use lw_taskteam after linking. content was swaped
1957 #endif
1958 
1959  // we were called from GNU native code
1960  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1961  return FALSE;
1962  } else {
1963  KMP_ASSERT2(call_context < fork_context_last,
1964  "__kmp_fork_call: unknown fork_context parameter");
1965  }
1966 
1967  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1968  KMP_MB();
1969  return FALSE;
1970  } // if (nthreads == 1)
1971 
1972  // GEH: only modify the executing flag in the case when not serialized
1973  // serialized case is handled in kmpc_serialized_parallel
1974  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1975  "curtask=%p, curtask_max_aclevel=%d\n",
1976  parent_team->t.t_active_level, master_th,
1977  master_th->th.th_current_task,
1978  master_th->th.th_current_task->td_icvs.max_active_levels));
1979  // TODO: GEH - cannot do this assertion because root thread not set up as
1980  // executing
1981  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1982  master_th->th.th_current_task->td_flags.executing = 0;
1983 
1984  if (!master_th->th.th_teams_microtask || level > teams_level) {
1985  /* Increment our nested depth level */
1986  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1987  }
1988 
1989  // See if we need to make a copy of the ICVs.
1990  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1991  if ((level + 1 < __kmp_nested_nth.used) &&
1992  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1993  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1994  } else {
1995  nthreads_icv = 0; // don't update
1996  }
1997 
1998  // Figure out the proc_bind_policy for the new team.
1999  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2000  // proc_bind_default means don't update
2001  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2002  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2003  proc_bind = proc_bind_false;
2004  } else {
2005  // No proc_bind clause specified; use current proc-bind-var for this
2006  // parallel region
2007  if (proc_bind == proc_bind_default) {
2008  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2009  }
2010  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2011  if (master_th->th.th_teams_microtask &&
2012  microtask == (microtask_t)__kmp_teams_master) {
2013  proc_bind = __kmp_teams_proc_bind;
2014  }
2015  /* else: The proc_bind policy was specified explicitly on parallel clause.
2016  This overrides proc-bind-var for this parallel region, but does not
2017  change proc-bind-var. */
2018  // Figure the value of proc-bind-var for the child threads.
2019  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2020  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2021  master_th->th.th_current_task->td_icvs.proc_bind)) {
2022  // Do not modify the proc bind icv for the two teams construct forks
2023  // They just let the proc bind icv pass through
2024  if (!master_th->th.th_teams_microtask ||
2025  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2026  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2027  }
2028  }
2029 
2030  // Reset for next parallel region
2031  master_th->th.th_set_proc_bind = proc_bind_default;
2032 
2033  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2034  kmp_internal_control_t new_icvs;
2035  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2036  new_icvs.next = NULL;
2037  if (nthreads_icv > 0) {
2038  new_icvs.nproc = nthreads_icv;
2039  }
2040  if (proc_bind_icv != proc_bind_default) {
2041  new_icvs.proc_bind = proc_bind_icv;
2042  }
2043 
2044  /* allocate a new parallel team */
2045  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2046  team = __kmp_allocate_team(root, nthreads, nthreads,
2047 #if OMPT_SUPPORT
2048  ompt_parallel_data,
2049 #endif
2050  proc_bind, &new_icvs,
2051  argc USE_NESTED_HOT_ARG(master_th));
2052  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2053  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2054  } else {
2055  /* allocate a new parallel team */
2056  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2057  team = __kmp_allocate_team(root, nthreads, nthreads,
2058 #if OMPT_SUPPORT
2059  ompt_parallel_data,
2060 #endif
2061  proc_bind,
2062  &master_th->th.th_current_task->td_icvs,
2063  argc USE_NESTED_HOT_ARG(master_th));
2064  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2065  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2066  &master_th->th.th_current_task->td_icvs);
2067  }
2068  KF_TRACE(
2069  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2070 
2071  /* setup the new team */
2072  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2073  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2074  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2075  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2076  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2077 #if OMPT_SUPPORT
2078  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2079  return_address);
2080 #endif
2081  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2082  // TODO: parent_team->t.t_level == INT_MAX ???
2083  if (!master_th->th.th_teams_microtask || level > teams_level) {
2084  int new_level = parent_team->t.t_level + 1;
2085  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2086  new_level = parent_team->t.t_active_level + 1;
2087  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2088  } else {
2089  // AC: Do not increase parallel level at start of the teams construct
2090  int new_level = parent_team->t.t_level;
2091  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2092  new_level = parent_team->t.t_active_level;
2093  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2094  }
2095  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2096  // set primary thread's schedule as new run-time schedule
2097  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2098 
2099  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2100  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2101 
2102  // Update the floating point rounding in the team if required.
2103  propagateFPControl(team);
2104 #if OMPD_SUPPORT
2105  if (ompd_state & OMPD_ENABLE_BP)
2106  ompd_bp_parallel_begin();
2107 #endif
2108 
2109  if (__kmp_tasking_mode != tskm_immediate_exec) {
2110  // Set primary thread's task team to team's task team. Unless this is hot
2111  // team, it should be NULL.
2112  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2113  parent_team->t.t_task_team[master_th->th.th_task_state]);
2114  KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2115  "%p, new task_team %p / team %p\n",
2116  __kmp_gtid_from_thread(master_th),
2117  master_th->th.th_task_team, parent_team,
2118  team->t.t_task_team[master_th->th.th_task_state], team));
2119 
2120  if (active_level || master_th->th.th_task_team) {
2121  // Take a memo of primary thread's task_state
2122  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2123  if (master_th->th.th_task_state_top >=
2124  master_th->th.th_task_state_stack_sz) { // increase size
2125  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2126  kmp_uint8 *old_stack, *new_stack;
2127  kmp_uint32 i;
2128  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2129  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2130  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2131  }
2132  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2133  ++i) { // zero-init rest of stack
2134  new_stack[i] = 0;
2135  }
2136  old_stack = master_th->th.th_task_state_memo_stack;
2137  master_th->th.th_task_state_memo_stack = new_stack;
2138  master_th->th.th_task_state_stack_sz = new_size;
2139  __kmp_free(old_stack);
2140  }
2141  // Store primary thread's task_state on stack
2142  master_th->th
2143  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2144  master_th->th.th_task_state;
2145  master_th->th.th_task_state_top++;
2146 #if KMP_NESTED_HOT_TEAMS
2147  if (master_th->th.th_hot_teams &&
2148  active_level < __kmp_hot_teams_max_level &&
2149  team == master_th->th.th_hot_teams[active_level].hot_team) {
2150  // Restore primary thread's nested state if nested hot team
2151  master_th->th.th_task_state =
2152  master_th->th
2153  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2154  } else {
2155 #endif
2156  master_th->th.th_task_state = 0;
2157 #if KMP_NESTED_HOT_TEAMS
2158  }
2159 #endif
2160  }
2161 #if !KMP_NESTED_HOT_TEAMS
2162  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2163  (team == root->r.r_hot_team));
2164 #endif
2165  }
2166 
2167  KA_TRACE(
2168  20,
2169  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2170  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2171  team->t.t_nproc));
2172  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2173  (team->t.t_master_tid == 0 &&
2174  (team->t.t_parent == root->r.r_root_team ||
2175  team->t.t_parent->t.t_serialized)));
2176  KMP_MB();
2177 
2178  /* now, setup the arguments */
2179  argv = (void **)team->t.t_argv;
2180  if (ap) {
2181  for (i = argc - 1; i >= 0; --i) {
2182  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2183  KMP_CHECK_UPDATE(*argv, new_argv);
2184  argv++;
2185  }
2186  } else {
2187  for (i = 0; i < argc; ++i) {
2188  // Get args from parent team for teams construct
2189  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2190  }
2191  }
2192 
2193  /* now actually fork the threads */
2194  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2195  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2196  root->r.r_active = TRUE;
2197 
2198  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2199  __kmp_setup_icv_copy(team, nthreads,
2200  &master_th->th.th_current_task->td_icvs, loc);
2201 
2202 #if OMPT_SUPPORT
2203  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2204 #endif
2205 
2206  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2207 
2208 #if USE_ITT_BUILD
2209  if (team->t.t_active_level == 1 // only report frames at level 1
2210  && !master_th->th.th_teams_microtask) { // not in teams construct
2211 #if USE_ITT_NOTIFY
2212  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2213  (__kmp_forkjoin_frames_mode == 3 ||
2214  __kmp_forkjoin_frames_mode == 1)) {
2215  kmp_uint64 tmp_time = 0;
2216  if (__itt_get_timestamp_ptr)
2217  tmp_time = __itt_get_timestamp();
2218  // Internal fork - report frame begin
2219  master_th->th.th_frame_time = tmp_time;
2220  if (__kmp_forkjoin_frames_mode == 3)
2221  team->t.t_region_time = tmp_time;
2222  } else
2223 // only one notification scheme (either "submit" or "forking/joined", not both)
2224 #endif /* USE_ITT_NOTIFY */
2225  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2226  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2227  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2228  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2229  }
2230  }
2231 #endif /* USE_ITT_BUILD */
2232 
2233  /* now go on and do the work */
2234  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2235  KMP_MB();
2236  KF_TRACE(10,
2237  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2238  root, team, master_th, gtid));
2239 
2240 #if USE_ITT_BUILD
2241  if (__itt_stack_caller_create_ptr) {
2242  // create new stack stitching id before entering fork barrier
2243  if (!enter_teams) {
2244  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2245  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2246  } else if (parent_team->t.t_serialized) {
2247  // keep stack stitching id in the serialized parent_team;
2248  // current team will be used for parallel inside the teams;
2249  // if parent_team is active, then it already keeps stack stitching id
2250  // for the league of teams
2251  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2252  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2253  }
2254  }
2255 #endif /* USE_ITT_BUILD */
2256 
2257  // AC: skip __kmp_internal_fork at teams construct, let only primary
2258  // threads execute
2259  if (ap) {
2260  __kmp_internal_fork(loc, gtid, team);
2261  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2262  "master_th=%p, gtid=%d\n",
2263  root, team, master_th, gtid));
2264  }
2265 
2266  if (call_context == fork_context_gnu) {
2267  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2268  return TRUE;
2269  }
2270 
2271  /* Invoke microtask for PRIMARY thread */
2272  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2273  team->t.t_id, team->t.t_pkfn));
2274  } // END of timer KMP_fork_call block
2275 
2276 #if KMP_STATS_ENABLED
2277  // If beginning a teams construct, then change thread state
2278  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2279  if (!ap) {
2280  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2281  }
2282 #endif
2283 
2284  if (!team->t.t_invoke(gtid)) {
2285  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2286  }
2287 
2288 #if KMP_STATS_ENABLED
2289  // If was beginning of a teams construct, then reset thread state
2290  if (!ap) {
2291  KMP_SET_THREAD_STATE(previous_state);
2292  }
2293 #endif
2294 
2295  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2296  team->t.t_id, team->t.t_pkfn));
2297  KMP_MB(); /* Flush all pending memory write invalidates. */
2298 
2299  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2300 #if OMPT_SUPPORT
2301  if (ompt_enabled.enabled) {
2302  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2303  }
2304 #endif
2305 
2306  return TRUE;
2307 }
2308 
2309 #if OMPT_SUPPORT
2310 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2311  kmp_team_t *team) {
2312  // restore state outside the region
2313  thread->th.ompt_thread_info.state =
2314  ((team->t.t_serialized) ? ompt_state_work_serial
2315  : ompt_state_work_parallel);
2316 }
2317 
2318 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2319  kmp_team_t *team, ompt_data_t *parallel_data,
2320  int flags, void *codeptr) {
2321  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2322  if (ompt_enabled.ompt_callback_parallel_end) {
2323  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2324  parallel_data, &(task_info->task_data), flags, codeptr);
2325  }
2326 
2327  task_info->frame.enter_frame = ompt_data_none;
2328  __kmp_join_restore_state(thread, team);
2329 }
2330 #endif
2331 
2332 void __kmp_join_call(ident_t *loc, int gtid
2333 #if OMPT_SUPPORT
2334  ,
2335  enum fork_context_e fork_context
2336 #endif
2337  ,
2338  int exit_teams) {
2339  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2340  kmp_team_t *team;
2341  kmp_team_t *parent_team;
2342  kmp_info_t *master_th;
2343  kmp_root_t *root;
2344  int master_active;
2345 
2346  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2347 
2348  /* setup current data */
2349  master_th = __kmp_threads[gtid];
2350  root = master_th->th.th_root;
2351  team = master_th->th.th_team;
2352  parent_team = team->t.t_parent;
2353 
2354  master_th->th.th_ident = loc;
2355 
2356 #if OMPT_SUPPORT
2357  void *team_microtask = (void *)team->t.t_pkfn;
2358  // For GOMP interface with serialized parallel, need the
2359  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2360  // and end-parallel events.
2361  if (ompt_enabled.enabled &&
2362  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2363  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2364  }
2365 #endif
2366 
2367 #if KMP_DEBUG
2368  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2369  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2370  "th_task_team = %p\n",
2371  __kmp_gtid_from_thread(master_th), team,
2372  team->t.t_task_team[master_th->th.th_task_state],
2373  master_th->th.th_task_team));
2374  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2375  team->t.t_task_team[master_th->th.th_task_state]);
2376  }
2377 #endif
2378 
2379  if (team->t.t_serialized) {
2380  if (master_th->th.th_teams_microtask) {
2381  // We are in teams construct
2382  int level = team->t.t_level;
2383  int tlevel = master_th->th.th_teams_level;
2384  if (level == tlevel) {
2385  // AC: we haven't incremented it earlier at start of teams construct,
2386  // so do it here - at the end of teams construct
2387  team->t.t_level++;
2388  } else if (level == tlevel + 1) {
2389  // AC: we are exiting parallel inside teams, need to increment
2390  // serialization in order to restore it in the next call to
2391  // __kmpc_end_serialized_parallel
2392  team->t.t_serialized++;
2393  }
2394  }
2395  __kmpc_end_serialized_parallel(loc, gtid);
2396 
2397 #if OMPT_SUPPORT
2398  if (ompt_enabled.enabled) {
2399  __kmp_join_restore_state(master_th, parent_team);
2400  }
2401 #endif
2402 
2403  return;
2404  }
2405 
2406  master_active = team->t.t_master_active;
2407 
2408  if (!exit_teams) {
2409  // AC: No barrier for internal teams at exit from teams construct.
2410  // But there is barrier for external team (league).
2411  __kmp_internal_join(loc, gtid, team);
2412 #if USE_ITT_BUILD
2413  if (__itt_stack_caller_create_ptr) {
2414  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2415  // destroy the stack stitching id after join barrier
2416  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2417  team->t.t_stack_id = NULL;
2418  }
2419 #endif
2420  } else {
2421  master_th->th.th_task_state =
2422  0; // AC: no tasking in teams (out of any parallel)
2423 #if USE_ITT_BUILD
2424  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2425  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2426  // destroy the stack stitching id on exit from the teams construct
2427  // if parent_team is active, then the id will be destroyed later on
2428  // by master of the league of teams
2429  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2430  parent_team->t.t_stack_id = NULL;
2431  }
2432 #endif
2433 
2434  if (team->t.t_nproc > 1 &&
2435  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2436  team->t.b->update_num_threads(team->t.t_nproc);
2437  __kmp_add_threads_to_team(team, team->t.t_nproc);
2438  }
2439  }
2440 
2441  KMP_MB();
2442 
2443 #if OMPT_SUPPORT
2444  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2445  void *codeptr = team->t.ompt_team_info.master_return_address;
2446 #endif
2447 
2448 #if USE_ITT_BUILD
2449  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2450  if (team->t.t_active_level == 1 &&
2451  (!master_th->th.th_teams_microtask || /* not in teams construct */
2452  master_th->th.th_teams_size.nteams == 1)) {
2453  master_th->th.th_ident = loc;
2454  // only one notification scheme (either "submit" or "forking/joined", not
2455  // both)
2456  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2457  __kmp_forkjoin_frames_mode == 3)
2458  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2459  master_th->th.th_frame_time, 0, loc,
2460  master_th->th.th_team_nproc, 1);
2461  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2462  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2463  __kmp_itt_region_joined(gtid);
2464  } // active_level == 1
2465 #endif /* USE_ITT_BUILD */
2466 
2467 #if KMP_AFFINITY_SUPPORTED
2468  if (!exit_teams) {
2469  // Restore master thread's partition.
2470  master_th->th.th_first_place = team->t.t_first_place;
2471  master_th->th.th_last_place = team->t.t_last_place;
2472  }
2473 #endif // KMP_AFFINITY_SUPPORTED
2474 
2475  if (master_th->th.th_teams_microtask && !exit_teams &&
2476  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2477  team->t.t_level == master_th->th.th_teams_level + 1) {
2478 // AC: We need to leave the team structure intact at the end of parallel
2479 // inside the teams construct, so that at the next parallel same (hot) team
2480 // works, only adjust nesting levels
2481 #if OMPT_SUPPORT
2482  ompt_data_t ompt_parallel_data = ompt_data_none;
2483  if (ompt_enabled.enabled) {
2484  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2485  if (ompt_enabled.ompt_callback_implicit_task) {
2486  int ompt_team_size = team->t.t_nproc;
2487  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2488  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2489  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2490  }
2491  task_info->frame.exit_frame = ompt_data_none;
2492  task_info->task_data = ompt_data_none;
2493  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2494  __ompt_lw_taskteam_unlink(master_th);
2495  }
2496 #endif
2497  /* Decrement our nested depth level */
2498  team->t.t_level--;
2499  team->t.t_active_level--;
2500  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2501 
2502  // Restore number of threads in the team if needed. This code relies on
2503  // the proper adjustment of th_teams_size.nth after the fork in
2504  // __kmp_teams_master on each teams primary thread in the case that
2505  // __kmp_reserve_threads reduced it.
2506  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2507  int old_num = master_th->th.th_team_nproc;
2508  int new_num = master_th->th.th_teams_size.nth;
2509  kmp_info_t **other_threads = team->t.t_threads;
2510  team->t.t_nproc = new_num;
2511  for (int i = 0; i < old_num; ++i) {
2512  other_threads[i]->th.th_team_nproc = new_num;
2513  }
2514  // Adjust states of non-used threads of the team
2515  for (int i = old_num; i < new_num; ++i) {
2516  // Re-initialize thread's barrier data.
2517  KMP_DEBUG_ASSERT(other_threads[i]);
2518  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2519  for (int b = 0; b < bs_last_barrier; ++b) {
2520  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2521  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2522 #if USE_DEBUGGER
2523  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2524 #endif
2525  }
2526  if (__kmp_tasking_mode != tskm_immediate_exec) {
2527  // Synchronize thread's task state
2528  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2529  }
2530  }
2531  }
2532 
2533 #if OMPT_SUPPORT
2534  if (ompt_enabled.enabled) {
2535  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2536  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2537  }
2538 #endif
2539 
2540  return;
2541  }
2542 
2543  /* do cleanup and restore the parent team */
2544  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2545  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2546 
2547  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2548 
2549  /* jc: The following lock has instructions with REL and ACQ semantics,
2550  separating the parallel user code called in this parallel region
2551  from the serial user code called after this function returns. */
2552  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2553 
2554  if (!master_th->th.th_teams_microtask ||
2555  team->t.t_level > master_th->th.th_teams_level) {
2556  /* Decrement our nested depth level */
2557  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2558  }
2559  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2560 
2561 #if OMPT_SUPPORT
2562  if (ompt_enabled.enabled) {
2563  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2564  if (ompt_enabled.ompt_callback_implicit_task) {
2565  int flags = (team_microtask == (void *)__kmp_teams_master)
2566  ? ompt_task_initial
2567  : ompt_task_implicit;
2568  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2571  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2572  }
2573  task_info->frame.exit_frame = ompt_data_none;
2574  task_info->task_data = ompt_data_none;
2575  }
2576 #endif
2577 
2578  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2579  master_th, team));
2580  __kmp_pop_current_task_from_thread(master_th);
2581 
2582  master_th->th.th_def_allocator = team->t.t_def_allocator;
2583 
2584 #if OMPD_SUPPORT
2585  if (ompd_state & OMPD_ENABLE_BP)
2586  ompd_bp_parallel_end();
2587 #endif
2588  updateHWFPControl(team);
2589 
2590  if (root->r.r_active != master_active)
2591  root->r.r_active = master_active;
2592 
2593  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2594  master_th)); // this will free worker threads
2595 
2596  /* this race was fun to find. make sure the following is in the critical
2597  region otherwise assertions may fail occasionally since the old team may be
2598  reallocated and the hierarchy appears inconsistent. it is actually safe to
2599  run and won't cause any bugs, but will cause those assertion failures. it's
2600  only one deref&assign so might as well put this in the critical region */
2601  master_th->th.th_team = parent_team;
2602  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2603  master_th->th.th_team_master = parent_team->t.t_threads[0];
2604  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2605 
2606  /* restore serialized team, if need be */
2607  if (parent_team->t.t_serialized &&
2608  parent_team != master_th->th.th_serial_team &&
2609  parent_team != root->r.r_root_team) {
2610  __kmp_free_team(root,
2611  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2612  master_th->th.th_serial_team = parent_team;
2613  }
2614 
2615  if (__kmp_tasking_mode != tskm_immediate_exec) {
2616  if (master_th->th.th_task_state_top >
2617  0) { // Restore task state from memo stack
2618  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2619  // Remember primary thread's state if we re-use this nested hot team
2620  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2621  master_th->th.th_task_state;
2622  --master_th->th.th_task_state_top; // pop
2623  // Now restore state at this level
2624  master_th->th.th_task_state =
2625  master_th->th
2626  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2627  }
2628  // Copy the task team from the parent team to the primary thread
2629  master_th->th.th_task_team =
2630  parent_team->t.t_task_team[master_th->th.th_task_state];
2631  KA_TRACE(20,
2632  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2633  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2634  parent_team));
2635  }
2636 
2637  // TODO: GEH - cannot do this assertion because root thread not set up as
2638  // executing
2639  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2640  master_th->th.th_current_task->td_flags.executing = 1;
2641 
2642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2643 
2644 #if OMPT_SUPPORT
2645  int flags =
2646  OMPT_INVOKER(fork_context) |
2647  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2648  : ompt_parallel_team);
2649  if (ompt_enabled.enabled) {
2650  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2651  codeptr);
2652  }
2653 #endif
2654 
2655  KMP_MB();
2656  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2657 }
2658 
2659 /* Check whether we should push an internal control record onto the
2660  serial team stack. If so, do it. */
2661 void __kmp_save_internal_controls(kmp_info_t *thread) {
2662 
2663  if (thread->th.th_team != thread->th.th_serial_team) {
2664  return;
2665  }
2666  if (thread->th.th_team->t.t_serialized > 1) {
2667  int push = 0;
2668 
2669  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2670  push = 1;
2671  } else {
2672  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2673  thread->th.th_team->t.t_serialized) {
2674  push = 1;
2675  }
2676  }
2677  if (push) { /* push a record on the serial team's stack */
2678  kmp_internal_control_t *control =
2679  (kmp_internal_control_t *)__kmp_allocate(
2680  sizeof(kmp_internal_control_t));
2681 
2682  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2683 
2684  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2685 
2686  control->next = thread->th.th_team->t.t_control_stack_top;
2687  thread->th.th_team->t.t_control_stack_top = control;
2688  }
2689  }
2690 }
2691 
2692 /* Changes set_nproc */
2693 void __kmp_set_num_threads(int new_nth, int gtid) {
2694  kmp_info_t *thread;
2695  kmp_root_t *root;
2696 
2697  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2698  KMP_DEBUG_ASSERT(__kmp_init_serial);
2699 
2700  if (new_nth < 1)
2701  new_nth = 1;
2702  else if (new_nth > __kmp_max_nth)
2703  new_nth = __kmp_max_nth;
2704 
2705  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2706  thread = __kmp_threads[gtid];
2707  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2708  return; // nothing to do
2709 
2710  __kmp_save_internal_controls(thread);
2711 
2712  set__nproc(thread, new_nth);
2713 
2714  // If this omp_set_num_threads() call will cause the hot team size to be
2715  // reduced (in the absence of a num_threads clause), then reduce it now,
2716  // rather than waiting for the next parallel region.
2717  root = thread->th.th_root;
2718  if (__kmp_init_parallel && (!root->r.r_active) &&
2719  (root->r.r_hot_team->t.t_nproc > new_nth)
2720 #if KMP_NESTED_HOT_TEAMS
2721  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2722 #endif
2723  ) {
2724  kmp_team_t *hot_team = root->r.r_hot_team;
2725  int f;
2726 
2727  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2728 
2729  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2730  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2731  }
2732  // Release the extra threads we don't need any more.
2733  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2734  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2735  if (__kmp_tasking_mode != tskm_immediate_exec) {
2736  // When decreasing team size, threads no longer in the team should unref
2737  // task team.
2738  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2739  }
2740  __kmp_free_thread(hot_team->t.t_threads[f]);
2741  hot_team->t.t_threads[f] = NULL;
2742  }
2743  hot_team->t.t_nproc = new_nth;
2744 #if KMP_NESTED_HOT_TEAMS
2745  if (thread->th.th_hot_teams) {
2746  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2747  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2748  }
2749 #endif
2750 
2751  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2752  hot_team->t.b->update_num_threads(new_nth);
2753  __kmp_add_threads_to_team(hot_team, new_nth);
2754  }
2755 
2756  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2757 
2758  // Update the t_nproc field in the threads that are still active.
2759  for (f = 0; f < new_nth; f++) {
2760  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2761  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2762  }
2763  // Special flag in case omp_set_num_threads() call
2764  hot_team->t.t_size_changed = -1;
2765  }
2766 }
2767 
2768 /* Changes max_active_levels */
2769 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2770  kmp_info_t *thread;
2771 
2772  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2773  "%d = (%d)\n",
2774  gtid, max_active_levels));
2775  KMP_DEBUG_ASSERT(__kmp_init_serial);
2776 
2777  // validate max_active_levels
2778  if (max_active_levels < 0) {
2779  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2780  // We ignore this call if the user has specified a negative value.
2781  // The current setting won't be changed. The last valid setting will be
2782  // used. A warning will be issued (if warnings are allowed as controlled by
2783  // the KMP_WARNINGS env var).
2784  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2785  "max_active_levels for thread %d = (%d)\n",
2786  gtid, max_active_levels));
2787  return;
2788  }
2789  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2790  // it's OK, the max_active_levels is within the valid range: [ 0;
2791  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2792  // We allow a zero value. (implementation defined behavior)
2793  } else {
2794  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2795  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2796  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2797  // Current upper limit is MAX_INT. (implementation defined behavior)
2798  // If the input exceeds the upper limit, we correct the input to be the
2799  // upper limit. (implementation defined behavior)
2800  // Actually, the flow should never get here until we use MAX_INT limit.
2801  }
2802  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2803  "max_active_levels for thread %d = (%d)\n",
2804  gtid, max_active_levels));
2805 
2806  thread = __kmp_threads[gtid];
2807 
2808  __kmp_save_internal_controls(thread);
2809 
2810  set__max_active_levels(thread, max_active_levels);
2811 }
2812 
2813 /* Gets max_active_levels */
2814 int __kmp_get_max_active_levels(int gtid) {
2815  kmp_info_t *thread;
2816 
2817  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2818  KMP_DEBUG_ASSERT(__kmp_init_serial);
2819 
2820  thread = __kmp_threads[gtid];
2821  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2822  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2823  "curtask_maxaclevel=%d\n",
2824  gtid, thread->th.th_current_task,
2825  thread->th.th_current_task->td_icvs.max_active_levels));
2826  return thread->th.th_current_task->td_icvs.max_active_levels;
2827 }
2828 
2829 // nteams-var per-device ICV
2830 void __kmp_set_num_teams(int num_teams) {
2831  if (num_teams > 0)
2832  __kmp_nteams = num_teams;
2833 }
2834 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2835 // teams-thread-limit-var per-device ICV
2836 void __kmp_set_teams_thread_limit(int limit) {
2837  if (limit > 0)
2838  __kmp_teams_thread_limit = limit;
2839 }
2840 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2841 
2842 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2843 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2844 
2845 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2846 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2847  kmp_info_t *thread;
2848  kmp_sched_t orig_kind;
2849  // kmp_team_t *team;
2850 
2851  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2852  gtid, (int)kind, chunk));
2853  KMP_DEBUG_ASSERT(__kmp_init_serial);
2854 
2855  // Check if the kind parameter is valid, correct if needed.
2856  // Valid parameters should fit in one of two intervals - standard or extended:
2857  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2858  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2859  orig_kind = kind;
2860  kind = __kmp_sched_without_mods(kind);
2861 
2862  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2863  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2864  // TODO: Hint needs attention in case we change the default schedule.
2865  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2866  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2867  __kmp_msg_null);
2868  kind = kmp_sched_default;
2869  chunk = 0; // ignore chunk value in case of bad kind
2870  }
2871 
2872  thread = __kmp_threads[gtid];
2873 
2874  __kmp_save_internal_controls(thread);
2875 
2876  if (kind < kmp_sched_upper_std) {
2877  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2878  // differ static chunked vs. unchunked: chunk should be invalid to
2879  // indicate unchunked schedule (which is the default)
2880  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2881  } else {
2882  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2883  __kmp_sch_map[kind - kmp_sched_lower - 1];
2884  }
2885  } else {
2886  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2887  // kmp_sched_lower - 2 ];
2888  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2889  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2890  kmp_sched_lower - 2];
2891  }
2892  __kmp_sched_apply_mods_intkind(
2893  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2894  if (kind == kmp_sched_auto || chunk < 1) {
2895  // ignore parameter chunk for schedule auto
2896  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2897  } else {
2898  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2899  }
2900 }
2901 
2902 /* Gets def_sched_var ICV values */
2903 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2904  kmp_info_t *thread;
2905  enum sched_type th_type;
2906 
2907  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2908  KMP_DEBUG_ASSERT(__kmp_init_serial);
2909 
2910  thread = __kmp_threads[gtid];
2911 
2912  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2913  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2914  case kmp_sch_static:
2915  case kmp_sch_static_greedy:
2916  case kmp_sch_static_balanced:
2917  *kind = kmp_sched_static;
2918  __kmp_sched_apply_mods_stdkind(kind, th_type);
2919  *chunk = 0; // chunk was not set, try to show this fact via zero value
2920  return;
2921  case kmp_sch_static_chunked:
2922  *kind = kmp_sched_static;
2923  break;
2924  case kmp_sch_dynamic_chunked:
2925  *kind = kmp_sched_dynamic;
2926  break;
2928  case kmp_sch_guided_iterative_chunked:
2929  case kmp_sch_guided_analytical_chunked:
2930  *kind = kmp_sched_guided;
2931  break;
2932  case kmp_sch_auto:
2933  *kind = kmp_sched_auto;
2934  break;
2935  case kmp_sch_trapezoidal:
2936  *kind = kmp_sched_trapezoidal;
2937  break;
2938 #if KMP_STATIC_STEAL_ENABLED
2939  case kmp_sch_static_steal:
2940  *kind = kmp_sched_static_steal;
2941  break;
2942 #endif
2943  default:
2944  KMP_FATAL(UnknownSchedulingType, th_type);
2945  }
2946 
2947  __kmp_sched_apply_mods_stdkind(kind, th_type);
2948  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2949 }
2950 
2951 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2952 
2953  int ii, dd;
2954  kmp_team_t *team;
2955  kmp_info_t *thr;
2956 
2957  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2958  KMP_DEBUG_ASSERT(__kmp_init_serial);
2959 
2960  // validate level
2961  if (level == 0)
2962  return 0;
2963  if (level < 0)
2964  return -1;
2965  thr = __kmp_threads[gtid];
2966  team = thr->th.th_team;
2967  ii = team->t.t_level;
2968  if (level > ii)
2969  return -1;
2970 
2971  if (thr->th.th_teams_microtask) {
2972  // AC: we are in teams region where multiple nested teams have same level
2973  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2974  if (level <=
2975  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2976  KMP_DEBUG_ASSERT(ii >= tlevel);
2977  // AC: As we need to pass by the teams league, we need to artificially
2978  // increase ii
2979  if (ii == tlevel) {
2980  ii += 2; // three teams have same level
2981  } else {
2982  ii++; // two teams have same level
2983  }
2984  }
2985  }
2986 
2987  if (ii == level)
2988  return __kmp_tid_from_gtid(gtid);
2989 
2990  dd = team->t.t_serialized;
2991  level++;
2992  while (ii > level) {
2993  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2994  }
2995  if ((team->t.t_serialized) && (!dd)) {
2996  team = team->t.t_parent;
2997  continue;
2998  }
2999  if (ii > level) {
3000  team = team->t.t_parent;
3001  dd = team->t.t_serialized;
3002  ii--;
3003  }
3004  }
3005 
3006  return (dd > 1) ? (0) : (team->t.t_master_tid);
3007 }
3008 
3009 int __kmp_get_team_size(int gtid, int level) {
3010 
3011  int ii, dd;
3012  kmp_team_t *team;
3013  kmp_info_t *thr;
3014 
3015  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3016  KMP_DEBUG_ASSERT(__kmp_init_serial);
3017 
3018  // validate level
3019  if (level == 0)
3020  return 1;
3021  if (level < 0)
3022  return -1;
3023  thr = __kmp_threads[gtid];
3024  team = thr->th.th_team;
3025  ii = team->t.t_level;
3026  if (level > ii)
3027  return -1;
3028 
3029  if (thr->th.th_teams_microtask) {
3030  // AC: we are in teams region where multiple nested teams have same level
3031  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3032  if (level <=
3033  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3034  KMP_DEBUG_ASSERT(ii >= tlevel);
3035  // AC: As we need to pass by the teams league, we need to artificially
3036  // increase ii
3037  if (ii == tlevel) {
3038  ii += 2; // three teams have same level
3039  } else {
3040  ii++; // two teams have same level
3041  }
3042  }
3043  }
3044 
3045  while (ii > level) {
3046  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3047  }
3048  if (team->t.t_serialized && (!dd)) {
3049  team = team->t.t_parent;
3050  continue;
3051  }
3052  if (ii > level) {
3053  team = team->t.t_parent;
3054  ii--;
3055  }
3056  }
3057 
3058  return team->t.t_nproc;
3059 }
3060 
3061 kmp_r_sched_t __kmp_get_schedule_global() {
3062  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3063  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3064  // independently. So one can get the updated schedule here.
3065 
3066  kmp_r_sched_t r_sched;
3067 
3068  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3069  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3070  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3071  // different roots (even in OMP 2.5)
3072  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3073  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3074  if (s == kmp_sch_static) {
3075  // replace STATIC with more detailed schedule (balanced or greedy)
3076  r_sched.r_sched_type = __kmp_static;
3077  } else if (s == kmp_sch_guided_chunked) {
3078  // replace GUIDED with more detailed schedule (iterative or analytical)
3079  r_sched.r_sched_type = __kmp_guided;
3080  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3081  r_sched.r_sched_type = __kmp_sched;
3082  }
3083  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3084 
3085  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3086  // __kmp_chunk may be wrong here (if it was not ever set)
3087  r_sched.chunk = KMP_DEFAULT_CHUNK;
3088  } else {
3089  r_sched.chunk = __kmp_chunk;
3090  }
3091 
3092  return r_sched;
3093 }
3094 
3095 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3096  at least argc number of *t_argv entries for the requested team. */
3097 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3098 
3099  KMP_DEBUG_ASSERT(team);
3100  if (!realloc || argc > team->t.t_max_argc) {
3101 
3102  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3103  "current entries=%d\n",
3104  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3105  /* if previously allocated heap space for args, free them */
3106  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3107  __kmp_free((void *)team->t.t_argv);
3108 
3109  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3110  /* use unused space in the cache line for arguments */
3111  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3112  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3113  "argv entries\n",
3114  team->t.t_id, team->t.t_max_argc));
3115  team->t.t_argv = &team->t.t_inline_argv[0];
3116  if (__kmp_storage_map) {
3117  __kmp_print_storage_map_gtid(
3118  -1, &team->t.t_inline_argv[0],
3119  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3120  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3121  team->t.t_id);
3122  }
3123  } else {
3124  /* allocate space for arguments in the heap */
3125  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3126  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3127  : 2 * argc;
3128  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3129  "argv entries\n",
3130  team->t.t_id, team->t.t_max_argc));
3131  team->t.t_argv =
3132  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3133  if (__kmp_storage_map) {
3134  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3135  &team->t.t_argv[team->t.t_max_argc],
3136  sizeof(void *) * team->t.t_max_argc,
3137  "team_%d.t_argv", team->t.t_id);
3138  }
3139  }
3140  }
3141 }
3142 
3143 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3144  int i;
3145  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3146  team->t.t_threads =
3147  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3148  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3149  sizeof(dispatch_shared_info_t) * num_disp_buff);
3150  team->t.t_dispatch =
3151  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3152  team->t.t_implicit_task_taskdata =
3153  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3154  team->t.t_max_nproc = max_nth;
3155 
3156  /* setup dispatch buffers */
3157  for (i = 0; i < num_disp_buff; ++i) {
3158  team->t.t_disp_buffer[i].buffer_index = i;
3159  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3160  }
3161 }
3162 
3163 static void __kmp_free_team_arrays(kmp_team_t *team) {
3164  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3165  int i;
3166  for (i = 0; i < team->t.t_max_nproc; ++i) {
3167  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3168  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3169  team->t.t_dispatch[i].th_disp_buffer = NULL;
3170  }
3171  }
3172 #if KMP_USE_HIER_SCHED
3173  __kmp_dispatch_free_hierarchies(team);
3174 #endif
3175  __kmp_free(team->t.t_threads);
3176  __kmp_free(team->t.t_disp_buffer);
3177  __kmp_free(team->t.t_dispatch);
3178  __kmp_free(team->t.t_implicit_task_taskdata);
3179  team->t.t_threads = NULL;
3180  team->t.t_disp_buffer = NULL;
3181  team->t.t_dispatch = NULL;
3182  team->t.t_implicit_task_taskdata = 0;
3183 }
3184 
3185 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3186  kmp_info_t **oldThreads = team->t.t_threads;
3187 
3188  __kmp_free(team->t.t_disp_buffer);
3189  __kmp_free(team->t.t_dispatch);
3190  __kmp_free(team->t.t_implicit_task_taskdata);
3191  __kmp_allocate_team_arrays(team, max_nth);
3192 
3193  KMP_MEMCPY(team->t.t_threads, oldThreads,
3194  team->t.t_nproc * sizeof(kmp_info_t *));
3195 
3196  __kmp_free(oldThreads);
3197 }
3198 
3199 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3200 
3201  kmp_r_sched_t r_sched =
3202  __kmp_get_schedule_global(); // get current state of scheduling globals
3203 
3204  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3205 
3206  kmp_internal_control_t g_icvs = {
3207  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3208  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3209  // adjustment of threads (per thread)
3210  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3211  // whether blocktime is explicitly set
3212  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3213 #if KMP_USE_MONITOR
3214  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3215 // intervals
3216 #endif
3217  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3218  // next parallel region (per thread)
3219  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3220  __kmp_cg_max_nth, // int thread_limit;
3221  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3222  // for max_active_levels
3223  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3224  // {sched,chunk} pair
3225  __kmp_nested_proc_bind.bind_types[0],
3226  __kmp_default_device,
3227  NULL // struct kmp_internal_control *next;
3228  };
3229 
3230  return g_icvs;
3231 }
3232 
3233 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3234 
3235  kmp_internal_control_t gx_icvs;
3236  gx_icvs.serial_nesting_level =
3237  0; // probably =team->t.t_serial like in save_inter_controls
3238  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3239  gx_icvs.next = NULL;
3240 
3241  return gx_icvs;
3242 }
3243 
3244 static void __kmp_initialize_root(kmp_root_t *root) {
3245  int f;
3246  kmp_team_t *root_team;
3247  kmp_team_t *hot_team;
3248  int hot_team_max_nth;
3249  kmp_r_sched_t r_sched =
3250  __kmp_get_schedule_global(); // get current state of scheduling globals
3251  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3252  KMP_DEBUG_ASSERT(root);
3253  KMP_ASSERT(!root->r.r_begin);
3254 
3255  /* setup the root state structure */
3256  __kmp_init_lock(&root->r.r_begin_lock);
3257  root->r.r_begin = FALSE;
3258  root->r.r_active = FALSE;
3259  root->r.r_in_parallel = 0;
3260  root->r.r_blocktime = __kmp_dflt_blocktime;
3261 #if KMP_AFFINITY_SUPPORTED
3262  root->r.r_affinity_assigned = FALSE;
3263 #endif
3264 
3265  /* setup the root team for this task */
3266  /* allocate the root team structure */
3267  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3268 
3269  root_team =
3270  __kmp_allocate_team(root,
3271  1, // new_nproc
3272  1, // max_nproc
3273 #if OMPT_SUPPORT
3274  ompt_data_none, // root parallel id
3275 #endif
3276  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3277  0 // argc
3278  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3279  );
3280 #if USE_DEBUGGER
3281  // Non-NULL value should be assigned to make the debugger display the root
3282  // team.
3283  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3284 #endif
3285 
3286  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3287 
3288  root->r.r_root_team = root_team;
3289  root_team->t.t_control_stack_top = NULL;
3290 
3291  /* initialize root team */
3292  root_team->t.t_threads[0] = NULL;
3293  root_team->t.t_nproc = 1;
3294  root_team->t.t_serialized = 1;
3295  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3296  root_team->t.t_sched.sched = r_sched.sched;
3297  KA_TRACE(
3298  20,
3299  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3300  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3301 
3302  /* setup the hot team for this task */
3303  /* allocate the hot team structure */
3304  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3305 
3306  hot_team =
3307  __kmp_allocate_team(root,
3308  1, // new_nproc
3309  __kmp_dflt_team_nth_ub * 2, // max_nproc
3310 #if OMPT_SUPPORT
3311  ompt_data_none, // root parallel id
3312 #endif
3313  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3314  0 // argc
3315  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3316  );
3317  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3318 
3319  root->r.r_hot_team = hot_team;
3320  root_team->t.t_control_stack_top = NULL;
3321 
3322  /* first-time initialization */
3323  hot_team->t.t_parent = root_team;
3324 
3325  /* initialize hot team */
3326  hot_team_max_nth = hot_team->t.t_max_nproc;
3327  for (f = 0; f < hot_team_max_nth; ++f) {
3328  hot_team->t.t_threads[f] = NULL;
3329  }
3330  hot_team->t.t_nproc = 1;
3331  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3332  hot_team->t.t_sched.sched = r_sched.sched;
3333  hot_team->t.t_size_changed = 0;
3334 }
3335 
3336 #ifdef KMP_DEBUG
3337 
3338 typedef struct kmp_team_list_item {
3339  kmp_team_p const *entry;
3340  struct kmp_team_list_item *next;
3341 } kmp_team_list_item_t;
3342 typedef kmp_team_list_item_t *kmp_team_list_t;
3343 
3344 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3345  kmp_team_list_t list, // List of teams.
3346  kmp_team_p const *team // Team to add.
3347 ) {
3348 
3349  // List must terminate with item where both entry and next are NULL.
3350  // Team is added to the list only once.
3351  // List is sorted in ascending order by team id.
3352  // Team id is *not* a key.
3353 
3354  kmp_team_list_t l;
3355 
3356  KMP_DEBUG_ASSERT(list != NULL);
3357  if (team == NULL) {
3358  return;
3359  }
3360 
3361  __kmp_print_structure_team_accum(list, team->t.t_parent);
3362  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3363 
3364  // Search list for the team.
3365  l = list;
3366  while (l->next != NULL && l->entry != team) {
3367  l = l->next;
3368  }
3369  if (l->next != NULL) {
3370  return; // Team has been added before, exit.
3371  }
3372 
3373  // Team is not found. Search list again for insertion point.
3374  l = list;
3375  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3376  l = l->next;
3377  }
3378 
3379  // Insert team.
3380  {
3381  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3382  sizeof(kmp_team_list_item_t));
3383  *item = *l;
3384  l->entry = team;
3385  l->next = item;
3386  }
3387 }
3388 
3389 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3390 
3391 ) {
3392  __kmp_printf("%s", title);
3393  if (team != NULL) {
3394  __kmp_printf("%2x %p\n", team->t.t_id, team);
3395  } else {
3396  __kmp_printf(" - (nil)\n");
3397  }
3398 }
3399 
3400 static void __kmp_print_structure_thread(char const *title,
3401  kmp_info_p const *thread) {
3402  __kmp_printf("%s", title);
3403  if (thread != NULL) {
3404  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3405  } else {
3406  __kmp_printf(" - (nil)\n");
3407  }
3408 }
3409 
3410 void __kmp_print_structure(void) {
3411 
3412  kmp_team_list_t list;
3413 
3414  // Initialize list of teams.
3415  list =
3416  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3417  list->entry = NULL;
3418  list->next = NULL;
3419 
3420  __kmp_printf("\n------------------------------\nGlobal Thread "
3421  "Table\n------------------------------\n");
3422  {
3423  int gtid;
3424  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3425  __kmp_printf("%2d", gtid);
3426  if (__kmp_threads != NULL) {
3427  __kmp_printf(" %p", __kmp_threads[gtid]);
3428  }
3429  if (__kmp_root != NULL) {
3430  __kmp_printf(" %p", __kmp_root[gtid]);
3431  }
3432  __kmp_printf("\n");
3433  }
3434  }
3435 
3436  // Print out __kmp_threads array.
3437  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3438  "----------\n");
3439  if (__kmp_threads != NULL) {
3440  int gtid;
3441  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3442  kmp_info_t const *thread = __kmp_threads[gtid];
3443  if (thread != NULL) {
3444  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3445  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3446  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3447  __kmp_print_structure_team(" Serial Team: ",
3448  thread->th.th_serial_team);
3449  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3450  __kmp_print_structure_thread(" Primary: ",
3451  thread->th.th_team_master);
3452  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3453  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3454  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3455  __kmp_print_structure_thread(" Next in pool: ",
3456  thread->th.th_next_pool);
3457  __kmp_printf("\n");
3458  __kmp_print_structure_team_accum(list, thread->th.th_team);
3459  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3460  }
3461  }
3462  } else {
3463  __kmp_printf("Threads array is not allocated.\n");
3464  }
3465 
3466  // Print out __kmp_root array.
3467  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3468  "--------\n");
3469  if (__kmp_root != NULL) {
3470  int gtid;
3471  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3472  kmp_root_t const *root = __kmp_root[gtid];
3473  if (root != NULL) {
3474  __kmp_printf("GTID %2d %p:\n", gtid, root);
3475  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3476  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3477  __kmp_print_structure_thread(" Uber Thread: ",
3478  root->r.r_uber_thread);
3479  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3480  __kmp_printf(" In Parallel: %2d\n",
3481  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3482  __kmp_printf("\n");
3483  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3484  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3485  }
3486  }
3487  } else {
3488  __kmp_printf("Ubers array is not allocated.\n");
3489  }
3490 
3491  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3492  "--------\n");
3493  while (list->next != NULL) {
3494  kmp_team_p const *team = list->entry;
3495  int i;
3496  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3497  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3498  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3499  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3500  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3501  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3502  for (i = 0; i < team->t.t_nproc; ++i) {
3503  __kmp_printf(" Thread %2d: ", i);
3504  __kmp_print_structure_thread("", team->t.t_threads[i]);
3505  }
3506  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3507  __kmp_printf("\n");
3508  list = list->next;
3509  }
3510 
3511  // Print out __kmp_thread_pool and __kmp_team_pool.
3512  __kmp_printf("\n------------------------------\nPools\n----------------------"
3513  "--------\n");
3514  __kmp_print_structure_thread("Thread pool: ",
3515  CCAST(kmp_info_t *, __kmp_thread_pool));
3516  __kmp_print_structure_team("Team pool: ",
3517  CCAST(kmp_team_t *, __kmp_team_pool));
3518  __kmp_printf("\n");
3519 
3520  // Free team list.
3521  while (list != NULL) {
3522  kmp_team_list_item_t *item = list;
3523  list = list->next;
3524  KMP_INTERNAL_FREE(item);
3525  }
3526 }
3527 
3528 #endif
3529 
3530 //---------------------------------------------------------------------------
3531 // Stuff for per-thread fast random number generator
3532 // Table of primes
3533 static const unsigned __kmp_primes[] = {
3534  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3535  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3536  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3537  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3538  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3539  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3540  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3541  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3542  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3543  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3544  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3545 
3546 //---------------------------------------------------------------------------
3547 // __kmp_get_random: Get a random number using a linear congruential method.
3548 unsigned short __kmp_get_random(kmp_info_t *thread) {
3549  unsigned x = thread->th.th_x;
3550  unsigned short r = (unsigned short)(x >> 16);
3551 
3552  thread->th.th_x = x * thread->th.th_a + 1;
3553 
3554  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3555  thread->th.th_info.ds.ds_tid, r));
3556 
3557  return r;
3558 }
3559 //--------------------------------------------------------
3560 // __kmp_init_random: Initialize a random number generator
3561 void __kmp_init_random(kmp_info_t *thread) {
3562  unsigned seed = thread->th.th_info.ds.ds_tid;
3563 
3564  thread->th.th_a =
3565  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3566  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3567  KA_TRACE(30,
3568  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3569 }
3570 
3571 #if KMP_OS_WINDOWS
3572 /* reclaim array entries for root threads that are already dead, returns number
3573  * reclaimed */
3574 static int __kmp_reclaim_dead_roots(void) {
3575  int i, r = 0;
3576 
3577  for (i = 0; i < __kmp_threads_capacity; ++i) {
3578  if (KMP_UBER_GTID(i) &&
3579  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3580  !__kmp_root[i]
3581  ->r.r_active) { // AC: reclaim only roots died in non-active state
3582  r += __kmp_unregister_root_other_thread(i);
3583  }
3584  }
3585  return r;
3586 }
3587 #endif
3588 
3589 /* This function attempts to create free entries in __kmp_threads and
3590  __kmp_root, and returns the number of free entries generated.
3591 
3592  For Windows* OS static library, the first mechanism used is to reclaim array
3593  entries for root threads that are already dead.
3594 
3595  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3596  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3597  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3598  threadprivate cache array has been created. Synchronization with
3599  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3600 
3601  After any dead root reclamation, if the clipping value allows array expansion
3602  to result in the generation of a total of nNeed free slots, the function does
3603  that expansion. If not, nothing is done beyond the possible initial root
3604  thread reclamation.
3605 
3606  If any argument is negative, the behavior is undefined. */
3607 static int __kmp_expand_threads(int nNeed) {
3608  int added = 0;
3609  int minimumRequiredCapacity;
3610  int newCapacity;
3611  kmp_info_t **newThreads;
3612  kmp_root_t **newRoot;
3613 
3614  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3615  // resizing __kmp_threads does not need additional protection if foreign
3616  // threads are present
3617 
3618 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3619  /* only for Windows static library */
3620  /* reclaim array entries for root threads that are already dead */
3621  added = __kmp_reclaim_dead_roots();
3622 
3623  if (nNeed) {
3624  nNeed -= added;
3625  if (nNeed < 0)
3626  nNeed = 0;
3627  }
3628 #endif
3629  if (nNeed <= 0)
3630  return added;
3631 
3632  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3633  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3634  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3635  // > __kmp_max_nth in one of two ways:
3636  //
3637  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3638  // may not be reused by another thread, so we may need to increase
3639  // __kmp_threads_capacity to __kmp_max_nth + 1.
3640  //
3641  // 2) New foreign root(s) are encountered. We always register new foreign
3642  // roots. This may cause a smaller # of threads to be allocated at
3643  // subsequent parallel regions, but the worker threads hang around (and
3644  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3645  //
3646  // Anyway, that is the reason for moving the check to see if
3647  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3648  // instead of having it performed here. -BB
3649 
3650  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3651 
3652  /* compute expansion headroom to check if we can expand */
3653  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3654  /* possible expansion too small -- give up */
3655  return added;
3656  }
3657  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3658 
3659  newCapacity = __kmp_threads_capacity;
3660  do {
3661  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3662  : __kmp_sys_max_nth;
3663  } while (newCapacity < minimumRequiredCapacity);
3664  newThreads = (kmp_info_t **)__kmp_allocate(
3665  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3666  newRoot =
3667  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3668  KMP_MEMCPY(newThreads, __kmp_threads,
3669  __kmp_threads_capacity * sizeof(kmp_info_t *));
3670  KMP_MEMCPY(newRoot, __kmp_root,
3671  __kmp_threads_capacity * sizeof(kmp_root_t *));
3672 
3673  kmp_info_t **temp_threads = __kmp_threads;
3674  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3675  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3676  __kmp_free(temp_threads);
3677  added += newCapacity - __kmp_threads_capacity;
3678  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3679 
3680  if (newCapacity > __kmp_tp_capacity) {
3681  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3682  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3683  __kmp_threadprivate_resize_cache(newCapacity);
3684  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3685  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3686  }
3687  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3688  }
3689 
3690  return added;
3691 }
3692 
3693 /* Register the current thread as a root thread and obtain our gtid. We must
3694  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3695  thread that calls from __kmp_do_serial_initialize() */
3696 int __kmp_register_root(int initial_thread) {
3697  kmp_info_t *root_thread;
3698  kmp_root_t *root;
3699  int gtid;
3700  int capacity;
3701  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3702  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3703  KMP_MB();
3704 
3705  /* 2007-03-02:
3706  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3707  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3708  work as expected -- it may return false (that means there is at least one
3709  empty slot in __kmp_threads array), but it is possible the only free slot
3710  is #0, which is reserved for initial thread and so cannot be used for this
3711  one. Following code workarounds this bug.
3712 
3713  However, right solution seems to be not reserving slot #0 for initial
3714  thread because:
3715  (1) there is no magic in slot #0,
3716  (2) we cannot detect initial thread reliably (the first thread which does
3717  serial initialization may be not a real initial thread).
3718  */
3719  capacity = __kmp_threads_capacity;
3720  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3721  --capacity;
3722  }
3723 
3724  // If it is not for initializing the hidden helper team, we need to take
3725  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3726  // in __kmp_threads_capacity.
3727  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3728  capacity -= __kmp_hidden_helper_threads_num;
3729  }
3730 
3731  /* see if there are too many threads */
3732  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3733  if (__kmp_tp_cached) {
3734  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3735  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3736  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3737  } else {
3738  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3739  __kmp_msg_null);
3740  }
3741  }
3742 
3743  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3744  // 0: initial thread, also a regular OpenMP thread.
3745  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3746  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3747  // regular OpenMP threads.
3748  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3749  // Find an available thread slot for hidden helper thread. Slots for hidden
3750  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3751  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3752  gtid <= __kmp_hidden_helper_threads_num;
3753  gtid++)
3754  ;
3755  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3756  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3757  "hidden helper thread: T#%d\n",
3758  gtid));
3759  } else {
3760  /* find an available thread slot */
3761  // Don't reassign the zero slot since we need that to only be used by
3762  // initial thread. Slots for hidden helper threads should also be skipped.
3763  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3764  gtid = 0;
3765  } else {
3766  for (gtid = __kmp_hidden_helper_threads_num + 1;
3767  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3768  ;
3769  }
3770  KA_TRACE(
3771  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3772  KMP_ASSERT(gtid < __kmp_threads_capacity);
3773  }
3774 
3775  /* update global accounting */
3776  __kmp_all_nth++;
3777  TCW_4(__kmp_nth, __kmp_nth + 1);
3778 
3779  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3780  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3781  if (__kmp_adjust_gtid_mode) {
3782  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3783  if (TCR_4(__kmp_gtid_mode) != 2) {
3784  TCW_4(__kmp_gtid_mode, 2);
3785  }
3786  } else {
3787  if (TCR_4(__kmp_gtid_mode) != 1) {
3788  TCW_4(__kmp_gtid_mode, 1);
3789  }
3790  }
3791  }
3792 
3793 #ifdef KMP_ADJUST_BLOCKTIME
3794  /* Adjust blocktime to zero if necessary */
3795  /* Middle initialization might not have occurred yet */
3796  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3797  if (__kmp_nth > __kmp_avail_proc) {
3798  __kmp_zero_bt = TRUE;
3799  }
3800  }
3801 #endif /* KMP_ADJUST_BLOCKTIME */
3802 
3803  /* setup this new hierarchy */
3804  if (!(root = __kmp_root[gtid])) {
3805  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3806  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3807  }
3808 
3809 #if KMP_STATS_ENABLED
3810  // Initialize stats as soon as possible (right after gtid assignment).
3811  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3812  __kmp_stats_thread_ptr->startLife();
3813  KMP_SET_THREAD_STATE(SERIAL_REGION);
3814  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3815 #endif
3816  __kmp_initialize_root(root);
3817 
3818  /* setup new root thread structure */
3819  if (root->r.r_uber_thread) {
3820  root_thread = root->r.r_uber_thread;
3821  } else {
3822  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3823  if (__kmp_storage_map) {
3824  __kmp_print_thread_storage_map(root_thread, gtid);
3825  }
3826  root_thread->th.th_info.ds.ds_gtid = gtid;
3827 #if OMPT_SUPPORT
3828  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3829 #endif
3830  root_thread->th.th_root = root;
3831  if (__kmp_env_consistency_check) {
3832  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3833  }
3834 #if USE_FAST_MEMORY
3835  __kmp_initialize_fast_memory(root_thread);
3836 #endif /* USE_FAST_MEMORY */
3837 
3838 #if KMP_USE_BGET
3839  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3840  __kmp_initialize_bget(root_thread);
3841 #endif
3842  __kmp_init_random(root_thread); // Initialize random number generator
3843  }
3844 
3845  /* setup the serial team held in reserve by the root thread */
3846  if (!root_thread->th.th_serial_team) {
3847  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3848  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3849  root_thread->th.th_serial_team = __kmp_allocate_team(
3850  root, 1, 1,
3851 #if OMPT_SUPPORT
3852  ompt_data_none, // root parallel id
3853 #endif
3854  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3855  }
3856  KMP_ASSERT(root_thread->th.th_serial_team);
3857  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3858  root_thread->th.th_serial_team));
3859 
3860  /* drop root_thread into place */
3861  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3862 
3863  root->r.r_root_team->t.t_threads[0] = root_thread;
3864  root->r.r_hot_team->t.t_threads[0] = root_thread;
3865  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3866  // AC: the team created in reserve, not for execution (it is unused for now).
3867  root_thread->th.th_serial_team->t.t_serialized = 0;
3868  root->r.r_uber_thread = root_thread;
3869 
3870  /* initialize the thread, get it ready to go */
3871  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3872  TCW_4(__kmp_init_gtid, TRUE);
3873 
3874  /* prepare the primary thread for get_gtid() */
3875  __kmp_gtid_set_specific(gtid);
3876 
3877 #if USE_ITT_BUILD
3878  __kmp_itt_thread_name(gtid);
3879 #endif /* USE_ITT_BUILD */
3880 
3881 #ifdef KMP_TDATA_GTID
3882  __kmp_gtid = gtid;
3883 #endif
3884  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3885  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3886 
3887  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3888  "plain=%u\n",
3889  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3890  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3891  KMP_INIT_BARRIER_STATE));
3892  { // Initialize barrier data.
3893  int b;
3894  for (b = 0; b < bs_last_barrier; ++b) {
3895  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3896 #if USE_DEBUGGER
3897  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3898 #endif
3899  }
3900  }
3901  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3902  KMP_INIT_BARRIER_STATE);
3903 
3904 #if KMP_AFFINITY_SUPPORTED
3905  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3906  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3907  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3908  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3909 #endif /* KMP_AFFINITY_SUPPORTED */
3910  root_thread->th.th_def_allocator = __kmp_def_allocator;
3911  root_thread->th.th_prev_level = 0;
3912  root_thread->th.th_prev_num_threads = 1;
3913 
3914  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3915  tmp->cg_root = root_thread;
3916  tmp->cg_thread_limit = __kmp_cg_max_nth;
3917  tmp->cg_nthreads = 1;
3918  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3919  " cg_nthreads init to 1\n",
3920  root_thread, tmp));
3921  tmp->up = NULL;
3922  root_thread->th.th_cg_roots = tmp;
3923 
3924  __kmp_root_counter++;
3925 
3926 #if OMPT_SUPPORT
3927  if (!initial_thread && ompt_enabled.enabled) {
3928 
3929  kmp_info_t *root_thread = ompt_get_thread();
3930 
3931  ompt_set_thread_state(root_thread, ompt_state_overhead);
3932 
3933  if (ompt_enabled.ompt_callback_thread_begin) {
3934  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3935  ompt_thread_initial, __ompt_get_thread_data_internal());
3936  }
3937  ompt_data_t *task_data;
3938  ompt_data_t *parallel_data;
3939  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
3940  NULL);
3941  if (ompt_enabled.ompt_callback_implicit_task) {
3942  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3943  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3944  }
3945 
3946  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3947  }
3948 #endif
3949 #if OMPD_SUPPORT
3950  if (ompd_state & OMPD_ENABLE_BP)
3951  ompd_bp_thread_begin();
3952 #endif
3953 
3954  KMP_MB();
3955  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3956 
3957  return gtid;
3958 }
3959 
3960 #if KMP_NESTED_HOT_TEAMS
3961 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3962  const int max_level) {
3963  int i, n, nth;
3964  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3965  if (!hot_teams || !hot_teams[level].hot_team) {
3966  return 0;
3967  }
3968  KMP_DEBUG_ASSERT(level < max_level);
3969  kmp_team_t *team = hot_teams[level].hot_team;
3970  nth = hot_teams[level].hot_team_nth;
3971  n = nth - 1; // primary thread is not freed
3972  if (level < max_level - 1) {
3973  for (i = 0; i < nth; ++i) {
3974  kmp_info_t *th = team->t.t_threads[i];
3975  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3976  if (i > 0 && th->th.th_hot_teams) {
3977  __kmp_free(th->th.th_hot_teams);
3978  th->th.th_hot_teams = NULL;
3979  }
3980  }
3981  }
3982  __kmp_free_team(root, team, NULL);
3983  return n;
3984 }
3985 #endif
3986 
3987 // Resets a root thread and clear its root and hot teams.
3988 // Returns the number of __kmp_threads entries directly and indirectly freed.
3989 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3990  kmp_team_t *root_team = root->r.r_root_team;
3991  kmp_team_t *hot_team = root->r.r_hot_team;
3992  int n = hot_team->t.t_nproc;
3993  int i;
3994 
3995  KMP_DEBUG_ASSERT(!root->r.r_active);
3996 
3997  root->r.r_root_team = NULL;
3998  root->r.r_hot_team = NULL;
3999  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4000  // before call to __kmp_free_team().
4001  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4002 #if KMP_NESTED_HOT_TEAMS
4003  if (__kmp_hot_teams_max_level >
4004  0) { // need to free nested hot teams and their threads if any
4005  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4006  kmp_info_t *th = hot_team->t.t_threads[i];
4007  if (__kmp_hot_teams_max_level > 1) {
4008  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4009  }
4010  if (th->th.th_hot_teams) {
4011  __kmp_free(th->th.th_hot_teams);
4012  th->th.th_hot_teams = NULL;
4013  }
4014  }
4015  }
4016 #endif
4017  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4018 
4019  // Before we can reap the thread, we need to make certain that all other
4020  // threads in the teams that had this root as ancestor have stopped trying to
4021  // steal tasks.
4022  if (__kmp_tasking_mode != tskm_immediate_exec) {
4023  __kmp_wait_to_unref_task_teams();
4024  }
4025 
4026 #if KMP_OS_WINDOWS
4027  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4028  KA_TRACE(
4029  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4030  "\n",
4031  (LPVOID) & (root->r.r_uber_thread->th),
4032  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4033  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4034 #endif /* KMP_OS_WINDOWS */
4035 
4036 #if OMPD_SUPPORT
4037  if (ompd_state & OMPD_ENABLE_BP)
4038  ompd_bp_thread_end();
4039 #endif
4040 
4041 #if OMPT_SUPPORT
4042  ompt_data_t *task_data;
4043  ompt_data_t *parallel_data;
4044  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4045  NULL);
4046  if (ompt_enabled.ompt_callback_implicit_task) {
4047  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4048  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4049  }
4050  if (ompt_enabled.ompt_callback_thread_end) {
4051  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4052  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4053  }
4054 #endif
4055 
4056  TCW_4(__kmp_nth,
4057  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4058  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4059  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4060  " to %d\n",
4061  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4062  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4063  if (i == 1) {
4064  // need to free contention group structure
4065  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4066  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4067  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4068  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4069  root->r.r_uber_thread->th.th_cg_roots = NULL;
4070  }
4071  __kmp_reap_thread(root->r.r_uber_thread, 1);
4072 
4073  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4074  // instead of freeing.
4075  root->r.r_uber_thread = NULL;
4076  /* mark root as no longer in use */
4077  root->r.r_begin = FALSE;
4078 
4079  return n;
4080 }
4081 
4082 void __kmp_unregister_root_current_thread(int gtid) {
4083  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4084  /* this lock should be ok, since unregister_root_current_thread is never
4085  called during an abort, only during a normal close. furthermore, if you
4086  have the forkjoin lock, you should never try to get the initz lock */
4087  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4088  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4089  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4090  "exiting T#%d\n",
4091  gtid));
4092  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4093  return;
4094  }
4095  kmp_root_t *root = __kmp_root[gtid];
4096 
4097  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4098  KMP_ASSERT(KMP_UBER_GTID(gtid));
4099  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4100  KMP_ASSERT(root->r.r_active == FALSE);
4101 
4102  KMP_MB();
4103 
4104  kmp_info_t *thread = __kmp_threads[gtid];
4105  kmp_team_t *team = thread->th.th_team;
4106  kmp_task_team_t *task_team = thread->th.th_task_team;
4107 
4108  // we need to wait for the proxy tasks before finishing the thread
4109  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4110 #if OMPT_SUPPORT
4111  // the runtime is shutting down so we won't report any events
4112  thread->th.ompt_thread_info.state = ompt_state_undefined;
4113 #endif
4114  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4115  }
4116 
4117  __kmp_reset_root(gtid, root);
4118 
4119  KMP_MB();
4120  KC_TRACE(10,
4121  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4122 
4123  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4124 }
4125 
4126 #if KMP_OS_WINDOWS
4127 /* __kmp_forkjoin_lock must be already held
4128  Unregisters a root thread that is not the current thread. Returns the number
4129  of __kmp_threads entries freed as a result. */
4130 static int __kmp_unregister_root_other_thread(int gtid) {
4131  kmp_root_t *root = __kmp_root[gtid];
4132  int r;
4133 
4134  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4135  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4136  KMP_ASSERT(KMP_UBER_GTID(gtid));
4137  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4138  KMP_ASSERT(root->r.r_active == FALSE);
4139 
4140  r = __kmp_reset_root(gtid, root);
4141  KC_TRACE(10,
4142  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4143  return r;
4144 }
4145 #endif
4146 
4147 #if KMP_DEBUG
4148 void __kmp_task_info() {
4149 
4150  kmp_int32 gtid = __kmp_entry_gtid();
4151  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4152  kmp_info_t *this_thr = __kmp_threads[gtid];
4153  kmp_team_t *steam = this_thr->th.th_serial_team;
4154  kmp_team_t *team = this_thr->th.th_team;
4155 
4156  __kmp_printf(
4157  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4158  "ptask=%p\n",
4159  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4160  team->t.t_implicit_task_taskdata[tid].td_parent);
4161 }
4162 #endif // KMP_DEBUG
4163 
4164 /* TODO optimize with one big memclr, take out what isn't needed, split
4165  responsibility to workers as much as possible, and delay initialization of
4166  features as much as possible */
4167 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4168  int tid, int gtid) {
4169  /* this_thr->th.th_info.ds.ds_gtid is setup in
4170  kmp_allocate_thread/create_worker.
4171  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4172  KMP_DEBUG_ASSERT(this_thr != NULL);
4173  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4174  KMP_DEBUG_ASSERT(team);
4175  KMP_DEBUG_ASSERT(team->t.t_threads);
4176  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4177  kmp_info_t *master = team->t.t_threads[0];
4178  KMP_DEBUG_ASSERT(master);
4179  KMP_DEBUG_ASSERT(master->th.th_root);
4180 
4181  KMP_MB();
4182 
4183  TCW_SYNC_PTR(this_thr->th.th_team, team);
4184 
4185  this_thr->th.th_info.ds.ds_tid = tid;
4186  this_thr->th.th_set_nproc = 0;
4187  if (__kmp_tasking_mode != tskm_immediate_exec)
4188  // When tasking is possible, threads are not safe to reap until they are
4189  // done tasking; this will be set when tasking code is exited in wait
4190  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4191  else // no tasking --> always safe to reap
4192  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4193  this_thr->th.th_set_proc_bind = proc_bind_default;
4194 #if KMP_AFFINITY_SUPPORTED
4195  this_thr->th.th_new_place = this_thr->th.th_current_place;
4196 #endif
4197  this_thr->th.th_root = master->th.th_root;
4198 
4199  /* setup the thread's cache of the team structure */
4200  this_thr->th.th_team_nproc = team->t.t_nproc;
4201  this_thr->th.th_team_master = master;
4202  this_thr->th.th_team_serialized = team->t.t_serialized;
4203 
4204  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4205 
4206  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4207  tid, gtid, this_thr, this_thr->th.th_current_task));
4208 
4209  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4210  team, tid, TRUE);
4211 
4212  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4213  tid, gtid, this_thr, this_thr->th.th_current_task));
4214  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4215  // __kmp_initialize_team()?
4216 
4217  /* TODO no worksharing in speculative threads */
4218  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4219 
4220  this_thr->th.th_local.this_construct = 0;
4221 
4222  if (!this_thr->th.th_pri_common) {
4223  this_thr->th.th_pri_common =
4224  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4225  if (__kmp_storage_map) {
4226  __kmp_print_storage_map_gtid(
4227  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4228  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4229  }
4230  this_thr->th.th_pri_head = NULL;
4231  }
4232 
4233  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4234  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4235  // Make new thread's CG root same as primary thread's
4236  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4237  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4238  if (tmp) {
4239  // worker changes CG, need to check if old CG should be freed
4240  int i = tmp->cg_nthreads--;
4241  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4242  " on node %p of thread %p to %d\n",
4243  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4244  if (i == 1) {
4245  __kmp_free(tmp); // last thread left CG --> free it
4246  }
4247  }
4248  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4249  // Increment new thread's CG root's counter to add the new thread
4250  this_thr->th.th_cg_roots->cg_nthreads++;
4251  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4252  " node %p of thread %p to %d\n",
4253  this_thr, this_thr->th.th_cg_roots,
4254  this_thr->th.th_cg_roots->cg_root,
4255  this_thr->th.th_cg_roots->cg_nthreads));
4256  this_thr->th.th_current_task->td_icvs.thread_limit =
4257  this_thr->th.th_cg_roots->cg_thread_limit;
4258  }
4259 
4260  /* Initialize dynamic dispatch */
4261  {
4262  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4263  // Use team max_nproc since this will never change for the team.
4264  size_t disp_size =
4265  sizeof(dispatch_private_info_t) *
4266  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4267  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4268  team->t.t_max_nproc));
4269  KMP_ASSERT(dispatch);
4270  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4271  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4272 
4273  dispatch->th_disp_index = 0;
4274  dispatch->th_doacross_buf_idx = 0;
4275  if (!dispatch->th_disp_buffer) {
4276  dispatch->th_disp_buffer =
4277  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4278 
4279  if (__kmp_storage_map) {
4280  __kmp_print_storage_map_gtid(
4281  gtid, &dispatch->th_disp_buffer[0],
4282  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4283  ? 1
4284  : __kmp_dispatch_num_buffers],
4285  disp_size,
4286  "th_%d.th_dispatch.th_disp_buffer "
4287  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4288  gtid, team->t.t_id, gtid);
4289  }
4290  } else {
4291  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4292  }
4293 
4294  dispatch->th_dispatch_pr_current = 0;
4295  dispatch->th_dispatch_sh_current = 0;
4296 
4297  dispatch->th_deo_fcn = 0; /* ORDERED */
4298  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4299  }
4300 
4301  this_thr->th.th_next_pool = NULL;
4302 
4303  if (!this_thr->th.th_task_state_memo_stack) {
4304  size_t i;
4305  this_thr->th.th_task_state_memo_stack =
4306  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4307  this_thr->th.th_task_state_top = 0;
4308  this_thr->th.th_task_state_stack_sz = 4;
4309  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4310  ++i) // zero init the stack
4311  this_thr->th.th_task_state_memo_stack[i] = 0;
4312  }
4313 
4314  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4315  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4316 
4317  KMP_MB();
4318 }
4319 
4320 /* allocate a new thread for the requesting team. this is only called from
4321  within a forkjoin critical section. we will first try to get an available
4322  thread from the thread pool. if none is available, we will fork a new one
4323  assuming we are able to create a new one. this should be assured, as the
4324  caller should check on this first. */
4325 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4326  int new_tid) {
4327  kmp_team_t *serial_team;
4328  kmp_info_t *new_thr;
4329  int new_gtid;
4330 
4331  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4332  KMP_DEBUG_ASSERT(root && team);
4333 #if !KMP_NESTED_HOT_TEAMS
4334  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4335 #endif
4336  KMP_MB();
4337 
4338  /* first, try to get one from the thread pool */
4339  if (__kmp_thread_pool) {
4340  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4341  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4342  if (new_thr == __kmp_thread_pool_insert_pt) {
4343  __kmp_thread_pool_insert_pt = NULL;
4344  }
4345  TCW_4(new_thr->th.th_in_pool, FALSE);
4346  __kmp_suspend_initialize_thread(new_thr);
4347  __kmp_lock_suspend_mx(new_thr);
4348  if (new_thr->th.th_active_in_pool == TRUE) {
4349  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4350  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4351  new_thr->th.th_active_in_pool = FALSE;
4352  }
4353  __kmp_unlock_suspend_mx(new_thr);
4354 
4355  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4356  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4357  KMP_ASSERT(!new_thr->th.th_team);
4358  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4359 
4360  /* setup the thread structure */
4361  __kmp_initialize_info(new_thr, team, new_tid,
4362  new_thr->th.th_info.ds.ds_gtid);
4363  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4364 
4365  TCW_4(__kmp_nth, __kmp_nth + 1);
4366 
4367  new_thr->th.th_task_state = 0;
4368  new_thr->th.th_task_state_top = 0;
4369  new_thr->th.th_task_state_stack_sz = 4;
4370 
4371  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4372  // Make sure pool thread has transitioned to waiting on own thread struct
4373  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4374  // Thread activated in __kmp_allocate_team when increasing team size
4375  }
4376 
4377 #ifdef KMP_ADJUST_BLOCKTIME
4378  /* Adjust blocktime back to zero if necessary */
4379  /* Middle initialization might not have occurred yet */
4380  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4381  if (__kmp_nth > __kmp_avail_proc) {
4382  __kmp_zero_bt = TRUE;
4383  }
4384  }
4385 #endif /* KMP_ADJUST_BLOCKTIME */
4386 
4387 #if KMP_DEBUG
4388  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4389  // KMP_BARRIER_PARENT_FLAG.
4390  int b;
4391  kmp_balign_t *balign = new_thr->th.th_bar;
4392  for (b = 0; b < bs_last_barrier; ++b)
4393  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4394 #endif
4395 
4396  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4397  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4398 
4399  KMP_MB();
4400  return new_thr;
4401  }
4402 
4403  /* no, well fork a new one */
4404  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4405  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4406 
4407 #if KMP_USE_MONITOR
4408  // If this is the first worker thread the RTL is creating, then also
4409  // launch the monitor thread. We try to do this as early as possible.
4410  if (!TCR_4(__kmp_init_monitor)) {
4411  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4412  if (!TCR_4(__kmp_init_monitor)) {
4413  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4414  TCW_4(__kmp_init_monitor, 1);
4415  __kmp_create_monitor(&__kmp_monitor);
4416  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4417 #if KMP_OS_WINDOWS
4418  // AC: wait until monitor has started. This is a fix for CQ232808.
4419  // The reason is that if the library is loaded/unloaded in a loop with
4420  // small (parallel) work in between, then there is high probability that
4421  // monitor thread started after the library shutdown. At shutdown it is
4422  // too late to cope with the problem, because when the primary thread is
4423  // in DllMain (process detach) the monitor has no chances to start (it is
4424  // blocked), and primary thread has no means to inform the monitor that
4425  // the library has gone, because all the memory which the monitor can
4426  // access is going to be released/reset.
4427  while (TCR_4(__kmp_init_monitor) < 2) {
4428  KMP_YIELD(TRUE);
4429  }
4430  KF_TRACE(10, ("after monitor thread has started\n"));
4431 #endif
4432  }
4433  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4434  }
4435 #endif
4436 
4437  KMP_MB();
4438 
4439  {
4440  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4441  ? 1
4442  : __kmp_hidden_helper_threads_num + 1;
4443 
4444  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4445  ++new_gtid) {
4446  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4447  }
4448 
4449  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4450  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4451  }
4452  }
4453 
4454  /* allocate space for it. */
4455  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4456 
4457  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4458 
4459 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4460  // suppress race conditions detection on synchronization flags in debug mode
4461  // this helps to analyze library internals eliminating false positives
4462  __itt_suppress_mark_range(
4463  __itt_suppress_range, __itt_suppress_threading_errors,
4464  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4465  __itt_suppress_mark_range(
4466  __itt_suppress_range, __itt_suppress_threading_errors,
4467  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4468 #if KMP_OS_WINDOWS
4469  __itt_suppress_mark_range(
4470  __itt_suppress_range, __itt_suppress_threading_errors,
4471  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4472 #else
4473  __itt_suppress_mark_range(__itt_suppress_range,
4474  __itt_suppress_threading_errors,
4475  &new_thr->th.th_suspend_init_count,
4476  sizeof(new_thr->th.th_suspend_init_count));
4477 #endif
4478  // TODO: check if we need to also suppress b_arrived flags
4479  __itt_suppress_mark_range(__itt_suppress_range,
4480  __itt_suppress_threading_errors,
4481  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4482  sizeof(new_thr->th.th_bar[0].bb.b_go));
4483  __itt_suppress_mark_range(__itt_suppress_range,
4484  __itt_suppress_threading_errors,
4485  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4486  sizeof(new_thr->th.th_bar[1].bb.b_go));
4487  __itt_suppress_mark_range(__itt_suppress_range,
4488  __itt_suppress_threading_errors,
4489  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4490  sizeof(new_thr->th.th_bar[2].bb.b_go));
4491 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4492  if (__kmp_storage_map) {
4493  __kmp_print_thread_storage_map(new_thr, new_gtid);
4494  }
4495 
4496  // add the reserve serialized team, initialized from the team's primary thread
4497  {
4498  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4499  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4500  new_thr->th.th_serial_team = serial_team =
4501  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4502 #if OMPT_SUPPORT
4503  ompt_data_none, // root parallel id
4504 #endif
4505  proc_bind_default, &r_icvs,
4506  0 USE_NESTED_HOT_ARG(NULL));
4507  }
4508  KMP_ASSERT(serial_team);
4509  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4510  // execution (it is unused for now).
4511  serial_team->t.t_threads[0] = new_thr;
4512  KF_TRACE(10,
4513  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4514  new_thr));
4515 
4516  /* setup the thread structures */
4517  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4518 
4519 #if USE_FAST_MEMORY
4520  __kmp_initialize_fast_memory(new_thr);
4521 #endif /* USE_FAST_MEMORY */
4522 
4523 #if KMP_USE_BGET
4524  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4525  __kmp_initialize_bget(new_thr);
4526 #endif
4527 
4528  __kmp_init_random(new_thr); // Initialize random number generator
4529 
4530  /* Initialize these only once when thread is grabbed for a team allocation */
4531  KA_TRACE(20,
4532  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4533  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4534 
4535  int b;
4536  kmp_balign_t *balign = new_thr->th.th_bar;
4537  for (b = 0; b < bs_last_barrier; ++b) {
4538  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4539  balign[b].bb.team = NULL;
4540  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4541  balign[b].bb.use_oncore_barrier = 0;
4542  }
4543 
4544  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4545  new_thr->th.th_sleep_loc_type = flag_unset;
4546 
4547  new_thr->th.th_spin_here = FALSE;
4548  new_thr->th.th_next_waiting = 0;
4549 #if KMP_OS_UNIX
4550  new_thr->th.th_blocking = false;
4551 #endif
4552 
4553 #if KMP_AFFINITY_SUPPORTED
4554  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4555  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4556  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4557  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4558 #endif
4559  new_thr->th.th_def_allocator = __kmp_def_allocator;
4560  new_thr->th.th_prev_level = 0;
4561  new_thr->th.th_prev_num_threads = 1;
4562 
4563  TCW_4(new_thr->th.th_in_pool, FALSE);
4564  new_thr->th.th_active_in_pool = FALSE;
4565  TCW_4(new_thr->th.th_active, TRUE);
4566 
4567  /* adjust the global counters */
4568  __kmp_all_nth++;
4569  __kmp_nth++;
4570 
4571  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4572  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4573  if (__kmp_adjust_gtid_mode) {
4574  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4575  if (TCR_4(__kmp_gtid_mode) != 2) {
4576  TCW_4(__kmp_gtid_mode, 2);
4577  }
4578  } else {
4579  if (TCR_4(__kmp_gtid_mode) != 1) {
4580  TCW_4(__kmp_gtid_mode, 1);
4581  }
4582  }
4583  }
4584 
4585 #ifdef KMP_ADJUST_BLOCKTIME
4586  /* Adjust blocktime back to zero if necessary */
4587  /* Middle initialization might not have occurred yet */
4588  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4589  if (__kmp_nth > __kmp_avail_proc) {
4590  __kmp_zero_bt = TRUE;
4591  }
4592  }
4593 #endif /* KMP_ADJUST_BLOCKTIME */
4594 
4595  /* actually fork it and create the new worker thread */
4596  KF_TRACE(
4597  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4598  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4599  KF_TRACE(10,
4600  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4601 
4602  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4603  new_gtid));
4604  KMP_MB();
4605  return new_thr;
4606 }
4607 
4608 /* Reinitialize team for reuse.
4609  The hot team code calls this case at every fork barrier, so EPCC barrier
4610  test are extremely sensitive to changes in it, esp. writes to the team
4611  struct, which cause a cache invalidation in all threads.
4612  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4613 static void __kmp_reinitialize_team(kmp_team_t *team,
4614  kmp_internal_control_t *new_icvs,
4615  ident_t *loc) {
4616  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4617  team->t.t_threads[0], team));
4618  KMP_DEBUG_ASSERT(team && new_icvs);
4619  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4620  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4621 
4622  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4623  // Copy ICVs to the primary thread's implicit taskdata
4624  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4625  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4626 
4627  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4628  team->t.t_threads[0], team));
4629 }
4630 
4631 /* Initialize the team data structure.
4632  This assumes the t_threads and t_max_nproc are already set.
4633  Also, we don't touch the arguments */
4634 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4635  kmp_internal_control_t *new_icvs,
4636  ident_t *loc) {
4637  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4638 
4639  /* verify */
4640  KMP_DEBUG_ASSERT(team);
4641  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4642  KMP_DEBUG_ASSERT(team->t.t_threads);
4643  KMP_MB();
4644 
4645  team->t.t_master_tid = 0; /* not needed */
4646  /* team->t.t_master_bar; not needed */
4647  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4648  team->t.t_nproc = new_nproc;
4649 
4650  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4651  team->t.t_next_pool = NULL;
4652  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4653  * up hot team */
4654 
4655  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4656  team->t.t_invoke = NULL; /* not needed */
4657 
4658  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4659  team->t.t_sched.sched = new_icvs->sched.sched;
4660 
4661 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4662  team->t.t_fp_control_saved = FALSE; /* not needed */
4663  team->t.t_x87_fpu_control_word = 0; /* not needed */
4664  team->t.t_mxcsr = 0; /* not needed */
4665 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4666 
4667  team->t.t_construct = 0;
4668 
4669  team->t.t_ordered.dt.t_value = 0;
4670  team->t.t_master_active = FALSE;
4671 
4672 #ifdef KMP_DEBUG
4673  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4674 #endif
4675 #if KMP_OS_WINDOWS
4676  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4677 #endif
4678 
4679  team->t.t_control_stack_top = NULL;
4680 
4681  __kmp_reinitialize_team(team, new_icvs, loc);
4682 
4683  KMP_MB();
4684  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4685 }
4686 
4687 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4688 /* Sets full mask for thread and returns old mask, no changes to structures. */
4689 static void
4690 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4691  if (KMP_AFFINITY_CAPABLE()) {
4692  int status;
4693  if (old_mask != NULL) {
4694  status = __kmp_get_system_affinity(old_mask, TRUE);
4695  int error = errno;
4696  if (status != 0) {
4697  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4698  __kmp_msg_null);
4699  }
4700  }
4701  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4702  }
4703 }
4704 #endif
4705 
4706 #if KMP_AFFINITY_SUPPORTED
4707 
4708 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4709 // It calculates the worker + primary thread's partition based upon the parent
4710 // thread's partition, and binds each worker to a thread in their partition.
4711 // The primary thread's partition should already include its current binding.
4712 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4713  // Do not partition places for the hidden helper team
4714  if (KMP_HIDDEN_HELPER_TEAM(team))
4715  return;
4716  // Copy the primary thread's place partition to the team struct
4717  kmp_info_t *master_th = team->t.t_threads[0];
4718  KMP_DEBUG_ASSERT(master_th != NULL);
4719  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4720  int first_place = master_th->th.th_first_place;
4721  int last_place = master_th->th.th_last_place;
4722  int masters_place = master_th->th.th_current_place;
4723  team->t.t_first_place = first_place;
4724  team->t.t_last_place = last_place;
4725 
4726  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4727  "bound to place %d partition = [%d,%d]\n",
4728  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4729  team->t.t_id, masters_place, first_place, last_place));
4730 
4731  switch (proc_bind) {
4732 
4733  case proc_bind_default:
4734  // Serial teams might have the proc_bind policy set to proc_bind_default.
4735  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4736  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4737  break;
4738 
4739  case proc_bind_primary: {
4740  int f;
4741  int n_th = team->t.t_nproc;
4742  for (f = 1; f < n_th; f++) {
4743  kmp_info_t *th = team->t.t_threads[f];
4744  KMP_DEBUG_ASSERT(th != NULL);
4745  th->th.th_first_place = first_place;
4746  th->th.th_last_place = last_place;
4747  th->th.th_new_place = masters_place;
4748  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4749  team->t.t_display_affinity != 1) {
4750  team->t.t_display_affinity = 1;
4751  }
4752 
4753  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4754  "partition = [%d,%d]\n",
4755  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4756  f, masters_place, first_place, last_place));
4757  }
4758  } break;
4759 
4760  case proc_bind_close: {
4761  int f;
4762  int n_th = team->t.t_nproc;
4763  int n_places;
4764  if (first_place <= last_place) {
4765  n_places = last_place - first_place + 1;
4766  } else {
4767  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4768  }
4769  if (n_th <= n_places) {
4770  int place = masters_place;
4771  for (f = 1; f < n_th; f++) {
4772  kmp_info_t *th = team->t.t_threads[f];
4773  KMP_DEBUG_ASSERT(th != NULL);
4774 
4775  if (place == last_place) {
4776  place = first_place;
4777  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4778  place = 0;
4779  } else {
4780  place++;
4781  }
4782  th->th.th_first_place = first_place;
4783  th->th.th_last_place = last_place;
4784  th->th.th_new_place = place;
4785  if (__kmp_display_affinity && place != th->th.th_current_place &&
4786  team->t.t_display_affinity != 1) {
4787  team->t.t_display_affinity = 1;
4788  }
4789 
4790  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4791  "partition = [%d,%d]\n",
4792  __kmp_gtid_from_thread(team->t.t_threads[f]),
4793  team->t.t_id, f, place, first_place, last_place));
4794  }
4795  } else {
4796  int S, rem, gap, s_count;
4797  S = n_th / n_places;
4798  s_count = 0;
4799  rem = n_th - (S * n_places);
4800  gap = rem > 0 ? n_places / rem : n_places;
4801  int place = masters_place;
4802  int gap_ct = gap;
4803  for (f = 0; f < n_th; f++) {
4804  kmp_info_t *th = team->t.t_threads[f];
4805  KMP_DEBUG_ASSERT(th != NULL);
4806 
4807  th->th.th_first_place = first_place;
4808  th->th.th_last_place = last_place;
4809  th->th.th_new_place = place;
4810  if (__kmp_display_affinity && place != th->th.th_current_place &&
4811  team->t.t_display_affinity != 1) {
4812  team->t.t_display_affinity = 1;
4813  }
4814  s_count++;
4815 
4816  if ((s_count == S) && rem && (gap_ct == gap)) {
4817  // do nothing, add an extra thread to place on next iteration
4818  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4819  // we added an extra thread to this place; move to next place
4820  if (place == last_place) {
4821  place = first_place;
4822  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4823  place = 0;
4824  } else {
4825  place++;
4826  }
4827  s_count = 0;
4828  gap_ct = 1;
4829  rem--;
4830  } else if (s_count == S) { // place full; don't add extra
4831  if (place == last_place) {
4832  place = first_place;
4833  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4834  place = 0;
4835  } else {
4836  place++;
4837  }
4838  gap_ct++;
4839  s_count = 0;
4840  }
4841 
4842  KA_TRACE(100,
4843  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4844  "partition = [%d,%d]\n",
4845  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4846  th->th.th_new_place, first_place, last_place));
4847  }
4848  KMP_DEBUG_ASSERT(place == masters_place);
4849  }
4850  } break;
4851 
4852  case proc_bind_spread: {
4853  int f;
4854  int n_th = team->t.t_nproc;
4855  int n_places;
4856  int thidx;
4857  if (first_place <= last_place) {
4858  n_places = last_place - first_place + 1;
4859  } else {
4860  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4861  }
4862  if (n_th <= n_places) {
4863  int place = -1;
4864 
4865  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4866  int S = n_places / n_th;
4867  int s_count, rem, gap, gap_ct;
4868 
4869  place = masters_place;
4870  rem = n_places - n_th * S;
4871  gap = rem ? n_th / rem : 1;
4872  gap_ct = gap;
4873  thidx = n_th;
4874  if (update_master_only == 1)
4875  thidx = 1;
4876  for (f = 0; f < thidx; f++) {
4877  kmp_info_t *th = team->t.t_threads[f];
4878  KMP_DEBUG_ASSERT(th != NULL);
4879 
4880  th->th.th_first_place = place;
4881  th->th.th_new_place = place;
4882  if (__kmp_display_affinity && place != th->th.th_current_place &&
4883  team->t.t_display_affinity != 1) {
4884  team->t.t_display_affinity = 1;
4885  }
4886  s_count = 1;
4887  while (s_count < S) {
4888  if (place == last_place) {
4889  place = first_place;
4890  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4891  place = 0;
4892  } else {
4893  place++;
4894  }
4895  s_count++;
4896  }
4897  if (rem && (gap_ct == gap)) {
4898  if (place == last_place) {
4899  place = first_place;
4900  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4901  place = 0;
4902  } else {
4903  place++;
4904  }
4905  rem--;
4906  gap_ct = 0;
4907  }
4908  th->th.th_last_place = place;
4909  gap_ct++;
4910 
4911  if (place == last_place) {
4912  place = first_place;
4913  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4914  place = 0;
4915  } else {
4916  place++;
4917  }
4918 
4919  KA_TRACE(100,
4920  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4921  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4922  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4923  f, th->th.th_new_place, th->th.th_first_place,
4924  th->th.th_last_place, __kmp_affinity_num_masks));
4925  }
4926  } else {
4927  /* Having uniform space of available computation places I can create
4928  T partitions of round(P/T) size and put threads into the first
4929  place of each partition. */
4930  double current = static_cast<double>(masters_place);
4931  double spacing =
4932  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4933  int first, last;
4934  kmp_info_t *th;
4935 
4936  thidx = n_th + 1;
4937  if (update_master_only == 1)
4938  thidx = 1;
4939  for (f = 0; f < thidx; f++) {
4940  first = static_cast<int>(current);
4941  last = static_cast<int>(current + spacing) - 1;
4942  KMP_DEBUG_ASSERT(last >= first);
4943  if (first >= n_places) {
4944  if (masters_place) {
4945  first -= n_places;
4946  last -= n_places;
4947  if (first == (masters_place + 1)) {
4948  KMP_DEBUG_ASSERT(f == n_th);
4949  first--;
4950  }
4951  if (last == masters_place) {
4952  KMP_DEBUG_ASSERT(f == (n_th - 1));
4953  last--;
4954  }
4955  } else {
4956  KMP_DEBUG_ASSERT(f == n_th);
4957  first = 0;
4958  last = 0;
4959  }
4960  }
4961  if (last >= n_places) {
4962  last = (n_places - 1);
4963  }
4964  place = first;
4965  current += spacing;
4966  if (f < n_th) {
4967  KMP_DEBUG_ASSERT(0 <= first);
4968  KMP_DEBUG_ASSERT(n_places > first);
4969  KMP_DEBUG_ASSERT(0 <= last);
4970  KMP_DEBUG_ASSERT(n_places > last);
4971  KMP_DEBUG_ASSERT(last_place >= first_place);
4972  th = team->t.t_threads[f];
4973  KMP_DEBUG_ASSERT(th);
4974  th->th.th_first_place = first;
4975  th->th.th_new_place = place;
4976  th->th.th_last_place = last;
4977  if (__kmp_display_affinity && place != th->th.th_current_place &&
4978  team->t.t_display_affinity != 1) {
4979  team->t.t_display_affinity = 1;
4980  }
4981  KA_TRACE(100,
4982  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4983  "partition = [%d,%d], spacing = %.4f\n",
4984  __kmp_gtid_from_thread(team->t.t_threads[f]),
4985  team->t.t_id, f, th->th.th_new_place,
4986  th->th.th_first_place, th->th.th_last_place, spacing));
4987  }
4988  }
4989  }
4990  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4991  } else {
4992  int S, rem, gap, s_count;
4993  S = n_th / n_places;
4994  s_count = 0;
4995  rem = n_th - (S * n_places);
4996  gap = rem > 0 ? n_places / rem : n_places;
4997  int place = masters_place;
4998  int gap_ct = gap;
4999  thidx = n_th;
5000  if (update_master_only == 1)
5001  thidx = 1;
5002  for (f = 0; f < thidx; f++) {
5003  kmp_info_t *th = team->t.t_threads[f];
5004  KMP_DEBUG_ASSERT(th != NULL);
5005 
5006  th->th.th_first_place = place;
5007  th->th.th_last_place = place;
5008  th->th.th_new_place = place;
5009  if (__kmp_display_affinity && place != th->th.th_current_place &&
5010  team->t.t_display_affinity != 1) {
5011  team->t.t_display_affinity = 1;
5012  }
5013  s_count++;
5014 
5015  if ((s_count == S) && rem && (gap_ct == gap)) {
5016  // do nothing, add an extra thread to place on next iteration
5017  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5018  // we added an extra thread to this place; move on to next place
5019  if (place == last_place) {
5020  place = first_place;
5021  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5022  place = 0;
5023  } else {
5024  place++;
5025  }
5026  s_count = 0;
5027  gap_ct = 1;
5028  rem--;
5029  } else if (s_count == S) { // place is full; don't add extra thread
5030  if (place == last_place) {
5031  place = first_place;
5032  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
5033  place = 0;
5034  } else {
5035  place++;
5036  }
5037  gap_ct++;
5038  s_count = 0;
5039  }
5040 
5041  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5042  "partition = [%d,%d]\n",
5043  __kmp_gtid_from_thread(team->t.t_threads[f]),
5044  team->t.t_id, f, th->th.th_new_place,
5045  th->th.th_first_place, th->th.th_last_place));
5046  }
5047  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5048  }
5049  } break;
5050 
5051  default:
5052  break;
5053  }
5054 
5055  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5056 }
5057 
5058 #endif // KMP_AFFINITY_SUPPORTED
5059 
5060 /* allocate a new team data structure to use. take one off of the free pool if
5061  available */
5062 kmp_team_t *
5063 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5064 #if OMPT_SUPPORT
5065  ompt_data_t ompt_parallel_data,
5066 #endif
5067  kmp_proc_bind_t new_proc_bind,
5068  kmp_internal_control_t *new_icvs,
5069  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5070  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5071  int f;
5072  kmp_team_t *team;
5073  int use_hot_team = !root->r.r_active;
5074  int level = 0;
5075  int do_place_partition = 1;
5076 
5077  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5078  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5079  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5080  KMP_MB();
5081 
5082 #if KMP_NESTED_HOT_TEAMS
5083  kmp_hot_team_ptr_t *hot_teams;
5084  if (master) {
5085  team = master->th.th_team;
5086  level = team->t.t_active_level;
5087  if (master->th.th_teams_microtask) { // in teams construct?
5088  if (master->th.th_teams_size.nteams > 1 &&
5089  ( // #teams > 1
5090  team->t.t_pkfn ==
5091  (microtask_t)__kmp_teams_master || // inner fork of the teams
5092  master->th.th_teams_level <
5093  team->t.t_level)) { // or nested parallel inside the teams
5094  ++level; // not increment if #teams==1, or for outer fork of the teams;
5095  // increment otherwise
5096  }
5097  // Do not perform the place partition if inner fork of the teams
5098  // Wait until nested parallel region encountered inside teams construct
5099  if ((master->th.th_teams_size.nteams == 1 &&
5100  master->th.th_teams_level >= team->t.t_level) ||
5101  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5102  do_place_partition = 0;
5103  }
5104  hot_teams = master->th.th_hot_teams;
5105  if (level < __kmp_hot_teams_max_level && hot_teams &&
5106  hot_teams[level].hot_team) {
5107  // hot team has already been allocated for given level
5108  use_hot_team = 1;
5109  } else {
5110  use_hot_team = 0;
5111  }
5112  } else {
5113  // check we won't access uninitialized hot_teams, just in case
5114  KMP_DEBUG_ASSERT(new_nproc == 1);
5115  }
5116 #endif
5117  // Optimization to use a "hot" team
5118  if (use_hot_team && new_nproc > 1) {
5119  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5120 #if KMP_NESTED_HOT_TEAMS
5121  team = hot_teams[level].hot_team;
5122 #else
5123  team = root->r.r_hot_team;
5124 #endif
5125 #if KMP_DEBUG
5126  if (__kmp_tasking_mode != tskm_immediate_exec) {
5127  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5128  "task_team[1] = %p before reinit\n",
5129  team->t.t_task_team[0], team->t.t_task_team[1]));
5130  }
5131 #endif
5132 
5133  if (team->t.t_nproc != new_nproc &&
5134  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5135  // Distributed barrier may need a resize
5136  int old_nthr = team->t.t_nproc;
5137  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5138  }
5139 
5140  // If not doing the place partition, then reset the team's proc bind
5141  // to indicate that partitioning of all threads still needs to take place
5142  if (do_place_partition == 0)
5143  team->t.t_proc_bind = proc_bind_default;
5144  // Has the number of threads changed?
5145  /* Let's assume the most common case is that the number of threads is
5146  unchanged, and put that case first. */
5147  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5148  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5149  // This case can mean that omp_set_num_threads() was called and the hot
5150  // team size was already reduced, so we check the special flag
5151  if (team->t.t_size_changed == -1) {
5152  team->t.t_size_changed = 1;
5153  } else {
5154  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5155  }
5156 
5157  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5158  kmp_r_sched_t new_sched = new_icvs->sched;
5159  // set primary thread's schedule as new run-time schedule
5160  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5161 
5162  __kmp_reinitialize_team(team, new_icvs,
5163  root->r.r_uber_thread->th.th_ident);
5164 
5165  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5166  team->t.t_threads[0], team));
5167  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5168 
5169 #if KMP_AFFINITY_SUPPORTED
5170  if ((team->t.t_size_changed == 0) &&
5171  (team->t.t_proc_bind == new_proc_bind)) {
5172  if (new_proc_bind == proc_bind_spread) {
5173  if (do_place_partition) {
5174  // add flag to update only master for spread
5175  __kmp_partition_places(team, 1);
5176  }
5177  }
5178  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5179  "proc_bind = %d, partition = [%d,%d]\n",
5180  team->t.t_id, new_proc_bind, team->t.t_first_place,
5181  team->t.t_last_place));
5182  } else {
5183  if (do_place_partition) {
5184  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5185  __kmp_partition_places(team);
5186  }
5187  }
5188 #else
5189  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5190 #endif /* KMP_AFFINITY_SUPPORTED */
5191  } else if (team->t.t_nproc > new_nproc) {
5192  KA_TRACE(20,
5193  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5194  new_nproc));
5195 
5196  team->t.t_size_changed = 1;
5197  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5198  // Barrier size already reduced earlier in this function
5199  // Activate team threads via th_used_in_team
5200  __kmp_add_threads_to_team(team, new_nproc);
5201  }
5202 #if KMP_NESTED_HOT_TEAMS
5203  if (__kmp_hot_teams_mode == 0) {
5204  // AC: saved number of threads should correspond to team's value in this
5205  // mode, can be bigger in mode 1, when hot team has threads in reserve
5206  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5207  hot_teams[level].hot_team_nth = new_nproc;
5208 #endif // KMP_NESTED_HOT_TEAMS
5209  /* release the extra threads we don't need any more */
5210  for (f = new_nproc; f < team->t.t_nproc; f++) {
5211  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5212  if (__kmp_tasking_mode != tskm_immediate_exec) {
5213  // When decreasing team size, threads no longer in the team should
5214  // unref task team.
5215  team->t.t_threads[f]->th.th_task_team = NULL;
5216  }
5217  __kmp_free_thread(team->t.t_threads[f]);
5218  team->t.t_threads[f] = NULL;
5219  }
5220 #if KMP_NESTED_HOT_TEAMS
5221  } // (__kmp_hot_teams_mode == 0)
5222  else {
5223  // When keeping extra threads in team, switch threads to wait on own
5224  // b_go flag
5225  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5226  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5227  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5228  for (int b = 0; b < bs_last_barrier; ++b) {
5229  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5230  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5231  }
5232  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5233  }
5234  }
5235  }
5236 #endif // KMP_NESTED_HOT_TEAMS
5237  team->t.t_nproc = new_nproc;
5238  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5239  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5240  __kmp_reinitialize_team(team, new_icvs,
5241  root->r.r_uber_thread->th.th_ident);
5242 
5243  // Update remaining threads
5244  for (f = 0; f < new_nproc; ++f) {
5245  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5246  }
5247 
5248  // restore the current task state of the primary thread: should be the
5249  // implicit task
5250  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5251  team->t.t_threads[0], team));
5252 
5253  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5254 
5255 #ifdef KMP_DEBUG
5256  for (f = 0; f < team->t.t_nproc; f++) {
5257  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5258  team->t.t_threads[f]->th.th_team_nproc ==
5259  team->t.t_nproc);
5260  }
5261 #endif
5262 
5263  if (do_place_partition) {
5264  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5265 #if KMP_AFFINITY_SUPPORTED
5266  __kmp_partition_places(team);
5267 #endif
5268  }
5269  } else { // team->t.t_nproc < new_nproc
5270 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5271  kmp_affin_mask_t *old_mask;
5272  if (KMP_AFFINITY_CAPABLE()) {
5273  KMP_CPU_ALLOC(old_mask);
5274  }
5275 #endif
5276 
5277  KA_TRACE(20,
5278  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5279  new_nproc));
5280  int old_nproc = team->t.t_nproc; // save old value and use to update only
5281  team->t.t_size_changed = 1;
5282 
5283 #if KMP_NESTED_HOT_TEAMS
5284  int avail_threads = hot_teams[level].hot_team_nth;
5285  if (new_nproc < avail_threads)
5286  avail_threads = new_nproc;
5287  kmp_info_t **other_threads = team->t.t_threads;
5288  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5289  // Adjust barrier data of reserved threads (if any) of the team
5290  // Other data will be set in __kmp_initialize_info() below.
5291  int b;
5292  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5293  for (b = 0; b < bs_last_barrier; ++b) {
5294  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5295  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5296 #if USE_DEBUGGER
5297  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5298 #endif
5299  }
5300  }
5301  if (hot_teams[level].hot_team_nth >= new_nproc) {
5302  // we have all needed threads in reserve, no need to allocate any
5303  // this only possible in mode 1, cannot have reserved threads in mode 0
5304  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5305  team->t.t_nproc = new_nproc; // just get reserved threads involved
5306  } else {
5307  // We may have some threads in reserve, but not enough;
5308  // get reserved threads involved if any.
5309  team->t.t_nproc = hot_teams[level].hot_team_nth;
5310  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5311 #endif // KMP_NESTED_HOT_TEAMS
5312  if (team->t.t_max_nproc < new_nproc) {
5313  /* reallocate larger arrays */
5314  __kmp_reallocate_team_arrays(team, new_nproc);
5315  __kmp_reinitialize_team(team, new_icvs, NULL);
5316  }
5317 
5318 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5319  /* Temporarily set full mask for primary thread before creation of
5320  workers. The reason is that workers inherit the affinity from the
5321  primary thread, so if a lot of workers are created on the single
5322  core quickly, they don't get a chance to set their own affinity for
5323  a long time. */
5324  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5325 #endif
5326 
5327  /* allocate new threads for the hot team */
5328  for (f = team->t.t_nproc; f < new_nproc; f++) {
5329  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5330  KMP_DEBUG_ASSERT(new_worker);
5331  team->t.t_threads[f] = new_worker;
5332 
5333  KA_TRACE(20,
5334  ("__kmp_allocate_team: team %d init T#%d arrived: "
5335  "join=%llu, plain=%llu\n",
5336  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5337  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5338  team->t.t_bar[bs_plain_barrier].b_arrived));
5339 
5340  { // Initialize barrier data for new threads.
5341  int b;
5342  kmp_balign_t *balign = new_worker->th.th_bar;
5343  for (b = 0; b < bs_last_barrier; ++b) {
5344  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5345  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5346  KMP_BARRIER_PARENT_FLAG);
5347 #if USE_DEBUGGER
5348  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5349 #endif
5350  }
5351  }
5352  }
5353 
5354 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5355  if (KMP_AFFINITY_CAPABLE()) {
5356  /* Restore initial primary thread's affinity mask */
5357  __kmp_set_system_affinity(old_mask, TRUE);
5358  KMP_CPU_FREE(old_mask);
5359  }
5360 #endif
5361 #if KMP_NESTED_HOT_TEAMS
5362  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5363 #endif // KMP_NESTED_HOT_TEAMS
5364  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5365  // Barrier size already increased earlier in this function
5366  // Activate team threads via th_used_in_team
5367  __kmp_add_threads_to_team(team, new_nproc);
5368  }
5369  /* make sure everyone is syncronized */
5370  // new threads below
5371  __kmp_initialize_team(team, new_nproc, new_icvs,
5372  root->r.r_uber_thread->th.th_ident);
5373 
5374  /* reinitialize the threads */
5375  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5376  for (f = 0; f < team->t.t_nproc; ++f)
5377  __kmp_initialize_info(team->t.t_threads[f], team, f,
5378  __kmp_gtid_from_tid(f, team));
5379 
5380  if (level) { // set th_task_state for new threads in nested hot team
5381  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5382  // only need to set the th_task_state for the new threads. th_task_state
5383  // for primary thread will not be accurate until after this in
5384  // __kmp_fork_call(), so we look to the primary thread's memo_stack to
5385  // get the correct value.
5386  for (f = old_nproc; f < team->t.t_nproc; ++f)
5387  team->t.t_threads[f]->th.th_task_state =
5388  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5389  } else { // set th_task_state for new threads in non-nested hot team
5390  // copy primary thread's state
5391  kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state;
5392  for (f = old_nproc; f < team->t.t_nproc; ++f)
5393  team->t.t_threads[f]->th.th_task_state = old_state;
5394  }
5395 
5396 #ifdef KMP_DEBUG
5397  for (f = 0; f < team->t.t_nproc; ++f) {
5398  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5399  team->t.t_threads[f]->th.th_team_nproc ==
5400  team->t.t_nproc);
5401  }
5402 #endif
5403 
5404  if (do_place_partition) {
5405  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5406 #if KMP_AFFINITY_SUPPORTED
5407  __kmp_partition_places(team);
5408 #endif
5409  }
5410  } // Check changes in number of threads
5411 
5412  kmp_info_t *master = team->t.t_threads[0];
5413  if (master->th.th_teams_microtask) {
5414  for (f = 1; f < new_nproc; ++f) {
5415  // propagate teams construct specific info to workers
5416  kmp_info_t *thr = team->t.t_threads[f];
5417  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5418  thr->th.th_teams_level = master->th.th_teams_level;
5419  thr->th.th_teams_size = master->th.th_teams_size;
5420  }
5421  }
5422 #if KMP_NESTED_HOT_TEAMS
5423  if (level) {
5424  // Sync barrier state for nested hot teams, not needed for outermost hot
5425  // team.
5426  for (f = 1; f < new_nproc; ++f) {
5427  kmp_info_t *thr = team->t.t_threads[f];
5428  int b;
5429  kmp_balign_t *balign = thr->th.th_bar;
5430  for (b = 0; b < bs_last_barrier; ++b) {
5431  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5432  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5433 #if USE_DEBUGGER
5434  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5435 #endif
5436  }
5437  }
5438  }
5439 #endif // KMP_NESTED_HOT_TEAMS
5440 
5441  /* reallocate space for arguments if necessary */
5442  __kmp_alloc_argv_entries(argc, team, TRUE);
5443  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5444  // The hot team re-uses the previous task team,
5445  // if untouched during the previous release->gather phase.
5446 
5447  KF_TRACE(10, (" hot_team = %p\n", team));
5448 
5449 #if KMP_DEBUG
5450  if (__kmp_tasking_mode != tskm_immediate_exec) {
5451  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5452  "task_team[1] = %p after reinit\n",
5453  team->t.t_task_team[0], team->t.t_task_team[1]));
5454  }
5455 #endif
5456 
5457 #if OMPT_SUPPORT
5458  __ompt_team_assign_id(team, ompt_parallel_data);
5459 #endif
5460 
5461  KMP_MB();
5462 
5463  return team;
5464  }
5465 
5466  /* next, let's try to take one from the team pool */
5467  KMP_MB();
5468  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5469  /* TODO: consider resizing undersized teams instead of reaping them, now
5470  that we have a resizing mechanism */
5471  if (team->t.t_max_nproc >= max_nproc) {
5472  /* take this team from the team pool */
5473  __kmp_team_pool = team->t.t_next_pool;
5474 
5475  if (max_nproc > 1 &&
5476  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5477  if (!team->t.b) { // Allocate barrier structure
5478  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5479  }
5480  }
5481 
5482  /* setup the team for fresh use */
5483  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5484 
5485  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5486  "task_team[1] %p to NULL\n",
5487  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5488  team->t.t_task_team[0] = NULL;
5489  team->t.t_task_team[1] = NULL;
5490 
5491  /* reallocate space for arguments if necessary */
5492  __kmp_alloc_argv_entries(argc, team, TRUE);
5493  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5494 
5495  KA_TRACE(
5496  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5497  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5498  { // Initialize barrier data.
5499  int b;
5500  for (b = 0; b < bs_last_barrier; ++b) {
5501  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5502 #if USE_DEBUGGER
5503  team->t.t_bar[b].b_master_arrived = 0;
5504  team->t.t_bar[b].b_team_arrived = 0;
5505 #endif
5506  }
5507  }
5508 
5509  team->t.t_proc_bind = new_proc_bind;
5510 
5511  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5512  team->t.t_id));
5513 
5514 #if OMPT_SUPPORT
5515  __ompt_team_assign_id(team, ompt_parallel_data);
5516 #endif
5517 
5518  KMP_MB();
5519 
5520  return team;
5521  }
5522 
5523  /* reap team if it is too small, then loop back and check the next one */
5524  // not sure if this is wise, but, will be redone during the hot-teams
5525  // rewrite.
5526  /* TODO: Use technique to find the right size hot-team, don't reap them */
5527  team = __kmp_reap_team(team);
5528  __kmp_team_pool = team;
5529  }
5530 
5531  /* nothing available in the pool, no matter, make a new team! */
5532  KMP_MB();
5533  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5534 
5535  /* and set it up */
5536  team->t.t_max_nproc = max_nproc;
5537  if (max_nproc > 1 &&
5538  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5539  // Allocate barrier structure
5540  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5541  }
5542 
5543  /* NOTE well, for some reason allocating one big buffer and dividing it up
5544  seems to really hurt performance a lot on the P4, so, let's not use this */
5545  __kmp_allocate_team_arrays(team, max_nproc);
5546 
5547  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5548  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5549 
5550  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5551  "%p to NULL\n",
5552  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5553  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5554  // memory, no need to duplicate
5555  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5556  // memory, no need to duplicate
5557 
5558  if (__kmp_storage_map) {
5559  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5560  }
5561 
5562  /* allocate space for arguments */
5563  __kmp_alloc_argv_entries(argc, team, FALSE);
5564  team->t.t_argc = argc;
5565 
5566  KA_TRACE(20,
5567  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5568  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5569  { // Initialize barrier data.
5570  int b;
5571  for (b = 0; b < bs_last_barrier; ++b) {
5572  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5573 #if USE_DEBUGGER
5574  team->t.t_bar[b].b_master_arrived = 0;
5575  team->t.t_bar[b].b_team_arrived = 0;
5576 #endif
5577  }
5578  }
5579 
5580  team->t.t_proc_bind = new_proc_bind;
5581 
5582 #if OMPT_SUPPORT
5583  __ompt_team_assign_id(team, ompt_parallel_data);
5584  team->t.ompt_serialized_team_info = NULL;
5585 #endif
5586 
5587  KMP_MB();
5588 
5589  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5590  team->t.t_id));
5591 
5592  return team;
5593 }
5594 
5595 /* TODO implement hot-teams at all levels */
5596 /* TODO implement lazy thread release on demand (disband request) */
5597 
5598 /* free the team. return it to the team pool. release all the threads
5599  * associated with it */
5600 void __kmp_free_team(kmp_root_t *root,
5601  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5602  int f;
5603  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5604  team->t.t_id));
5605 
5606  /* verify state */
5607  KMP_DEBUG_ASSERT(root);
5608  KMP_DEBUG_ASSERT(team);
5609  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5610  KMP_DEBUG_ASSERT(team->t.t_threads);
5611 
5612  int use_hot_team = team == root->r.r_hot_team;
5613 #if KMP_NESTED_HOT_TEAMS
5614  int level;
5615  if (master) {
5616  level = team->t.t_active_level - 1;
5617  if (master->th.th_teams_microtask) { // in teams construct?
5618  if (master->th.th_teams_size.nteams > 1) {
5619  ++level; // level was not increased in teams construct for
5620  // team_of_masters
5621  }
5622  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5623  master->th.th_teams_level == team->t.t_level) {
5624  ++level; // level was not increased in teams construct for
5625  // team_of_workers before the parallel
5626  } // team->t.t_level will be increased inside parallel
5627  }
5628 #if KMP_DEBUG
5629  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5630 #endif
5631  if (level < __kmp_hot_teams_max_level) {
5632  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5633  use_hot_team = 1;
5634  }
5635  }
5636 #endif // KMP_NESTED_HOT_TEAMS
5637 
5638  /* team is done working */
5639  TCW_SYNC_PTR(team->t.t_pkfn,
5640  NULL); // Important for Debugging Support Library.
5641 #if KMP_OS_WINDOWS
5642  team->t.t_copyin_counter = 0; // init counter for possible reuse
5643 #endif
5644  // Do not reset pointer to parent team to NULL for hot teams.
5645 
5646  /* if we are non-hot team, release our threads */
5647  if (!use_hot_team) {
5648  if (__kmp_tasking_mode != tskm_immediate_exec) {
5649  // Wait for threads to reach reapable state
5650  for (f = 1; f < team->t.t_nproc; ++f) {
5651  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5652  kmp_info_t *th = team->t.t_threads[f];
5653  volatile kmp_uint32 *state = &th->th.th_reap_state;
5654  while (*state != KMP_SAFE_TO_REAP) {
5655 #if KMP_OS_WINDOWS
5656  // On Windows a thread can be killed at any time, check this
5657  DWORD ecode;
5658  if (!__kmp_is_thread_alive(th, &ecode)) {
5659  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5660  break;
5661  }
5662 #endif
5663  // first check if thread is sleeping
5664  kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5665  if (fl.is_sleeping())
5666  fl.resume(__kmp_gtid_from_thread(th));
5667  KMP_CPU_PAUSE();
5668  }
5669  }
5670 
5671  // Delete task teams
5672  int tt_idx;
5673  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5674  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5675  if (task_team != NULL) {
5676  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5677  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5678  team->t.t_threads[f]->th.th_task_team = NULL;
5679  }
5680  KA_TRACE(
5681  20,
5682  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5683  __kmp_get_gtid(), task_team, team->t.t_id));
5684 #if KMP_NESTED_HOT_TEAMS
5685  __kmp_free_task_team(master, task_team);
5686 #endif
5687  team->t.t_task_team[tt_idx] = NULL;
5688  }
5689  }
5690  }
5691 
5692  // Reset pointer to parent team only for non-hot teams.
5693  team->t.t_parent = NULL;
5694  team->t.t_level = 0;
5695  team->t.t_active_level = 0;
5696 
5697  /* free the worker threads */
5698  for (f = 1; f < team->t.t_nproc; ++f) {
5699  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5700  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5701  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5702  1, 2);
5703  }
5704  __kmp_free_thread(team->t.t_threads[f]);
5705  }
5706 
5707  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5708  if (team->t.b) {
5709  // wake up thread at old location
5710  team->t.b->go_release();
5711  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5712  for (f = 1; f < team->t.t_nproc; ++f) {
5713  if (team->t.b->sleep[f].sleep) {
5714  __kmp_atomic_resume_64(
5715  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5716  (kmp_atomic_flag_64<> *)NULL);
5717  }
5718  }
5719  }
5720  // Wait for threads to be removed from team
5721  for (int f = 1; f < team->t.t_nproc; ++f) {
5722  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5723  KMP_CPU_PAUSE();
5724  }
5725  }
5726  }
5727 
5728  for (f = 1; f < team->t.t_nproc; ++f) {
5729  team->t.t_threads[f] = NULL;
5730  }
5731 
5732  if (team->t.t_max_nproc > 1 &&
5733  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5734  distributedBarrier::deallocate(team->t.b);
5735  team->t.b = NULL;
5736  }
5737  /* put the team back in the team pool */
5738  /* TODO limit size of team pool, call reap_team if pool too large */
5739  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5740  __kmp_team_pool = (volatile kmp_team_t *)team;
5741  } else { // Check if team was created for primary threads in teams construct
5742  // See if first worker is a CG root
5743  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5744  team->t.t_threads[1]->th.th_cg_roots);
5745  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5746  // Clean up the CG root nodes on workers so that this team can be re-used
5747  for (f = 1; f < team->t.t_nproc; ++f) {
5748  kmp_info_t *thr = team->t.t_threads[f];
5749  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5750  thr->th.th_cg_roots->cg_root == thr);
5751  // Pop current CG root off list
5752  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5753  thr->th.th_cg_roots = tmp->up;
5754  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5755  " up to node %p. cg_nthreads was %d\n",
5756  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5757  int i = tmp->cg_nthreads--;
5758  if (i == 1) {
5759  __kmp_free(tmp); // free CG if we are the last thread in it
5760  }
5761  // Restore current task's thread_limit from CG root
5762  if (thr->th.th_cg_roots)
5763  thr->th.th_current_task->td_icvs.thread_limit =
5764  thr->th.th_cg_roots->cg_thread_limit;
5765  }
5766  }
5767  }
5768 
5769  KMP_MB();
5770 }
5771 
5772 /* reap the team. destroy it, reclaim all its resources and free its memory */
5773 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5774  kmp_team_t *next_pool = team->t.t_next_pool;
5775 
5776  KMP_DEBUG_ASSERT(team);
5777  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5778  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5779  KMP_DEBUG_ASSERT(team->t.t_threads);
5780  KMP_DEBUG_ASSERT(team->t.t_argv);
5781 
5782  /* TODO clean the threads that are a part of this? */
5783 
5784  /* free stuff */
5785  __kmp_free_team_arrays(team);
5786  if (team->t.t_argv != &team->t.t_inline_argv[0])
5787  __kmp_free((void *)team->t.t_argv);
5788  __kmp_free(team);
5789 
5790  KMP_MB();
5791  return next_pool;
5792 }
5793 
5794 // Free the thread. Don't reap it, just place it on the pool of available
5795 // threads.
5796 //
5797 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5798 // binding for the affinity mechanism to be useful.
5799 //
5800 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5801 // However, we want to avoid a potential performance problem by always
5802 // scanning through the list to find the correct point at which to insert
5803 // the thread (potential N**2 behavior). To do this we keep track of the
5804 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5805 // With single-level parallelism, threads will always be added to the tail
5806 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5807 // parallelism, all bets are off and we may need to scan through the entire
5808 // free list.
5809 //
5810 // This change also has a potentially large performance benefit, for some
5811 // applications. Previously, as threads were freed from the hot team, they
5812 // would be placed back on the free list in inverse order. If the hot team
5813 // grew back to it's original size, then the freed thread would be placed
5814 // back on the hot team in reverse order. This could cause bad cache
5815 // locality problems on programs where the size of the hot team regularly
5816 // grew and shrunk.
5817 //
5818 // Now, for single-level parallelism, the OMP tid is always == gtid.
5819 void __kmp_free_thread(kmp_info_t *this_th) {
5820  int gtid;
5821  kmp_info_t **scan;
5822 
5823  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5824  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5825 
5826  KMP_DEBUG_ASSERT(this_th);
5827 
5828  // When moving thread to pool, switch thread to wait on own b_go flag, and
5829  // uninitialized (NULL team).
5830  int b;
5831  kmp_balign_t *balign = this_th->th.th_bar;
5832  for (b = 0; b < bs_last_barrier; ++b) {
5833  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5834  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5835  balign[b].bb.team = NULL;
5836  balign[b].bb.leaf_kids = 0;
5837  }
5838  this_th->th.th_task_state = 0;
5839  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5840 
5841  /* put thread back on the free pool */
5842  TCW_PTR(this_th->th.th_team, NULL);
5843  TCW_PTR(this_th->th.th_root, NULL);
5844  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5845 
5846  while (this_th->th.th_cg_roots) {
5847  this_th->th.th_cg_roots->cg_nthreads--;
5848  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5849  " %p of thread %p to %d\n",
5850  this_th, this_th->th.th_cg_roots,
5851  this_th->th.th_cg_roots->cg_root,
5852  this_th->th.th_cg_roots->cg_nthreads));
5853  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5854  if (tmp->cg_root == this_th) { // Thread is a cg_root
5855  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5856  KA_TRACE(
5857  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5858  this_th->th.th_cg_roots = tmp->up;
5859  __kmp_free(tmp);
5860  } else { // Worker thread
5861  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5862  __kmp_free(tmp);
5863  }
5864  this_th->th.th_cg_roots = NULL;
5865  break;
5866  }
5867  }
5868 
5869  /* If the implicit task assigned to this thread can be used by other threads
5870  * -> multiple threads can share the data and try to free the task at
5871  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5872  * with higher probability when hot team is disabled but can occurs even when
5873  * the hot team is enabled */
5874  __kmp_free_implicit_task(this_th);
5875  this_th->th.th_current_task = NULL;
5876 
5877  // If the __kmp_thread_pool_insert_pt is already past the new insert
5878  // point, then we need to re-scan the entire list.
5879  gtid = this_th->th.th_info.ds.ds_gtid;
5880  if (__kmp_thread_pool_insert_pt != NULL) {
5881  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5882  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5883  __kmp_thread_pool_insert_pt = NULL;
5884  }
5885  }
5886 
5887  // Scan down the list to find the place to insert the thread.
5888  // scan is the address of a link in the list, possibly the address of
5889  // __kmp_thread_pool itself.
5890  //
5891  // In the absence of nested parallelism, the for loop will have 0 iterations.
5892  if (__kmp_thread_pool_insert_pt != NULL) {
5893  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5894  } else {
5895  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5896  }
5897  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5898  scan = &((*scan)->th.th_next_pool))
5899  ;
5900 
5901  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5902  // to its address.
5903  TCW_PTR(this_th->th.th_next_pool, *scan);
5904  __kmp_thread_pool_insert_pt = *scan = this_th;
5905  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5906  (this_th->th.th_info.ds.ds_gtid <
5907  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5908  TCW_4(this_th->th.th_in_pool, TRUE);
5909  __kmp_suspend_initialize_thread(this_th);
5910  __kmp_lock_suspend_mx(this_th);
5911  if (this_th->th.th_active == TRUE) {
5912  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5913  this_th->th.th_active_in_pool = TRUE;
5914  }
5915 #if KMP_DEBUG
5916  else {
5917  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5918  }
5919 #endif
5920  __kmp_unlock_suspend_mx(this_th);
5921 
5922  TCW_4(__kmp_nth, __kmp_nth - 1);
5923 
5924 #ifdef KMP_ADJUST_BLOCKTIME
5925  /* Adjust blocktime back to user setting or default if necessary */
5926  /* Middle initialization might never have occurred */
5927  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5928  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5929  if (__kmp_nth <= __kmp_avail_proc) {
5930  __kmp_zero_bt = FALSE;
5931  }
5932  }
5933 #endif /* KMP_ADJUST_BLOCKTIME */
5934 
5935  KMP_MB();
5936 }
5937 
5938 /* ------------------------------------------------------------------------ */
5939 
5940 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5941 #if OMP_PROFILING_SUPPORT
5942  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5943  // TODO: add a configuration option for time granularity
5944  if (ProfileTraceFile)
5945  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5946 #endif
5947 
5948  int gtid = this_thr->th.th_info.ds.ds_gtid;
5949  /* void *stack_data;*/
5950  kmp_team_t **volatile pteam;
5951 
5952  KMP_MB();
5953  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5954 
5955  if (__kmp_env_consistency_check) {
5956  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5957  }
5958 
5959 #if OMPD_SUPPORT
5960  if (ompd_state & OMPD_ENABLE_BP)
5961  ompd_bp_thread_begin();
5962 #endif
5963 
5964 #if OMPT_SUPPORT
5965  ompt_data_t *thread_data = nullptr;
5966  if (ompt_enabled.enabled) {
5967  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5968  *thread_data = ompt_data_none;
5969 
5970  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5971  this_thr->th.ompt_thread_info.wait_id = 0;
5972  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5973  this_thr->th.ompt_thread_info.parallel_flags = 0;
5974  if (ompt_enabled.ompt_callback_thread_begin) {
5975  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5976  ompt_thread_worker, thread_data);
5977  }
5978  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5979  }
5980 #endif
5981 
5982  /* This is the place where threads wait for work */
5983  while (!TCR_4(__kmp_global.g.g_done)) {
5984  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5985  KMP_MB();
5986 
5987  /* wait for work to do */
5988  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5989 
5990  /* No tid yet since not part of a team */
5991  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5992 
5993 #if OMPT_SUPPORT
5994  if (ompt_enabled.enabled) {
5995  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5996  }
5997 #endif
5998 
5999  pteam = &this_thr->th.th_team;
6000 
6001  /* have we been allocated? */
6002  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6003  /* we were just woken up, so run our new task */
6004  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6005  int rc;
6006  KA_TRACE(20,
6007  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6008  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6009  (*pteam)->t.t_pkfn));
6010 
6011  updateHWFPControl(*pteam);
6012 
6013 #if OMPT_SUPPORT
6014  if (ompt_enabled.enabled) {
6015  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6016  }
6017 #endif
6018 
6019  rc = (*pteam)->t.t_invoke(gtid);
6020  KMP_ASSERT(rc);
6021 
6022  KMP_MB();
6023  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6024  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6025  (*pteam)->t.t_pkfn));
6026  }
6027 #if OMPT_SUPPORT
6028  if (ompt_enabled.enabled) {
6029  /* no frame set while outside task */
6030  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6031 
6032  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6033  }
6034 #endif
6035  /* join barrier after parallel region */
6036  __kmp_join_barrier(gtid);
6037  }
6038  }
6039  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6040 
6041 #if OMPD_SUPPORT
6042  if (ompd_state & OMPD_ENABLE_BP)
6043  ompd_bp_thread_end();
6044 #endif
6045 
6046 #if OMPT_SUPPORT
6047  if (ompt_enabled.ompt_callback_thread_end) {
6048  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6049  }
6050 #endif
6051 
6052  this_thr->th.th_task_team = NULL;
6053  /* run the destructors for the threadprivate data for this thread */
6054  __kmp_common_destroy_gtid(gtid);
6055 
6056  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6057  KMP_MB();
6058 
6059 #if OMP_PROFILING_SUPPORT
6060  llvm::timeTraceProfilerFinishThread();
6061 #endif
6062  return this_thr;
6063 }
6064 
6065 /* ------------------------------------------------------------------------ */
6066 
6067 void __kmp_internal_end_dest(void *specific_gtid) {
6068  // Make sure no significant bits are lost
6069  int gtid;
6070  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6071 
6072  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6073  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6074  * this is because 0 is reserved for the nothing-stored case */
6075 
6076  __kmp_internal_end_thread(gtid);
6077 }
6078 
6079 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6080 
6081 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6082  __kmp_internal_end_atexit();
6083 }
6084 
6085 #endif
6086 
6087 /* [Windows] josh: when the atexit handler is called, there may still be more
6088  than one thread alive */
6089 void __kmp_internal_end_atexit(void) {
6090  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6091  /* [Windows]
6092  josh: ideally, we want to completely shutdown the library in this atexit
6093  handler, but stat code that depends on thread specific data for gtid fails
6094  because that data becomes unavailable at some point during the shutdown, so
6095  we call __kmp_internal_end_thread instead. We should eventually remove the
6096  dependency on __kmp_get_specific_gtid in the stat code and use
6097  __kmp_internal_end_library to cleanly shutdown the library.
6098 
6099  // TODO: Can some of this comment about GVS be removed?
6100  I suspect that the offending stat code is executed when the calling thread
6101  tries to clean up a dead root thread's data structures, resulting in GVS
6102  code trying to close the GVS structures for that thread, but since the stat
6103  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6104  the calling thread is cleaning up itself instead of another thread, it get
6105  confused. This happens because allowing a thread to unregister and cleanup
6106  another thread is a recent modification for addressing an issue.
6107  Based on the current design (20050722), a thread may end up
6108  trying to unregister another thread only if thread death does not trigger
6109  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6110  thread specific data destructor function to detect thread death. For
6111  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6112  is nothing. Thus, the workaround is applicable only for Windows static
6113  stat library. */
6114  __kmp_internal_end_library(-1);
6115 #if KMP_OS_WINDOWS
6116  __kmp_close_console();
6117 #endif
6118 }
6119 
6120 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6121  // It is assumed __kmp_forkjoin_lock is acquired.
6122 
6123  int gtid;
6124 
6125  KMP_DEBUG_ASSERT(thread != NULL);
6126 
6127  gtid = thread->th.th_info.ds.ds_gtid;
6128 
6129  if (!is_root) {
6130  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6131  /* Assume the threads are at the fork barrier here */
6132  KA_TRACE(
6133  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6134  gtid));
6135  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6136  while (
6137  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6138  KMP_CPU_PAUSE();
6139  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6140  } else {
6141  /* Need release fence here to prevent seg faults for tree forkjoin
6142  barrier (GEH) */
6143  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6144  thread);
6145  __kmp_release_64(&flag);
6146  }
6147  }
6148 
6149  // Terminate OS thread.
6150  __kmp_reap_worker(thread);
6151 
6152  // The thread was killed asynchronously. If it was actively
6153  // spinning in the thread pool, decrement the global count.
6154  //
6155  // There is a small timing hole here - if the worker thread was just waking
6156  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6157  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6158  // the global counter might not get updated.
6159  //
6160  // Currently, this can only happen as the library is unloaded,
6161  // so there are no harmful side effects.
6162  if (thread->th.th_active_in_pool) {
6163  thread->th.th_active_in_pool = FALSE;
6164  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6165  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6166  }
6167  }
6168 
6169  __kmp_free_implicit_task(thread);
6170 
6171 // Free the fast memory for tasking
6172 #if USE_FAST_MEMORY
6173  __kmp_free_fast_memory(thread);
6174 #endif /* USE_FAST_MEMORY */
6175 
6176  __kmp_suspend_uninitialize_thread(thread);
6177 
6178  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6179  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6180 
6181  --__kmp_all_nth;
6182  // __kmp_nth was decremented when thread is added to the pool.
6183 
6184 #ifdef KMP_ADJUST_BLOCKTIME
6185  /* Adjust blocktime back to user setting or default if necessary */
6186  /* Middle initialization might never have occurred */
6187  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6188  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6189  if (__kmp_nth <= __kmp_avail_proc) {
6190  __kmp_zero_bt = FALSE;
6191  }
6192  }
6193 #endif /* KMP_ADJUST_BLOCKTIME */
6194 
6195  /* free the memory being used */
6196  if (__kmp_env_consistency_check) {
6197  if (thread->th.th_cons) {
6198  __kmp_free_cons_stack(thread->th.th_cons);
6199  thread->th.th_cons = NULL;
6200  }
6201  }
6202 
6203  if (thread->th.th_pri_common != NULL) {
6204  __kmp_free(thread->th.th_pri_common);
6205  thread->th.th_pri_common = NULL;
6206  }
6207 
6208  if (thread->th.th_task_state_memo_stack != NULL) {
6209  __kmp_free(thread->th.th_task_state_memo_stack);
6210  thread->th.th_task_state_memo_stack = NULL;
6211  }
6212 
6213 #if KMP_USE_BGET
6214  if (thread->th.th_local.bget_data != NULL) {
6215  __kmp_finalize_bget(thread);
6216  }
6217 #endif
6218 
6219 #if KMP_AFFINITY_SUPPORTED
6220  if (thread->th.th_affin_mask != NULL) {
6221  KMP_CPU_FREE(thread->th.th_affin_mask);
6222  thread->th.th_affin_mask = NULL;
6223  }
6224 #endif /* KMP_AFFINITY_SUPPORTED */
6225 
6226 #if KMP_USE_HIER_SCHED
6227  if (thread->th.th_hier_bar_data != NULL) {
6228  __kmp_free(thread->th.th_hier_bar_data);
6229  thread->th.th_hier_bar_data = NULL;
6230  }
6231 #endif
6232 
6233  __kmp_reap_team(thread->th.th_serial_team);
6234  thread->th.th_serial_team = NULL;
6235  __kmp_free(thread);
6236 
6237  KMP_MB();
6238 
6239 } // __kmp_reap_thread
6240 
6241 static void __kmp_itthash_clean(kmp_info_t *th) {
6242 #if USE_ITT_NOTIFY
6243  if (__kmp_itt_region_domains.count > 0) {
6244  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6245  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6246  while (bucket) {
6247  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6248  __kmp_thread_free(th, bucket);
6249  bucket = next;
6250  }
6251  }
6252  }
6253  if (__kmp_itt_barrier_domains.count > 0) {
6254  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6255  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6256  while (bucket) {
6257  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6258  __kmp_thread_free(th, bucket);
6259  bucket = next;
6260  }
6261  }
6262  }
6263 #endif
6264 }
6265 
6266 static void __kmp_internal_end(void) {
6267  int i;
6268 
6269  /* First, unregister the library */
6270  __kmp_unregister_library();
6271 
6272 #if KMP_OS_WINDOWS
6273  /* In Win static library, we can't tell when a root actually dies, so we
6274  reclaim the data structures for any root threads that have died but not
6275  unregistered themselves, in order to shut down cleanly.
6276  In Win dynamic library we also can't tell when a thread dies. */
6277  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6278 // dead roots
6279 #endif
6280 
6281  for (i = 0; i < __kmp_threads_capacity; i++)
6282  if (__kmp_root[i])
6283  if (__kmp_root[i]->r.r_active)
6284  break;
6285  KMP_MB(); /* Flush all pending memory write invalidates. */
6286  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6287 
6288  if (i < __kmp_threads_capacity) {
6289 #if KMP_USE_MONITOR
6290  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6291  KMP_MB(); /* Flush all pending memory write invalidates. */
6292 
6293  // Need to check that monitor was initialized before reaping it. If we are
6294  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6295  // __kmp_monitor will appear to contain valid data, but it is only valid in
6296  // the parent process, not the child.
6297  // New behavior (201008): instead of keying off of the flag
6298  // __kmp_init_parallel, the monitor thread creation is keyed off
6299  // of the new flag __kmp_init_monitor.
6300  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6301  if (TCR_4(__kmp_init_monitor)) {
6302  __kmp_reap_monitor(&__kmp_monitor);
6303  TCW_4(__kmp_init_monitor, 0);
6304  }
6305  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6306  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6307 #endif // KMP_USE_MONITOR
6308  } else {
6309 /* TODO move this to cleanup code */
6310 #ifdef KMP_DEBUG
6311  /* make sure that everything has properly ended */
6312  for (i = 0; i < __kmp_threads_capacity; i++) {
6313  if (__kmp_root[i]) {
6314  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6315  // there can be uber threads alive here
6316  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6317  }
6318  }
6319 #endif
6320 
6321  KMP_MB();
6322 
6323  // Reap the worker threads.
6324  // This is valid for now, but be careful if threads are reaped sooner.
6325  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6326  // Get the next thread from the pool.
6327  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6328  __kmp_thread_pool = thread->th.th_next_pool;
6329  // Reap it.
6330  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6331  thread->th.th_next_pool = NULL;
6332  thread->th.th_in_pool = FALSE;
6333  __kmp_reap_thread(thread, 0);
6334  }
6335  __kmp_thread_pool_insert_pt = NULL;
6336 
6337  // Reap teams.
6338  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6339  // Get the next team from the pool.
6340  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6341  __kmp_team_pool = team->t.t_next_pool;
6342  // Reap it.
6343  team->t.t_next_pool = NULL;
6344  __kmp_reap_team(team);
6345  }
6346 
6347  __kmp_reap_task_teams();
6348 
6349 #if KMP_OS_UNIX
6350  // Threads that are not reaped should not access any resources since they
6351  // are going to be deallocated soon, so the shutdown sequence should wait
6352  // until all threads either exit the final spin-waiting loop or begin
6353  // sleeping after the given blocktime.
6354  for (i = 0; i < __kmp_threads_capacity; i++) {
6355  kmp_info_t *thr = __kmp_threads[i];
6356  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6357  KMP_CPU_PAUSE();
6358  }
6359 #endif
6360 
6361  for (i = 0; i < __kmp_threads_capacity; ++i) {
6362  // TBD: Add some checking...
6363  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6364  }
6365 
6366  /* Make sure all threadprivate destructors get run by joining with all
6367  worker threads before resetting this flag */
6368  TCW_SYNC_4(__kmp_init_common, FALSE);
6369 
6370  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6371  KMP_MB();
6372 
6373 #if KMP_USE_MONITOR
6374  // See note above: One of the possible fixes for CQ138434 / CQ140126
6375  //
6376  // FIXME: push both code fragments down and CSE them?
6377  // push them into __kmp_cleanup() ?
6378  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6379  if (TCR_4(__kmp_init_monitor)) {
6380  __kmp_reap_monitor(&__kmp_monitor);
6381  TCW_4(__kmp_init_monitor, 0);
6382  }
6383  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6384  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6385 #endif
6386  } /* else !__kmp_global.t_active */
6387  TCW_4(__kmp_init_gtid, FALSE);
6388  KMP_MB(); /* Flush all pending memory write invalidates. */
6389 
6390  __kmp_cleanup();
6391 #if OMPT_SUPPORT
6392  ompt_fini();
6393 #endif
6394 }
6395 
6396 void __kmp_internal_end_library(int gtid_req) {
6397  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6398  /* this shouldn't be a race condition because __kmp_internal_end() is the
6399  only place to clear __kmp_serial_init */
6400  /* we'll check this later too, after we get the lock */
6401  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6402  // redundant, because the next check will work in any case.
6403  if (__kmp_global.g.g_abort) {
6404  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6405  /* TODO abort? */
6406  return;
6407  }
6408  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6409  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6410  return;
6411  }
6412 
6413  // If hidden helper team has been initialized, we need to deinit it
6414  if (TCR_4(__kmp_init_hidden_helper) &&
6415  !TCR_4(__kmp_hidden_helper_team_done)) {
6416  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6417  // First release the main thread to let it continue its work
6418  __kmp_hidden_helper_main_thread_release();
6419  // Wait until the hidden helper team has been destroyed
6420  __kmp_hidden_helper_threads_deinitz_wait();
6421  }
6422 
6423  KMP_MB(); /* Flush all pending memory write invalidates. */
6424  /* find out who we are and what we should do */
6425  {
6426  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6427  KA_TRACE(
6428  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6429  if (gtid == KMP_GTID_SHUTDOWN) {
6430  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6431  "already shutdown\n"));
6432  return;
6433  } else if (gtid == KMP_GTID_MONITOR) {
6434  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6435  "registered, or system shutdown\n"));
6436  return;
6437  } else if (gtid == KMP_GTID_DNE) {
6438  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6439  "shutdown\n"));
6440  /* we don't know who we are, but we may still shutdown the library */
6441  } else if (KMP_UBER_GTID(gtid)) {
6442  /* unregister ourselves as an uber thread. gtid is no longer valid */
6443  if (__kmp_root[gtid]->r.r_active) {
6444  __kmp_global.g.g_abort = -1;
6445  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6446  __kmp_unregister_library();
6447  KA_TRACE(10,
6448  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6449  gtid));
6450  return;
6451  } else {
6452  __kmp_itthash_clean(__kmp_threads[gtid]);
6453  KA_TRACE(
6454  10,
6455  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6456  __kmp_unregister_root_current_thread(gtid);
6457  }
6458  } else {
6459 /* worker threads may call this function through the atexit handler, if they
6460  * call exit() */
6461 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6462  TODO: do a thorough shutdown instead */
6463 #ifdef DUMP_DEBUG_ON_EXIT
6464  if (__kmp_debug_buf)
6465  __kmp_dump_debug_buffer();
6466 #endif
6467  // added unregister library call here when we switch to shm linux
6468  // if we don't, it will leave lots of files in /dev/shm
6469  // cleanup shared memory file before exiting.
6470  __kmp_unregister_library();
6471  return;
6472  }
6473  }
6474  /* synchronize the termination process */
6475  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6476 
6477  /* have we already finished */
6478  if (__kmp_global.g.g_abort) {
6479  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6480  /* TODO abort? */
6481  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6482  return;
6483  }
6484  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6485  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6486  return;
6487  }
6488 
6489  /* We need this lock to enforce mutex between this reading of
6490  __kmp_threads_capacity and the writing by __kmp_register_root.
6491  Alternatively, we can use a counter of roots that is atomically updated by
6492  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6493  __kmp_internal_end_*. */
6494  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6495 
6496  /* now we can safely conduct the actual termination */
6497  __kmp_internal_end();
6498 
6499  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6500  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6501 
6502  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6503 
6504 #ifdef DUMP_DEBUG_ON_EXIT
6505  if (__kmp_debug_buf)
6506  __kmp_dump_debug_buffer();
6507 #endif
6508 
6509 #if KMP_OS_WINDOWS
6510  __kmp_close_console();
6511 #endif
6512 
6513  __kmp_fini_allocator();
6514 
6515 } // __kmp_internal_end_library
6516 
6517 void __kmp_internal_end_thread(int gtid_req) {
6518  int i;
6519 
6520  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6521  /* this shouldn't be a race condition because __kmp_internal_end() is the
6522  * only place to clear __kmp_serial_init */
6523  /* we'll check this later too, after we get the lock */
6524  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6525  // redundant, because the next check will work in any case.
6526  if (__kmp_global.g.g_abort) {
6527  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6528  /* TODO abort? */
6529  return;
6530  }
6531  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6532  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6533  return;
6534  }
6535 
6536  // If hidden helper team has been initialized, we need to deinit it
6537  if (TCR_4(__kmp_init_hidden_helper) &&
6538  !TCR_4(__kmp_hidden_helper_team_done)) {
6539  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6540  // First release the main thread to let it continue its work
6541  __kmp_hidden_helper_main_thread_release();
6542  // Wait until the hidden helper team has been destroyed
6543  __kmp_hidden_helper_threads_deinitz_wait();
6544  }
6545 
6546  KMP_MB(); /* Flush all pending memory write invalidates. */
6547 
6548  /* find out who we are and what we should do */
6549  {
6550  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6551  KA_TRACE(10,
6552  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6553  if (gtid == KMP_GTID_SHUTDOWN) {
6554  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6555  "already shutdown\n"));
6556  return;
6557  } else if (gtid == KMP_GTID_MONITOR) {
6558  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6559  "registered, or system shutdown\n"));
6560  return;
6561  } else if (gtid == KMP_GTID_DNE) {
6562  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6563  "shutdown\n"));
6564  return;
6565  /* we don't know who we are */
6566  } else if (KMP_UBER_GTID(gtid)) {
6567  /* unregister ourselves as an uber thread. gtid is no longer valid */
6568  if (__kmp_root[gtid]->r.r_active) {
6569  __kmp_global.g.g_abort = -1;
6570  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6571  KA_TRACE(10,
6572  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6573  gtid));
6574  return;
6575  } else {
6576  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6577  gtid));
6578  __kmp_unregister_root_current_thread(gtid);
6579  }
6580  } else {
6581  /* just a worker thread, let's leave */
6582  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6583 
6584  if (gtid >= 0) {
6585  __kmp_threads[gtid]->th.th_task_team = NULL;
6586  }
6587 
6588  KA_TRACE(10,
6589  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6590  gtid));
6591  return;
6592  }
6593  }
6594 #if KMP_DYNAMIC_LIB
6595  if (__kmp_pause_status != kmp_hard_paused)
6596  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6597  // because we will better shutdown later in the library destructor.
6598  {
6599  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6600  return;
6601  }
6602 #endif
6603  /* synchronize the termination process */
6604  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6605 
6606  /* have we already finished */
6607  if (__kmp_global.g.g_abort) {
6608  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6609  /* TODO abort? */
6610  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6611  return;
6612  }
6613  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6614  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6615  return;
6616  }
6617 
6618  /* We need this lock to enforce mutex between this reading of
6619  __kmp_threads_capacity and the writing by __kmp_register_root.
6620  Alternatively, we can use a counter of roots that is atomically updated by
6621  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6622  __kmp_internal_end_*. */
6623 
6624  /* should we finish the run-time? are all siblings done? */
6625  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6626 
6627  for (i = 0; i < __kmp_threads_capacity; ++i) {
6628  if (KMP_UBER_GTID(i)) {
6629  KA_TRACE(
6630  10,
6631  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6632  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6633  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6634  return;
6635  }
6636  }
6637 
6638  /* now we can safely conduct the actual termination */
6639 
6640  __kmp_internal_end();
6641 
6642  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6643  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6644 
6645  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6646 
6647 #ifdef DUMP_DEBUG_ON_EXIT
6648  if (__kmp_debug_buf)
6649  __kmp_dump_debug_buffer();
6650 #endif
6651 } // __kmp_internal_end_thread
6652 
6653 // -----------------------------------------------------------------------------
6654 // Library registration stuff.
6655 
6656 static long __kmp_registration_flag = 0;
6657 // Random value used to indicate library initialization.
6658 static char *__kmp_registration_str = NULL;
6659 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6660 
6661 static inline char *__kmp_reg_status_name() {
6662 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6663  each thread. If registration and unregistration go in different threads
6664  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6665  env var can not be found, because the name will contain different pid. */
6666 // macOS* complains about name being too long with additional getuid()
6667 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6668  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6669  (int)getuid());
6670 #else
6671  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6672 #endif
6673 } // __kmp_reg_status_get
6674 
6675 void __kmp_register_library_startup(void) {
6676 
6677  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6678  int done = 0;
6679  union {
6680  double dtime;
6681  long ltime;
6682  } time;
6683 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6684  __kmp_initialize_system_tick();
6685 #endif
6686  __kmp_read_system_time(&time.dtime);
6687  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6688  __kmp_registration_str =
6689  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6690  __kmp_registration_flag, KMP_LIBRARY_FILE);
6691 
6692  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6693  __kmp_registration_str));
6694 
6695  while (!done) {
6696 
6697  char *value = NULL; // Actual value of the environment variable.
6698 
6699 #if defined(KMP_USE_SHM)
6700  char *shm_name = __kmp_str_format("/%s", name);
6701  int shm_preexist = 0;
6702  char *data1;
6703  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6704  if ((fd1 == -1) && (errno == EEXIST)) {
6705  // file didn't open because it already exists.
6706  // try opening existing file
6707  fd1 = shm_open(shm_name, O_RDWR, 0666);
6708  if (fd1 == -1) { // file didn't open
6709  // error out here
6710  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6711  __kmp_msg_null);
6712  } else {
6713  // able to open existing file
6714  shm_preexist = 1;
6715  }
6716  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6717  // already exists.
6718  // error out here.
6719  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6720  __kmp_msg_null);
6721  }
6722  if (shm_preexist == 0) {
6723  // we created SHM now set size
6724  if (ftruncate(fd1, SHM_SIZE) == -1) {
6725  // error occured setting size;
6726  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6727  KMP_ERR(errno), __kmp_msg_null);
6728  }
6729  }
6730  data1 =
6731  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6732  if (data1 == MAP_FAILED) {
6733  // failed to map shared memory
6734  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6735  __kmp_msg_null);
6736  }
6737  if (shm_preexist == 0) { // set data to SHM, set value
6738  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6739  }
6740  // Read value from either what we just wrote or existing file.
6741  value = __kmp_str_format("%s", data1); // read value from SHM
6742  munmap(data1, SHM_SIZE);
6743  close(fd1);
6744 #else // Windows and unix with static library
6745  // Set environment variable, but do not overwrite if it is exist.
6746  __kmp_env_set(name, __kmp_registration_str, 0);
6747  // read value to see if it got set
6748  value = __kmp_env_get(name);
6749 #endif
6750 
6751  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6752  done = 1; // Ok, environment variable set successfully, exit the loop.
6753  } else {
6754  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6755  // Check whether it alive or dead.
6756  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6757  char *tail = value;
6758  char *flag_addr_str = NULL;
6759  char *flag_val_str = NULL;
6760  char const *file_name = NULL;
6761  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6762  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6763  file_name = tail;
6764  if (tail != NULL) {
6765  unsigned long *flag_addr = 0;
6766  unsigned long flag_val = 0;
6767  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6768  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6769  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6770  // First, check whether environment-encoded address is mapped into
6771  // addr space.
6772  // If so, dereference it to see if it still has the right value.
6773  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6774  neighbor = 1;
6775  } else {
6776  // If not, then we know the other copy of the library is no longer
6777  // running.
6778  neighbor = 2;
6779  }
6780  }
6781  }
6782  switch (neighbor) {
6783  case 0: // Cannot parse environment variable -- neighbor status unknown.
6784  // Assume it is the incompatible format of future version of the
6785  // library. Assume the other library is alive.
6786  // WARN( ... ); // TODO: Issue a warning.
6787  file_name = "unknown library";
6788  KMP_FALLTHROUGH();
6789  // Attention! Falling to the next case. That's intentional.
6790  case 1: { // Neighbor is alive.
6791  // Check it is allowed.
6792  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6793  if (!__kmp_str_match_true(duplicate_ok)) {
6794  // That's not allowed. Issue fatal error.
6795  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6796  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6797  }
6798  KMP_INTERNAL_FREE(duplicate_ok);
6799  __kmp_duplicate_library_ok = 1;
6800  done = 1; // Exit the loop.
6801  } break;
6802  case 2: { // Neighbor is dead.
6803 
6804 #if defined(KMP_USE_SHM)
6805  // close shared memory.
6806  shm_unlink(shm_name); // this removes file in /dev/shm
6807 #else
6808  // Clear the variable and try to register library again.
6809  __kmp_env_unset(name);
6810 #endif
6811  } break;
6812  default: {
6813  KMP_DEBUG_ASSERT(0);
6814  } break;
6815  }
6816  }
6817  KMP_INTERNAL_FREE((void *)value);
6818 #if defined(KMP_USE_SHM)
6819  KMP_INTERNAL_FREE((void *)shm_name);
6820 #endif
6821  } // while
6822  KMP_INTERNAL_FREE((void *)name);
6823 
6824 } // func __kmp_register_library_startup
6825 
6826 void __kmp_unregister_library(void) {
6827 
6828  char *name = __kmp_reg_status_name();
6829  char *value = NULL;
6830 
6831 #if defined(KMP_USE_SHM)
6832  char *shm_name = __kmp_str_format("/%s", name);
6833  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6834  if (fd1 == -1) {
6835  // file did not open. return.
6836  return;
6837  }
6838  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6839  if (data1 != MAP_FAILED) {
6840  value = __kmp_str_format("%s", data1); // read value from SHM
6841  munmap(data1, SHM_SIZE);
6842  }
6843  close(fd1);
6844 #else
6845  value = __kmp_env_get(name);
6846 #endif
6847 
6848  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6849  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6850  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6851 // Ok, this is our variable. Delete it.
6852 #if defined(KMP_USE_SHM)
6853  shm_unlink(shm_name); // this removes file in /dev/shm
6854 #else
6855  __kmp_env_unset(name);
6856 #endif
6857  }
6858 
6859 #if defined(KMP_USE_SHM)
6860  KMP_INTERNAL_FREE(shm_name);
6861 #endif
6862 
6863  KMP_INTERNAL_FREE(__kmp_registration_str);
6864  KMP_INTERNAL_FREE(value);
6865  KMP_INTERNAL_FREE(name);
6866 
6867  __kmp_registration_flag = 0;
6868  __kmp_registration_str = NULL;
6869 
6870 } // __kmp_unregister_library
6871 
6872 // End of Library registration stuff.
6873 // -----------------------------------------------------------------------------
6874 
6875 #if KMP_MIC_SUPPORTED
6876 
6877 static void __kmp_check_mic_type() {
6878  kmp_cpuid_t cpuid_state = {0};
6879  kmp_cpuid_t *cs_p = &cpuid_state;
6880  __kmp_x86_cpuid(1, 0, cs_p);
6881  // We don't support mic1 at the moment
6882  if ((cs_p->eax & 0xff0) == 0xB10) {
6883  __kmp_mic_type = mic2;
6884  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6885  __kmp_mic_type = mic3;
6886  } else {
6887  __kmp_mic_type = non_mic;
6888  }
6889 }
6890 
6891 #endif /* KMP_MIC_SUPPORTED */
6892 
6893 #if KMP_HAVE_UMWAIT
6894 static void __kmp_user_level_mwait_init() {
6895  struct kmp_cpuid buf;
6896  __kmp_x86_cpuid(7, 0, &buf);
6897  __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6898  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6899  __kmp_umwait_enabled));
6900 }
6901 #elif KMP_HAVE_MWAIT
6902 #ifndef AT_INTELPHIUSERMWAIT
6903 // Spurious, non-existent value that should always fail to return anything.
6904 // Will be replaced with the correct value when we know that.
6905 #define AT_INTELPHIUSERMWAIT 10000
6906 #endif
6907 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6908 // earlier OS is used to build the RTL, we'll use the following internal
6909 // function when the entry is not found.
6910 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6911 unsigned long getauxval(unsigned long) { return 0; }
6912 
6913 static void __kmp_user_level_mwait_init() {
6914  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6915  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6916  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6917  // KMP_USER_LEVEL_MWAIT was set to TRUE.
6918  if (__kmp_mic_type == mic3) {
6919  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6920  if ((res & 0x1) || __kmp_user_level_mwait) {
6921  __kmp_mwait_enabled = TRUE;
6922  if (__kmp_user_level_mwait) {
6923  KMP_INFORM(EnvMwaitWarn);
6924  }
6925  } else {
6926  __kmp_mwait_enabled = FALSE;
6927  }
6928  }
6929  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6930  "__kmp_mwait_enabled = %d\n",
6931  __kmp_mic_type, __kmp_mwait_enabled));
6932 }
6933 #endif /* KMP_HAVE_UMWAIT */
6934 
6935 static void __kmp_do_serial_initialize(void) {
6936  int i, gtid;
6937  size_t size;
6938 
6939  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6940 
6941  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6942  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6943  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6944  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6945  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6946 
6947 #if OMPT_SUPPORT
6948  ompt_pre_init();
6949 #endif
6950 #if OMPD_SUPPORT
6951  __kmp_env_dump();
6952  ompd_init();
6953 #endif
6954 
6955  __kmp_validate_locks();
6956 
6957  /* Initialize internal memory allocator */
6958  __kmp_init_allocator();
6959 
6960  /* Register the library startup via an environment variable and check to see
6961  whether another copy of the library is already registered. */
6962 
6963  __kmp_register_library_startup();
6964 
6965  /* TODO reinitialization of library */
6966  if (TCR_4(__kmp_global.g.g_done)) {
6967  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6968  }
6969 
6970  __kmp_global.g.g_abort = 0;
6971  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6972 
6973 /* initialize the locks */
6974 #if KMP_USE_ADAPTIVE_LOCKS
6975 #if KMP_DEBUG_ADAPTIVE_LOCKS
6976  __kmp_init_speculative_stats();
6977 #endif
6978 #endif
6979 #if KMP_STATS_ENABLED
6980  __kmp_stats_init();
6981 #endif
6982  __kmp_init_lock(&__kmp_global_lock);
6983  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6984  __kmp_init_lock(&__kmp_debug_lock);
6985  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6986  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6987  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6988  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6989  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6990  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6991  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6992  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6993  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6994  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6995  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6996  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6997  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6998  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6999  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7000 #if KMP_USE_MONITOR
7001  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7002 #endif
7003  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7004 
7005  /* conduct initialization and initial setup of configuration */
7006 
7007  __kmp_runtime_initialize();
7008 
7009 #if KMP_MIC_SUPPORTED
7010  __kmp_check_mic_type();
7011 #endif
7012 
7013 // Some global variable initialization moved here from kmp_env_initialize()
7014 #ifdef KMP_DEBUG
7015  kmp_diag = 0;
7016 #endif
7017  __kmp_abort_delay = 0;
7018 
7019  // From __kmp_init_dflt_team_nth()
7020  /* assume the entire machine will be used */
7021  __kmp_dflt_team_nth_ub = __kmp_xproc;
7022  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7023  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7024  }
7025  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7026  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7027  }
7028  __kmp_max_nth = __kmp_sys_max_nth;
7029  __kmp_cg_max_nth = __kmp_sys_max_nth;
7030  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7031  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7032  __kmp_teams_max_nth = __kmp_sys_max_nth;
7033  }
7034 
7035  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7036  // part
7037  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7038 #if KMP_USE_MONITOR
7039  __kmp_monitor_wakeups =
7040  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7041  __kmp_bt_intervals =
7042  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7043 #endif
7044  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7045  __kmp_library = library_throughput;
7046  // From KMP_SCHEDULE initialization
7047  __kmp_static = kmp_sch_static_balanced;
7048 // AC: do not use analytical here, because it is non-monotonous
7049 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7050 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7051 // need to repeat assignment
7052 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7053 // bit control and barrier method control parts
7054 #if KMP_FAST_REDUCTION_BARRIER
7055 #define kmp_reduction_barrier_gather_bb ((int)1)
7056 #define kmp_reduction_barrier_release_bb ((int)1)
7057 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7058 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7059 #endif // KMP_FAST_REDUCTION_BARRIER
7060  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7061  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7062  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7063  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7064  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7065 #if KMP_FAST_REDUCTION_BARRIER
7066  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7067  // lin_64 ): hyper,1
7068  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7069  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7070  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7071  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7072  }
7073 #endif // KMP_FAST_REDUCTION_BARRIER
7074  }
7075 #if KMP_FAST_REDUCTION_BARRIER
7076 #undef kmp_reduction_barrier_release_pat
7077 #undef kmp_reduction_barrier_gather_pat
7078 #undef kmp_reduction_barrier_release_bb
7079 #undef kmp_reduction_barrier_gather_bb
7080 #endif // KMP_FAST_REDUCTION_BARRIER
7081 #if KMP_MIC_SUPPORTED
7082  if (__kmp_mic_type == mic2) { // KNC
7083  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7084  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7085  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7086  1; // forkjoin release
7087  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7088  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7089  }
7090 #if KMP_FAST_REDUCTION_BARRIER
7091  if (__kmp_mic_type == mic2) { // KNC
7092  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7093  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7094  }
7095 #endif // KMP_FAST_REDUCTION_BARRIER
7096 #endif // KMP_MIC_SUPPORTED
7097 
7098 // From KMP_CHECKS initialization
7099 #ifdef KMP_DEBUG
7100  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7101 #else
7102  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7103 #endif
7104 
7105  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7106  __kmp_foreign_tp = TRUE;
7107 
7108  __kmp_global.g.g_dynamic = FALSE;
7109  __kmp_global.g.g_dynamic_mode = dynamic_default;
7110 
7111  __kmp_init_nesting_mode();
7112 
7113  __kmp_env_initialize(NULL);
7114 
7115 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7116  __kmp_user_level_mwait_init();
7117 #endif
7118 // Print all messages in message catalog for testing purposes.
7119 #ifdef KMP_DEBUG
7120  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7121  if (__kmp_str_match_true(val)) {
7122  kmp_str_buf_t buffer;
7123  __kmp_str_buf_init(&buffer);
7124  __kmp_i18n_dump_catalog(&buffer);
7125  __kmp_printf("%s", buffer.str);
7126  __kmp_str_buf_free(&buffer);
7127  }
7128  __kmp_env_free(&val);
7129 #endif
7130 
7131  __kmp_threads_capacity =
7132  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7133  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7134  __kmp_tp_capacity = __kmp_default_tp_capacity(
7135  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7136 
7137  // If the library is shut down properly, both pools must be NULL. Just in
7138  // case, set them to NULL -- some memory may leak, but subsequent code will
7139  // work even if pools are not freed.
7140  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7141  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7142  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7143  __kmp_thread_pool = NULL;
7144  __kmp_thread_pool_insert_pt = NULL;
7145  __kmp_team_pool = NULL;
7146 
7147  /* Allocate all of the variable sized records */
7148  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7149  * expandable */
7150  /* Since allocation is cache-aligned, just add extra padding at the end */
7151  size =
7152  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7153  CACHE_LINE;
7154  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7155  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7156  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7157 
7158  /* init thread counts */
7159  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7160  0); // Asserts fail if the library is reinitializing and
7161  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7162  __kmp_all_nth = 0;
7163  __kmp_nth = 0;
7164 
7165  /* setup the uber master thread and hierarchy */
7166  gtid = __kmp_register_root(TRUE);
7167  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7168  KMP_ASSERT(KMP_UBER_GTID(gtid));
7169  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7170 
7171  KMP_MB(); /* Flush all pending memory write invalidates. */
7172 
7173  __kmp_common_initialize();
7174 
7175 #if KMP_OS_UNIX
7176  /* invoke the child fork handler */
7177  __kmp_register_atfork();
7178 #endif
7179 
7180 #if !KMP_DYNAMIC_LIB
7181  {
7182  /* Invoke the exit handler when the program finishes, only for static
7183  library. For dynamic library, we already have _fini and DllMain. */
7184  int rc = atexit(__kmp_internal_end_atexit);
7185  if (rc != 0) {
7186  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7187  __kmp_msg_null);
7188  }
7189  }
7190 #endif
7191 
7192 #if KMP_HANDLE_SIGNALS
7193 #if KMP_OS_UNIX
7194  /* NOTE: make sure that this is called before the user installs their own
7195  signal handlers so that the user handlers are called first. this way they
7196  can return false, not call our handler, avoid terminating the library, and
7197  continue execution where they left off. */
7198  __kmp_install_signals(FALSE);
7199 #endif /* KMP_OS_UNIX */
7200 #if KMP_OS_WINDOWS
7201  __kmp_install_signals(TRUE);
7202 #endif /* KMP_OS_WINDOWS */
7203 #endif
7204 
7205  /* we have finished the serial initialization */
7206  __kmp_init_counter++;
7207 
7208  __kmp_init_serial = TRUE;
7209 
7210  if (__kmp_settings) {
7211  __kmp_env_print();
7212  }
7213 
7214  if (__kmp_display_env || __kmp_display_env_verbose) {
7215  __kmp_env_print_2();
7216  }
7217 
7218 #if OMPT_SUPPORT
7219  ompt_post_init();
7220 #endif
7221 
7222  KMP_MB();
7223 
7224  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7225 }
7226 
7227 void __kmp_serial_initialize(void) {
7228  if (__kmp_init_serial) {
7229  return;
7230  }
7231  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7232  if (__kmp_init_serial) {
7233  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7234  return;
7235  }
7236  __kmp_do_serial_initialize();
7237  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7238 }
7239 
7240 static void __kmp_do_middle_initialize(void) {
7241  int i, j;
7242  int prev_dflt_team_nth;
7243 
7244  if (!__kmp_init_serial) {
7245  __kmp_do_serial_initialize();
7246  }
7247 
7248  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7249 
7250  // Save the previous value for the __kmp_dflt_team_nth so that
7251  // we can avoid some reinitialization if it hasn't changed.
7252  prev_dflt_team_nth = __kmp_dflt_team_nth;
7253 
7254 #if KMP_AFFINITY_SUPPORTED
7255  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7256  // number of cores on the machine.
7257  __kmp_affinity_initialize();
7258 
7259 #endif /* KMP_AFFINITY_SUPPORTED */
7260 
7261  KMP_ASSERT(__kmp_xproc > 0);
7262  if (__kmp_avail_proc == 0) {
7263  __kmp_avail_proc = __kmp_xproc;
7264  }
7265 
7266  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7267  // correct them now
7268  j = 0;
7269  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7270  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7271  __kmp_avail_proc;
7272  j++;
7273  }
7274 
7275  if (__kmp_dflt_team_nth == 0) {
7276 #ifdef KMP_DFLT_NTH_CORES
7277  // Default #threads = #cores
7278  __kmp_dflt_team_nth = __kmp_ncores;
7279  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7280  "__kmp_ncores (%d)\n",
7281  __kmp_dflt_team_nth));
7282 #else
7283  // Default #threads = #available OS procs
7284  __kmp_dflt_team_nth = __kmp_avail_proc;
7285  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7286  "__kmp_avail_proc(%d)\n",
7287  __kmp_dflt_team_nth));
7288 #endif /* KMP_DFLT_NTH_CORES */
7289  }
7290 
7291  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7292  __kmp_dflt_team_nth = KMP_MIN_NTH;
7293  }
7294  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7295  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7296  }
7297 
7298  if (__kmp_nesting_mode > 0)
7299  __kmp_set_nesting_mode_threads();
7300 
7301  // There's no harm in continuing if the following check fails,
7302  // but it indicates an error in the previous logic.
7303  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7304 
7305  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7306  // Run through the __kmp_threads array and set the num threads icv for each
7307  // root thread that is currently registered with the RTL (which has not
7308  // already explicitly set its nthreads-var with a call to
7309  // omp_set_num_threads()).
7310  for (i = 0; i < __kmp_threads_capacity; i++) {
7311  kmp_info_t *thread = __kmp_threads[i];
7312  if (thread == NULL)
7313  continue;
7314  if (thread->th.th_current_task->td_icvs.nproc != 0)
7315  continue;
7316 
7317  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7318  }
7319  }
7320  KA_TRACE(
7321  20,
7322  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7323  __kmp_dflt_team_nth));
7324 
7325 #ifdef KMP_ADJUST_BLOCKTIME
7326  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7327  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7328  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7329  if (__kmp_nth > __kmp_avail_proc) {
7330  __kmp_zero_bt = TRUE;
7331  }
7332  }
7333 #endif /* KMP_ADJUST_BLOCKTIME */
7334 
7335  /* we have finished middle initialization */
7336  TCW_SYNC_4(__kmp_init_middle, TRUE);
7337 
7338  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7339 }
7340 
7341 void __kmp_middle_initialize(void) {
7342  if (__kmp_init_middle) {
7343  return;
7344  }
7345  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7346  if (__kmp_init_middle) {
7347  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7348  return;
7349  }
7350  __kmp_do_middle_initialize();
7351  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7352 }
7353 
7354 void __kmp_parallel_initialize(void) {
7355  int gtid = __kmp_entry_gtid(); // this might be a new root
7356 
7357  /* synchronize parallel initialization (for sibling) */
7358  if (TCR_4(__kmp_init_parallel))
7359  return;
7360  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7361  if (TCR_4(__kmp_init_parallel)) {
7362  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7363  return;
7364  }
7365 
7366  /* TODO reinitialization after we have already shut down */
7367  if (TCR_4(__kmp_global.g.g_done)) {
7368  KA_TRACE(
7369  10,
7370  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7371  __kmp_infinite_loop();
7372  }
7373 
7374  /* jc: The lock __kmp_initz_lock is already held, so calling
7375  __kmp_serial_initialize would cause a deadlock. So we call
7376  __kmp_do_serial_initialize directly. */
7377  if (!__kmp_init_middle) {
7378  __kmp_do_middle_initialize();
7379  }
7380  __kmp_assign_root_init_mask();
7381  __kmp_resume_if_hard_paused();
7382 
7383  /* begin initialization */
7384  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7385  KMP_ASSERT(KMP_UBER_GTID(gtid));
7386 
7387 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7388  // Save the FP control regs.
7389  // Worker threads will set theirs to these values at thread startup.
7390  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7391  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7392  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7393 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7394 
7395 #if KMP_OS_UNIX
7396 #if KMP_HANDLE_SIGNALS
7397  /* must be after __kmp_serial_initialize */
7398  __kmp_install_signals(TRUE);
7399 #endif
7400 #endif
7401 
7402  __kmp_suspend_initialize();
7403 
7404 #if defined(USE_LOAD_BALANCE)
7405  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7406  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7407  }
7408 #else
7409  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7410  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7411  }
7412 #endif
7413 
7414  if (__kmp_version) {
7415  __kmp_print_version_2();
7416  }
7417 
7418  /* we have finished parallel initialization */
7419  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7420 
7421  KMP_MB();
7422  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7423 
7424  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7425 }
7426 
7427 void __kmp_hidden_helper_initialize() {
7428  if (TCR_4(__kmp_init_hidden_helper))
7429  return;
7430 
7431  // __kmp_parallel_initialize is required before we initialize hidden helper
7432  if (!TCR_4(__kmp_init_parallel))
7433  __kmp_parallel_initialize();
7434 
7435  // Double check. Note that this double check should not be placed before
7436  // __kmp_parallel_initialize as it will cause dead lock.
7437  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7438  if (TCR_4(__kmp_init_hidden_helper)) {
7439  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7440  return;
7441  }
7442 
7443  // Set the count of hidden helper tasks to be executed to zero
7444  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7445 
7446  // Set the global variable indicating that we're initializing hidden helper
7447  // team/threads
7448  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7449 
7450  // Platform independent initialization
7451  __kmp_do_initialize_hidden_helper_threads();
7452 
7453  // Wait here for the finish of initialization of hidden helper teams
7454  __kmp_hidden_helper_threads_initz_wait();
7455 
7456  // We have finished hidden helper initialization
7457  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7458 
7459  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7460 }
7461 
7462 /* ------------------------------------------------------------------------ */
7463 
7464 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7465  kmp_team_t *team) {
7466  kmp_disp_t *dispatch;
7467 
7468  KMP_MB();
7469 
7470  /* none of the threads have encountered any constructs, yet. */
7471  this_thr->th.th_local.this_construct = 0;
7472 #if KMP_CACHE_MANAGE
7473  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7474 #endif /* KMP_CACHE_MANAGE */
7475  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7476  KMP_DEBUG_ASSERT(dispatch);
7477  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7478  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7479  // this_thr->th.th_info.ds.ds_tid ] );
7480 
7481  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7482  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7483  if (__kmp_env_consistency_check)
7484  __kmp_push_parallel(gtid, team->t.t_ident);
7485 
7486  KMP_MB(); /* Flush all pending memory write invalidates. */
7487 }
7488 
7489 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7490  kmp_team_t *team) {
7491  if (__kmp_env_consistency_check)
7492  __kmp_pop_parallel(gtid, team->t.t_ident);
7493 
7494  __kmp_finish_implicit_task(this_thr);
7495 }
7496 
7497 int __kmp_invoke_task_func(int gtid) {
7498  int rc;
7499  int tid = __kmp_tid_from_gtid(gtid);
7500  kmp_info_t *this_thr = __kmp_threads[gtid];
7501  kmp_team_t *team = this_thr->th.th_team;
7502 
7503  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7504 #if USE_ITT_BUILD
7505  if (__itt_stack_caller_create_ptr) {
7506  // inform ittnotify about entering user's code
7507  if (team->t.t_stack_id != NULL) {
7508  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7509  } else {
7510  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7511  __kmp_itt_stack_callee_enter(
7512  (__itt_caller)team->t.t_parent->t.t_stack_id);
7513  }
7514  }
7515 #endif /* USE_ITT_BUILD */
7516 #if INCLUDE_SSC_MARKS
7517  SSC_MARK_INVOKING();
7518 #endif
7519 
7520 #if OMPT_SUPPORT
7521  void *dummy;
7522  void **exit_frame_p;
7523  ompt_data_t *my_task_data;
7524  ompt_data_t *my_parallel_data;
7525  int ompt_team_size;
7526 
7527  if (ompt_enabled.enabled) {
7528  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7529  .ompt_task_info.frame.exit_frame.ptr);
7530  } else {
7531  exit_frame_p = &dummy;
7532  }
7533 
7534  my_task_data =
7535  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7536  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7537  if (ompt_enabled.ompt_callback_implicit_task) {
7538  ompt_team_size = team->t.t_nproc;
7539  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7540  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7541  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7542  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7543  }
7544 #endif
7545 
7546 #if KMP_STATS_ENABLED
7547  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7548  if (previous_state == stats_state_e::TEAMS_REGION) {
7549  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7550  } else {
7551  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7552  }
7553  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7554 #endif
7555 
7556  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7557  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7558 #if OMPT_SUPPORT
7559  ,
7560  exit_frame_p
7561 #endif
7562  );
7563 #if OMPT_SUPPORT
7564  *exit_frame_p = NULL;
7565  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7566 #endif
7567 
7568 #if KMP_STATS_ENABLED
7569  if (previous_state == stats_state_e::TEAMS_REGION) {
7570  KMP_SET_THREAD_STATE(previous_state);
7571  }
7572  KMP_POP_PARTITIONED_TIMER();
7573 #endif
7574 
7575 #if USE_ITT_BUILD
7576  if (__itt_stack_caller_create_ptr) {
7577  // inform ittnotify about leaving user's code
7578  if (team->t.t_stack_id != NULL) {
7579  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7580  } else {
7581  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7582  __kmp_itt_stack_callee_leave(
7583  (__itt_caller)team->t.t_parent->t.t_stack_id);
7584  }
7585  }
7586 #endif /* USE_ITT_BUILD */
7587  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7588 
7589  return rc;
7590 }
7591 
7592 void __kmp_teams_master(int gtid) {
7593  // This routine is called by all primary threads in teams construct
7594  kmp_info_t *thr = __kmp_threads[gtid];
7595  kmp_team_t *team = thr->th.th_team;
7596  ident_t *loc = team->t.t_ident;
7597  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7598  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7599  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7600  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7601  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7602 
7603  // This thread is a new CG root. Set up the proper variables.
7604  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7605  tmp->cg_root = thr; // Make thr the CG root
7606  // Init to thread limit stored when league primary threads were forked
7607  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7608  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7609  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7610  " cg_nthreads to 1\n",
7611  thr, tmp));
7612  tmp->up = thr->th.th_cg_roots;
7613  thr->th.th_cg_roots = tmp;
7614 
7615 // Launch league of teams now, but not let workers execute
7616 // (they hang on fork barrier until next parallel)
7617 #if INCLUDE_SSC_MARKS
7618  SSC_MARK_FORKING();
7619 #endif
7620  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7621  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7622  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7623 #if INCLUDE_SSC_MARKS
7624  SSC_MARK_JOINING();
7625 #endif
7626  // If the team size was reduced from the limit, set it to the new size
7627  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7628  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7629  // AC: last parameter "1" eliminates join barrier which won't work because
7630  // worker threads are in a fork barrier waiting for more parallel regions
7631  __kmp_join_call(loc, gtid
7632 #if OMPT_SUPPORT
7633  ,
7634  fork_context_intel
7635 #endif
7636  ,
7637  1);
7638 }
7639 
7640 int __kmp_invoke_teams_master(int gtid) {
7641  kmp_info_t *this_thr = __kmp_threads[gtid];
7642  kmp_team_t *team = this_thr->th.th_team;
7643 #if KMP_DEBUG
7644  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7645  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7646  (void *)__kmp_teams_master);
7647 #endif
7648  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7649 #if OMPT_SUPPORT
7650  int tid = __kmp_tid_from_gtid(gtid);
7651  ompt_data_t *task_data =
7652  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7653  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7654  if (ompt_enabled.ompt_callback_implicit_task) {
7655  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7656  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7657  ompt_task_initial);
7658  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7659  }
7660 #endif
7661  __kmp_teams_master(gtid);
7662 #if OMPT_SUPPORT
7663  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7664 #endif
7665  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7666  return 1;
7667 }
7668 
7669 /* this sets the requested number of threads for the next parallel region
7670  encountered by this team. since this should be enclosed in the forkjoin
7671  critical section it should avoid race conditions with asymmetrical nested
7672  parallelism */
7673 
7674 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7675  kmp_info_t *thr = __kmp_threads[gtid];
7676 
7677  if (num_threads > 0)
7678  thr->th.th_set_nproc = num_threads;
7679 }
7680 
7681 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7682  int num_threads) {
7683  KMP_DEBUG_ASSERT(thr);
7684  // Remember the number of threads for inner parallel regions
7685  if (!TCR_4(__kmp_init_middle))
7686  __kmp_middle_initialize(); // get internal globals calculated
7687  __kmp_assign_root_init_mask();
7688  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7689  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7690 
7691  if (num_threads == 0) {
7692  if (__kmp_teams_thread_limit > 0) {
7693  num_threads = __kmp_teams_thread_limit;
7694  } else {
7695  num_threads = __kmp_avail_proc / num_teams;
7696  }
7697  // adjust num_threads w/o warning as it is not user setting
7698  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7699  // no thread_limit clause specified - do not change thread-limit-var ICV
7700  if (num_threads > __kmp_dflt_team_nth) {
7701  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7702  }
7703  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7704  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7705  } // prevent team size to exceed thread-limit-var
7706  if (num_teams * num_threads > __kmp_teams_max_nth) {
7707  num_threads = __kmp_teams_max_nth / num_teams;
7708  }
7709  if (num_threads == 0) {
7710  num_threads = 1;
7711  }
7712  } else {
7713  if (num_threads < 0) {
7714  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7715  __kmp_msg_null);
7716  num_threads = 1;
7717  }
7718  // This thread will be the primary thread of the league primary threads
7719  // Store new thread limit; old limit is saved in th_cg_roots list
7720  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7721  // num_threads = min(num_threads, nthreads-var)
7722  if (num_threads > __kmp_dflt_team_nth) {
7723  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7724  }
7725  if (num_teams * num_threads > __kmp_teams_max_nth) {
7726  int new_threads = __kmp_teams_max_nth / num_teams;
7727  if (new_threads == 0) {
7728  new_threads = 1;
7729  }
7730  if (new_threads != num_threads) {
7731  if (!__kmp_reserve_warn) { // user asked for too many threads
7732  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7733  __kmp_msg(kmp_ms_warning,
7734  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7735  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7736  }
7737  }
7738  num_threads = new_threads;
7739  }
7740  }
7741  thr->th.th_teams_size.nth = num_threads;
7742 }
7743 
7744 /* this sets the requested number of teams for the teams region and/or
7745  the number of threads for the next parallel region encountered */
7746 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7747  int num_threads) {
7748  kmp_info_t *thr = __kmp_threads[gtid];
7749  if (num_teams < 0) {
7750  // OpenMP specification requires requested values to be positive,
7751  // but people can send us any value, so we'd better check
7752  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7753  __kmp_msg_null);
7754  num_teams = 1;
7755  }
7756  if (num_teams == 0) {
7757  if (__kmp_nteams > 0) {
7758  num_teams = __kmp_nteams;
7759  } else {
7760  num_teams = 1; // default number of teams is 1.
7761  }
7762  }
7763  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7764  if (!__kmp_reserve_warn) {
7765  __kmp_reserve_warn = 1;
7766  __kmp_msg(kmp_ms_warning,
7767  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7768  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7769  }
7770  num_teams = __kmp_teams_max_nth;
7771  }
7772  // Set number of teams (number of threads in the outer "parallel" of the
7773  // teams)
7774  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7775 
7776  __kmp_push_thread_limit(thr, num_teams, num_threads);
7777 }
7778 
7779 /* This sets the requested number of teams for the teams region and/or
7780  the number of threads for the next parallel region encountered */
7781 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7782  int num_teams_ub, int num_threads) {
7783  kmp_info_t *thr = __kmp_threads[gtid];
7784  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7785  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7786  KMP_DEBUG_ASSERT(num_threads >= 0);
7787 
7788  if (num_teams_lb > num_teams_ub) {
7789  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7790  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7791  }
7792 
7793  int num_teams = 1; // defalt number of teams is 1.
7794 
7795  if (num_teams_lb == 0 && num_teams_ub > 0)
7796  num_teams_lb = num_teams_ub;
7797 
7798  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7799  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7800  if (num_teams > __kmp_teams_max_nth) {
7801  if (!__kmp_reserve_warn) {
7802  __kmp_reserve_warn = 1;
7803  __kmp_msg(kmp_ms_warning,
7804  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7805  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7806  }
7807  num_teams = __kmp_teams_max_nth;
7808  }
7809  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7810  num_teams = num_teams_ub;
7811  } else { // num_teams_lb <= num_teams <= num_teams_ub
7812  if (num_threads <= 0) {
7813  if (num_teams_ub > __kmp_teams_max_nth) {
7814  num_teams = num_teams_lb;
7815  } else {
7816  num_teams = num_teams_ub;
7817  }
7818  } else {
7819  num_teams = (num_threads > __kmp_teams_max_nth)
7820  ? num_teams
7821  : __kmp_teams_max_nth / num_threads;
7822  if (num_teams < num_teams_lb) {
7823  num_teams = num_teams_lb;
7824  } else if (num_teams > num_teams_ub) {
7825  num_teams = num_teams_ub;
7826  }
7827  }
7828  }
7829  // Set number of teams (number of threads in the outer "parallel" of the
7830  // teams)
7831  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7832 
7833  __kmp_push_thread_limit(thr, num_teams, num_threads);
7834 }
7835 
7836 // Set the proc_bind var to use in the following parallel region.
7837 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7838  kmp_info_t *thr = __kmp_threads[gtid];
7839  thr->th.th_set_proc_bind = proc_bind;
7840 }
7841 
7842 /* Launch the worker threads into the microtask. */
7843 
7844 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7845  kmp_info_t *this_thr = __kmp_threads[gtid];
7846 
7847 #ifdef KMP_DEBUG
7848  int f;
7849 #endif /* KMP_DEBUG */
7850 
7851  KMP_DEBUG_ASSERT(team);
7852  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7853  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7854  KMP_MB(); /* Flush all pending memory write invalidates. */
7855 
7856  team->t.t_construct = 0; /* no single directives seen yet */
7857  team->t.t_ordered.dt.t_value =
7858  0; /* thread 0 enters the ordered section first */
7859 
7860  /* Reset the identifiers on the dispatch buffer */
7861  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7862  if (team->t.t_max_nproc > 1) {
7863  int i;
7864  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7865  team->t.t_disp_buffer[i].buffer_index = i;
7866  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7867  }
7868  } else {
7869  team->t.t_disp_buffer[0].buffer_index = 0;
7870  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7871  }
7872 
7873  KMP_MB(); /* Flush all pending memory write invalidates. */
7874  KMP_ASSERT(this_thr->th.th_team == team);
7875 
7876 #ifdef KMP_DEBUG
7877  for (f = 0; f < team->t.t_nproc; f++) {
7878  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7879  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7880  }
7881 #endif /* KMP_DEBUG */
7882 
7883  /* release the worker threads so they may begin working */
7884  __kmp_fork_barrier(gtid, 0);
7885 }
7886 
7887 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7888  kmp_info_t *this_thr = __kmp_threads[gtid];
7889 
7890  KMP_DEBUG_ASSERT(team);
7891  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7892  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7893  KMP_MB(); /* Flush all pending memory write invalidates. */
7894 
7895  /* Join barrier after fork */
7896 
7897 #ifdef KMP_DEBUG
7898  if (__kmp_threads[gtid] &&
7899  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7900  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7901  __kmp_threads[gtid]);
7902  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7903  "team->t.t_nproc=%d\n",
7904  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7905  team->t.t_nproc);
7906  __kmp_print_structure();
7907  }
7908  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7909  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7910 #endif /* KMP_DEBUG */
7911 
7912  __kmp_join_barrier(gtid); /* wait for everyone */
7913 #if OMPT_SUPPORT
7914  if (ompt_enabled.enabled &&
7915  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7916  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7917  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7918  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7919 #if OMPT_OPTIONAL
7920  void *codeptr = NULL;
7921  if (KMP_MASTER_TID(ds_tid) &&
7922  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7923  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7924  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7925 
7926  if (ompt_enabled.ompt_callback_sync_region_wait) {
7927  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7928  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7929  codeptr);
7930  }
7931  if (ompt_enabled.ompt_callback_sync_region) {
7932  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7933  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7934  codeptr);
7935  }
7936 #endif
7937  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7938  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7939  ompt_scope_end, NULL, task_data, 0, ds_tid,
7940  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7941  }
7942  }
7943 #endif
7944 
7945  KMP_MB(); /* Flush all pending memory write invalidates. */
7946  KMP_ASSERT(this_thr->th.th_team == team);
7947 }
7948 
7949 /* ------------------------------------------------------------------------ */
7950 
7951 #ifdef USE_LOAD_BALANCE
7952 
7953 // Return the worker threads actively spinning in the hot team, if we
7954 // are at the outermost level of parallelism. Otherwise, return 0.
7955 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7956  int i;
7957  int retval;
7958  kmp_team_t *hot_team;
7959 
7960  if (root->r.r_active) {
7961  return 0;
7962  }
7963  hot_team = root->r.r_hot_team;
7964  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7965  return hot_team->t.t_nproc - 1; // Don't count primary thread
7966  }
7967 
7968  // Skip the primary thread - it is accounted for elsewhere.
7969  retval = 0;
7970  for (i = 1; i < hot_team->t.t_nproc; i++) {
7971  if (hot_team->t.t_threads[i]->th.th_active) {
7972  retval++;
7973  }
7974  }
7975  return retval;
7976 }
7977 
7978 // Perform an automatic adjustment to the number of
7979 // threads used by the next parallel region.
7980 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7981  int retval;
7982  int pool_active;
7983  int hot_team_active;
7984  int team_curr_active;
7985  int system_active;
7986 
7987  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7988  set_nproc));
7989  KMP_DEBUG_ASSERT(root);
7990  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7991  ->th.th_current_task->td_icvs.dynamic == TRUE);
7992  KMP_DEBUG_ASSERT(set_nproc > 1);
7993 
7994  if (set_nproc == 1) {
7995  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7996  return 1;
7997  }
7998 
7999  // Threads that are active in the thread pool, active in the hot team for this
8000  // particular root (if we are at the outer par level), and the currently
8001  // executing thread (to become the primary thread) are available to add to the
8002  // new team, but are currently contributing to the system load, and must be
8003  // accounted for.
8004  pool_active = __kmp_thread_pool_active_nth;
8005  hot_team_active = __kmp_active_hot_team_nproc(root);
8006  team_curr_active = pool_active + hot_team_active + 1;
8007 
8008  // Check the system load.
8009  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8010  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8011  "hot team active = %d\n",
8012  system_active, pool_active, hot_team_active));
8013 
8014  if (system_active < 0) {
8015  // There was an error reading the necessary info from /proc, so use the
8016  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8017  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8018  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8019  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8020 
8021  // Make this call behave like the thread limit algorithm.
8022  retval = __kmp_avail_proc - __kmp_nth +
8023  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8024  if (retval > set_nproc) {
8025  retval = set_nproc;
8026  }
8027  if (retval < KMP_MIN_NTH) {
8028  retval = KMP_MIN_NTH;
8029  }
8030 
8031  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8032  retval));
8033  return retval;
8034  }
8035 
8036  // There is a slight delay in the load balance algorithm in detecting new
8037  // running procs. The real system load at this instant should be at least as
8038  // large as the #active omp thread that are available to add to the team.
8039  if (system_active < team_curr_active) {
8040  system_active = team_curr_active;
8041  }
8042  retval = __kmp_avail_proc - system_active + team_curr_active;
8043  if (retval > set_nproc) {
8044  retval = set_nproc;
8045  }
8046  if (retval < KMP_MIN_NTH) {
8047  retval = KMP_MIN_NTH;
8048  }
8049 
8050  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8051  return retval;
8052 } // __kmp_load_balance_nproc()
8053 
8054 #endif /* USE_LOAD_BALANCE */
8055 
8056 /* ------------------------------------------------------------------------ */
8057 
8058 /* NOTE: this is called with the __kmp_init_lock held */
8059 void __kmp_cleanup(void) {
8060  int f;
8061 
8062  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8063 
8064  if (TCR_4(__kmp_init_parallel)) {
8065 #if KMP_HANDLE_SIGNALS
8066  __kmp_remove_signals();
8067 #endif
8068  TCW_4(__kmp_init_parallel, FALSE);
8069  }
8070 
8071  if (TCR_4(__kmp_init_middle)) {
8072 #if KMP_AFFINITY_SUPPORTED
8073  __kmp_affinity_uninitialize();
8074 #endif /* KMP_AFFINITY_SUPPORTED */
8075  __kmp_cleanup_hierarchy();
8076  TCW_4(__kmp_init_middle, FALSE);
8077  }
8078 
8079  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8080 
8081  if (__kmp_init_serial) {
8082  __kmp_runtime_destroy();
8083  __kmp_init_serial = FALSE;
8084  }
8085 
8086  __kmp_cleanup_threadprivate_caches();
8087 
8088  for (f = 0; f < __kmp_threads_capacity; f++) {
8089  if (__kmp_root[f] != NULL) {
8090  __kmp_free(__kmp_root[f]);
8091  __kmp_root[f] = NULL;
8092  }
8093  }
8094  __kmp_free(__kmp_threads);
8095  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8096  // there is no need in freeing __kmp_root.
8097  __kmp_threads = NULL;
8098  __kmp_root = NULL;
8099  __kmp_threads_capacity = 0;
8100 
8101 #if KMP_USE_DYNAMIC_LOCK
8102  __kmp_cleanup_indirect_user_locks();
8103 #else
8104  __kmp_cleanup_user_locks();
8105 #endif
8106 #if OMPD_SUPPORT
8107  if (ompd_state) {
8108  __kmp_free(ompd_env_block);
8109  ompd_env_block = NULL;
8110  ompd_env_block_size = 0;
8111  }
8112 #endif
8113 
8114 #if KMP_AFFINITY_SUPPORTED
8115  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8116  __kmp_cpuinfo_file = NULL;
8117 #endif /* KMP_AFFINITY_SUPPORTED */
8118 
8119 #if KMP_USE_ADAPTIVE_LOCKS
8120 #if KMP_DEBUG_ADAPTIVE_LOCKS
8121  __kmp_print_speculative_stats();
8122 #endif
8123 #endif
8124  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8125  __kmp_nested_nth.nth = NULL;
8126  __kmp_nested_nth.size = 0;
8127  __kmp_nested_nth.used = 0;
8128  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8129  __kmp_nested_proc_bind.bind_types = NULL;
8130  __kmp_nested_proc_bind.size = 0;
8131  __kmp_nested_proc_bind.used = 0;
8132  if (__kmp_affinity_format) {
8133  KMP_INTERNAL_FREE(__kmp_affinity_format);
8134  __kmp_affinity_format = NULL;
8135  }
8136 
8137  __kmp_i18n_catclose();
8138 
8139 #if KMP_USE_HIER_SCHED
8140  __kmp_hier_scheds.deallocate();
8141 #endif
8142 
8143 #if KMP_STATS_ENABLED
8144  __kmp_stats_fini();
8145 #endif
8146 
8147  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8148 }
8149 
8150 /* ------------------------------------------------------------------------ */
8151 
8152 int __kmp_ignore_mppbeg(void) {
8153  char *env;
8154 
8155  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8156  if (__kmp_str_match_false(env))
8157  return FALSE;
8158  }
8159  // By default __kmpc_begin() is no-op.
8160  return TRUE;
8161 }
8162 
8163 int __kmp_ignore_mppend(void) {
8164  char *env;
8165 
8166  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8167  if (__kmp_str_match_false(env))
8168  return FALSE;
8169  }
8170  // By default __kmpc_end() is no-op.
8171  return TRUE;
8172 }
8173 
8174 void __kmp_internal_begin(void) {
8175  int gtid;
8176  kmp_root_t *root;
8177 
8178  /* this is a very important step as it will register new sibling threads
8179  and assign these new uber threads a new gtid */
8180  gtid = __kmp_entry_gtid();
8181  root = __kmp_threads[gtid]->th.th_root;
8182  KMP_ASSERT(KMP_UBER_GTID(gtid));
8183 
8184  if (root->r.r_begin)
8185  return;
8186  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8187  if (root->r.r_begin) {
8188  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8189  return;
8190  }
8191 
8192  root->r.r_begin = TRUE;
8193 
8194  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8195 }
8196 
8197 /* ------------------------------------------------------------------------ */
8198 
8199 void __kmp_user_set_library(enum library_type arg) {
8200  int gtid;
8201  kmp_root_t *root;
8202  kmp_info_t *thread;
8203 
8204  /* first, make sure we are initialized so we can get our gtid */
8205 
8206  gtid = __kmp_entry_gtid();
8207  thread = __kmp_threads[gtid];
8208 
8209  root = thread->th.th_root;
8210 
8211  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8212  library_serial));
8213  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8214  thread */
8215  KMP_WARNING(SetLibraryIncorrectCall);
8216  return;
8217  }
8218 
8219  switch (arg) {
8220  case library_serial:
8221  thread->th.th_set_nproc = 0;
8222  set__nproc(thread, 1);
8223  break;
8224  case library_turnaround:
8225  thread->th.th_set_nproc = 0;
8226  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8227  : __kmp_dflt_team_nth_ub);
8228  break;
8229  case library_throughput:
8230  thread->th.th_set_nproc = 0;
8231  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8232  : __kmp_dflt_team_nth_ub);
8233  break;
8234  default:
8235  KMP_FATAL(UnknownLibraryType, arg);
8236  }
8237 
8238  __kmp_aux_set_library(arg);
8239 }
8240 
8241 void __kmp_aux_set_stacksize(size_t arg) {
8242  if (!__kmp_init_serial)
8243  __kmp_serial_initialize();
8244 
8245 #if KMP_OS_DARWIN
8246  if (arg & (0x1000 - 1)) {
8247  arg &= ~(0x1000 - 1);
8248  if (arg + 0x1000) /* check for overflow if we round up */
8249  arg += 0x1000;
8250  }
8251 #endif
8252  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8253 
8254  /* only change the default stacksize before the first parallel region */
8255  if (!TCR_4(__kmp_init_parallel)) {
8256  size_t value = arg; /* argument is in bytes */
8257 
8258  if (value < __kmp_sys_min_stksize)
8259  value = __kmp_sys_min_stksize;
8260  else if (value > KMP_MAX_STKSIZE)
8261  value = KMP_MAX_STKSIZE;
8262 
8263  __kmp_stksize = value;
8264 
8265  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8266  }
8267 
8268  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8269 }
8270 
8271 /* set the behaviour of the runtime library */
8272 /* TODO this can cause some odd behaviour with sibling parallelism... */
8273 void __kmp_aux_set_library(enum library_type arg) {
8274  __kmp_library = arg;
8275 
8276  switch (__kmp_library) {
8277  case library_serial: {
8278  KMP_INFORM(LibraryIsSerial);
8279  } break;
8280  case library_turnaround:
8281  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8282  __kmp_use_yield = 2; // only yield when oversubscribed
8283  break;
8284  case library_throughput:
8285  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8286  __kmp_dflt_blocktime = 200;
8287  break;
8288  default:
8289  KMP_FATAL(UnknownLibraryType, arg);
8290  }
8291 }
8292 
8293 /* Getting team information common for all team API */
8294 // Returns NULL if not in teams construct
8295 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8296  kmp_info_t *thr = __kmp_entry_thread();
8297  teams_serialized = 0;
8298  if (thr->th.th_teams_microtask) {
8299  kmp_team_t *team = thr->th.th_team;
8300  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8301  int ii = team->t.t_level;
8302  teams_serialized = team->t.t_serialized;
8303  int level = tlevel + 1;
8304  KMP_DEBUG_ASSERT(ii >= tlevel);
8305  while (ii > level) {
8306  for (teams_serialized = team->t.t_serialized;
8307  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8308  }
8309  if (team->t.t_serialized && (!teams_serialized)) {
8310  team = team->t.t_parent;
8311  continue;
8312  }
8313  if (ii > level) {
8314  team = team->t.t_parent;
8315  ii--;
8316  }
8317  }
8318  return team;
8319  }
8320  return NULL;
8321 }
8322 
8323 int __kmp_aux_get_team_num() {
8324  int serialized;
8325  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8326  if (team) {
8327  if (serialized > 1) {
8328  return 0; // teams region is serialized ( 1 team of 1 thread ).
8329  } else {
8330  return team->t.t_master_tid;
8331  }
8332  }
8333  return 0;
8334 }
8335 
8336 int __kmp_aux_get_num_teams() {
8337  int serialized;
8338  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8339  if (team) {
8340  if (serialized > 1) {
8341  return 1;
8342  } else {
8343  return team->t.t_parent->t.t_nproc;
8344  }
8345  }
8346  return 1;
8347 }
8348 
8349 /* ------------------------------------------------------------------------ */
8350 
8351 /*
8352  * Affinity Format Parser
8353  *
8354  * Field is in form of: %[[[0].]size]type
8355  * % and type are required (%% means print a literal '%')
8356  * type is either single char or long name surrounded by {},
8357  * e.g., N or {num_threads}
8358  * 0 => leading zeros
8359  * . => right justified when size is specified
8360  * by default output is left justified
8361  * size is the *minimum* field length
8362  * All other characters are printed as is
8363  *
8364  * Available field types:
8365  * L {thread_level} - omp_get_level()
8366  * n {thread_num} - omp_get_thread_num()
8367  * h {host} - name of host machine
8368  * P {process_id} - process id (integer)
8369  * T {thread_identifier} - native thread identifier (integer)
8370  * N {num_threads} - omp_get_num_threads()
8371  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8372  * a {thread_affinity} - comma separated list of integers or integer ranges
8373  * (values of affinity mask)
8374  *
8375  * Implementation-specific field types can be added
8376  * If a type is unknown, print "undefined"
8377  */
8378 
8379 // Structure holding the short name, long name, and corresponding data type
8380 // for snprintf. A table of these will represent the entire valid keyword
8381 // field types.
8382 typedef struct kmp_affinity_format_field_t {
8383  char short_name; // from spec e.g., L -> thread level
8384  const char *long_name; // from spec thread_level -> thread level
8385  char field_format; // data type for snprintf (typically 'd' or 's'
8386  // for integer or string)
8387 } kmp_affinity_format_field_t;
8388 
8389 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8390 #if KMP_AFFINITY_SUPPORTED
8391  {'A', "thread_affinity", 's'},
8392 #endif
8393  {'t', "team_num", 'd'},
8394  {'T', "num_teams", 'd'},
8395  {'L', "nesting_level", 'd'},
8396  {'n', "thread_num", 'd'},
8397  {'N', "num_threads", 'd'},
8398  {'a', "ancestor_tnum", 'd'},
8399  {'H', "host", 's'},
8400  {'P', "process_id", 'd'},
8401  {'i', "native_thread_id", 'd'}};
8402 
8403 // Return the number of characters it takes to hold field
8404 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8405  const char **ptr,
8406  kmp_str_buf_t *field_buffer) {
8407  int rc, format_index, field_value;
8408  const char *width_left, *width_right;
8409  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8410  static const int FORMAT_SIZE = 20;
8411  char format[FORMAT_SIZE] = {0};
8412  char absolute_short_name = 0;
8413 
8414  KMP_DEBUG_ASSERT(gtid >= 0);
8415  KMP_DEBUG_ASSERT(th);
8416  KMP_DEBUG_ASSERT(**ptr == '%');
8417  KMP_DEBUG_ASSERT(field_buffer);
8418 
8419  __kmp_str_buf_clear(field_buffer);
8420 
8421  // Skip the initial %
8422  (*ptr)++;
8423 
8424  // Check for %% first
8425  if (**ptr == '%') {
8426  __kmp_str_buf_cat(field_buffer, "%", 1);
8427  (*ptr)++; // skip over the second %
8428  return 1;
8429  }
8430 
8431  // Parse field modifiers if they are present
8432  pad_zeros = false;
8433  if (**ptr == '0') {
8434  pad_zeros = true;
8435  (*ptr)++; // skip over 0
8436  }
8437  right_justify = false;
8438  if (**ptr == '.') {
8439  right_justify = true;
8440  (*ptr)++; // skip over .
8441  }
8442  // Parse width of field: [width_left, width_right)
8443  width_left = width_right = NULL;
8444  if (**ptr >= '0' && **ptr <= '9') {
8445  width_left = *ptr;
8446  SKIP_DIGITS(*ptr);
8447  width_right = *ptr;
8448  }
8449 
8450  // Create the format for KMP_SNPRINTF based on flags parsed above
8451  format_index = 0;
8452  format[format_index++] = '%';
8453  if (!right_justify)
8454  format[format_index++] = '-';
8455  if (pad_zeros)
8456  format[format_index++] = '0';
8457  if (width_left && width_right) {
8458  int i = 0;
8459  // Only allow 8 digit number widths.
8460  // This also prevents overflowing format variable
8461  while (i < 8 && width_left < width_right) {
8462  format[format_index++] = *width_left;
8463  width_left++;
8464  i++;
8465  }
8466  }
8467 
8468  // Parse a name (long or short)
8469  // Canonicalize the name into absolute_short_name
8470  found_valid_name = false;
8471  parse_long_name = (**ptr == '{');
8472  if (parse_long_name)
8473  (*ptr)++; // skip initial left brace
8474  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8475  sizeof(__kmp_affinity_format_table[0]);
8476  ++i) {
8477  char short_name = __kmp_affinity_format_table[i].short_name;
8478  const char *long_name = __kmp_affinity_format_table[i].long_name;
8479  char field_format = __kmp_affinity_format_table[i].field_format;
8480  if (parse_long_name) {
8481  size_t length = KMP_STRLEN(long_name);
8482  if (strncmp(*ptr, long_name, length) == 0) {
8483  found_valid_name = true;
8484  (*ptr) += length; // skip the long name
8485  }
8486  } else if (**ptr == short_name) {
8487  found_valid_name = true;
8488  (*ptr)++; // skip the short name
8489  }
8490  if (found_valid_name) {
8491  format[format_index++] = field_format;
8492  format[format_index++] = '\0';
8493  absolute_short_name = short_name;
8494  break;
8495  }
8496  }
8497  if (parse_long_name) {
8498  if (**ptr != '}') {
8499  absolute_short_name = 0;
8500  } else {
8501  (*ptr)++; // skip over the right brace
8502  }
8503  }
8504 
8505  // Attempt to fill the buffer with the requested
8506  // value using snprintf within __kmp_str_buf_print()
8507  switch (absolute_short_name) {
8508  case 't':
8509  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8510  break;
8511  case 'T':
8512  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8513  break;
8514  case 'L':
8515  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8516  break;
8517  case 'n':
8518  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8519  break;
8520  case 'H': {
8521  static const int BUFFER_SIZE = 256;
8522  char buf[BUFFER_SIZE];
8523  __kmp_expand_host_name(buf, BUFFER_SIZE);
8524  rc = __kmp_str_buf_print(field_buffer, format, buf);
8525  } break;
8526  case 'P':
8527  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8528  break;
8529  case 'i':
8530  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8531  break;
8532  case 'N':
8533  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8534  break;
8535  case 'a':
8536  field_value =
8537  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8538  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8539  break;
8540 #if KMP_AFFINITY_SUPPORTED
8541  case 'A': {
8542  kmp_str_buf_t buf;
8543  __kmp_str_buf_init(&buf);
8544  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8545  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8546  __kmp_str_buf_free(&buf);
8547  } break;
8548 #endif
8549  default:
8550  // According to spec, If an implementation does not have info for field
8551  // type, then "undefined" is printed
8552  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8553  // Skip the field
8554  if (parse_long_name) {
8555  SKIP_TOKEN(*ptr);
8556  if (**ptr == '}')
8557  (*ptr)++;
8558  } else {
8559  (*ptr)++;
8560  }
8561  }
8562 
8563  KMP_ASSERT(format_index <= FORMAT_SIZE);
8564  return rc;
8565 }
8566 
8567 /*
8568  * Return number of characters needed to hold the affinity string
8569  * (not including null byte character)
8570  * The resultant string is printed to buffer, which the caller can then
8571  * handle afterwards
8572  */
8573 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8574  kmp_str_buf_t *buffer) {
8575  const char *parse_ptr;
8576  size_t retval;
8577  const kmp_info_t *th;
8578  kmp_str_buf_t field;
8579 
8580  KMP_DEBUG_ASSERT(buffer);
8581  KMP_DEBUG_ASSERT(gtid >= 0);
8582 
8583  __kmp_str_buf_init(&field);
8584  __kmp_str_buf_clear(buffer);
8585 
8586  th = __kmp_threads[gtid];
8587  retval = 0;
8588 
8589  // If format is NULL or zero-length string, then we use
8590  // affinity-format-var ICV
8591  parse_ptr = format;
8592  if (parse_ptr == NULL || *parse_ptr == '\0') {
8593  parse_ptr = __kmp_affinity_format;
8594  }
8595  KMP_DEBUG_ASSERT(parse_ptr);
8596 
8597  while (*parse_ptr != '\0') {
8598  // Parse a field
8599  if (*parse_ptr == '%') {
8600  // Put field in the buffer
8601  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8602  __kmp_str_buf_catbuf(buffer, &field);
8603  retval += rc;
8604  } else {
8605  // Put literal character in buffer
8606  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8607  retval++;
8608  parse_ptr++;
8609  }
8610  }
8611  __kmp_str_buf_free(&field);
8612  return retval;
8613 }
8614 
8615 // Displays the affinity string to stdout
8616 void __kmp_aux_display_affinity(int gtid, const char *format) {
8617  kmp_str_buf_t buf;
8618  __kmp_str_buf_init(&buf);
8619  __kmp_aux_capture_affinity(gtid, format, &buf);
8620  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8621  __kmp_str_buf_free(&buf);
8622 }
8623 
8624 /* ------------------------------------------------------------------------ */
8625 
8626 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8627  int blocktime = arg; /* argument is in milliseconds */
8628 #if KMP_USE_MONITOR
8629  int bt_intervals;
8630 #endif
8631  kmp_int8 bt_set;
8632 
8633  __kmp_save_internal_controls(thread);
8634 
8635  /* Normalize and set blocktime for the teams */
8636  if (blocktime < KMP_MIN_BLOCKTIME)
8637  blocktime = KMP_MIN_BLOCKTIME;
8638  else if (blocktime > KMP_MAX_BLOCKTIME)
8639  blocktime = KMP_MAX_BLOCKTIME;
8640 
8641  set__blocktime_team(thread->th.th_team, tid, blocktime);
8642  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8643 
8644 #if KMP_USE_MONITOR
8645  /* Calculate and set blocktime intervals for the teams */
8646  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8647 
8648  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8649  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8650 #endif
8651 
8652  /* Set whether blocktime has been set to "TRUE" */
8653  bt_set = TRUE;
8654 
8655  set__bt_set_team(thread->th.th_team, tid, bt_set);
8656  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8657 #if KMP_USE_MONITOR
8658  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8659  "bt_intervals=%d, monitor_updates=%d\n",
8660  __kmp_gtid_from_tid(tid, thread->th.th_team),
8661  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8662  __kmp_monitor_wakeups));
8663 #else
8664  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8665  __kmp_gtid_from_tid(tid, thread->th.th_team),
8666  thread->th.th_team->t.t_id, tid, blocktime));
8667 #endif
8668 }
8669 
8670 void __kmp_aux_set_defaults(char const *str, size_t len) {
8671  if (!__kmp_init_serial) {
8672  __kmp_serial_initialize();
8673  }
8674  __kmp_env_initialize(str);
8675 
8676  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8677  __kmp_env_print();
8678  }
8679 } // __kmp_aux_set_defaults
8680 
8681 /* ------------------------------------------------------------------------ */
8682 /* internal fast reduction routines */
8683 
8684 PACKED_REDUCTION_METHOD_T
8685 __kmp_determine_reduction_method(
8686  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8687  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8688  kmp_critical_name *lck) {
8689 
8690  // Default reduction method: critical construct ( lck != NULL, like in current
8691  // PAROPT )
8692  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8693  // can be selected by RTL
8694  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8695  // can be selected by RTL
8696  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8697  // among generated by PAROPT.
8698 
8699  PACKED_REDUCTION_METHOD_T retval;
8700 
8701  int team_size;
8702 
8703  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8704  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8705 
8706 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8707  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8708 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8709 
8710  retval = critical_reduce_block;
8711 
8712  // another choice of getting a team size (with 1 dynamic deference) is slower
8713  team_size = __kmp_get_team_num_threads(global_tid);
8714  if (team_size == 1) {
8715 
8716  retval = empty_reduce_block;
8717 
8718  } else {
8719 
8720  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8721 
8722 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8723  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8724 
8725 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8726  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8727 
8728  int teamsize_cutoff = 4;
8729 
8730 #if KMP_MIC_SUPPORTED
8731  if (__kmp_mic_type != non_mic) {
8732  teamsize_cutoff = 8;
8733  }
8734 #endif
8735  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8736  if (tree_available) {
8737  if (team_size <= teamsize_cutoff) {
8738  if (atomic_available) {
8739  retval = atomic_reduce_block;
8740  }
8741  } else {
8742  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8743  }
8744  } else if (atomic_available) {
8745  retval = atomic_reduce_block;
8746  }
8747 #else
8748 #error "Unknown or unsupported OS"
8749 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8750  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8751 
8752 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8753 
8754 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8755 
8756  // basic tuning
8757 
8758  if (atomic_available) {
8759  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8760  retval = atomic_reduce_block;
8761  }
8762  } // otherwise: use critical section
8763 
8764 #elif KMP_OS_DARWIN
8765 
8766  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8767  if (atomic_available && (num_vars <= 3)) {
8768  retval = atomic_reduce_block;
8769  } else if (tree_available) {
8770  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8771  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8772  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8773  }
8774  } // otherwise: use critical section
8775 
8776 #else
8777 #error "Unknown or unsupported OS"
8778 #endif
8779 
8780 #else
8781 #error "Unknown or unsupported architecture"
8782 #endif
8783  }
8784 
8785  // KMP_FORCE_REDUCTION
8786 
8787  // If the team is serialized (team_size == 1), ignore the forced reduction
8788  // method and stay with the unsynchronized method (empty_reduce_block)
8789  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8790  team_size != 1) {
8791 
8792  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8793 
8794  int atomic_available, tree_available;
8795 
8796  switch ((forced_retval = __kmp_force_reduction_method)) {
8797  case critical_reduce_block:
8798  KMP_ASSERT(lck); // lck should be != 0
8799  break;
8800 
8801  case atomic_reduce_block:
8802  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8803  if (!atomic_available) {
8804  KMP_WARNING(RedMethodNotSupported, "atomic");
8805  forced_retval = critical_reduce_block;
8806  }
8807  break;
8808 
8809  case tree_reduce_block:
8810  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8811  if (!tree_available) {
8812  KMP_WARNING(RedMethodNotSupported, "tree");
8813  forced_retval = critical_reduce_block;
8814  } else {
8815 #if KMP_FAST_REDUCTION_BARRIER
8816  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8817 #endif
8818  }
8819  break;
8820 
8821  default:
8822  KMP_ASSERT(0); // "unsupported method specified"
8823  }
8824 
8825  retval = forced_retval;
8826  }
8827 
8828  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8829 
8830 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8831 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8832 
8833  return (retval);
8834 }
8835 // this function is for testing set/get/determine reduce method
8836 kmp_int32 __kmp_get_reduce_method(void) {
8837  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8838 }
8839 
8840 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8841 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8842 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8843 
8844 // Hard pause shuts down the runtime completely. Resume happens naturally when
8845 // OpenMP is used subsequently.
8846 void __kmp_hard_pause() {
8847  __kmp_pause_status = kmp_hard_paused;
8848  __kmp_internal_end_thread(-1);
8849 }
8850 
8851 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8852 void __kmp_resume_if_soft_paused() {
8853  if (__kmp_pause_status == kmp_soft_paused) {
8854  __kmp_pause_status = kmp_not_paused;
8855 
8856  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8857  kmp_info_t *thread = __kmp_threads[gtid];
8858  if (thread) { // Wake it if sleeping
8859  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8860  thread);
8861  if (fl.is_sleeping())
8862  fl.resume(gtid);
8863  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8864  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8865  } else { // thread holds the lock and may sleep soon
8866  do { // until either the thread sleeps, or we can get the lock
8867  if (fl.is_sleeping()) {
8868  fl.resume(gtid);
8869  break;
8870  } else if (__kmp_try_suspend_mx(thread)) {
8871  __kmp_unlock_suspend_mx(thread);
8872  break;
8873  }
8874  } while (1);
8875  }
8876  }
8877  }
8878  }
8879 }
8880 
8881 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8882 // TODO: add warning messages
8883 int __kmp_pause_resource(kmp_pause_status_t level) {
8884  if (level == kmp_not_paused) { // requesting resume
8885  if (__kmp_pause_status == kmp_not_paused) {
8886  // error message about runtime not being paused, so can't resume
8887  return 1;
8888  } else {
8889  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8890  __kmp_pause_status == kmp_hard_paused);
8891  __kmp_pause_status = kmp_not_paused;
8892  return 0;
8893  }
8894  } else if (level == kmp_soft_paused) { // requesting soft pause
8895  if (__kmp_pause_status != kmp_not_paused) {
8896  // error message about already being paused
8897  return 1;
8898  } else {
8899  __kmp_soft_pause();
8900  return 0;
8901  }
8902  } else if (level == kmp_hard_paused) { // requesting hard pause
8903  if (__kmp_pause_status != kmp_not_paused) {
8904  // error message about already being paused
8905  return 1;
8906  } else {
8907  __kmp_hard_pause();
8908  return 0;
8909  }
8910  } else {
8911  // error message about invalid level
8912  return 1;
8913  }
8914 }
8915 
8916 void __kmp_omp_display_env(int verbose) {
8917  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8918  if (__kmp_init_serial == 0)
8919  __kmp_do_serial_initialize();
8920  __kmp_display_env_impl(!verbose, verbose);
8921  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8922 }
8923 
8924 // The team size is changing, so distributed barrier must be modified
8925 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
8926  int new_nthreads) {
8927  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
8928  bp_dist_bar);
8929  kmp_info_t **other_threads = team->t.t_threads;
8930 
8931  // We want all the workers to stop waiting on the barrier while we adjust the
8932  // size of the team.
8933  for (int f = 1; f < old_nthreads; ++f) {
8934  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
8935  // Ignore threads that are already inactive or not present in the team
8936  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
8937  // teams construct causes thread_limit to get passed in, and some of
8938  // those could be inactive; just ignore them
8939  continue;
8940  }
8941  // If thread is transitioning still to in_use state, wait for it
8942  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
8943  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
8944  KMP_CPU_PAUSE();
8945  }
8946  // The thread should be in_use now
8947  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
8948  // Transition to unused state
8949  team->t.t_threads[f]->th.th_used_in_team.store(2);
8950  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
8951  }
8952  // Release all the workers
8953  kmp_uint64 new_value; // new value for go
8954  new_value = team->t.b->go_release();
8955 
8956  KMP_MFENCE();
8957 
8958  // Workers should see transition status 2 and move to 0; but may need to be
8959  // woken up first
8960  size_t my_go_index;
8961  int count = old_nthreads - 1;
8962  while (count > 0) {
8963  count = old_nthreads - 1;
8964  for (int f = 1; f < old_nthreads; ++f) {
8965  my_go_index = f / team->t.b->threads_per_go;
8966  if (other_threads[f]->th.th_used_in_team.load() != 0) {
8967  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
8968  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
8969  void *, other_threads[f]->th.th_sleep_loc);
8970  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
8971  }
8972  } else {
8973  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
8974  count--;
8975  }
8976  }
8977  }
8978  // Now update the barrier size
8979  team->t.b->update_num_threads(new_nthreads);
8980  team->t.b->go_reset();
8981 }
8982 
8983 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
8984  // Add the threads back to the team
8985  KMP_DEBUG_ASSERT(team);
8986  // Threads were paused and pointed at th_used_in_team temporarily during a
8987  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
8988  // the thread that it should transition itself back into the team. Then, if
8989  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
8990  // to wake it up.
8991  for (int f = 1; f < new_nthreads; ++f) {
8992  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
8993  KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
8994  3);
8995  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
8996  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
8997  (kmp_flag_32<false, false> *)NULL);
8998  }
8999  }
9000  // The threads should be transitioning to the team; when they are done, they
9001  // should have set th_used_in_team to 1. This loop forces master to wait until
9002  // all threads have moved into the team and are waiting in the barrier.
9003  int count = new_nthreads - 1;
9004  while (count > 0) {
9005  count = new_nthreads - 1;
9006  for (int f = 1; f < new_nthreads; ++f) {
9007  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9008  count--;
9009  }
9010  }
9011  }
9012 }
9013 
9014 // Globals and functions for hidden helper task
9015 kmp_info_t **__kmp_hidden_helper_threads;
9016 kmp_info_t *__kmp_hidden_helper_main_thread;
9017 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9018 #if KMP_OS_LINUX
9019 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9020 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9021 #else
9022 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9023 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9024 #endif
9025 
9026 namespace {
9027 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9028 
9029 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9030  // This is an explicit synchronization on all hidden helper threads in case
9031  // that when a regular thread pushes a hidden helper task to one hidden
9032  // helper thread, the thread has not been awaken once since they're released
9033  // by the main thread after creating the team.
9034  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9035  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9036  __kmp_hidden_helper_threads_num)
9037  ;
9038 
9039  // If main thread, then wait for signal
9040  if (__kmpc_master(nullptr, *gtid)) {
9041  // First, unset the initial state and release the initial thread
9042  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9043  __kmp_hidden_helper_initz_release();
9044  __kmp_hidden_helper_main_thread_wait();
9045  // Now wake up all worker threads
9046  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9047  __kmp_hidden_helper_worker_thread_signal();
9048  }
9049  }
9050 }
9051 } // namespace
9052 
9053 void __kmp_hidden_helper_threads_initz_routine() {
9054  // Create a new root for hidden helper team/threads
9055  const int gtid = __kmp_register_root(TRUE);
9056  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9057  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9058  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9059  __kmp_hidden_helper_threads_num;
9060 
9061  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9062 
9063  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9064 
9065  // Set the initialization flag to FALSE
9066  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9067 
9068  __kmp_hidden_helper_threads_deinitz_release();
9069 }
9070 
9071 /* Nesting Mode:
9072  Set via KMP_NESTING_MODE, which takes an integer.
9073  Note: we skip duplicate topology levels, and skip levels with only
9074  one entity.
9075  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9076  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9077  in the topology, and initializes the number of threads at each of those
9078  levels to the number of entities at each level, respectively, below the
9079  entity at the parent level.
9080  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9081  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9082  the user to turn nesting on explicitly. This is an even more experimental
9083  option to this experimental feature, and may change or go away in the
9084  future.
9085 */
9086 
9087 // Allocate space to store nesting levels
9088 void __kmp_init_nesting_mode() {
9089  int levels = KMP_HW_LAST;
9090  __kmp_nesting_mode_nlevels = levels;
9091  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9092  for (int i = 0; i < levels; ++i)
9093  __kmp_nesting_nth_level[i] = 0;
9094  if (__kmp_nested_nth.size < levels) {
9095  __kmp_nested_nth.nth =
9096  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9097  __kmp_nested_nth.size = levels;
9098  }
9099 }
9100 
9101 // Set # threads for top levels of nesting; must be called after topology set
9102 void __kmp_set_nesting_mode_threads() {
9103  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9104 
9105  if (__kmp_nesting_mode == 1)
9106  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9107  else if (__kmp_nesting_mode > 1)
9108  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9109 
9110  if (__kmp_topology) { // use topology info
9111  int loc, hw_level;
9112  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9113  loc < __kmp_nesting_mode_nlevels;
9114  loc++, hw_level++) {
9115  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9116  if (__kmp_nesting_nth_level[loc] == 1)
9117  loc--;
9118  }
9119  // Make sure all cores are used
9120  if (__kmp_nesting_mode > 1 && loc > 1) {
9121  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9122  int num_cores = __kmp_topology->get_count(core_level);
9123  int upper_levels = 1;
9124  for (int level = 0; level < loc - 1; ++level)
9125  upper_levels *= __kmp_nesting_nth_level[level];
9126  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9127  __kmp_nesting_nth_level[loc - 1] =
9128  num_cores / __kmp_nesting_nth_level[loc - 2];
9129  }
9130  __kmp_nesting_mode_nlevels = loc;
9131  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9132  } else { // no topology info available; provide a reasonable guesstimation
9133  if (__kmp_avail_proc >= 4) {
9134  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9135  __kmp_nesting_nth_level[1] = 2;
9136  __kmp_nesting_mode_nlevels = 2;
9137  } else {
9138  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9139  __kmp_nesting_mode_nlevels = 1;
9140  }
9141  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9142  }
9143  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9144  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9145  }
9146  set__nproc(thread, __kmp_nesting_nth_level[0]);
9147  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9148  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9149  if (get__max_active_levels(thread) > 1) {
9150  // if max levels was set, set nesting mode levels to same
9151  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9152  }
9153  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9154  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9155 }
__kmpc_fork_call
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
Definition: kmp_csupport.cpp:262
__kmpc_master
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:770
stats_state_e
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
__kmpc_end_serialized_parallel
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:509
kmp_sch_guided_chunked
@ kmp_sch_guided_chunked
Definition: kmp.h:362
sched_type
sched_type
Definition: kmp.h:357
KMP_IDENT_AUTOPAR
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_COUNT_VALUE
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:895
ident
Definition: kmp.h:234
kmp_sch_auto
@ kmp_sch_auto
Definition: kmp.h:364
kmp_sch_static
@ kmp_sch_static
Definition: kmp.h:360
ident::flags
kmp_int32 flags
Definition: kmp.h:236
KMP_INIT_PARTITIONED_TIMERS
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:937
__kmpc_serialized_parallel
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.cpp:491