LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99  kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113  int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117  int level) {
118  kmp_nested_nthreads_t *new_nested_nth =
119  (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120  sizeof(kmp_nested_nthreads_t));
121  int new_size = level + thr->th.th_set_nested_nth_sz;
122  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123  for (int i = 0; i < level + 1; ++i)
124  new_nested_nth->nth[i] = 0;
125  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126  new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127  new_nested_nth->size = new_nested_nth->used = new_size;
128  return new_nested_nth;
129 }
130 
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134 int __kmp_get_global_thread_id() {
135  int i;
136  kmp_info_t **other_threads;
137  size_t stack_data;
138  char *stack_addr;
139  size_t stack_size;
140  char *stack_base;
141 
142  KA_TRACE(
143  1000,
144  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145  __kmp_nth, __kmp_all_nth));
146 
147  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150  __kmp_init_gtid for this to work. */
151 
152  if (!TCR_4(__kmp_init_gtid))
153  return KMP_GTID_DNE;
154 
155 #ifdef KMP_TDATA_GTID
156  if (TCR_4(__kmp_gtid_mode) >= 3) {
157  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158  return __kmp_gtid;
159  }
160 #endif
161  if (TCR_4(__kmp_gtid_mode) >= 2) {
162  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163  return __kmp_gtid_get_specific();
164  }
165  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166 
167  stack_addr = (char *)&stack_data;
168  other_threads = __kmp_threads;
169 
170  /* ATT: The code below is a source of potential bugs due to unsynchronized
171  access to __kmp_threads array. For example:
172  1. Current thread loads other_threads[i] to thr and checks it, it is
173  non-NULL.
174  2. Current thread is suspended by OS.
175  3. Another thread unregisters and finishes (debug versions of free()
176  may fill memory with something like 0xEF).
177  4. Current thread is resumed.
178  5. Current thread reads junk from *thr.
179  TODO: Fix it. --ln */
180 
181  for (i = 0; i < __kmp_threads_capacity; i++) {
182 
183  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184  if (!thr)
185  continue;
186 
187  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189 
190  /* stack grows down -- search through all of the active threads */
191 
192  if (stack_addr <= stack_base) {
193  size_t stack_diff = stack_base - stack_addr;
194 
195  if (stack_diff <= stack_size) {
196  /* The only way we can be closer than the allocated */
197  /* stack size is if we are running on this thread. */
198  // __kmp_gtid_get_specific can return negative value because this
199  // function can be called by thread destructor. However, before the
200  // thread destructor is called, the value of the corresponding
201  // thread-specific data will be reset to NULL.
202  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203  __kmp_gtid_get_specific() == i);
204  return i;
205  }
206  }
207  }
208 
209  /* get specific to try and determine our gtid */
210  KA_TRACE(1000,
211  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212  "thread, using TLS\n"));
213  i = __kmp_gtid_get_specific();
214 
215  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216 
217  /* if we havn't been assigned a gtid, then return code */
218  if (i < 0)
219  return i;
220 
221  // other_threads[i] can be nullptr at this point because the corresponding
222  // thread could have already been destructed. It can happen when this function
223  // is called in end library routine.
224  if (!TCR_SYNC_PTR(other_threads[i]))
225  return i;
226 
227  /* dynamically updated stack window for uber threads to avoid get_specific
228  call */
229  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230  KMP_FATAL(StackOverflow, i);
231  }
232 
233  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234  if (stack_addr > stack_base) {
235  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238  stack_base);
239  } else {
240  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241  stack_base - stack_addr);
242  }
243 
244  /* Reprint stack bounds for ubermaster since they have been refined */
245  if (__kmp_storage_map) {
246  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249  other_threads[i]->th.th_info.ds.ds_stacksize,
250  "th_%d stack (refinement)", i);
251  }
252  return i;
253 }
254 
255 int __kmp_get_global_thread_id_reg() {
256  int gtid;
257 
258  if (!__kmp_init_serial) {
259  gtid = KMP_GTID_DNE;
260  } else
261 #ifdef KMP_TDATA_GTID
262  if (TCR_4(__kmp_gtid_mode) >= 3) {
263  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264  gtid = __kmp_gtid;
265  } else
266 #endif
267  if (TCR_4(__kmp_gtid_mode) >= 2) {
268  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269  gtid = __kmp_gtid_get_specific();
270  } else {
271  KA_TRACE(1000,
272  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273  gtid = __kmp_get_global_thread_id();
274  }
275 
276  /* we must be a new uber master sibling thread */
277  if (gtid == KMP_GTID_DNE) {
278  KA_TRACE(10,
279  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280  "Registering a new gtid.\n"));
281  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282  if (!__kmp_init_serial) {
283  __kmp_do_serial_initialize();
284  gtid = __kmp_gtid_get_specific();
285  } else {
286  gtid = __kmp_register_root(FALSE);
287  }
288  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290  }
291 
292  KMP_DEBUG_ASSERT(gtid >= 0);
293 
294  return gtid;
295 }
296 
297 /* caller must hold forkjoin_lock */
298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299  int f;
300  char *stack_beg = NULL;
301  char *stack_end = NULL;
302  int gtid;
303 
304  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305  if (__kmp_storage_map) {
306  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308 
309  gtid = __kmp_gtid_from_thread(th);
310 
311  if (gtid == KMP_GTID_MONITOR) {
312  __kmp_print_storage_map_gtid(
313  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314  "th_%s stack (%s)", "mon",
315  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316  } else {
317  __kmp_print_storage_map_gtid(
318  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319  "th_%d stack (%s)", gtid,
320  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321  }
322  }
323 
324  /* No point in checking ubermaster threads since they use refinement and
325  * cannot overlap */
326  gtid = __kmp_gtid_from_thread(th);
327  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328  KA_TRACE(10,
329  ("__kmp_check_stack_overlap: performing extensive checking\n"));
330  if (stack_beg == NULL) {
331  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333  }
334 
335  for (f = 0; f < __kmp_threads_capacity; f++) {
336  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337 
338  if (f_th && f_th != th) {
339  char *other_stack_end =
340  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341  char *other_stack_beg =
342  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345 
346  /* Print the other stack values before the abort */
347  if (__kmp_storage_map)
348  __kmp_print_storage_map_gtid(
349  -1, other_stack_beg, other_stack_end,
350  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352 
353  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354  __kmp_msg_null);
355  }
356  }
357  }
358  }
359  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 
364 void __kmp_infinite_loop(void) {
365  static int done = FALSE;
366 
367  while (!done) {
368  KMP_YIELD(TRUE);
369  }
370 }
371 
372 #define MAX_MESSAGE 512
373 
374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375  char const *format, ...) {
376  char buffer[MAX_MESSAGE];
377  va_list ap;
378 
379  va_start(ap, format);
380  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381  p2, (unsigned long)size, format);
382  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383  __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385  int node;
386  if (gtid >= 0) {
387  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388  if (__kmp_storage_map_verbose) {
389  node = __kmp_get_host_node(p1);
390  if (node < 0) /* doesn't work, so don't try this next time */
391  __kmp_storage_map_verbose = FALSE;
392  else {
393  char *last;
394  int lastNode;
395  int localProc = __kmp_get_cpu_from_gtid(gtid);
396 
397  const int page_size = KMP_GET_PAGE_SIZE();
398 
399  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401  if (localProc >= 0)
402  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403  localProc >> 1);
404  else
405  __kmp_printf_no_lock(" GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407  /* The more elaborate format is disabled for now because of the prctl
408  * hanging bug. */
409  do {
410  last = p1;
411  lastNode = node;
412  /* This loop collates adjacent pages with the same host node. */
413  do {
414  (char *)p1 += page_size;
415  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417  lastNode);
418  } while (p1 <= p2);
419 #else
420  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421  (char *)p1 + (page_size - 1),
422  __kmp_get_host_node(p1));
423  if (p1 < p2) {
424  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425  (char *)p2 + (page_size - 1),
426  __kmp_get_host_node(p2));
427  }
428 #endif
429  }
430  }
431  } else
432  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433  }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436 
437  va_end(ap);
438 }
439 
440 void __kmp_warn(char const *format, ...) {
441  char buffer[MAX_MESSAGE];
442  va_list ap;
443 
444  if (__kmp_generate_warnings == kmp_warnings_off) {
445  return;
446  }
447 
448  va_start(ap, format);
449 
450  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452  __kmp_vprintf(kmp_err, buffer, ap);
453  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454 
455  va_end(ap);
456 }
457 
458 void __kmp_abort_process() {
459  // Later threads may stall here, but that's ok because abort() will kill them.
460  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461 
462  if (__kmp_debug_buf) {
463  __kmp_dump_debug_buffer();
464  }
465 
466 #if KMP_OS_WINDOWS
467  // Let other threads know of abnormal termination and prevent deadlock
468  // if abort happened during library initialization or shutdown
469  __kmp_global.g.g_abort = SIGABRT;
470 
471  /* On Windows* OS by default abort() causes pop-up error box, which stalls
472  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473  boxes. _set_abort_behavior() works well, but this function is not
474  available in VS7 (this is not problem for DLL, but it is a problem for
475  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476  help, at least in some versions of MS C RTL.
477 
478  It seems following sequence is the only way to simulate abort() and
479  avoid pop-up error box. */
480  raise(SIGABRT);
481  _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483  __kmp_unregister_library();
484  abort();
485 #endif
486 
487  __kmp_infinite_loop();
488  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489 
490 } // __kmp_abort_process
491 
492 void __kmp_abort_thread(void) {
493  // TODO: Eliminate g_abort global variable and this function.
494  // In case of abort just call abort(), it will kill all the threads.
495  __kmp_infinite_loop();
496 } // __kmp_abort_thread
497 
498 /* Print out the storage map for the major kmp_info_t thread data structures
499  that are allocated together. */
500 
501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503  gtid);
504 
505  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507 
508  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509  sizeof(kmp_local_t), "th_%d.th_local", gtid);
510 
511  __kmp_print_storage_map_gtid(
512  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514 
515  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516  &thr->th.th_bar[bs_plain_barrier + 1],
517  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518  gtid);
519 
520  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521  &thr->th.th_bar[bs_forkjoin_barrier + 1],
522  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523  gtid);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527  &thr->th.th_bar[bs_reduction_barrier + 1],
528  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529  gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532 
533 /* Print out the storage map for the major kmp_team_t team data structures
534  that are allocated together. */
535 
536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537  int team_id, int num_thr) {
538  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540  header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543  &team->t.t_bar[bs_last_barrier],
544  sizeof(kmp_balign_team_t) * bs_last_barrier,
545  "%s_%d.t_bar", header, team_id);
546 
547  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548  &team->t.t_bar[bs_plain_barrier + 1],
549  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550  header, team_id);
551 
552  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553  &team->t.t_bar[bs_forkjoin_barrier + 1],
554  sizeof(kmp_balign_team_t),
555  "%s_%d.t_bar[forkjoin]", header, team_id);
556 
557 #if KMP_FAST_REDUCTION_BARRIER
558  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559  &team->t.t_bar[bs_reduction_barrier + 1],
560  sizeof(kmp_balign_team_t),
561  "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563 
564  __kmp_print_storage_map_gtid(
565  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567 
568  __kmp_print_storage_map_gtid(
569  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571 
572  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573  &team->t.t_disp_buffer[num_disp_buff],
574  sizeof(dispatch_shared_info_t) * num_disp_buff,
575  "%s_%d.t_disp_buffer", header, team_id);
576 }
577 
578 static void __kmp_init_allocator() {
579  __kmp_init_memkind();
580  __kmp_init_target_mem();
581 }
582 static void __kmp_fini_allocator() {
583  __kmp_fini_target_mem();
584  __kmp_fini_memkind();
585 }
586 
587 /* ------------------------------------------------------------------------ */
588 
589 #if ENABLE_LIBOMPTARGET
590 static void __kmp_init_omptarget() {
591  __kmp_init_target_task();
592 }
593 #endif
594 
595 /* ------------------------------------------------------------------------ */
596 
597 #if KMP_DYNAMIC_LIB
598 #if KMP_OS_WINDOWS
599 
600 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602 
603  switch (fdwReason) {
604 
605  case DLL_PROCESS_ATTACH:
606  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607 
608  return TRUE;
609 
610  case DLL_PROCESS_DETACH:
611  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612 
613  // According to Windows* documentation for DllMain entry point:
614  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615  // lpReserved == NULL when FreeLibrary() is called,
616  // lpReserved != NULL when the process is terminated.
617  // When FreeLibrary() is called, worker threads remain alive. So the
618  // runtime's state is consistent and executing proper shutdown is OK.
619  // When the process is terminated, worker threads have exited or been
620  // forcefully terminated by the OS and only the shutdown thread remains.
621  // This can leave the runtime in an inconsistent state.
622  // Hence, only attempt proper cleanup when FreeLibrary() is called.
623  // Otherwise, rely on OS to reclaim resources.
624  if (lpReserved == NULL)
625  __kmp_internal_end_library(__kmp_gtid_get_specific());
626 
627  return TRUE;
628 
629  case DLL_THREAD_ATTACH:
630  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631 
632  /* if we want to register new siblings all the time here call
633  * __kmp_get_gtid(); */
634  return TRUE;
635 
636  case DLL_THREAD_DETACH:
637  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638 
639  __kmp_internal_end_thread(__kmp_gtid_get_specific());
640  return TRUE;
641  }
642 
643  return TRUE;
644 }
645 
646 #endif /* KMP_OS_WINDOWS */
647 #endif /* KMP_DYNAMIC_LIB */
648 
649 /* __kmp_parallel_deo -- Wait until it's our turn. */
650 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651  int gtid = *gtid_ref;
652 #ifdef BUILD_PARALLEL_ORDERED
653  kmp_team_t *team = __kmp_team_from_gtid(gtid);
654 #endif /* BUILD_PARALLEL_ORDERED */
655 
656  if (__kmp_env_consistency_check) {
657  if (__kmp_threads[gtid]->th.th_root->r.r_active)
658 #if KMP_USE_DYNAMIC_LOCK
659  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
660 #else
661  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662 #endif
663  }
664 #ifdef BUILD_PARALLEL_ORDERED
665  if (!team->t.t_serialized) {
666  KMP_MB();
667  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
668  NULL);
669  KMP_MB();
670  }
671 #endif /* BUILD_PARALLEL_ORDERED */
672 }
673 
674 /* __kmp_parallel_dxo -- Signal the next task. */
675 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676  int gtid = *gtid_ref;
677 #ifdef BUILD_PARALLEL_ORDERED
678  int tid = __kmp_tid_from_gtid(gtid);
679  kmp_team_t *team = __kmp_team_from_gtid(gtid);
680 #endif /* BUILD_PARALLEL_ORDERED */
681 
682  if (__kmp_env_consistency_check) {
683  if (__kmp_threads[gtid]->th.th_root->r.r_active)
684  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
685  }
686 #ifdef BUILD_PARALLEL_ORDERED
687  if (!team->t.t_serialized) {
688  KMP_MB(); /* Flush all pending memory write invalidates. */
689 
690  /* use the tid of the next thread in this team */
691  /* TODO replace with general release procedure */
692  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693 
694  KMP_MB(); /* Flush all pending memory write invalidates. */
695  }
696 #endif /* BUILD_PARALLEL_ORDERED */
697 }
698 
699 /* ------------------------------------------------------------------------ */
700 /* The BARRIER for a SINGLE process section is always explicit */
701 
702 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703  int status;
704  kmp_info_t *th;
705  kmp_team_t *team;
706 
707  if (!TCR_4(__kmp_init_parallel))
708  __kmp_parallel_initialize();
709  __kmp_resume_if_soft_paused();
710 
711  th = __kmp_threads[gtid];
712  team = th->th.th_team;
713  status = 0;
714 
715  th->th.th_ident = id_ref;
716 
717  if (team->t.t_serialized) {
718  status = 1;
719  } else {
720  kmp_int32 old_this = th->th.th_local.this_construct;
721 
722  ++th->th.th_local.this_construct;
723  /* try to set team count to thread count--success means thread got the
724  single block */
725  /* TODO: Should this be acquire or release? */
726  if (team->t.t_construct == old_this) {
727  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
728  th->th.th_local.this_construct);
729  }
730 #if USE_ITT_BUILD
731  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733  team->t.t_active_level == 1) {
734  // Only report metadata by primary thread of active team at level 1
735  __kmp_itt_metadata_single(id_ref);
736  }
737 #endif /* USE_ITT_BUILD */
738  }
739 
740  if (__kmp_env_consistency_check) {
741  if (status && push_ws) {
742  __kmp_push_workshare(gtid, ct_psingle, id_ref);
743  } else {
744  __kmp_check_workshare(gtid, ct_psingle, id_ref);
745  }
746  }
747 #if USE_ITT_BUILD
748  if (status) {
749  __kmp_itt_single_start(gtid);
750  }
751 #endif /* USE_ITT_BUILD */
752  return status;
753 }
754 
755 void __kmp_exit_single(int gtid) {
756 #if USE_ITT_BUILD
757  __kmp_itt_single_end(gtid);
758 #endif /* USE_ITT_BUILD */
759  if (__kmp_env_consistency_check)
760  __kmp_pop_workshare(gtid, ct_psingle, NULL);
761 }
762 
763 /* determine if we can go parallel or must use a serialized parallel region and
764  * how many threads we can use
765  * set_nproc is the number of threads requested for the team
766  * returns 0 if we should serialize or only use one thread,
767  * otherwise the number of threads to use
768  * The forkjoin lock is held by the caller. */
769 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770  int master_tid, int set_nthreads,
771  int enter_teams) {
772  int capacity;
773  int new_nthreads;
774  KMP_DEBUG_ASSERT(__kmp_init_serial);
775  KMP_DEBUG_ASSERT(root && parent_team);
776  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777 
778  // If dyn-var is set, dynamically adjust the number of desired threads,
779  // according to the method specified by dynamic_mode.
780  new_nthreads = set_nthreads;
781  if (!get__dynamic_2(parent_team, master_tid)) {
782  ;
783  }
784 #ifdef USE_LOAD_BALANCE
785  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
787  if (new_nthreads == 1) {
788  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789  "reservation to 1 thread\n",
790  master_tid));
791  return 1;
792  }
793  if (new_nthreads < set_nthreads) {
794  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795  "reservation to %d threads\n",
796  master_tid, new_nthreads));
797  }
798  }
799 #endif /* USE_LOAD_BALANCE */
800  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801  new_nthreads = __kmp_avail_proc - __kmp_nth +
802  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803  if (new_nthreads <= 1) {
804  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805  "reservation to 1 thread\n",
806  master_tid));
807  return 1;
808  }
809  if (new_nthreads < set_nthreads) {
810  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811  "reservation to %d threads\n",
812  master_tid, new_nthreads));
813  } else {
814  new_nthreads = set_nthreads;
815  }
816  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817  if (set_nthreads > 2) {
818  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
819  new_nthreads = (new_nthreads % set_nthreads) + 1;
820  if (new_nthreads == 1) {
821  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822  "reservation to 1 thread\n",
823  master_tid));
824  return 1;
825  }
826  if (new_nthreads < set_nthreads) {
827  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828  "reservation to %d threads\n",
829  master_tid, new_nthreads));
830  }
831  }
832  } else {
833  KMP_ASSERT(0);
834  }
835 
836  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837  if (__kmp_nth + new_nthreads -
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839  __kmp_max_nth) {
840  int tl_nthreads = __kmp_max_nth - __kmp_nth +
841  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842  if (tl_nthreads <= 0) {
843  tl_nthreads = 1;
844  }
845 
846  // If dyn-var is false, emit a 1-time warning.
847  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848  __kmp_reserve_warn = 1;
849  __kmp_msg(kmp_ms_warning,
850  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852  }
853  if (tl_nthreads == 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855  "reduced reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860  "reservation to %d threads\n",
861  master_tid, tl_nthreads));
862  new_nthreads = tl_nthreads;
863  }
864 
865  // Respect OMP_THREAD_LIMIT
866  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868  if (cg_nthreads + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  max_cg_threads) {
871  int tl_nthreads = max_cg_threads - cg_nthreads +
872  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873  if (tl_nthreads <= 0) {
874  tl_nthreads = 1;
875  }
876 
877  // If dyn-var is false, emit a 1-time warning.
878  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879  __kmp_reserve_warn = 1;
880  __kmp_msg(kmp_ms_warning,
881  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883  }
884  if (tl_nthreads == 1) {
885  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886  "reduced reservation to 1 thread\n",
887  master_tid));
888  return 1;
889  }
890  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891  "reservation to %d threads\n",
892  master_tid, tl_nthreads));
893  new_nthreads = tl_nthreads;
894  }
895 
896  // Check if the threads array is large enough, or needs expanding.
897  // See comment in __kmp_register_root() about the adjustment if
898  // __kmp_threads[0] == NULL.
899  capacity = __kmp_threads_capacity;
900  if (TCR_PTR(__kmp_threads[0]) == NULL) {
901  --capacity;
902  }
903  // If it is not for initializing the hidden helper team, we need to take
904  // __kmp_hidden_helper_threads_num out of the capacity because it is included
905  // in __kmp_threads_capacity.
906  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907  capacity -= __kmp_hidden_helper_threads_num;
908  }
909  if (__kmp_nth + new_nthreads -
910  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911  capacity) {
912  // Expand the threads array.
913  int slotsRequired = __kmp_nth + new_nthreads -
914  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915  capacity;
916  int slotsAdded = __kmp_expand_threads(slotsRequired);
917  if (slotsAdded < slotsRequired) {
918  // The threads array was not expanded enough.
919  new_nthreads -= (slotsRequired - slotsAdded);
920  KMP_ASSERT(new_nthreads >= 1);
921 
922  // If dyn-var is false, emit a 1-time warning.
923  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924  __kmp_reserve_warn = 1;
925  if (__kmp_tp_cached) {
926  __kmp_msg(kmp_ms_warning,
927  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930  } else {
931  __kmp_msg(kmp_ms_warning,
932  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934  }
935  }
936  }
937  }
938 
939 #ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941  KC_TRACE(10,
942  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943  "dead roots and rechecking; requested %d threads\n",
944  __kmp_get_gtid(), set_nthreads));
945  } else {
946  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947  " %d threads\n",
948  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950 #endif // KMP_DEBUG
951 
952  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
954  this_thr->th.th_nt_msg);
955  }
956  return new_nthreads;
957 }
958 
959 /* Allocate threads from the thread pool and assign them to the new team. We are
960  assured that there are enough threads available, because we checked on that
961  earlier within critical section forkjoin */
962 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963  kmp_info_t *master_th, int master_gtid,
964  int fork_teams_workers) {
965  int i;
966  int use_hot_team;
967 
968  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970  KMP_MB();
971 
972  /* first, let's setup the primary thread */
973  master_th->th.th_info.ds.ds_tid = 0;
974  master_th->th.th_team = team;
975  master_th->th.th_team_nproc = team->t.t_nproc;
976  master_th->th.th_team_master = master_th;
977  master_th->th.th_team_serialized = FALSE;
978  master_th->th.th_dispatch = &team->t.t_dispatch[0];
979 
980 /* make sure we are not the optimized hot team */
981 #if KMP_NESTED_HOT_TEAMS
982  use_hot_team = 0;
983  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
984  if (hot_teams) { // hot teams array is not allocated if
985  // KMP_HOT_TEAMS_MAX_LEVEL=0
986  int level = team->t.t_active_level - 1; // index in array of hot teams
987  if (master_th->th.th_teams_microtask) { // are we inside the teams?
988  if (master_th->th.th_teams_size.nteams > 1) {
989  ++level; // level was not increased in teams construct for
990  // team_of_masters
991  }
992  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
993  master_th->th.th_teams_level == team->t.t_level) {
994  ++level; // level was not increased in teams construct for
995  // team_of_workers before the parallel
996  } // team->t.t_level will be increased inside parallel
997  }
998  if (level < __kmp_hot_teams_max_level) {
999  if (hot_teams[level].hot_team) {
1000  // hot team has already been allocated for given level
1001  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1002  use_hot_team = 1; // the team is ready to use
1003  } else {
1004  use_hot_team = 0; // AC: threads are not allocated yet
1005  hot_teams[level].hot_team = team; // remember new hot team
1006  hot_teams[level].hot_team_nth = team->t.t_nproc;
1007  }
1008  } else {
1009  use_hot_team = 0;
1010  }
1011  }
1012 #else
1013  use_hot_team = team == root->r.r_hot_team;
1014 #endif
1015  if (!use_hot_team) {
1016 
1017  /* install the primary thread */
1018  team->t.t_threads[0] = master_th;
1019  __kmp_initialize_info(master_th, team, 0, master_gtid);
1020 
1021  /* now, install the worker threads */
1022  for (i = 1; i < team->t.t_nproc; i++) {
1023 
1024  /* fork or reallocate a new thread and install it in team */
1025  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1026  team->t.t_threads[i] = thr;
1027  KMP_DEBUG_ASSERT(thr);
1028  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1029  /* align team and thread arrived states */
1030  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1031  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1032  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1033  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1034  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1035  team->t.t_bar[bs_plain_barrier].b_arrived));
1036  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1037  thr->th.th_teams_level = master_th->th.th_teams_level;
1038  thr->th.th_teams_size = master_th->th.th_teams_size;
1039  { // Initialize threads' barrier data.
1040  int b;
1041  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1042  for (b = 0; b < bs_last_barrier; ++b) {
1043  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1044  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1045 #if USE_DEBUGGER
1046  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1047 #endif
1048  }
1049  }
1050  }
1051 
1052 #if KMP_AFFINITY_SUPPORTED
1053  // Do not partition the places list for teams construct workers who
1054  // haven't actually been forked to do real work yet. This partitioning
1055  // will take place in the parallel region nested within the teams construct.
1056  if (!fork_teams_workers) {
1057  __kmp_partition_places(team);
1058  }
1059 #endif
1060 
1061  if (team->t.t_nproc > 1 &&
1062  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1063  team->t.b->update_num_threads(team->t.t_nproc);
1064  __kmp_add_threads_to_team(team, team->t.t_nproc);
1065  }
1066  }
1067 
1068  // Take care of primary thread's task state
1069  if (__kmp_tasking_mode != tskm_immediate_exec) {
1070  if (use_hot_team) {
1071  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1072  KA_TRACE(
1073  20,
1074  ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1075  "%p, new task_team %p / team %p\n",
1076  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1077  team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1078  team));
1079 
1080  // Store primary thread's current task state on new team
1081  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1082  master_th->th.th_task_state);
1083 
1084  // Restore primary thread's task state to hot team's state
1085  // by using thread 1's task state
1086  if (team->t.t_nproc > 1) {
1087  KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1088  team->t.t_threads[1]->th.th_task_state == 1);
1089  KMP_CHECK_UPDATE(master_th->th.th_task_state,
1090  team->t.t_threads[1]->th.th_task_state);
1091  } else {
1092  master_th->th.th_task_state = 0;
1093  }
1094  } else {
1095  // Store primary thread's current task_state on new team
1096  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1097  master_th->th.th_task_state);
1098  // Are not using hot team, so set task state to 0.
1099  master_th->th.th_task_state = 0;
1100  }
1101  }
1102 
1103  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1104  for (i = 0; i < team->t.t_nproc; i++) {
1105  kmp_info_t *thr = team->t.t_threads[i];
1106  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1107  thr->th.th_prev_level != team->t.t_level) {
1108  team->t.t_display_affinity = 1;
1109  break;
1110  }
1111  }
1112  }
1113 
1114  KMP_MB();
1115 }
1116 
1117 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1118 // Propagate any changes to the floating point control registers out to the team
1119 // We try to avoid unnecessary writes to the relevant cache line in the team
1120 // structure, so we don't make changes unless they are needed.
1121 inline static void propagateFPControl(kmp_team_t *team) {
1122  if (__kmp_inherit_fp_control) {
1123  kmp_int16 x87_fpu_control_word;
1124  kmp_uint32 mxcsr;
1125 
1126  // Get primary thread's values of FPU control flags (both X87 and vector)
1127  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128  __kmp_store_mxcsr(&mxcsr);
1129  mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131  // There is no point looking at t_fp_control_saved here.
1132  // If it is TRUE, we still have to update the values if they are different
1133  // from those we now have. If it is FALSE we didn't save anything yet, but
1134  // our objective is the same. We have to ensure that the values in the team
1135  // are the same as those we have.
1136  // So, this code achieves what we need whether or not t_fp_control_saved is
1137  // true. By checking whether the value needs updating we avoid unnecessary
1138  // writes that would put the cache-line into a written state, causing all
1139  // threads in the team to have to read it again.
1140  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1141  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1142  // Although we don't use this value, other code in the runtime wants to know
1143  // whether it should restore them. So we must ensure it is correct.
1144  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1145  } else {
1146  // Similarly here. Don't write to this cache-line in the team structure
1147  // unless we have to.
1148  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1149  }
1150 }
1151 
1152 // Do the opposite, setting the hardware registers to the updated values from
1153 // the team.
1154 inline static void updateHWFPControl(kmp_team_t *team) {
1155  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1156  // Only reset the fp control regs if they have been changed in the team.
1157  // the parallel region that we are exiting.
1158  kmp_int16 x87_fpu_control_word;
1159  kmp_uint32 mxcsr;
1160  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1161  __kmp_store_mxcsr(&mxcsr);
1162  mxcsr &= KMP_X86_MXCSR_MASK;
1163 
1164  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1165  __kmp_clear_x87_fpu_status_word();
1166  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1167  }
1168 
1169  if (team->t.t_mxcsr != mxcsr) {
1170  __kmp_load_mxcsr(&team->t.t_mxcsr);
1171  }
1172  }
1173 }
1174 #else
1175 #define propagateFPControl(x) ((void)0)
1176 #define updateHWFPControl(x) ((void)0)
1177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1178 
1179 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1180  int realloc); // forward declaration
1181 
1182 /* Run a parallel region that has been serialized, so runs only in a team of the
1183  single primary thread. */
1184 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1185  kmp_info_t *this_thr;
1186  kmp_team_t *serial_team;
1187 
1188  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1189 
1190  /* Skip all this code for autopar serialized loops since it results in
1191  unacceptable overhead */
1192  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1193  return;
1194 
1195  if (!TCR_4(__kmp_init_parallel))
1196  __kmp_parallel_initialize();
1197  __kmp_resume_if_soft_paused();
1198 
1199  this_thr = __kmp_threads[global_tid];
1200  serial_team = this_thr->th.th_serial_team;
1201 
1202  /* utilize the serialized team held by this thread */
1203  KMP_DEBUG_ASSERT(serial_team);
1204  KMP_MB();
1205 
1206  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1207  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1208  proc_bind = proc_bind_false;
1209  } else if (proc_bind == proc_bind_default) {
1210  // No proc_bind clause was specified, so use the current value
1211  // of proc-bind-var for this parallel region.
1212  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1213  }
1214  // Reset for next parallel region
1215  this_thr->th.th_set_proc_bind = proc_bind_default;
1216 
1217  // Reset num_threads for next parallel region
1218  this_thr->th.th_set_nproc = 0;
1219 
1220 #if OMPT_SUPPORT
1221  ompt_data_t ompt_parallel_data = ompt_data_none;
1222  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1223  if (ompt_enabled.enabled &&
1224  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1225 
1226  ompt_task_info_t *parent_task_info;
1227  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1228 
1229  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1230  if (ompt_enabled.ompt_callback_parallel_begin) {
1231  int team_size = 1;
1232 
1233  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1234  &(parent_task_info->task_data), &(parent_task_info->frame),
1235  &ompt_parallel_data, team_size,
1236  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1237  }
1238  }
1239 #endif // OMPT_SUPPORT
1240 
1241  if (this_thr->th.th_team != serial_team) {
1242  // Nested level will be an index in the nested nthreads array
1243  int level = this_thr->th.th_team->t.t_level;
1244 
1245  if (serial_team->t.t_serialized) {
1246  /* this serial team was already used
1247  TODO increase performance by making this locks more specific */
1248  kmp_team_t *new_team;
1249 
1250  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1251 
1252  new_team =
1253  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1254 #if OMPT_SUPPORT
1255  ompt_parallel_data,
1256 #endif
1257  proc_bind, &this_thr->th.th_current_task->td_icvs,
1258  0 USE_NESTED_HOT_ARG(NULL));
1259  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1260  KMP_ASSERT(new_team);
1261 
1262  /* setup new serialized team and install it */
1263  new_team->t.t_threads[0] = this_thr;
1264  new_team->t.t_parent = this_thr->th.th_team;
1265  serial_team = new_team;
1266  this_thr->th.th_serial_team = serial_team;
1267 
1268  KF_TRACE(
1269  10,
1270  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1271  global_tid, serial_team));
1272 
1273  /* TODO the above breaks the requirement that if we run out of resources,
1274  then we can still guarantee that serialized teams are ok, since we may
1275  need to allocate a new one */
1276  } else {
1277  KF_TRACE(
1278  10,
1279  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1280  global_tid, serial_team));
1281  }
1282 
1283  /* we have to initialize this serial team */
1284  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1285  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1286  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1287  serial_team->t.t_ident = loc;
1288  serial_team->t.t_serialized = 1;
1289  serial_team->t.t_nproc = 1;
1290  serial_team->t.t_parent = this_thr->th.th_team;
1291  if (this_thr->th.th_team->t.t_nested_nth)
1292  serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1293  else
1294  serial_team->t.t_nested_nth = &__kmp_nested_nth;
1295  // Save previous team's task state on serial team structure
1296  serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1297  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1298  this_thr->th.th_team = serial_team;
1299  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1300 
1301  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1302  this_thr->th.th_current_task));
1303  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1304  this_thr->th.th_current_task->td_flags.executing = 0;
1305 
1306  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1307 
1308  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1309  implicit task for each serialized task represented by
1310  team->t.t_serialized? */
1311  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1312  &this_thr->th.th_current_task->td_parent->td_icvs);
1313 
1314  // Thread value exists in the nested nthreads array for the next nested
1315  // level
1316  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1317  if (this_thr->th.th_team->t.t_nested_nth)
1318  nested_nth = this_thr->th.th_team->t.t_nested_nth;
1319  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1320  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1321  }
1322 
1323  if (__kmp_nested_proc_bind.used &&
1324  (level + 1 < __kmp_nested_proc_bind.used)) {
1325  this_thr->th.th_current_task->td_icvs.proc_bind =
1326  __kmp_nested_proc_bind.bind_types[level + 1];
1327  }
1328 
1329 #if USE_DEBUGGER
1330  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1331 #endif
1332  this_thr->th.th_info.ds.ds_tid = 0;
1333 
1334  /* set thread cache values */
1335  this_thr->th.th_team_nproc = 1;
1336  this_thr->th.th_team_master = this_thr;
1337  this_thr->th.th_team_serialized = 1;
1338  this_thr->th.th_task_team = NULL;
1339  this_thr->th.th_task_state = 0;
1340 
1341  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1342  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1343  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1344 
1345  propagateFPControl(serial_team);
1346 
1347  /* check if we need to allocate dispatch buffers stack */
1348  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1349  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1350  serial_team->t.t_dispatch->th_disp_buffer =
1351  (dispatch_private_info_t *)__kmp_allocate(
1352  sizeof(dispatch_private_info_t));
1353  }
1354  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1355 
1356  KMP_MB();
1357 
1358  } else {
1359  /* this serialized team is already being used,
1360  * that's fine, just add another nested level */
1361  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1362  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1363  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1364  ++serial_team->t.t_serialized;
1365  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1366 
1367  // Nested level will be an index in the nested nthreads array
1368  int level = this_thr->th.th_team->t.t_level;
1369  // Thread value exists in the nested nthreads array for the next nested
1370  // level
1371 
1372  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1373  if (serial_team->t.t_nested_nth)
1374  nested_nth = serial_team->t.t_nested_nth;
1375  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1376  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1377  }
1378 
1379  serial_team->t.t_level++;
1380  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1381  "of serial team %p to %d\n",
1382  global_tid, serial_team, serial_team->t.t_level));
1383 
1384  /* allocate/push dispatch buffers stack */
1385  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1386  {
1387  dispatch_private_info_t *disp_buffer =
1388  (dispatch_private_info_t *)__kmp_allocate(
1389  sizeof(dispatch_private_info_t));
1390  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1391  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1392  }
1393  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1394 
1395  /* allocate/push task team stack */
1396  __kmp_push_task_team_node(this_thr, serial_team);
1397 
1398  KMP_MB();
1399  }
1400  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1401 
1402  // Perform the display affinity functionality for
1403  // serialized parallel regions
1404  if (__kmp_display_affinity) {
1405  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1406  this_thr->th.th_prev_num_threads != 1) {
1407  // NULL means use the affinity-format-var ICV
1408  __kmp_aux_display_affinity(global_tid, NULL);
1409  this_thr->th.th_prev_level = serial_team->t.t_level;
1410  this_thr->th.th_prev_num_threads = 1;
1411  }
1412  }
1413 
1414  if (__kmp_env_consistency_check)
1415  __kmp_push_parallel(global_tid, NULL);
1416 #if OMPT_SUPPORT
1417  serial_team->t.ompt_team_info.master_return_address = codeptr;
1418  if (ompt_enabled.enabled &&
1419  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1420  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1421  OMPT_GET_FRAME_ADDRESS(0);
1422 
1423  ompt_lw_taskteam_t lw_taskteam;
1424  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1425  &ompt_parallel_data, codeptr);
1426 
1427  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1428  // don't use lw_taskteam after linking. content was swaped
1429 
1430  /* OMPT implicit task begin */
1431  if (ompt_enabled.ompt_callback_implicit_task) {
1432  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1434  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1435  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1436  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1437  __kmp_tid_from_gtid(global_tid);
1438  }
1439 
1440  /* OMPT state */
1441  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1442  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1443  OMPT_GET_FRAME_ADDRESS(0);
1444  }
1445 #endif
1446 }
1447 
1448 // Test if this fork is for a team closely nested in a teams construct
1449 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1450  microtask_t microtask, int level,
1451  int teams_level, kmp_va_list ap) {
1452  return (master_th->th.th_teams_microtask && ap &&
1453  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1454 }
1455 
1456 // Test if this fork is for the teams construct, i.e. to form the outer league
1457 // of teams
1458 static inline bool __kmp_is_entering_teams(int active_level, int level,
1459  int teams_level, kmp_va_list ap) {
1460  return ((ap == NULL && active_level == 0) ||
1461  (ap && teams_level > 0 && teams_level == level));
1462 }
1463 
1464 // AC: This is start of parallel that is nested inside teams construct.
1465 // The team is actual (hot), all workers are ready at the fork barrier.
1466 // No lock needed to initialize the team a bit, then free workers.
1467 static inline int
1468 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1469  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1470  enum fork_context_e call_context, microtask_t microtask,
1471  launch_t invoker, int master_set_numthreads, int level,
1472 #if OMPT_SUPPORT
1473  ompt_data_t ompt_parallel_data, void *return_address,
1474 #endif
1475  kmp_va_list ap) {
1476  void **argv;
1477  int i;
1478 
1479  parent_team->t.t_ident = loc;
1480  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1481  parent_team->t.t_argc = argc;
1482  argv = (void **)parent_team->t.t_argv;
1483  for (i = argc - 1; i >= 0; --i) {
1484  *argv++ = va_arg(kmp_va_deref(ap), void *);
1485  }
1486  // Increment our nested depth levels, but not increase the serialization
1487  if (parent_team == master_th->th.th_serial_team) {
1488  // AC: we are in serialized parallel
1489  __kmpc_serialized_parallel(loc, gtid);
1490  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1491 
1492  if (call_context == fork_context_gnu) {
1493  // AC: need to decrement t_serialized for enquiry functions to work
1494  // correctly, will restore at join time
1495  parent_team->t.t_serialized--;
1496  return TRUE;
1497  }
1498 
1499 #if OMPD_SUPPORT
1500  parent_team->t.t_pkfn = microtask;
1501 #endif
1502 
1503 #if OMPT_SUPPORT
1504  void *dummy;
1505  void **exit_frame_p;
1506  ompt_data_t *implicit_task_data;
1507  ompt_lw_taskteam_t lw_taskteam;
1508 
1509  if (ompt_enabled.enabled) {
1510  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1511  &ompt_parallel_data, return_address);
1512  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1513 
1514  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1515  // Don't use lw_taskteam after linking. Content was swapped.
1516 
1517  /* OMPT implicit task begin */
1518  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1519  if (ompt_enabled.ompt_callback_implicit_task) {
1520  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1521  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1522  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1523  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1524  }
1525 
1526  /* OMPT state */
1527  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1528  } else {
1529  exit_frame_p = &dummy;
1530  }
1531 #endif
1532 
1533  // AC: need to decrement t_serialized for enquiry functions to work
1534  // correctly, will restore at join time
1535  parent_team->t.t_serialized--;
1536 
1537  {
1538  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1539  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1540  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1541 #if OMPT_SUPPORT
1542  ,
1543  exit_frame_p
1544 #endif
1545  );
1546  }
1547 
1548 #if OMPT_SUPPORT
1549  if (ompt_enabled.enabled) {
1550  *exit_frame_p = NULL;
1551  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1552  if (ompt_enabled.ompt_callback_implicit_task) {
1553  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1554  ompt_scope_end, NULL, implicit_task_data, 1,
1555  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1556  }
1557  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1558  __ompt_lw_taskteam_unlink(master_th);
1559  if (ompt_enabled.ompt_callback_parallel_end) {
1560  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1561  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1562  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1563  }
1564  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1565  }
1566 #endif
1567  return TRUE;
1568  }
1569 
1570  parent_team->t.t_pkfn = microtask;
1571  parent_team->t.t_invoke = invoker;
1572  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1573  parent_team->t.t_active_level++;
1574  parent_team->t.t_level++;
1575  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1576 
1577  // If the threads allocated to the team are less than the thread limit, update
1578  // the thread limit here. th_teams_size.nth is specific to this team nested
1579  // in a teams construct, the team is fully created, and we're about to do
1580  // the actual fork. Best to do this here so that the subsequent uses below
1581  // and in the join have the correct value.
1582  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1583 
1584 #if OMPT_SUPPORT
1585  if (ompt_enabled.enabled) {
1586  ompt_lw_taskteam_t lw_taskteam;
1587  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1588  return_address);
1589  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1590  }
1591 #endif
1592 
1593  /* Change number of threads in the team if requested */
1594  if (master_set_numthreads) { // The parallel has num_threads clause
1595  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1596  // AC: only can reduce number of threads dynamically, can't increase
1597  kmp_info_t **other_threads = parent_team->t.t_threads;
1598  // NOTE: if using distributed barrier, we need to run this code block
1599  // even when the team size appears not to have changed from the max.
1600  int old_proc = master_th->th.th_teams_size.nth;
1601  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1602  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1603  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1604  }
1605  parent_team->t.t_nproc = master_set_numthreads;
1606  for (i = 0; i < master_set_numthreads; ++i) {
1607  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1608  }
1609  }
1610  // Keep extra threads hot in the team for possible next parallels
1611  master_th->th.th_set_nproc = 0;
1612  }
1613 
1614 #if USE_DEBUGGER
1615  if (__kmp_debugging) { // Let debugger override number of threads.
1616  int nth = __kmp_omp_num_threads(loc);
1617  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1618  master_set_numthreads = nth;
1619  }
1620  }
1621 #endif
1622 
1623  // Figure out the proc_bind policy for the nested parallel within teams
1624  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1625  // proc_bind_default means don't update
1626  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1627  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1628  proc_bind = proc_bind_false;
1629  } else {
1630  // No proc_bind clause specified; use current proc-bind-var
1631  if (proc_bind == proc_bind_default) {
1632  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1633  }
1634  /* else: The proc_bind policy was specified explicitly on parallel clause.
1635  This overrides proc-bind-var for this parallel region, but does not
1636  change proc-bind-var. */
1637  // Figure the value of proc-bind-var for the child threads.
1638  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1639  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1640  master_th->th.th_current_task->td_icvs.proc_bind)) {
1641  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1642  }
1643  }
1644  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1645  // Need to change the bind-var ICV to correct value for each implicit task
1646  if (proc_bind_icv != proc_bind_default &&
1647  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1648  kmp_info_t **other_threads = parent_team->t.t_threads;
1649  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1650  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1651  }
1652  }
1653  // Reset for next parallel region
1654  master_th->th.th_set_proc_bind = proc_bind_default;
1655 
1656 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1657  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1658  KMP_ITT_DEBUG) &&
1659  __kmp_forkjoin_frames_mode == 3 &&
1660  parent_team->t.t_active_level == 1 // only report frames at level 1
1661  && master_th->th.th_teams_size.nteams == 1) {
1662  kmp_uint64 tmp_time = __itt_get_timestamp();
1663  master_th->th.th_frame_time = tmp_time;
1664  parent_team->t.t_region_time = tmp_time;
1665  }
1666  if (__itt_stack_caller_create_ptr) {
1667  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1668  // create new stack stitching id before entering fork barrier
1669  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1670  }
1671 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1672 #if KMP_AFFINITY_SUPPORTED
1673  __kmp_partition_places(parent_team);
1674 #endif
1675 
1676  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1677  "master_th=%p, gtid=%d\n",
1678  root, parent_team, master_th, gtid));
1679  __kmp_internal_fork(loc, gtid, parent_team);
1680  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1681  "master_th=%p, gtid=%d\n",
1682  root, parent_team, master_th, gtid));
1683 
1684  if (call_context == fork_context_gnu)
1685  return TRUE;
1686 
1687  /* Invoke microtask for PRIMARY thread */
1688  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1689  parent_team->t.t_id, parent_team->t.t_pkfn));
1690 
1691  if (!parent_team->t.t_invoke(gtid)) {
1692  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1693  }
1694  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1695  parent_team->t.t_id, parent_team->t.t_pkfn));
1696  KMP_MB(); /* Flush all pending memory write invalidates. */
1697 
1698  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1699 
1700  return TRUE;
1701 }
1702 
1703 // Create a serialized parallel region
1704 static inline int
1705 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1706  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1707  kmp_info_t *master_th, kmp_team_t *parent_team,
1708 #if OMPT_SUPPORT
1709  ompt_data_t *ompt_parallel_data, void **return_address,
1710  ompt_data_t **parent_task_data,
1711 #endif
1712  kmp_va_list ap) {
1713  kmp_team_t *team;
1714  int i;
1715  void **argv;
1716 
1717 /* josh todo: hypothetical question: what do we do for OS X*? */
1718 #if KMP_OS_LINUX && \
1719  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1720  SimpleVLA<void *> args(argc);
1721 #else
1722  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1723 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1724  KMP_ARCH_AARCH64) */
1725 
1726  KA_TRACE(
1727  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1728 
1729  __kmpc_serialized_parallel(loc, gtid);
1730 
1731 #if OMPD_SUPPORT
1732  master_th->th.th_serial_team->t.t_pkfn = microtask;
1733 #endif
1734 
1735  if (call_context == fork_context_intel) {
1736  /* TODO this sucks, use the compiler itself to pass args! :) */
1737  master_th->th.th_serial_team->t.t_ident = loc;
1738  if (!ap) {
1739  // revert change made in __kmpc_serialized_parallel()
1740  master_th->th.th_serial_team->t.t_level--;
1741 // Get args from parent team for teams construct
1742 
1743 #if OMPT_SUPPORT
1744  void *dummy;
1745  void **exit_frame_p;
1746  ompt_task_info_t *task_info;
1747  ompt_lw_taskteam_t lw_taskteam;
1748 
1749  if (ompt_enabled.enabled) {
1750  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1751  ompt_parallel_data, *return_address);
1752 
1753  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1754  // don't use lw_taskteam after linking. content was swaped
1755  task_info = OMPT_CUR_TASK_INFO(master_th);
1756  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1757  if (ompt_enabled.ompt_callback_implicit_task) {
1758  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1759  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1760  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1761  &(task_info->task_data), 1,
1762  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1763  }
1764 
1765  /* OMPT state */
1766  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1767  } else {
1768  exit_frame_p = &dummy;
1769  }
1770 #endif
1771 
1772  {
1773  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1774  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1775  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1776 #if OMPT_SUPPORT
1777  ,
1778  exit_frame_p
1779 #endif
1780  );
1781  }
1782 
1783 #if OMPT_SUPPORT
1784  if (ompt_enabled.enabled) {
1785  *exit_frame_p = NULL;
1786  if (ompt_enabled.ompt_callback_implicit_task) {
1787  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1788  ompt_scope_end, NULL, &(task_info->task_data), 1,
1789  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1790  }
1791  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1792  __ompt_lw_taskteam_unlink(master_th);
1793  if (ompt_enabled.ompt_callback_parallel_end) {
1794  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1795  ompt_parallel_data, *parent_task_data,
1796  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1797  }
1798  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1799  }
1800 #endif
1801  } else if (microtask == (microtask_t)__kmp_teams_master) {
1802  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1803  team = master_th->th.th_team;
1804  // team->t.t_pkfn = microtask;
1805  team->t.t_invoke = invoker;
1806  __kmp_alloc_argv_entries(argc, team, TRUE);
1807  team->t.t_argc = argc;
1808  argv = (void **)team->t.t_argv;
1809  for (i = argc - 1; i >= 0; --i)
1810  *argv++ = va_arg(kmp_va_deref(ap), void *);
1811  // AC: revert change made in __kmpc_serialized_parallel()
1812  // because initial code in teams should have level=0
1813  team->t.t_level--;
1814  // AC: call special invoker for outer "parallel" of teams construct
1815  invoker(gtid);
1816 #if OMPT_SUPPORT
1817  if (ompt_enabled.enabled) {
1818  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1819  if (ompt_enabled.ompt_callback_implicit_task) {
1820  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1821  ompt_scope_end, NULL, &(task_info->task_data), 0,
1822  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1823  }
1824  if (ompt_enabled.ompt_callback_parallel_end) {
1825  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1826  ompt_parallel_data, *parent_task_data,
1827  OMPT_INVOKER(call_context) | ompt_parallel_league,
1828  *return_address);
1829  }
1830  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1831  }
1832 #endif
1833  } else {
1834  argv = args;
1835  for (i = argc - 1; i >= 0; --i)
1836  *argv++ = va_arg(kmp_va_deref(ap), void *);
1837  KMP_MB();
1838 
1839 #if OMPT_SUPPORT
1840  void *dummy;
1841  void **exit_frame_p;
1842  ompt_task_info_t *task_info;
1843  ompt_lw_taskteam_t lw_taskteam;
1844  ompt_data_t *implicit_task_data;
1845 
1846  if (ompt_enabled.enabled) {
1847  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1848  ompt_parallel_data, *return_address);
1849  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1850  // don't use lw_taskteam after linking. content was swaped
1851  task_info = OMPT_CUR_TASK_INFO(master_th);
1852  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1853 
1854  /* OMPT implicit task begin */
1855  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1856  if (ompt_enabled.ompt_callback_implicit_task) {
1857  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1858  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1859  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1860  ompt_task_implicit);
1861  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1862  }
1863 
1864  /* OMPT state */
1865  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1866  } else {
1867  exit_frame_p = &dummy;
1868  }
1869 #endif
1870 
1871  {
1872  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1873  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1874  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1875 #if OMPT_SUPPORT
1876  ,
1877  exit_frame_p
1878 #endif
1879  );
1880  }
1881 
1882 #if OMPT_SUPPORT
1883  if (ompt_enabled.enabled) {
1884  *exit_frame_p = NULL;
1885  if (ompt_enabled.ompt_callback_implicit_task) {
1886  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1887  ompt_scope_end, NULL, &(task_info->task_data), 1,
1888  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1889  }
1890 
1891  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1892  __ompt_lw_taskteam_unlink(master_th);
1893  if (ompt_enabled.ompt_callback_parallel_end) {
1894  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1895  ompt_parallel_data, *parent_task_data,
1896  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1897  }
1898  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1899  }
1900 #endif
1901  }
1902  } else if (call_context == fork_context_gnu) {
1903 #if OMPT_SUPPORT
1904  if (ompt_enabled.enabled) {
1905  ompt_lw_taskteam_t lwt;
1906  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1907  *return_address);
1908 
1909  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1910  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1911  }
1912 // don't use lw_taskteam after linking. content was swaped
1913 #endif
1914 
1915  // we were called from GNU native code
1916  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1917  return FALSE;
1918  } else {
1919  KMP_ASSERT2(call_context < fork_context_last,
1920  "__kmp_serial_fork_call: unknown fork_context parameter");
1921  }
1922 
1923  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1924  KMP_MB();
1925  return FALSE;
1926 }
1927 
1928 /* most of the work for a fork */
1929 /* return true if we really went parallel, false if serialized */
1930 int __kmp_fork_call(ident_t *loc, int gtid,
1931  enum fork_context_e call_context, // Intel, GNU, ...
1932  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1933  kmp_va_list ap) {
1934  void **argv;
1935  int i;
1936  int master_tid;
1937  int master_this_cons;
1938  kmp_team_t *team;
1939  kmp_team_t *parent_team;
1940  kmp_info_t *master_th;
1941  kmp_root_t *root;
1942  int nthreads;
1943  int master_active;
1944  int master_set_numthreads;
1945  int task_thread_limit = 0;
1946  int level;
1947  int active_level;
1948  int teams_level;
1949 #if KMP_NESTED_HOT_TEAMS
1950  kmp_hot_team_ptr_t **p_hot_teams;
1951 #endif
1952  { // KMP_TIME_BLOCK
1953  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1954  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1955 
1956  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1957  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1958  /* Some systems prefer the stack for the root thread(s) to start with */
1959  /* some gap from the parent stack to prevent false sharing. */
1960  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1961  /* These 2 lines below are so this does not get optimized out */
1962  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1963  __kmp_stkpadding += (short)((kmp_int64)dummy);
1964  }
1965 
1966  /* initialize if needed */
1967  KMP_DEBUG_ASSERT(
1968  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1969  if (!TCR_4(__kmp_init_parallel))
1970  __kmp_parallel_initialize();
1971  __kmp_resume_if_soft_paused();
1972 
1973  /* setup current data */
1974  // AC: potentially unsafe, not in sync with library shutdown,
1975  // __kmp_threads can be freed
1976  master_th = __kmp_threads[gtid];
1977 
1978  parent_team = master_th->th.th_team;
1979  master_tid = master_th->th.th_info.ds.ds_tid;
1980  master_this_cons = master_th->th.th_local.this_construct;
1981  root = master_th->th.th_root;
1982  master_active = root->r.r_active;
1983  master_set_numthreads = master_th->th.th_set_nproc;
1984  task_thread_limit =
1985  master_th->th.th_current_task->td_icvs.task_thread_limit;
1986 
1987 #if OMPT_SUPPORT
1988  ompt_data_t ompt_parallel_data = ompt_data_none;
1989  ompt_data_t *parent_task_data = NULL;
1990  ompt_frame_t *ompt_frame = NULL;
1991  void *return_address = NULL;
1992 
1993  if (ompt_enabled.enabled) {
1994  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1995  NULL, NULL);
1996  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1997  }
1998 #endif
1999 
2000  // Assign affinity to root thread if it hasn't happened yet
2001  __kmp_assign_root_init_mask();
2002 
2003  // Nested level will be an index in the nested nthreads array
2004  level = parent_team->t.t_level;
2005  // used to launch non-serial teams even if nested is not allowed
2006  active_level = parent_team->t.t_active_level;
2007  // needed to check nesting inside the teams
2008  teams_level = master_th->th.th_teams_level;
2009 #if KMP_NESTED_HOT_TEAMS
2010  p_hot_teams = &master_th->th.th_hot_teams;
2011  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2012  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2013  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2014  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2015  // it is either actual or not needed (when active_level > 0)
2016  (*p_hot_teams)[0].hot_team_nth = 1;
2017  }
2018 #endif
2019 
2020 #if OMPT_SUPPORT
2021  if (ompt_enabled.enabled) {
2022  if (ompt_enabled.ompt_callback_parallel_begin) {
2023  int team_size = master_set_numthreads
2024  ? master_set_numthreads
2025  : get__nproc_2(parent_team, master_tid);
2026  int flags = OMPT_INVOKER(call_context) |
2027  ((microtask == (microtask_t)__kmp_teams_master)
2028  ? ompt_parallel_league
2029  : ompt_parallel_team);
2030  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2031  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2032  return_address);
2033  }
2034  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2035  }
2036 #endif
2037 
2038  master_th->th.th_ident = loc;
2039 
2040  // Parallel closely nested in teams construct:
2041  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2042  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2043  call_context, microtask, invoker,
2044  master_set_numthreads, level,
2045 #if OMPT_SUPPORT
2046  ompt_parallel_data, return_address,
2047 #endif
2048  ap);
2049  } // End parallel closely nested in teams construct
2050 
2051  // Need this to happen before we determine the number of threads, not while
2052  // we are allocating the team
2053  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2054 
2055  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2056 
2057  // Determine the number of threads
2058  int enter_teams =
2059  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2060  if ((!enter_teams &&
2061  (parent_team->t.t_active_level >=
2062  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2063  (__kmp_library == library_serial)) {
2064  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2065  nthreads = 1;
2066  } else {
2067  nthreads = master_set_numthreads
2068  ? master_set_numthreads
2069  // TODO: get nproc directly from current task
2070  : get__nproc_2(parent_team, master_tid);
2071  // Use the thread_limit set for the current target task if exists, else go
2072  // with the deduced nthreads
2073  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2074  ? task_thread_limit
2075  : nthreads;
2076  // Check if we need to take forkjoin lock? (no need for serialized
2077  // parallel out of teams construct).
2078  if (nthreads > 1) {
2079  /* determine how many new threads we can use */
2080  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2081  /* AC: If we execute teams from parallel region (on host), then teams
2082  should be created but each can only have 1 thread if nesting is
2083  disabled. If teams called from serial region, then teams and their
2084  threads should be created regardless of the nesting setting. */
2085  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2086  nthreads, enter_teams);
2087  if (nthreads == 1) {
2088  // Free lock for single thread execution here; for multi-thread
2089  // execution it will be freed later after team of threads created
2090  // and initialized
2091  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2092  }
2093  }
2094  }
2095  KMP_DEBUG_ASSERT(nthreads > 0);
2096 
2097  // If we temporarily changed the set number of threads then restore it now
2098  master_th->th.th_set_nproc = 0;
2099 
2100  if (nthreads == 1) {
2101  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2102  invoker, master_th, parent_team,
2103 #if OMPT_SUPPORT
2104  &ompt_parallel_data, &return_address,
2105  &parent_task_data,
2106 #endif
2107  ap);
2108  } // if (nthreads == 1)
2109 
2110  // GEH: only modify the executing flag in the case when not serialized
2111  // serialized case is handled in kmpc_serialized_parallel
2112  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2113  "curtask=%p, curtask_max_aclevel=%d\n",
2114  parent_team->t.t_active_level, master_th,
2115  master_th->th.th_current_task,
2116  master_th->th.th_current_task->td_icvs.max_active_levels));
2117  // TODO: GEH - cannot do this assertion because root thread not set up as
2118  // executing
2119  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2120  master_th->th.th_current_task->td_flags.executing = 0;
2121 
2122  if (!master_th->th.th_teams_microtask || level > teams_level) {
2123  /* Increment our nested depth level */
2124  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2125  }
2126 
2127  // See if we need to make a copy of the ICVs.
2128  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2129  kmp_nested_nthreads_t *nested_nth = NULL;
2130  if (!master_th->th.th_set_nested_nth &&
2131  (level + 1 < parent_team->t.t_nested_nth->used) &&
2132  (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2133  nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2134  } else if (master_th->th.th_set_nested_nth) {
2135  nested_nth = __kmp_override_nested_nth(master_th, level);
2136  if ((level + 1 < nested_nth->used) &&
2137  (nested_nth->nth[level + 1] != nthreads_icv))
2138  nthreads_icv = nested_nth->nth[level + 1];
2139  else
2140  nthreads_icv = 0; // don't update
2141  } else {
2142  nthreads_icv = 0; // don't update
2143  }
2144 
2145  // Figure out the proc_bind_policy for the new team.
2146  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2147  // proc_bind_default means don't update
2148  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2149  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2150  proc_bind = proc_bind_false;
2151  } else {
2152  // No proc_bind clause specified; use current proc-bind-var for this
2153  // parallel region
2154  if (proc_bind == proc_bind_default) {
2155  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2156  }
2157  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2158  if (master_th->th.th_teams_microtask &&
2159  microtask == (microtask_t)__kmp_teams_master) {
2160  proc_bind = __kmp_teams_proc_bind;
2161  }
2162  /* else: The proc_bind policy was specified explicitly on parallel clause.
2163  This overrides proc-bind-var for this parallel region, but does not
2164  change proc-bind-var. */
2165  // Figure the value of proc-bind-var for the child threads.
2166  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2167  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2168  master_th->th.th_current_task->td_icvs.proc_bind)) {
2169  // Do not modify the proc bind icv for the two teams construct forks
2170  // They just let the proc bind icv pass through
2171  if (!master_th->th.th_teams_microtask ||
2172  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2173  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2174  }
2175  }
2176 
2177  // Reset for next parallel region
2178  master_th->th.th_set_proc_bind = proc_bind_default;
2179 
2180  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2181  kmp_internal_control_t new_icvs;
2182  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2183  new_icvs.next = NULL;
2184  if (nthreads_icv > 0) {
2185  new_icvs.nproc = nthreads_icv;
2186  }
2187  if (proc_bind_icv != proc_bind_default) {
2188  new_icvs.proc_bind = proc_bind_icv;
2189  }
2190 
2191  /* allocate a new parallel team */
2192  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2193  team = __kmp_allocate_team(root, nthreads, nthreads,
2194 #if OMPT_SUPPORT
2195  ompt_parallel_data,
2196 #endif
2197  proc_bind, &new_icvs,
2198  argc USE_NESTED_HOT_ARG(master_th));
2199  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2200  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2201  } else {
2202  /* allocate a new parallel team */
2203  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2204  team = __kmp_allocate_team(root, nthreads, nthreads,
2205 #if OMPT_SUPPORT
2206  ompt_parallel_data,
2207 #endif
2208  proc_bind,
2209  &master_th->th.th_current_task->td_icvs,
2210  argc USE_NESTED_HOT_ARG(master_th));
2211  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2212  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2213  &master_th->th.th_current_task->td_icvs);
2214  }
2215  KF_TRACE(
2216  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2217 
2218  /* setup the new team */
2219  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2220  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2221  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2222  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2223  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2224 #if OMPT_SUPPORT
2225  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2226  return_address);
2227 #endif
2228  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2229  // TODO: parent_team->t.t_level == INT_MAX ???
2230  if (!master_th->th.th_teams_microtask || level > teams_level) {
2231  int new_level = parent_team->t.t_level + 1;
2232  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2233  new_level = parent_team->t.t_active_level + 1;
2234  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2235  } else {
2236  // AC: Do not increase parallel level at start of the teams construct
2237  int new_level = parent_team->t.t_level;
2238  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2239  new_level = parent_team->t.t_active_level;
2240  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2241  }
2242  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2243  // set primary thread's schedule as new run-time schedule
2244  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2245 
2246  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2247  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2248 
2249  // Check if hot team has potentially outdated list, and if so, free it
2250  if (team->t.t_nested_nth &&
2251  team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2252  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2253  KMP_INTERNAL_FREE(team->t.t_nested_nth);
2254  team->t.t_nested_nth = NULL;
2255  }
2256  team->t.t_nested_nth = parent_team->t.t_nested_nth;
2257  if (master_th->th.th_set_nested_nth) {
2258  if (!nested_nth)
2259  nested_nth = __kmp_override_nested_nth(master_th, level);
2260  team->t.t_nested_nth = nested_nth;
2261  KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2262  master_th->th.th_set_nested_nth = NULL;
2263  master_th->th.th_set_nested_nth_sz = 0;
2264  master_th->th.th_nt_strict = false;
2265  }
2266 
2267  // Update the floating point rounding in the team if required.
2268  propagateFPControl(team);
2269 #if OMPD_SUPPORT
2270  if (ompd_state & OMPD_ENABLE_BP)
2271  ompd_bp_parallel_begin();
2272 #endif
2273 
2274  KA_TRACE(
2275  20,
2276  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2277  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2278  team->t.t_nproc));
2279  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2280  (team->t.t_master_tid == 0 &&
2281  (team->t.t_parent == root->r.r_root_team ||
2282  team->t.t_parent->t.t_serialized)));
2283  KMP_MB();
2284 
2285  /* now, setup the arguments */
2286  argv = (void **)team->t.t_argv;
2287  if (ap) {
2288  for (i = argc - 1; i >= 0; --i) {
2289  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2290  KMP_CHECK_UPDATE(*argv, new_argv);
2291  argv++;
2292  }
2293  } else {
2294  for (i = 0; i < argc; ++i) {
2295  // Get args from parent team for teams construct
2296  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2297  }
2298  }
2299 
2300  /* now actually fork the threads */
2301  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2302  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2303  root->r.r_active = TRUE;
2304 
2305  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2306  __kmp_setup_icv_copy(team, nthreads,
2307  &master_th->th.th_current_task->td_icvs, loc);
2308 
2309 #if OMPT_SUPPORT
2310  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2311 #endif
2312 
2313  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2314 
2315 #if USE_ITT_BUILD
2316  if (team->t.t_active_level == 1 // only report frames at level 1
2317  && !master_th->th.th_teams_microtask) { // not in teams construct
2318 #if USE_ITT_NOTIFY
2319  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2320  (__kmp_forkjoin_frames_mode == 3 ||
2321  __kmp_forkjoin_frames_mode == 1)) {
2322  kmp_uint64 tmp_time = 0;
2323  if (__itt_get_timestamp_ptr)
2324  tmp_time = __itt_get_timestamp();
2325  // Internal fork - report frame begin
2326  master_th->th.th_frame_time = tmp_time;
2327  if (__kmp_forkjoin_frames_mode == 3)
2328  team->t.t_region_time = tmp_time;
2329  } else
2330 // only one notification scheme (either "submit" or "forking/joined", not both)
2331 #endif /* USE_ITT_NOTIFY */
2332  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2333  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2334  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2335  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2336  }
2337  }
2338 #endif /* USE_ITT_BUILD */
2339 
2340  /* now go on and do the work */
2341  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2342  KMP_MB();
2343  KF_TRACE(10,
2344  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2345  root, team, master_th, gtid));
2346 
2347 #if USE_ITT_BUILD
2348  if (__itt_stack_caller_create_ptr) {
2349  // create new stack stitching id before entering fork barrier
2350  if (!enter_teams) {
2351  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2352  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2353  } else if (parent_team->t.t_serialized) {
2354  // keep stack stitching id in the serialized parent_team;
2355  // current team will be used for parallel inside the teams;
2356  // if parent_team is active, then it already keeps stack stitching id
2357  // for the league of teams
2358  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2359  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2360  }
2361  }
2362 #endif /* USE_ITT_BUILD */
2363 
2364  // AC: skip __kmp_internal_fork at teams construct, let only primary
2365  // threads execute
2366  if (ap) {
2367  __kmp_internal_fork(loc, gtid, team);
2368  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2369  "master_th=%p, gtid=%d\n",
2370  root, team, master_th, gtid));
2371  }
2372 
2373  if (call_context == fork_context_gnu) {
2374  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2375  return TRUE;
2376  }
2377 
2378  /* Invoke microtask for PRIMARY thread */
2379  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2380  team->t.t_id, team->t.t_pkfn));
2381  } // END of timer KMP_fork_call block
2382 
2383 #if KMP_STATS_ENABLED
2384  // If beginning a teams construct, then change thread state
2385  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2386  if (!ap) {
2387  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2388  }
2389 #endif
2390 
2391  if (!team->t.t_invoke(gtid)) {
2392  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2393  }
2394 
2395 #if KMP_STATS_ENABLED
2396  // If was beginning of a teams construct, then reset thread state
2397  if (!ap) {
2398  KMP_SET_THREAD_STATE(previous_state);
2399  }
2400 #endif
2401 
2402  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2403  team->t.t_id, team->t.t_pkfn));
2404  KMP_MB(); /* Flush all pending memory write invalidates. */
2405 
2406  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2407 #if OMPT_SUPPORT
2408  if (ompt_enabled.enabled) {
2409  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2410  }
2411 #endif
2412 
2413  return TRUE;
2414 }
2415 
2416 #if OMPT_SUPPORT
2417 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2418  kmp_team_t *team) {
2419  // restore state outside the region
2420  thread->th.ompt_thread_info.state =
2421  ((team->t.t_serialized) ? ompt_state_work_serial
2422  : ompt_state_work_parallel);
2423 }
2424 
2425 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2426  kmp_team_t *team, ompt_data_t *parallel_data,
2427  int flags, void *codeptr) {
2428  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2429  if (ompt_enabled.ompt_callback_parallel_end) {
2430  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2431  parallel_data, &(task_info->task_data), flags, codeptr);
2432  }
2433 
2434  task_info->frame.enter_frame = ompt_data_none;
2435  __kmp_join_restore_state(thread, team);
2436 }
2437 #endif
2438 
2439 void __kmp_join_call(ident_t *loc, int gtid
2440 #if OMPT_SUPPORT
2441  ,
2442  enum fork_context_e fork_context
2443 #endif
2444  ,
2445  int exit_teams) {
2446  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2447  kmp_team_t *team;
2448  kmp_team_t *parent_team;
2449  kmp_info_t *master_th;
2450  kmp_root_t *root;
2451  int master_active;
2452 
2453  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2454 
2455  /* setup current data */
2456  master_th = __kmp_threads[gtid];
2457  root = master_th->th.th_root;
2458  team = master_th->th.th_team;
2459  parent_team = team->t.t_parent;
2460 
2461  master_th->th.th_ident = loc;
2462 
2463 #if OMPT_SUPPORT
2464  void *team_microtask = (void *)team->t.t_pkfn;
2465  // For GOMP interface with serialized parallel, need the
2466  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2467  // and end-parallel events.
2468  if (ompt_enabled.enabled &&
2469  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2470  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2471  }
2472 #endif
2473 
2474 #if KMP_DEBUG
2475  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2476  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2477  "th_task_team = %p\n",
2478  __kmp_gtid_from_thread(master_th), team,
2479  team->t.t_task_team[master_th->th.th_task_state],
2480  master_th->th.th_task_team));
2481  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2482  }
2483 #endif
2484 
2485  if (team->t.t_serialized) {
2486  if (master_th->th.th_teams_microtask) {
2487  // We are in teams construct
2488  int level = team->t.t_level;
2489  int tlevel = master_th->th.th_teams_level;
2490  if (level == tlevel) {
2491  // AC: we haven't incremented it earlier at start of teams construct,
2492  // so do it here - at the end of teams construct
2493  team->t.t_level++;
2494  } else if (level == tlevel + 1) {
2495  // AC: we are exiting parallel inside teams, need to increment
2496  // serialization in order to restore it in the next call to
2497  // __kmpc_end_serialized_parallel
2498  team->t.t_serialized++;
2499  }
2500  }
2501  __kmpc_end_serialized_parallel(loc, gtid);
2502 
2503 #if OMPT_SUPPORT
2504  if (ompt_enabled.enabled) {
2505  if (fork_context == fork_context_gnu) {
2506  __ompt_lw_taskteam_unlink(master_th);
2507  }
2508  __kmp_join_restore_state(master_th, parent_team);
2509  }
2510 #endif
2511 
2512  return;
2513  }
2514 
2515  master_active = team->t.t_master_active;
2516 
2517  if (!exit_teams) {
2518  // AC: No barrier for internal teams at exit from teams construct.
2519  // But there is barrier for external team (league).
2520  __kmp_internal_join(loc, gtid, team);
2521 #if USE_ITT_BUILD
2522  if (__itt_stack_caller_create_ptr) {
2523  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2524  // destroy the stack stitching id after join barrier
2525  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2526  team->t.t_stack_id = NULL;
2527  }
2528 #endif
2529  } else {
2530  master_th->th.th_task_state =
2531  0; // AC: no tasking in teams (out of any parallel)
2532 #if USE_ITT_BUILD
2533  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2534  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2535  // destroy the stack stitching id on exit from the teams construct
2536  // if parent_team is active, then the id will be destroyed later on
2537  // by master of the league of teams
2538  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2539  parent_team->t.t_stack_id = NULL;
2540  }
2541 #endif
2542  }
2543 
2544  KMP_MB();
2545 
2546 #if OMPT_SUPPORT
2547  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2548  void *codeptr = team->t.ompt_team_info.master_return_address;
2549 #endif
2550 
2551 #if USE_ITT_BUILD
2552  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2553  if (team->t.t_active_level == 1 &&
2554  (!master_th->th.th_teams_microtask || /* not in teams construct */
2555  master_th->th.th_teams_size.nteams == 1)) {
2556  master_th->th.th_ident = loc;
2557  // only one notification scheme (either "submit" or "forking/joined", not
2558  // both)
2559  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2560  __kmp_forkjoin_frames_mode == 3)
2561  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2562  master_th->th.th_frame_time, 0, loc,
2563  master_th->th.th_team_nproc, 1);
2564  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2565  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2566  __kmp_itt_region_joined(gtid);
2567  } // active_level == 1
2568 #endif /* USE_ITT_BUILD */
2569 
2570 #if KMP_AFFINITY_SUPPORTED
2571  if (!exit_teams) {
2572  // Restore master thread's partition.
2573  master_th->th.th_first_place = team->t.t_first_place;
2574  master_th->th.th_last_place = team->t.t_last_place;
2575  }
2576 #endif // KMP_AFFINITY_SUPPORTED
2577 
2578  if (master_th->th.th_teams_microtask && !exit_teams &&
2579  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2580  team->t.t_level == master_th->th.th_teams_level + 1) {
2581 // AC: We need to leave the team structure intact at the end of parallel
2582 // inside the teams construct, so that at the next parallel same (hot) team
2583 // works, only adjust nesting levels
2584 #if OMPT_SUPPORT
2585  ompt_data_t ompt_parallel_data = ompt_data_none;
2586  if (ompt_enabled.enabled) {
2587  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2588  if (ompt_enabled.ompt_callback_implicit_task) {
2589  int ompt_team_size = team->t.t_nproc;
2590  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2591  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2592  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2593  }
2594  task_info->frame.exit_frame = ompt_data_none;
2595  task_info->task_data = ompt_data_none;
2596  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2597  __ompt_lw_taskteam_unlink(master_th);
2598  }
2599 #endif
2600  /* Decrement our nested depth level */
2601  team->t.t_level--;
2602  team->t.t_active_level--;
2603  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2604 
2605  // Restore number of threads in the team if needed. This code relies on
2606  // the proper adjustment of th_teams_size.nth after the fork in
2607  // __kmp_teams_master on each teams primary thread in the case that
2608  // __kmp_reserve_threads reduced it.
2609  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2610  int old_num = master_th->th.th_team_nproc;
2611  int new_num = master_th->th.th_teams_size.nth;
2612  kmp_info_t **other_threads = team->t.t_threads;
2613  team->t.t_nproc = new_num;
2614  for (int i = 0; i < old_num; ++i) {
2615  other_threads[i]->th.th_team_nproc = new_num;
2616  }
2617  // Adjust states of non-used threads of the team
2618  for (int i = old_num; i < new_num; ++i) {
2619  // Re-initialize thread's barrier data.
2620  KMP_DEBUG_ASSERT(other_threads[i]);
2621  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2622  for (int b = 0; b < bs_last_barrier; ++b) {
2623  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2624  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2625 #if USE_DEBUGGER
2626  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2627 #endif
2628  }
2629  if (__kmp_tasking_mode != tskm_immediate_exec) {
2630  // Synchronize thread's task state
2631  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2632  }
2633  }
2634  }
2635 
2636 #if OMPT_SUPPORT
2637  if (ompt_enabled.enabled) {
2638  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2639  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2640  }
2641 #endif
2642 
2643  return;
2644  }
2645 
2646  /* do cleanup and restore the parent team */
2647  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2648  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2649 
2650  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2651 
2652  /* jc: The following lock has instructions with REL and ACQ semantics,
2653  separating the parallel user code called in this parallel region
2654  from the serial user code called after this function returns. */
2655  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2656 
2657  if (!master_th->th.th_teams_microtask ||
2658  team->t.t_level > master_th->th.th_teams_level) {
2659  /* Decrement our nested depth level */
2660  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2661  }
2662  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2663 
2664 #if OMPT_SUPPORT
2665  if (ompt_enabled.enabled) {
2666  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2667  if (ompt_enabled.ompt_callback_implicit_task) {
2668  int flags = (team_microtask == (void *)__kmp_teams_master)
2669  ? ompt_task_initial
2670  : ompt_task_implicit;
2671  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2672  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2673  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2674  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2675  }
2676  task_info->frame.exit_frame = ompt_data_none;
2677  task_info->task_data = ompt_data_none;
2678  }
2679 #endif
2680 
2681  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2682  master_th, team));
2683  __kmp_pop_current_task_from_thread(master_th);
2684 
2685  master_th->th.th_def_allocator = team->t.t_def_allocator;
2686 
2687 #if OMPD_SUPPORT
2688  if (ompd_state & OMPD_ENABLE_BP)
2689  ompd_bp_parallel_end();
2690 #endif
2691  updateHWFPControl(team);
2692 
2693  if (root->r.r_active != master_active)
2694  root->r.r_active = master_active;
2695 
2696  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2697  master_th)); // this will free worker threads
2698 
2699  /* this race was fun to find. make sure the following is in the critical
2700  region otherwise assertions may fail occasionally since the old team may be
2701  reallocated and the hierarchy appears inconsistent. it is actually safe to
2702  run and won't cause any bugs, but will cause those assertion failures. it's
2703  only one deref&assign so might as well put this in the critical region */
2704  master_th->th.th_team = parent_team;
2705  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2706  master_th->th.th_team_master = parent_team->t.t_threads[0];
2707  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2708 
2709  /* restore serialized team, if need be */
2710  if (parent_team->t.t_serialized &&
2711  parent_team != master_th->th.th_serial_team &&
2712  parent_team != root->r.r_root_team) {
2713  __kmp_free_team(root,
2714  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2715  master_th->th.th_serial_team = parent_team;
2716  }
2717 
2718  if (__kmp_tasking_mode != tskm_immediate_exec) {
2719  // Restore primary thread's task state from team structure
2720  KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2721  team->t.t_primary_task_state == 1);
2722  master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2723 
2724  // Copy the task team from the parent team to the primary thread
2725  master_th->th.th_task_team =
2726  parent_team->t.t_task_team[master_th->th.th_task_state];
2727  KA_TRACE(20,
2728  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2729  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2730  parent_team));
2731  }
2732 
2733  // TODO: GEH - cannot do this assertion because root thread not set up as
2734  // executing
2735  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2736  master_th->th.th_current_task->td_flags.executing = 1;
2737 
2738  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2739 
2740 #if KMP_AFFINITY_SUPPORTED
2741  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2742  __kmp_reset_root_init_mask(gtid);
2743  }
2744 #endif
2745 #if OMPT_SUPPORT
2746  int flags =
2747  OMPT_INVOKER(fork_context) |
2748  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2749  : ompt_parallel_team);
2750  if (ompt_enabled.enabled) {
2751  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2752  codeptr);
2753  }
2754 #endif
2755 
2756  KMP_MB();
2757  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2758 }
2759 
2760 /* Check whether we should push an internal control record onto the
2761  serial team stack. If so, do it. */
2762 void __kmp_save_internal_controls(kmp_info_t *thread) {
2763 
2764  if (thread->th.th_team != thread->th.th_serial_team) {
2765  return;
2766  }
2767  if (thread->th.th_team->t.t_serialized > 1) {
2768  int push = 0;
2769 
2770  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2771  push = 1;
2772  } else {
2773  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2774  thread->th.th_team->t.t_serialized) {
2775  push = 1;
2776  }
2777  }
2778  if (push) { /* push a record on the serial team's stack */
2779  kmp_internal_control_t *control =
2780  (kmp_internal_control_t *)__kmp_allocate(
2781  sizeof(kmp_internal_control_t));
2782 
2783  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2784 
2785  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2786 
2787  control->next = thread->th.th_team->t.t_control_stack_top;
2788  thread->th.th_team->t.t_control_stack_top = control;
2789  }
2790  }
2791 }
2792 
2793 /* Changes set_nproc */
2794 void __kmp_set_num_threads(int new_nth, int gtid) {
2795  kmp_info_t *thread;
2796  kmp_root_t *root;
2797 
2798  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2799  KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801  if (new_nth < 1)
2802  new_nth = 1;
2803  else if (new_nth > __kmp_max_nth)
2804  new_nth = __kmp_max_nth;
2805 
2806  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2807  thread = __kmp_threads[gtid];
2808  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2809  return; // nothing to do
2810 
2811  __kmp_save_internal_controls(thread);
2812 
2813  set__nproc(thread, new_nth);
2814 
2815  // If this omp_set_num_threads() call will cause the hot team size to be
2816  // reduced (in the absence of a num_threads clause), then reduce it now,
2817  // rather than waiting for the next parallel region.
2818  root = thread->th.th_root;
2819  if (__kmp_init_parallel && (!root->r.r_active) &&
2820  (root->r.r_hot_team->t.t_nproc > new_nth)
2821 #if KMP_NESTED_HOT_TEAMS
2822  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2823 #endif
2824  ) {
2825  kmp_team_t *hot_team = root->r.r_hot_team;
2826  int f;
2827 
2828  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2829 
2830  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2831  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2832  }
2833  // Release the extra threads we don't need any more.
2834  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2835  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2836  if (__kmp_tasking_mode != tskm_immediate_exec) {
2837  // When decreasing team size, threads no longer in the team should unref
2838  // task team.
2839  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2840  }
2841  __kmp_free_thread(hot_team->t.t_threads[f]);
2842  hot_team->t.t_threads[f] = NULL;
2843  }
2844  hot_team->t.t_nproc = new_nth;
2845 #if KMP_NESTED_HOT_TEAMS
2846  if (thread->th.th_hot_teams) {
2847  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2848  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2849  }
2850 #endif
2851 
2852  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2853  hot_team->t.b->update_num_threads(new_nth);
2854  __kmp_add_threads_to_team(hot_team, new_nth);
2855  }
2856 
2857  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2858 
2859  // Update the t_nproc field in the threads that are still active.
2860  for (f = 0; f < new_nth; f++) {
2861  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2862  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2863  }
2864  // Special flag in case omp_set_num_threads() call
2865  hot_team->t.t_size_changed = -1;
2866  }
2867 }
2868 
2869 /* Changes max_active_levels */
2870 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2871  kmp_info_t *thread;
2872 
2873  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2874  "%d = (%d)\n",
2875  gtid, max_active_levels));
2876  KMP_DEBUG_ASSERT(__kmp_init_serial);
2877 
2878  // validate max_active_levels
2879  if (max_active_levels < 0) {
2880  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2881  // We ignore this call if the user has specified a negative value.
2882  // The current setting won't be changed. The last valid setting will be
2883  // used. A warning will be issued (if warnings are allowed as controlled by
2884  // the KMP_WARNINGS env var).
2885  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2886  "max_active_levels for thread %d = (%d)\n",
2887  gtid, max_active_levels));
2888  return;
2889  }
2890  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2891  // it's OK, the max_active_levels is within the valid range: [ 0;
2892  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2893  // We allow a zero value. (implementation defined behavior)
2894  } else {
2895  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2896  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2897  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2898  // Current upper limit is MAX_INT. (implementation defined behavior)
2899  // If the input exceeds the upper limit, we correct the input to be the
2900  // upper limit. (implementation defined behavior)
2901  // Actually, the flow should never get here until we use MAX_INT limit.
2902  }
2903  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2904  "max_active_levels for thread %d = (%d)\n",
2905  gtid, max_active_levels));
2906 
2907  thread = __kmp_threads[gtid];
2908 
2909  __kmp_save_internal_controls(thread);
2910 
2911  set__max_active_levels(thread, max_active_levels);
2912 }
2913 
2914 /* Gets max_active_levels */
2915 int __kmp_get_max_active_levels(int gtid) {
2916  kmp_info_t *thread;
2917 
2918  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2919  KMP_DEBUG_ASSERT(__kmp_init_serial);
2920 
2921  thread = __kmp_threads[gtid];
2922  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2923  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2924  "curtask_maxaclevel=%d\n",
2925  gtid, thread->th.th_current_task,
2926  thread->th.th_current_task->td_icvs.max_active_levels));
2927  return thread->th.th_current_task->td_icvs.max_active_levels;
2928 }
2929 
2930 // nteams-var per-device ICV
2931 void __kmp_set_num_teams(int num_teams) {
2932  if (num_teams > 0)
2933  __kmp_nteams = num_teams;
2934 }
2935 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2936 // teams-thread-limit-var per-device ICV
2937 void __kmp_set_teams_thread_limit(int limit) {
2938  if (limit > 0)
2939  __kmp_teams_thread_limit = limit;
2940 }
2941 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2942 
2943 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2944 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2945 
2946 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2947 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2948  kmp_info_t *thread;
2949  kmp_sched_t orig_kind;
2950  // kmp_team_t *team;
2951 
2952  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2953  gtid, (int)kind, chunk));
2954  KMP_DEBUG_ASSERT(__kmp_init_serial);
2955 
2956  // Check if the kind parameter is valid, correct if needed.
2957  // Valid parameters should fit in one of two intervals - standard or extended:
2958  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2959  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2960  orig_kind = kind;
2961  kind = __kmp_sched_without_mods(kind);
2962 
2963  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2964  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2965  // TODO: Hint needs attention in case we change the default schedule.
2966  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2967  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2968  __kmp_msg_null);
2969  kind = kmp_sched_default;
2970  chunk = 0; // ignore chunk value in case of bad kind
2971  }
2972 
2973  thread = __kmp_threads[gtid];
2974 
2975  __kmp_save_internal_controls(thread);
2976 
2977  if (kind < kmp_sched_upper_std) {
2978  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2979  // differ static chunked vs. unchunked: chunk should be invalid to
2980  // indicate unchunked schedule (which is the default)
2981  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2982  } else {
2983  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2984  __kmp_sch_map[kind - kmp_sched_lower - 1];
2985  }
2986  } else {
2987  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2988  // kmp_sched_lower - 2 ];
2989  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2990  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2991  kmp_sched_lower - 2];
2992  }
2993  __kmp_sched_apply_mods_intkind(
2994  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2995  if (kind == kmp_sched_auto || chunk < 1) {
2996  // ignore parameter chunk for schedule auto
2997  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2998  } else {
2999  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
3000  }
3001 }
3002 
3003 /* Gets def_sched_var ICV values */
3004 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
3005  kmp_info_t *thread;
3006  enum sched_type th_type;
3007 
3008  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
3009  KMP_DEBUG_ASSERT(__kmp_init_serial);
3010 
3011  thread = __kmp_threads[gtid];
3012 
3013  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3014  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3015  case kmp_sch_static:
3016  case kmp_sch_static_greedy:
3017  case kmp_sch_static_balanced:
3018  *kind = kmp_sched_static;
3019  __kmp_sched_apply_mods_stdkind(kind, th_type);
3020  *chunk = 0; // chunk was not set, try to show this fact via zero value
3021  return;
3022  case kmp_sch_static_chunked:
3023  *kind = kmp_sched_static;
3024  break;
3025  case kmp_sch_dynamic_chunked:
3026  *kind = kmp_sched_dynamic;
3027  break;
3029  case kmp_sch_guided_iterative_chunked:
3030  case kmp_sch_guided_analytical_chunked:
3031  *kind = kmp_sched_guided;
3032  break;
3033  case kmp_sch_auto:
3034  *kind = kmp_sched_auto;
3035  break;
3036  case kmp_sch_trapezoidal:
3037  *kind = kmp_sched_trapezoidal;
3038  break;
3039 #if KMP_STATIC_STEAL_ENABLED
3040  case kmp_sch_static_steal:
3041  *kind = kmp_sched_static_steal;
3042  break;
3043 #endif
3044  default:
3045  KMP_FATAL(UnknownSchedulingType, th_type);
3046  }
3047 
3048  __kmp_sched_apply_mods_stdkind(kind, th_type);
3049  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3050 }
3051 
3052 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3053 
3054  int ii, dd;
3055  kmp_team_t *team;
3056  kmp_info_t *thr;
3057 
3058  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3059  KMP_DEBUG_ASSERT(__kmp_init_serial);
3060 
3061  // validate level
3062  if (level == 0)
3063  return 0;
3064  if (level < 0)
3065  return -1;
3066  thr = __kmp_threads[gtid];
3067  team = thr->th.th_team;
3068  ii = team->t.t_level;
3069  if (level > ii)
3070  return -1;
3071 
3072  if (thr->th.th_teams_microtask) {
3073  // AC: we are in teams region where multiple nested teams have same level
3074  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3075  if (level <=
3076  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3077  KMP_DEBUG_ASSERT(ii >= tlevel);
3078  // AC: As we need to pass by the teams league, we need to artificially
3079  // increase ii
3080  if (ii == tlevel) {
3081  ii += 2; // three teams have same level
3082  } else {
3083  ii++; // two teams have same level
3084  }
3085  }
3086  }
3087 
3088  if (ii == level)
3089  return __kmp_tid_from_gtid(gtid);
3090 
3091  dd = team->t.t_serialized;
3092  level++;
3093  while (ii > level) {
3094  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3095  }
3096  if ((team->t.t_serialized) && (!dd)) {
3097  team = team->t.t_parent;
3098  continue;
3099  }
3100  if (ii > level) {
3101  team = team->t.t_parent;
3102  dd = team->t.t_serialized;
3103  ii--;
3104  }
3105  }
3106 
3107  return (dd > 1) ? (0) : (team->t.t_master_tid);
3108 }
3109 
3110 int __kmp_get_team_size(int gtid, int level) {
3111 
3112  int ii, dd;
3113  kmp_team_t *team;
3114  kmp_info_t *thr;
3115 
3116  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3117  KMP_DEBUG_ASSERT(__kmp_init_serial);
3118 
3119  // validate level
3120  if (level == 0)
3121  return 1;
3122  if (level < 0)
3123  return -1;
3124  thr = __kmp_threads[gtid];
3125  team = thr->th.th_team;
3126  ii = team->t.t_level;
3127  if (level > ii)
3128  return -1;
3129 
3130  if (thr->th.th_teams_microtask) {
3131  // AC: we are in teams region where multiple nested teams have same level
3132  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3133  if (level <=
3134  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3135  KMP_DEBUG_ASSERT(ii >= tlevel);
3136  // AC: As we need to pass by the teams league, we need to artificially
3137  // increase ii
3138  if (ii == tlevel) {
3139  ii += 2; // three teams have same level
3140  } else {
3141  ii++; // two teams have same level
3142  }
3143  }
3144  }
3145 
3146  while (ii > level) {
3147  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3148  }
3149  if (team->t.t_serialized && (!dd)) {
3150  team = team->t.t_parent;
3151  continue;
3152  }
3153  if (ii > level) {
3154  team = team->t.t_parent;
3155  ii--;
3156  }
3157  }
3158 
3159  return team->t.t_nproc;
3160 }
3161 
3162 kmp_r_sched_t __kmp_get_schedule_global() {
3163  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3164  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3165  // independently. So one can get the updated schedule here.
3166 
3167  kmp_r_sched_t r_sched;
3168 
3169  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3170  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3171  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3172  // different roots (even in OMP 2.5)
3173  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3174  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3175  if (s == kmp_sch_static) {
3176  // replace STATIC with more detailed schedule (balanced or greedy)
3177  r_sched.r_sched_type = __kmp_static;
3178  } else if (s == kmp_sch_guided_chunked) {
3179  // replace GUIDED with more detailed schedule (iterative or analytical)
3180  r_sched.r_sched_type = __kmp_guided;
3181  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3182  r_sched.r_sched_type = __kmp_sched;
3183  }
3184  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3185 
3186  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3187  // __kmp_chunk may be wrong here (if it was not ever set)
3188  r_sched.chunk = KMP_DEFAULT_CHUNK;
3189  } else {
3190  r_sched.chunk = __kmp_chunk;
3191  }
3192 
3193  return r_sched;
3194 }
3195 
3196 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3197  at least argc number of *t_argv entries for the requested team. */
3198 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3199 
3200  KMP_DEBUG_ASSERT(team);
3201  if (!realloc || argc > team->t.t_max_argc) {
3202 
3203  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3204  "current entries=%d\n",
3205  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3206  /* if previously allocated heap space for args, free them */
3207  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3208  __kmp_free((void *)team->t.t_argv);
3209 
3210  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3211  /* use unused space in the cache line for arguments */
3212  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3213  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3214  "argv entries\n",
3215  team->t.t_id, team->t.t_max_argc));
3216  team->t.t_argv = &team->t.t_inline_argv[0];
3217  if (__kmp_storage_map) {
3218  __kmp_print_storage_map_gtid(
3219  -1, &team->t.t_inline_argv[0],
3220  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3221  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3222  team->t.t_id);
3223  }
3224  } else {
3225  /* allocate space for arguments in the heap */
3226  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3227  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3228  : 2 * argc;
3229  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3230  "argv entries\n",
3231  team->t.t_id, team->t.t_max_argc));
3232  team->t.t_argv =
3233  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3234  if (__kmp_storage_map) {
3235  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3236  &team->t.t_argv[team->t.t_max_argc],
3237  sizeof(void *) * team->t.t_max_argc,
3238  "team_%d.t_argv", team->t.t_id);
3239  }
3240  }
3241  }
3242 }
3243 
3244 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3245  int i;
3246  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3247  team->t.t_threads =
3248  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3249  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3250  sizeof(dispatch_shared_info_t) * num_disp_buff);
3251  team->t.t_dispatch =
3252  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3253  team->t.t_implicit_task_taskdata =
3254  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3255  team->t.t_max_nproc = max_nth;
3256 
3257  /* setup dispatch buffers */
3258  for (i = 0; i < num_disp_buff; ++i) {
3259  team->t.t_disp_buffer[i].buffer_index = i;
3260  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3261  }
3262 }
3263 
3264 static void __kmp_free_team_arrays(kmp_team_t *team) {
3265  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3266  int i;
3267  for (i = 0; i < team->t.t_max_nproc; ++i) {
3268  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3269  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3270  team->t.t_dispatch[i].th_disp_buffer = NULL;
3271  }
3272  }
3273 #if KMP_USE_HIER_SCHED
3274  __kmp_dispatch_free_hierarchies(team);
3275 #endif
3276  __kmp_free(team->t.t_threads);
3277  __kmp_free(team->t.t_disp_buffer);
3278  __kmp_free(team->t.t_dispatch);
3279  __kmp_free(team->t.t_implicit_task_taskdata);
3280  team->t.t_threads = NULL;
3281  team->t.t_disp_buffer = NULL;
3282  team->t.t_dispatch = NULL;
3283  team->t.t_implicit_task_taskdata = 0;
3284 }
3285 
3286 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3287  kmp_info_t **oldThreads = team->t.t_threads;
3288 
3289  __kmp_free(team->t.t_disp_buffer);
3290  __kmp_free(team->t.t_dispatch);
3291  __kmp_free(team->t.t_implicit_task_taskdata);
3292  __kmp_allocate_team_arrays(team, max_nth);
3293 
3294  KMP_MEMCPY(team->t.t_threads, oldThreads,
3295  team->t.t_nproc * sizeof(kmp_info_t *));
3296 
3297  __kmp_free(oldThreads);
3298 }
3299 
3300 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3301 
3302  kmp_r_sched_t r_sched =
3303  __kmp_get_schedule_global(); // get current state of scheduling globals
3304 
3305  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3306 
3307  kmp_internal_control_t g_icvs = {
3308  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3309  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3310  // adjustment of threads (per thread)
3311  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3312  // whether blocktime is explicitly set
3313  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3314 #if KMP_USE_MONITOR
3315  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3316 // intervals
3317 #endif
3318  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3319  // next parallel region (per thread)
3320  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3321  __kmp_cg_max_nth, // int thread_limit;
3322  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3323  // on task. This is used in the case of target thread_limit
3324  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3325  // for max_active_levels
3326  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3327  // {sched,chunk} pair
3328  __kmp_nested_proc_bind.bind_types[0],
3329  __kmp_default_device,
3330  NULL // struct kmp_internal_control *next;
3331  };
3332 
3333  return g_icvs;
3334 }
3335 
3336 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3337 
3338  kmp_internal_control_t gx_icvs;
3339  gx_icvs.serial_nesting_level =
3340  0; // probably =team->t.t_serial like in save_inter_controls
3341  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3342  gx_icvs.next = NULL;
3343 
3344  return gx_icvs;
3345 }
3346 
3347 static void __kmp_initialize_root(kmp_root_t *root) {
3348  int f;
3349  kmp_team_t *root_team;
3350  kmp_team_t *hot_team;
3351  int hot_team_max_nth;
3352  kmp_r_sched_t r_sched =
3353  __kmp_get_schedule_global(); // get current state of scheduling globals
3354  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3355  KMP_DEBUG_ASSERT(root);
3356  KMP_ASSERT(!root->r.r_begin);
3357 
3358  /* setup the root state structure */
3359  __kmp_init_lock(&root->r.r_begin_lock);
3360  root->r.r_begin = FALSE;
3361  root->r.r_active = FALSE;
3362  root->r.r_in_parallel = 0;
3363  root->r.r_blocktime = __kmp_dflt_blocktime;
3364 #if KMP_AFFINITY_SUPPORTED
3365  root->r.r_affinity_assigned = FALSE;
3366 #endif
3367 
3368  /* setup the root team for this task */
3369  /* allocate the root team structure */
3370  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3371 
3372  root_team =
3373  __kmp_allocate_team(root,
3374  1, // new_nproc
3375  1, // max_nproc
3376 #if OMPT_SUPPORT
3377  ompt_data_none, // root parallel id
3378 #endif
3379  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3380  0 // argc
3381  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3382  );
3383 #if USE_DEBUGGER
3384  // Non-NULL value should be assigned to make the debugger display the root
3385  // team.
3386  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3387 #endif
3388 
3389  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3390 
3391  root->r.r_root_team = root_team;
3392  root_team->t.t_control_stack_top = NULL;
3393 
3394  /* initialize root team */
3395  root_team->t.t_threads[0] = NULL;
3396  root_team->t.t_nproc = 1;
3397  root_team->t.t_serialized = 1;
3398  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3399  root_team->t.t_sched.sched = r_sched.sched;
3400  root_team->t.t_nested_nth = &__kmp_nested_nth;
3401  KA_TRACE(
3402  20,
3403  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3404  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3405 
3406  /* setup the hot team for this task */
3407  /* allocate the hot team structure */
3408  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3409 
3410  hot_team =
3411  __kmp_allocate_team(root,
3412  1, // new_nproc
3413  __kmp_dflt_team_nth_ub * 2, // max_nproc
3414 #if OMPT_SUPPORT
3415  ompt_data_none, // root parallel id
3416 #endif
3417  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3418  0 // argc
3419  USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3420  );
3421  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3422 
3423  root->r.r_hot_team = hot_team;
3424  root_team->t.t_control_stack_top = NULL;
3425 
3426  /* first-time initialization */
3427  hot_team->t.t_parent = root_team;
3428 
3429  /* initialize hot team */
3430  hot_team_max_nth = hot_team->t.t_max_nproc;
3431  for (f = 0; f < hot_team_max_nth; ++f) {
3432  hot_team->t.t_threads[f] = NULL;
3433  }
3434  hot_team->t.t_nproc = 1;
3435  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3436  hot_team->t.t_sched.sched = r_sched.sched;
3437  hot_team->t.t_size_changed = 0;
3438  hot_team->t.t_nested_nth = &__kmp_nested_nth;
3439 }
3440 
3441 #ifdef KMP_DEBUG
3442 
3443 typedef struct kmp_team_list_item {
3444  kmp_team_p const *entry;
3445  struct kmp_team_list_item *next;
3446 } kmp_team_list_item_t;
3447 typedef kmp_team_list_item_t *kmp_team_list_t;
3448 
3449 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3450  kmp_team_list_t list, // List of teams.
3451  kmp_team_p const *team // Team to add.
3452 ) {
3453 
3454  // List must terminate with item where both entry and next are NULL.
3455  // Team is added to the list only once.
3456  // List is sorted in ascending order by team id.
3457  // Team id is *not* a key.
3458 
3459  kmp_team_list_t l;
3460 
3461  KMP_DEBUG_ASSERT(list != NULL);
3462  if (team == NULL) {
3463  return;
3464  }
3465 
3466  __kmp_print_structure_team_accum(list, team->t.t_parent);
3467  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3468 
3469  // Search list for the team.
3470  l = list;
3471  while (l->next != NULL && l->entry != team) {
3472  l = l->next;
3473  }
3474  if (l->next != NULL) {
3475  return; // Team has been added before, exit.
3476  }
3477 
3478  // Team is not found. Search list again for insertion point.
3479  l = list;
3480  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3481  l = l->next;
3482  }
3483 
3484  // Insert team.
3485  {
3486  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3487  sizeof(kmp_team_list_item_t));
3488  *item = *l;
3489  l->entry = team;
3490  l->next = item;
3491  }
3492 }
3493 
3494 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3495 
3496 ) {
3497  __kmp_printf("%s", title);
3498  if (team != NULL) {
3499  __kmp_printf("%2x %p\n", team->t.t_id, team);
3500  } else {
3501  __kmp_printf(" - (nil)\n");
3502  }
3503 }
3504 
3505 static void __kmp_print_structure_thread(char const *title,
3506  kmp_info_p const *thread) {
3507  __kmp_printf("%s", title);
3508  if (thread != NULL) {
3509  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3510  } else {
3511  __kmp_printf(" - (nil)\n");
3512  }
3513 }
3514 
3515 void __kmp_print_structure(void) {
3516 
3517  kmp_team_list_t list;
3518 
3519  // Initialize list of teams.
3520  list =
3521  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3522  list->entry = NULL;
3523  list->next = NULL;
3524 
3525  __kmp_printf("\n------------------------------\nGlobal Thread "
3526  "Table\n------------------------------\n");
3527  {
3528  int gtid;
3529  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3530  __kmp_printf("%2d", gtid);
3531  if (__kmp_threads != NULL) {
3532  __kmp_printf(" %p", __kmp_threads[gtid]);
3533  }
3534  if (__kmp_root != NULL) {
3535  __kmp_printf(" %p", __kmp_root[gtid]);
3536  }
3537  __kmp_printf("\n");
3538  }
3539  }
3540 
3541  // Print out __kmp_threads array.
3542  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3543  "----------\n");
3544  if (__kmp_threads != NULL) {
3545  int gtid;
3546  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3547  kmp_info_t const *thread = __kmp_threads[gtid];
3548  if (thread != NULL) {
3549  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3550  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3551  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3552  __kmp_print_structure_team(" Serial Team: ",
3553  thread->th.th_serial_team);
3554  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3555  __kmp_print_structure_thread(" Primary: ",
3556  thread->th.th_team_master);
3557  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3558  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3559  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3560  __kmp_print_structure_thread(" Next in pool: ",
3561  thread->th.th_next_pool);
3562  __kmp_printf("\n");
3563  __kmp_print_structure_team_accum(list, thread->th.th_team);
3564  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3565  }
3566  }
3567  } else {
3568  __kmp_printf("Threads array is not allocated.\n");
3569  }
3570 
3571  // Print out __kmp_root array.
3572  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3573  "--------\n");
3574  if (__kmp_root != NULL) {
3575  int gtid;
3576  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3577  kmp_root_t const *root = __kmp_root[gtid];
3578  if (root != NULL) {
3579  __kmp_printf("GTID %2d %p:\n", gtid, root);
3580  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3581  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3582  __kmp_print_structure_thread(" Uber Thread: ",
3583  root->r.r_uber_thread);
3584  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3585  __kmp_printf(" In Parallel: %2d\n",
3586  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3587  __kmp_printf("\n");
3588  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3589  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3590  }
3591  }
3592  } else {
3593  __kmp_printf("Ubers array is not allocated.\n");
3594  }
3595 
3596  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3597  "--------\n");
3598  while (list->next != NULL) {
3599  kmp_team_p const *team = list->entry;
3600  int i;
3601  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3602  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3603  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3604  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3605  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3606  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3607  for (i = 0; i < team->t.t_nproc; ++i) {
3608  __kmp_printf(" Thread %2d: ", i);
3609  __kmp_print_structure_thread("", team->t.t_threads[i]);
3610  }
3611  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3612  __kmp_printf("\n");
3613  list = list->next;
3614  }
3615 
3616  // Print out __kmp_thread_pool and __kmp_team_pool.
3617  __kmp_printf("\n------------------------------\nPools\n----------------------"
3618  "--------\n");
3619  __kmp_print_structure_thread("Thread pool: ",
3620  CCAST(kmp_info_t *, __kmp_thread_pool));
3621  __kmp_print_structure_team("Team pool: ",
3622  CCAST(kmp_team_t *, __kmp_team_pool));
3623  __kmp_printf("\n");
3624 
3625  // Free team list.
3626  while (list != NULL) {
3627  kmp_team_list_item_t *item = list;
3628  list = list->next;
3629  KMP_INTERNAL_FREE(item);
3630  }
3631 }
3632 
3633 #endif
3634 
3635 //---------------------------------------------------------------------------
3636 // Stuff for per-thread fast random number generator
3637 // Table of primes
3638 static const unsigned __kmp_primes[] = {
3639  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3640  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3641  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3642  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3643  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3644  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3645  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3646  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3647  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3648  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3649  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3650 
3651 //---------------------------------------------------------------------------
3652 // __kmp_get_random: Get a random number using a linear congruential method.
3653 unsigned short __kmp_get_random(kmp_info_t *thread) {
3654  unsigned x = thread->th.th_x;
3655  unsigned short r = (unsigned short)(x >> 16);
3656 
3657  thread->th.th_x = x * thread->th.th_a + 1;
3658 
3659  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3660  thread->th.th_info.ds.ds_tid, r));
3661 
3662  return r;
3663 }
3664 //--------------------------------------------------------
3665 // __kmp_init_random: Initialize a random number generator
3666 void __kmp_init_random(kmp_info_t *thread) {
3667  unsigned seed = thread->th.th_info.ds.ds_tid;
3668 
3669  thread->th.th_a =
3670  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3671  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3672  KA_TRACE(30,
3673  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3674 }
3675 
3676 #if KMP_OS_WINDOWS
3677 /* reclaim array entries for root threads that are already dead, returns number
3678  * reclaimed */
3679 static int __kmp_reclaim_dead_roots(void) {
3680  int i, r = 0;
3681 
3682  for (i = 0; i < __kmp_threads_capacity; ++i) {
3683  if (KMP_UBER_GTID(i) &&
3684  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3685  !__kmp_root[i]
3686  ->r.r_active) { // AC: reclaim only roots died in non-active state
3687  r += __kmp_unregister_root_other_thread(i);
3688  }
3689  }
3690  return r;
3691 }
3692 #endif
3693 
3694 /* This function attempts to create free entries in __kmp_threads and
3695  __kmp_root, and returns the number of free entries generated.
3696 
3697  For Windows* OS static library, the first mechanism used is to reclaim array
3698  entries for root threads that are already dead.
3699 
3700  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3701  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3702  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3703  threadprivate cache array has been created. Synchronization with
3704  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3705 
3706  After any dead root reclamation, if the clipping value allows array expansion
3707  to result in the generation of a total of nNeed free slots, the function does
3708  that expansion. If not, nothing is done beyond the possible initial root
3709  thread reclamation.
3710 
3711  If any argument is negative, the behavior is undefined. */
3712 static int __kmp_expand_threads(int nNeed) {
3713  int added = 0;
3714  int minimumRequiredCapacity;
3715  int newCapacity;
3716  kmp_info_t **newThreads;
3717  kmp_root_t **newRoot;
3718 
3719  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3720  // resizing __kmp_threads does not need additional protection if foreign
3721  // threads are present
3722 
3723 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3724  /* only for Windows static library */
3725  /* reclaim array entries for root threads that are already dead */
3726  added = __kmp_reclaim_dead_roots();
3727 
3728  if (nNeed) {
3729  nNeed -= added;
3730  if (nNeed < 0)
3731  nNeed = 0;
3732  }
3733 #endif
3734  if (nNeed <= 0)
3735  return added;
3736 
3737  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3738  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3739  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3740  // > __kmp_max_nth in one of two ways:
3741  //
3742  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3743  // may not be reused by another thread, so we may need to increase
3744  // __kmp_threads_capacity to __kmp_max_nth + 1.
3745  //
3746  // 2) New foreign root(s) are encountered. We always register new foreign
3747  // roots. This may cause a smaller # of threads to be allocated at
3748  // subsequent parallel regions, but the worker threads hang around (and
3749  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3750  //
3751  // Anyway, that is the reason for moving the check to see if
3752  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3753  // instead of having it performed here. -BB
3754 
3755  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3756 
3757  /* compute expansion headroom to check if we can expand */
3758  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3759  /* possible expansion too small -- give up */
3760  return added;
3761  }
3762  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3763 
3764  newCapacity = __kmp_threads_capacity;
3765  do {
3766  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3767  : __kmp_sys_max_nth;
3768  } while (newCapacity < minimumRequiredCapacity);
3769  newThreads = (kmp_info_t **)__kmp_allocate(
3770  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3771  newRoot =
3772  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3773  KMP_MEMCPY(newThreads, __kmp_threads,
3774  __kmp_threads_capacity * sizeof(kmp_info_t *));
3775  KMP_MEMCPY(newRoot, __kmp_root,
3776  __kmp_threads_capacity * sizeof(kmp_root_t *));
3777  // Put old __kmp_threads array on a list. Any ongoing references to the old
3778  // list will be valid. This list is cleaned up at library shutdown.
3779  kmp_old_threads_list_t *node =
3780  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3781  node->threads = __kmp_threads;
3782  node->next = __kmp_old_threads_list;
3783  __kmp_old_threads_list = node;
3784 
3785  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3786  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3787  added += newCapacity - __kmp_threads_capacity;
3788  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3789 
3790  if (newCapacity > __kmp_tp_capacity) {
3791  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3792  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3793  __kmp_threadprivate_resize_cache(newCapacity);
3794  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3795  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3796  }
3797  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3798  }
3799 
3800  return added;
3801 }
3802 
3803 /* Register the current thread as a root thread and obtain our gtid. We must
3804  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3805  thread that calls from __kmp_do_serial_initialize() */
3806 int __kmp_register_root(int initial_thread) {
3807  kmp_info_t *root_thread;
3808  kmp_root_t *root;
3809  int gtid;
3810  int capacity;
3811  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3812  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3813  KMP_MB();
3814 
3815  /* 2007-03-02:
3816  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3817  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3818  work as expected -- it may return false (that means there is at least one
3819  empty slot in __kmp_threads array), but it is possible the only free slot
3820  is #0, which is reserved for initial thread and so cannot be used for this
3821  one. Following code workarounds this bug.
3822 
3823  However, right solution seems to be not reserving slot #0 for initial
3824  thread because:
3825  (1) there is no magic in slot #0,
3826  (2) we cannot detect initial thread reliably (the first thread which does
3827  serial initialization may be not a real initial thread).
3828  */
3829  capacity = __kmp_threads_capacity;
3830  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3831  --capacity;
3832  }
3833 
3834  // If it is not for initializing the hidden helper team, we need to take
3835  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3836  // in __kmp_threads_capacity.
3837  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3838  capacity -= __kmp_hidden_helper_threads_num;
3839  }
3840 
3841  /* see if there are too many threads */
3842  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3843  if (__kmp_tp_cached) {
3844  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3845  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3846  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3847  } else {
3848  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3849  __kmp_msg_null);
3850  }
3851  }
3852 
3853  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3854  // 0: initial thread, also a regular OpenMP thread.
3855  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3856  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3857  // regular OpenMP threads.
3858  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3859  // Find an available thread slot for hidden helper thread. Slots for hidden
3860  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3861  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3862  gtid <= __kmp_hidden_helper_threads_num;
3863  gtid++)
3864  ;
3865  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3866  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3867  "hidden helper thread: T#%d\n",
3868  gtid));
3869  } else {
3870  /* find an available thread slot */
3871  // Don't reassign the zero slot since we need that to only be used by
3872  // initial thread. Slots for hidden helper threads should also be skipped.
3873  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3874  gtid = 0;
3875  } else {
3876  for (gtid = __kmp_hidden_helper_threads_num + 1;
3877  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3878  ;
3879  }
3880  KA_TRACE(
3881  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3882  KMP_ASSERT(gtid < __kmp_threads_capacity);
3883  }
3884 
3885  /* update global accounting */
3886  __kmp_all_nth++;
3887  TCW_4(__kmp_nth, __kmp_nth + 1);
3888 
3889  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3890  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3891  if (__kmp_adjust_gtid_mode) {
3892  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3893  if (TCR_4(__kmp_gtid_mode) != 2) {
3894  TCW_4(__kmp_gtid_mode, 2);
3895  }
3896  } else {
3897  if (TCR_4(__kmp_gtid_mode) != 1) {
3898  TCW_4(__kmp_gtid_mode, 1);
3899  }
3900  }
3901  }
3902 
3903 #ifdef KMP_ADJUST_BLOCKTIME
3904  /* Adjust blocktime to zero if necessary */
3905  /* Middle initialization might not have occurred yet */
3906  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3907  if (__kmp_nth > __kmp_avail_proc) {
3908  __kmp_zero_bt = TRUE;
3909  }
3910  }
3911 #endif /* KMP_ADJUST_BLOCKTIME */
3912 
3913  /* setup this new hierarchy */
3914  if (!(root = __kmp_root[gtid])) {
3915  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3916  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3917  }
3918 
3919 #if KMP_STATS_ENABLED
3920  // Initialize stats as soon as possible (right after gtid assignment).
3921  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3922  __kmp_stats_thread_ptr->startLife();
3923  KMP_SET_THREAD_STATE(SERIAL_REGION);
3924  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3925 #endif
3926  __kmp_initialize_root(root);
3927 
3928  /* setup new root thread structure */
3929  if (root->r.r_uber_thread) {
3930  root_thread = root->r.r_uber_thread;
3931  } else {
3932  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3933  if (__kmp_storage_map) {
3934  __kmp_print_thread_storage_map(root_thread, gtid);
3935  }
3936  root_thread->th.th_info.ds.ds_gtid = gtid;
3937 #if OMPT_SUPPORT
3938  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3939 #endif
3940  root_thread->th.th_root = root;
3941  if (__kmp_env_consistency_check) {
3942  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3943  }
3944 #if USE_FAST_MEMORY
3945  __kmp_initialize_fast_memory(root_thread);
3946 #endif /* USE_FAST_MEMORY */
3947 
3948 #if KMP_USE_BGET
3949  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3950  __kmp_initialize_bget(root_thread);
3951 #endif
3952  __kmp_init_random(root_thread); // Initialize random number generator
3953  }
3954 
3955  /* setup the serial team held in reserve by the root thread */
3956  if (!root_thread->th.th_serial_team) {
3957  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3958  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3959  root_thread->th.th_serial_team = __kmp_allocate_team(
3960  root, 1, 1,
3961 #if OMPT_SUPPORT
3962  ompt_data_none, // root parallel id
3963 #endif
3964  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3965  }
3966  KMP_ASSERT(root_thread->th.th_serial_team);
3967  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3968  root_thread->th.th_serial_team));
3969 
3970  /* drop root_thread into place */
3971  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3972 
3973  root->r.r_root_team->t.t_threads[0] = root_thread;
3974  root->r.r_hot_team->t.t_threads[0] = root_thread;
3975  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3976  // AC: the team created in reserve, not for execution (it is unused for now).
3977  root_thread->th.th_serial_team->t.t_serialized = 0;
3978  root->r.r_uber_thread = root_thread;
3979 
3980  /* initialize the thread, get it ready to go */
3981  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3982  TCW_4(__kmp_init_gtid, TRUE);
3983 
3984  /* prepare the primary thread for get_gtid() */
3985  __kmp_gtid_set_specific(gtid);
3986 
3987 #if USE_ITT_BUILD
3988  __kmp_itt_thread_name(gtid);
3989 #endif /* USE_ITT_BUILD */
3990 
3991 #ifdef KMP_TDATA_GTID
3992  __kmp_gtid = gtid;
3993 #endif
3994  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3995  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3996 
3997  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3998  "plain=%u\n",
3999  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
4000  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
4001  KMP_INIT_BARRIER_STATE));
4002  { // Initialize barrier data.
4003  int b;
4004  for (b = 0; b < bs_last_barrier; ++b) {
4005  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4006 #if USE_DEBUGGER
4007  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
4008 #endif
4009  }
4010  }
4011  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
4012  KMP_INIT_BARRIER_STATE);
4013 
4014 #if KMP_AFFINITY_SUPPORTED
4015  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4016  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4017  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4018  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4019 #endif /* KMP_AFFINITY_SUPPORTED */
4020  root_thread->th.th_def_allocator = __kmp_def_allocator;
4021  root_thread->th.th_prev_level = 0;
4022  root_thread->th.th_prev_num_threads = 1;
4023 
4024  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4025  tmp->cg_root = root_thread;
4026  tmp->cg_thread_limit = __kmp_cg_max_nth;
4027  tmp->cg_nthreads = 1;
4028  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4029  " cg_nthreads init to 1\n",
4030  root_thread, tmp));
4031  tmp->up = NULL;
4032  root_thread->th.th_cg_roots = tmp;
4033 
4034  __kmp_root_counter++;
4035 
4036 #if OMPT_SUPPORT
4037  if (ompt_enabled.enabled) {
4038 
4039  kmp_info_t *root_thread = ompt_get_thread();
4040 
4041  ompt_set_thread_state(root_thread, ompt_state_overhead);
4042 
4043  if (ompt_enabled.ompt_callback_thread_begin) {
4044  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4045  ompt_thread_initial, __ompt_get_thread_data_internal());
4046  }
4047  ompt_data_t *task_data;
4048  ompt_data_t *parallel_data;
4049  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4050  NULL);
4051  if (ompt_enabled.ompt_callback_implicit_task) {
4052  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4053  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4054  }
4055 
4056  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4057  }
4058 #endif
4059 #if OMPD_SUPPORT
4060  if (ompd_state & OMPD_ENABLE_BP)
4061  ompd_bp_thread_begin();
4062 #endif
4063 
4064  KMP_MB();
4065  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4066 
4067  return gtid;
4068 }
4069 
4070 #if KMP_NESTED_HOT_TEAMS
4071 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4072  const int max_level) {
4073  int i, n, nth;
4074  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4075  if (!hot_teams || !hot_teams[level].hot_team) {
4076  return 0;
4077  }
4078  KMP_DEBUG_ASSERT(level < max_level);
4079  kmp_team_t *team = hot_teams[level].hot_team;
4080  nth = hot_teams[level].hot_team_nth;
4081  n = nth - 1; // primary thread is not freed
4082  if (level < max_level - 1) {
4083  for (i = 0; i < nth; ++i) {
4084  kmp_info_t *th = team->t.t_threads[i];
4085  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4086  if (i > 0 && th->th.th_hot_teams) {
4087  __kmp_free(th->th.th_hot_teams);
4088  th->th.th_hot_teams = NULL;
4089  }
4090  }
4091  }
4092  __kmp_free_team(root, team, NULL);
4093  return n;
4094 }
4095 #endif
4096 
4097 // Resets a root thread and clear its root and hot teams.
4098 // Returns the number of __kmp_threads entries directly and indirectly freed.
4099 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4100  kmp_team_t *root_team = root->r.r_root_team;
4101  kmp_team_t *hot_team = root->r.r_hot_team;
4102  int n = hot_team->t.t_nproc;
4103  int i;
4104 
4105  KMP_DEBUG_ASSERT(!root->r.r_active);
4106 
4107  root->r.r_root_team = NULL;
4108  root->r.r_hot_team = NULL;
4109  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4110  // before call to __kmp_free_team().
4111  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4112 #if KMP_NESTED_HOT_TEAMS
4113  if (__kmp_hot_teams_max_level >
4114  0) { // need to free nested hot teams and their threads if any
4115  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4116  kmp_info_t *th = hot_team->t.t_threads[i];
4117  if (__kmp_hot_teams_max_level > 1) {
4118  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4119  }
4120  if (th->th.th_hot_teams) {
4121  __kmp_free(th->th.th_hot_teams);
4122  th->th.th_hot_teams = NULL;
4123  }
4124  }
4125  }
4126 #endif
4127  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4128 
4129  // Before we can reap the thread, we need to make certain that all other
4130  // threads in the teams that had this root as ancestor have stopped trying to
4131  // steal tasks.
4132  if (__kmp_tasking_mode != tskm_immediate_exec) {
4133  __kmp_wait_to_unref_task_teams();
4134  }
4135 
4136 #if KMP_OS_WINDOWS
4137  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4138  KA_TRACE(
4139  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4140  "\n",
4141  (LPVOID) & (root->r.r_uber_thread->th),
4142  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4143  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4144 #endif /* KMP_OS_WINDOWS */
4145 
4146 #if OMPD_SUPPORT
4147  if (ompd_state & OMPD_ENABLE_BP)
4148  ompd_bp_thread_end();
4149 #endif
4150 
4151 #if OMPT_SUPPORT
4152  ompt_data_t *task_data;
4153  ompt_data_t *parallel_data;
4154  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4155  NULL);
4156  if (ompt_enabled.ompt_callback_implicit_task) {
4157  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4158  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4159  }
4160  if (ompt_enabled.ompt_callback_thread_end) {
4161  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4162  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4163  }
4164 #endif
4165 
4166  TCW_4(__kmp_nth,
4167  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4168  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4169  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4170  " to %d\n",
4171  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4172  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4173  if (i == 1) {
4174  // need to free contention group structure
4175  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4176  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4177  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4178  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4179  root->r.r_uber_thread->th.th_cg_roots = NULL;
4180  }
4181  __kmp_reap_thread(root->r.r_uber_thread, 1);
4182 
4183  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4184  // instead of freeing.
4185  root->r.r_uber_thread = NULL;
4186  /* mark root as no longer in use */
4187  root->r.r_begin = FALSE;
4188 
4189  return n;
4190 }
4191 
4192 void __kmp_unregister_root_current_thread(int gtid) {
4193  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4194  /* this lock should be ok, since unregister_root_current_thread is never
4195  called during an abort, only during a normal close. furthermore, if you
4196  have the forkjoin lock, you should never try to get the initz lock */
4197  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4198  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4199  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4200  "exiting T#%d\n",
4201  gtid));
4202  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4203  return;
4204  }
4205  kmp_root_t *root = __kmp_root[gtid];
4206 
4207  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4208  KMP_ASSERT(KMP_UBER_GTID(gtid));
4209  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4210  KMP_ASSERT(root->r.r_active == FALSE);
4211 
4212  KMP_MB();
4213 
4214  kmp_info_t *thread = __kmp_threads[gtid];
4215  kmp_team_t *team = thread->th.th_team;
4216  kmp_task_team_t *task_team = thread->th.th_task_team;
4217 
4218  // we need to wait for the proxy tasks before finishing the thread
4219  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4220  task_team->tt.tt_hidden_helper_task_encountered)) {
4221 #if OMPT_SUPPORT
4222  // the runtime is shutting down so we won't report any events
4223  thread->th.ompt_thread_info.state = ompt_state_undefined;
4224 #endif
4225  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4226  }
4227 
4228  __kmp_reset_root(gtid, root);
4229 
4230  KMP_MB();
4231  KC_TRACE(10,
4232  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4233 
4234  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4235 }
4236 
4237 #if KMP_OS_WINDOWS
4238 /* __kmp_forkjoin_lock must be already held
4239  Unregisters a root thread that is not the current thread. Returns the number
4240  of __kmp_threads entries freed as a result. */
4241 static int __kmp_unregister_root_other_thread(int gtid) {
4242  kmp_root_t *root = __kmp_root[gtid];
4243  int r;
4244 
4245  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4246  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4247  KMP_ASSERT(KMP_UBER_GTID(gtid));
4248  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4249  KMP_ASSERT(root->r.r_active == FALSE);
4250 
4251  r = __kmp_reset_root(gtid, root);
4252  KC_TRACE(10,
4253  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4254  return r;
4255 }
4256 #endif
4257 
4258 #if KMP_DEBUG
4259 void __kmp_task_info() {
4260 
4261  kmp_int32 gtid = __kmp_entry_gtid();
4262  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4263  kmp_info_t *this_thr = __kmp_threads[gtid];
4264  kmp_team_t *steam = this_thr->th.th_serial_team;
4265  kmp_team_t *team = this_thr->th.th_team;
4266 
4267  __kmp_printf(
4268  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4269  "ptask=%p\n",
4270  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4271  team->t.t_implicit_task_taskdata[tid].td_parent);
4272 }
4273 #endif // KMP_DEBUG
4274 
4275 /* TODO optimize with one big memclr, take out what isn't needed, split
4276  responsibility to workers as much as possible, and delay initialization of
4277  features as much as possible */
4278 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4279  int tid, int gtid) {
4280  /* this_thr->th.th_info.ds.ds_gtid is setup in
4281  kmp_allocate_thread/create_worker.
4282  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4283  KMP_DEBUG_ASSERT(this_thr != NULL);
4284  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4285  KMP_DEBUG_ASSERT(team);
4286  KMP_DEBUG_ASSERT(team->t.t_threads);
4287  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4288  kmp_info_t *master = team->t.t_threads[0];
4289  KMP_DEBUG_ASSERT(master);
4290  KMP_DEBUG_ASSERT(master->th.th_root);
4291 
4292  KMP_MB();
4293 
4294  TCW_SYNC_PTR(this_thr->th.th_team, team);
4295 
4296  this_thr->th.th_info.ds.ds_tid = tid;
4297  this_thr->th.th_set_nproc = 0;
4298  if (__kmp_tasking_mode != tskm_immediate_exec)
4299  // When tasking is possible, threads are not safe to reap until they are
4300  // done tasking; this will be set when tasking code is exited in wait
4301  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4302  else // no tasking --> always safe to reap
4303  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4304  this_thr->th.th_set_proc_bind = proc_bind_default;
4305 
4306 #if KMP_AFFINITY_SUPPORTED
4307  this_thr->th.th_new_place = this_thr->th.th_current_place;
4308 #endif
4309  this_thr->th.th_root = master->th.th_root;
4310 
4311  /* setup the thread's cache of the team structure */
4312  this_thr->th.th_team_nproc = team->t.t_nproc;
4313  this_thr->th.th_team_master = master;
4314  this_thr->th.th_team_serialized = team->t.t_serialized;
4315 
4316  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4317 
4318  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4319  tid, gtid, this_thr, this_thr->th.th_current_task));
4320 
4321  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4322  team, tid, TRUE);
4323 
4324  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4325  tid, gtid, this_thr, this_thr->th.th_current_task));
4326  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4327  // __kmp_initialize_team()?
4328 
4329  /* TODO no worksharing in speculative threads */
4330  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4331 
4332  this_thr->th.th_local.this_construct = 0;
4333 
4334  if (!this_thr->th.th_pri_common) {
4335  this_thr->th.th_pri_common =
4336  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4337  if (__kmp_storage_map) {
4338  __kmp_print_storage_map_gtid(
4339  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4340  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4341  }
4342  this_thr->th.th_pri_head = NULL;
4343  }
4344 
4345  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4346  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4347  // Make new thread's CG root same as primary thread's
4348  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4349  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4350  if (tmp) {
4351  // worker changes CG, need to check if old CG should be freed
4352  int i = tmp->cg_nthreads--;
4353  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4354  " on node %p of thread %p to %d\n",
4355  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4356  if (i == 1) {
4357  __kmp_free(tmp); // last thread left CG --> free it
4358  }
4359  }
4360  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4361  // Increment new thread's CG root's counter to add the new thread
4362  this_thr->th.th_cg_roots->cg_nthreads++;
4363  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4364  " node %p of thread %p to %d\n",
4365  this_thr, this_thr->th.th_cg_roots,
4366  this_thr->th.th_cg_roots->cg_root,
4367  this_thr->th.th_cg_roots->cg_nthreads));
4368  this_thr->th.th_current_task->td_icvs.thread_limit =
4369  this_thr->th.th_cg_roots->cg_thread_limit;
4370  }
4371 
4372  /* Initialize dynamic dispatch */
4373  {
4374  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4375  // Use team max_nproc since this will never change for the team.
4376  size_t disp_size =
4377  sizeof(dispatch_private_info_t) *
4378  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4379  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4380  team->t.t_max_nproc));
4381  KMP_ASSERT(dispatch);
4382  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4383  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4384 
4385  dispatch->th_disp_index = 0;
4386  dispatch->th_doacross_buf_idx = 0;
4387  if (!dispatch->th_disp_buffer) {
4388  dispatch->th_disp_buffer =
4389  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4390 
4391  if (__kmp_storage_map) {
4392  __kmp_print_storage_map_gtid(
4393  gtid, &dispatch->th_disp_buffer[0],
4394  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4395  ? 1
4396  : __kmp_dispatch_num_buffers],
4397  disp_size,
4398  "th_%d.th_dispatch.th_disp_buffer "
4399  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4400  gtid, team->t.t_id, gtid);
4401  }
4402  } else {
4403  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4404  }
4405 
4406  dispatch->th_dispatch_pr_current = 0;
4407  dispatch->th_dispatch_sh_current = 0;
4408 
4409  dispatch->th_deo_fcn = 0; /* ORDERED */
4410  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4411  }
4412 
4413  this_thr->th.th_next_pool = NULL;
4414 
4415  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4416  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4417 
4418  KMP_MB();
4419 }
4420 
4421 /* allocate a new thread for the requesting team. this is only called from
4422  within a forkjoin critical section. we will first try to get an available
4423  thread from the thread pool. if none is available, we will fork a new one
4424  assuming we are able to create a new one. this should be assured, as the
4425  caller should check on this first. */
4426 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4427  int new_tid) {
4428  kmp_team_t *serial_team;
4429  kmp_info_t *new_thr;
4430  int new_gtid;
4431 
4432  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4433  KMP_DEBUG_ASSERT(root && team);
4434 #if !KMP_NESTED_HOT_TEAMS
4435  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4436 #endif
4437  KMP_MB();
4438 
4439  /* first, try to get one from the thread pool unless allocating thread is
4440  * the main hidden helper thread. The hidden helper team should always
4441  * allocate new OS threads. */
4442  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4443  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4444  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4445  if (new_thr == __kmp_thread_pool_insert_pt) {
4446  __kmp_thread_pool_insert_pt = NULL;
4447  }
4448  TCW_4(new_thr->th.th_in_pool, FALSE);
4449  __kmp_suspend_initialize_thread(new_thr);
4450  __kmp_lock_suspend_mx(new_thr);
4451  if (new_thr->th.th_active_in_pool == TRUE) {
4452  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4453  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4454  new_thr->th.th_active_in_pool = FALSE;
4455  }
4456  __kmp_unlock_suspend_mx(new_thr);
4457 
4458  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4459  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4460  KMP_ASSERT(!new_thr->th.th_team);
4461  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4462 
4463  /* setup the thread structure */
4464  __kmp_initialize_info(new_thr, team, new_tid,
4465  new_thr->th.th_info.ds.ds_gtid);
4466  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4467 
4468  TCW_4(__kmp_nth, __kmp_nth + 1);
4469 
4470  new_thr->th.th_task_state = 0;
4471 
4472  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4473  // Make sure pool thread has transitioned to waiting on own thread struct
4474  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4475  // Thread activated in __kmp_allocate_team when increasing team size
4476  }
4477 
4478 #ifdef KMP_ADJUST_BLOCKTIME
4479  /* Adjust blocktime back to zero if necessary */
4480  /* Middle initialization might not have occurred yet */
4481  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4482  if (__kmp_nth > __kmp_avail_proc) {
4483  __kmp_zero_bt = TRUE;
4484  }
4485  }
4486 #endif /* KMP_ADJUST_BLOCKTIME */
4487 
4488 #if KMP_DEBUG
4489  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4490  // KMP_BARRIER_PARENT_FLAG.
4491  int b;
4492  kmp_balign_t *balign = new_thr->th.th_bar;
4493  for (b = 0; b < bs_last_barrier; ++b)
4494  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4495 #endif
4496 
4497  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4498  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4499 
4500  KMP_MB();
4501  return new_thr;
4502  }
4503 
4504  /* no, well fork a new one */
4505  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4506  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4507 
4508 #if KMP_USE_MONITOR
4509  // If this is the first worker thread the RTL is creating, then also
4510  // launch the monitor thread. We try to do this as early as possible.
4511  if (!TCR_4(__kmp_init_monitor)) {
4512  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4513  if (!TCR_4(__kmp_init_monitor)) {
4514  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4515  TCW_4(__kmp_init_monitor, 1);
4516  __kmp_create_monitor(&__kmp_monitor);
4517  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4518 #if KMP_OS_WINDOWS
4519  // AC: wait until monitor has started. This is a fix for CQ232808.
4520  // The reason is that if the library is loaded/unloaded in a loop with
4521  // small (parallel) work in between, then there is high probability that
4522  // monitor thread started after the library shutdown. At shutdown it is
4523  // too late to cope with the problem, because when the primary thread is
4524  // in DllMain (process detach) the monitor has no chances to start (it is
4525  // blocked), and primary thread has no means to inform the monitor that
4526  // the library has gone, because all the memory which the monitor can
4527  // access is going to be released/reset.
4528  while (TCR_4(__kmp_init_monitor) < 2) {
4529  KMP_YIELD(TRUE);
4530  }
4531  KF_TRACE(10, ("after monitor thread has started\n"));
4532 #endif
4533  }
4534  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4535  }
4536 #endif
4537 
4538  KMP_MB();
4539 
4540  {
4541  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4542  ? 1
4543  : __kmp_hidden_helper_threads_num + 1;
4544 
4545  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4546  ++new_gtid) {
4547  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4548  }
4549 
4550  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4551  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4552  }
4553  }
4554 
4555  /* allocate space for it. */
4556  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4557 
4558  new_thr->th.th_nt_strict = false;
4559  new_thr->th.th_nt_loc = NULL;
4560  new_thr->th.th_nt_sev = severity_fatal;
4561  new_thr->th.th_nt_msg = NULL;
4562 
4563  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4564 
4565 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4566  // suppress race conditions detection on synchronization flags in debug mode
4567  // this helps to analyze library internals eliminating false positives
4568  __itt_suppress_mark_range(
4569  __itt_suppress_range, __itt_suppress_threading_errors,
4570  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4571  __itt_suppress_mark_range(
4572  __itt_suppress_range, __itt_suppress_threading_errors,
4573  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4574 #if KMP_OS_WINDOWS
4575  __itt_suppress_mark_range(
4576  __itt_suppress_range, __itt_suppress_threading_errors,
4577  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4578 #else
4579  __itt_suppress_mark_range(__itt_suppress_range,
4580  __itt_suppress_threading_errors,
4581  &new_thr->th.th_suspend_init_count,
4582  sizeof(new_thr->th.th_suspend_init_count));
4583 #endif
4584  // TODO: check if we need to also suppress b_arrived flags
4585  __itt_suppress_mark_range(__itt_suppress_range,
4586  __itt_suppress_threading_errors,
4587  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4588  sizeof(new_thr->th.th_bar[0].bb.b_go));
4589  __itt_suppress_mark_range(__itt_suppress_range,
4590  __itt_suppress_threading_errors,
4591  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4592  sizeof(new_thr->th.th_bar[1].bb.b_go));
4593  __itt_suppress_mark_range(__itt_suppress_range,
4594  __itt_suppress_threading_errors,
4595  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4596  sizeof(new_thr->th.th_bar[2].bb.b_go));
4597 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4598  if (__kmp_storage_map) {
4599  __kmp_print_thread_storage_map(new_thr, new_gtid);
4600  }
4601 
4602  // add the reserve serialized team, initialized from the team's primary thread
4603  {
4604  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4605  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4606  new_thr->th.th_serial_team = serial_team =
4607  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4608 #if OMPT_SUPPORT
4609  ompt_data_none, // root parallel id
4610 #endif
4611  proc_bind_default, &r_icvs,
4612  0 USE_NESTED_HOT_ARG(NULL));
4613  }
4614  KMP_ASSERT(serial_team);
4615  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4616  // execution (it is unused for now).
4617  serial_team->t.t_threads[0] = new_thr;
4618  KF_TRACE(10,
4619  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4620  new_thr));
4621 
4622  /* setup the thread structures */
4623  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4624 
4625 #if USE_FAST_MEMORY
4626  __kmp_initialize_fast_memory(new_thr);
4627 #endif /* USE_FAST_MEMORY */
4628 
4629 #if KMP_USE_BGET
4630  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4631  __kmp_initialize_bget(new_thr);
4632 #endif
4633 
4634  __kmp_init_random(new_thr); // Initialize random number generator
4635 
4636  /* Initialize these only once when thread is grabbed for a team allocation */
4637  KA_TRACE(20,
4638  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4639  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4640 
4641  int b;
4642  kmp_balign_t *balign = new_thr->th.th_bar;
4643  for (b = 0; b < bs_last_barrier; ++b) {
4644  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4645  balign[b].bb.team = NULL;
4646  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4647  balign[b].bb.use_oncore_barrier = 0;
4648  }
4649 
4650  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4651  new_thr->th.th_sleep_loc_type = flag_unset;
4652 
4653  new_thr->th.th_spin_here = FALSE;
4654  new_thr->th.th_next_waiting = 0;
4655 #if KMP_OS_UNIX
4656  new_thr->th.th_blocking = false;
4657 #endif
4658 
4659 #if KMP_AFFINITY_SUPPORTED
4660  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4661  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4662  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4663  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4664 #endif
4665  new_thr->th.th_def_allocator = __kmp_def_allocator;
4666  new_thr->th.th_prev_level = 0;
4667  new_thr->th.th_prev_num_threads = 1;
4668 
4669  TCW_4(new_thr->th.th_in_pool, FALSE);
4670  new_thr->th.th_active_in_pool = FALSE;
4671  TCW_4(new_thr->th.th_active, TRUE);
4672 
4673  new_thr->th.th_set_nested_nth = NULL;
4674  new_thr->th.th_set_nested_nth_sz = 0;
4675 
4676  /* adjust the global counters */
4677  __kmp_all_nth++;
4678  __kmp_nth++;
4679 
4680  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4681  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4682  if (__kmp_adjust_gtid_mode) {
4683  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4684  if (TCR_4(__kmp_gtid_mode) != 2) {
4685  TCW_4(__kmp_gtid_mode, 2);
4686  }
4687  } else {
4688  if (TCR_4(__kmp_gtid_mode) != 1) {
4689  TCW_4(__kmp_gtid_mode, 1);
4690  }
4691  }
4692  }
4693 
4694 #ifdef KMP_ADJUST_BLOCKTIME
4695  /* Adjust blocktime back to zero if necessary */
4696  /* Middle initialization might not have occurred yet */
4697  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4698  if (__kmp_nth > __kmp_avail_proc) {
4699  __kmp_zero_bt = TRUE;
4700  }
4701  }
4702 #endif /* KMP_ADJUST_BLOCKTIME */
4703 
4704 #if KMP_AFFINITY_SUPPORTED
4705  // Set the affinity and topology information for new thread
4706  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4707 #endif
4708 
4709  /* actually fork it and create the new worker thread */
4710  KF_TRACE(
4711  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4712  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4713  KF_TRACE(10,
4714  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4715 
4716  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4717  new_gtid));
4718  KMP_MB();
4719  return new_thr;
4720 }
4721 
4722 /* Reinitialize team for reuse.
4723  The hot team code calls this case at every fork barrier, so EPCC barrier
4724  test are extremely sensitive to changes in it, esp. writes to the team
4725  struct, which cause a cache invalidation in all threads.
4726  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4727 static void __kmp_reinitialize_team(kmp_team_t *team,
4728  kmp_internal_control_t *new_icvs,
4729  ident_t *loc) {
4730  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4731  team->t.t_threads[0], team));
4732  KMP_DEBUG_ASSERT(team && new_icvs);
4733  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4734  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4735 
4736  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4737  // Copy ICVs to the primary thread's implicit taskdata
4738  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4739  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4740 
4741  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4742  team->t.t_threads[0], team));
4743 }
4744 
4745 /* Initialize the team data structure.
4746  This assumes the t_threads and t_max_nproc are already set.
4747  Also, we don't touch the arguments */
4748 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4749  kmp_internal_control_t *new_icvs,
4750  ident_t *loc) {
4751  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4752 
4753  /* verify */
4754  KMP_DEBUG_ASSERT(team);
4755  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4756  KMP_DEBUG_ASSERT(team->t.t_threads);
4757  KMP_MB();
4758 
4759  team->t.t_master_tid = 0; /* not needed */
4760  /* team->t.t_master_bar; not needed */
4761  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4762  team->t.t_nproc = new_nproc;
4763 
4764  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4765  team->t.t_next_pool = NULL;
4766  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4767  * up hot team */
4768 
4769  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4770  team->t.t_invoke = NULL; /* not needed */
4771 
4772  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4773  team->t.t_sched.sched = new_icvs->sched.sched;
4774 
4775 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4776  team->t.t_fp_control_saved = FALSE; /* not needed */
4777  team->t.t_x87_fpu_control_word = 0; /* not needed */
4778  team->t.t_mxcsr = 0; /* not needed */
4779 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4780 
4781  team->t.t_construct = 0;
4782 
4783  team->t.t_ordered.dt.t_value = 0;
4784  team->t.t_master_active = FALSE;
4785 
4786 #ifdef KMP_DEBUG
4787  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4788 #endif
4789 #if KMP_OS_WINDOWS
4790  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4791 #endif
4792 
4793  team->t.t_control_stack_top = NULL;
4794 
4795  __kmp_reinitialize_team(team, new_icvs, loc);
4796 
4797  KMP_MB();
4798  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4799 }
4800 
4801 #if KMP_AFFINITY_SUPPORTED
4802 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4803  int first, int last, int newp) {
4804  th->th.th_first_place = first;
4805  th->th.th_last_place = last;
4806  th->th.th_new_place = newp;
4807  if (newp != th->th.th_current_place) {
4808  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4809  team->t.t_display_affinity = 1;
4810  // Copy topology information associated with the new place
4811  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4812  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4813  }
4814 }
4815 
4816 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4817 // It calculates the worker + primary thread's partition based upon the parent
4818 // thread's partition, and binds each worker to a thread in their partition.
4819 // The primary thread's partition should already include its current binding.
4820 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4821  // Do not partition places for the hidden helper team
4822  if (KMP_HIDDEN_HELPER_TEAM(team))
4823  return;
4824  // Copy the primary thread's place partition to the team struct
4825  kmp_info_t *master_th = team->t.t_threads[0];
4826  KMP_DEBUG_ASSERT(master_th != NULL);
4827  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4828  int first_place = master_th->th.th_first_place;
4829  int last_place = master_th->th.th_last_place;
4830  int masters_place = master_th->th.th_current_place;
4831  int num_masks = __kmp_affinity.num_masks;
4832  team->t.t_first_place = first_place;
4833  team->t.t_last_place = last_place;
4834 
4835  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4836  "bound to place %d partition = [%d,%d]\n",
4837  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4838  team->t.t_id, masters_place, first_place, last_place));
4839 
4840  switch (proc_bind) {
4841 
4842  case proc_bind_default:
4843  // Serial teams might have the proc_bind policy set to proc_bind_default.
4844  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4845  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4846  break;
4847 
4848  case proc_bind_primary: {
4849  int f;
4850  int n_th = team->t.t_nproc;
4851  for (f = 1; f < n_th; f++) {
4852  kmp_info_t *th = team->t.t_threads[f];
4853  KMP_DEBUG_ASSERT(th != NULL);
4854  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4855 
4856  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4857  "partition = [%d,%d]\n",
4858  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4859  f, masters_place, first_place, last_place));
4860  }
4861  } break;
4862 
4863  case proc_bind_close: {
4864  int f;
4865  int n_th = team->t.t_nproc;
4866  int n_places;
4867  if (first_place <= last_place) {
4868  n_places = last_place - first_place + 1;
4869  } else {
4870  n_places = num_masks - first_place + last_place + 1;
4871  }
4872  if (n_th <= n_places) {
4873  int place = masters_place;
4874  for (f = 1; f < n_th; f++) {
4875  kmp_info_t *th = team->t.t_threads[f];
4876  KMP_DEBUG_ASSERT(th != NULL);
4877 
4878  if (place == last_place) {
4879  place = first_place;
4880  } else if (place == (num_masks - 1)) {
4881  place = 0;
4882  } else {
4883  place++;
4884  }
4885  __kmp_set_thread_place(team, th, first_place, last_place, place);
4886 
4887  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4888  "partition = [%d,%d]\n",
4889  __kmp_gtid_from_thread(team->t.t_threads[f]),
4890  team->t.t_id, f, place, first_place, last_place));
4891  }
4892  } else {
4893  int S, rem, gap, s_count;
4894  S = n_th / n_places;
4895  s_count = 0;
4896  rem = n_th - (S * n_places);
4897  gap = rem > 0 ? n_places / rem : n_places;
4898  int place = masters_place;
4899  int gap_ct = gap;
4900  for (f = 0; f < n_th; f++) {
4901  kmp_info_t *th = team->t.t_threads[f];
4902  KMP_DEBUG_ASSERT(th != NULL);
4903 
4904  __kmp_set_thread_place(team, th, first_place, last_place, place);
4905  s_count++;
4906 
4907  if ((s_count == S) && rem && (gap_ct == gap)) {
4908  // do nothing, add an extra thread to place on next iteration
4909  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4910  // we added an extra thread to this place; move to next place
4911  if (place == last_place) {
4912  place = first_place;
4913  } else if (place == (num_masks - 1)) {
4914  place = 0;
4915  } else {
4916  place++;
4917  }
4918  s_count = 0;
4919  gap_ct = 1;
4920  rem--;
4921  } else if (s_count == S) { // place full; don't add extra
4922  if (place == last_place) {
4923  place = first_place;
4924  } else if (place == (num_masks - 1)) {
4925  place = 0;
4926  } else {
4927  place++;
4928  }
4929  gap_ct++;
4930  s_count = 0;
4931  }
4932 
4933  KA_TRACE(100,
4934  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4935  "partition = [%d,%d]\n",
4936  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4937  th->th.th_new_place, first_place, last_place));
4938  }
4939  KMP_DEBUG_ASSERT(place == masters_place);
4940  }
4941  } break;
4942 
4943  case proc_bind_spread: {
4944  int f;
4945  int n_th = team->t.t_nproc;
4946  int n_places;
4947  int thidx;
4948  if (first_place <= last_place) {
4949  n_places = last_place - first_place + 1;
4950  } else {
4951  n_places = num_masks - first_place + last_place + 1;
4952  }
4953  if (n_th <= n_places) {
4954  int place = -1;
4955 
4956  if (n_places != num_masks) {
4957  int S = n_places / n_th;
4958  int s_count, rem, gap, gap_ct;
4959 
4960  place = masters_place;
4961  rem = n_places - n_th * S;
4962  gap = rem ? n_th / rem : 1;
4963  gap_ct = gap;
4964  thidx = n_th;
4965  if (update_master_only == 1)
4966  thidx = 1;
4967  for (f = 0; f < thidx; f++) {
4968  kmp_info_t *th = team->t.t_threads[f];
4969  KMP_DEBUG_ASSERT(th != NULL);
4970 
4971  int fplace = place, nplace = place;
4972  s_count = 1;
4973  while (s_count < S) {
4974  if (place == last_place) {
4975  place = first_place;
4976  } else if (place == (num_masks - 1)) {
4977  place = 0;
4978  } else {
4979  place++;
4980  }
4981  s_count++;
4982  }
4983  if (rem && (gap_ct == gap)) {
4984  if (place == last_place) {
4985  place = first_place;
4986  } else if (place == (num_masks - 1)) {
4987  place = 0;
4988  } else {
4989  place++;
4990  }
4991  rem--;
4992  gap_ct = 0;
4993  }
4994  __kmp_set_thread_place(team, th, fplace, place, nplace);
4995  gap_ct++;
4996 
4997  if (place == last_place) {
4998  place = first_place;
4999  } else if (place == (num_masks - 1)) {
5000  place = 0;
5001  } else {
5002  place++;
5003  }
5004 
5005  KA_TRACE(100,
5006  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5007  "partition = [%d,%d], num_masks: %u\n",
5008  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
5009  f, th->th.th_new_place, th->th.th_first_place,
5010  th->th.th_last_place, num_masks));
5011  }
5012  } else {
5013  /* Having uniform space of available computation places I can create
5014  T partitions of round(P/T) size and put threads into the first
5015  place of each partition. */
5016  double current = static_cast<double>(masters_place);
5017  double spacing =
5018  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5019  int first, last;
5020  kmp_info_t *th;
5021 
5022  thidx = n_th + 1;
5023  if (update_master_only == 1)
5024  thidx = 1;
5025  for (f = 0; f < thidx; f++) {
5026  first = static_cast<int>(current);
5027  last = static_cast<int>(current + spacing) - 1;
5028  KMP_DEBUG_ASSERT(last >= first);
5029  if (first >= n_places) {
5030  if (masters_place) {
5031  first -= n_places;
5032  last -= n_places;
5033  if (first == (masters_place + 1)) {
5034  KMP_DEBUG_ASSERT(f == n_th);
5035  first--;
5036  }
5037  if (last == masters_place) {
5038  KMP_DEBUG_ASSERT(f == (n_th - 1));
5039  last--;
5040  }
5041  } else {
5042  KMP_DEBUG_ASSERT(f == n_th);
5043  first = 0;
5044  last = 0;
5045  }
5046  }
5047  if (last >= n_places) {
5048  last = (n_places - 1);
5049  }
5050  place = first;
5051  current += spacing;
5052  if (f < n_th) {
5053  KMP_DEBUG_ASSERT(0 <= first);
5054  KMP_DEBUG_ASSERT(n_places > first);
5055  KMP_DEBUG_ASSERT(0 <= last);
5056  KMP_DEBUG_ASSERT(n_places > last);
5057  KMP_DEBUG_ASSERT(last_place >= first_place);
5058  th = team->t.t_threads[f];
5059  KMP_DEBUG_ASSERT(th);
5060  __kmp_set_thread_place(team, th, first, last, place);
5061  KA_TRACE(100,
5062  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5063  "partition = [%d,%d], spacing = %.4f\n",
5064  __kmp_gtid_from_thread(team->t.t_threads[f]),
5065  team->t.t_id, f, th->th.th_new_place,
5066  th->th.th_first_place, th->th.th_last_place, spacing));
5067  }
5068  }
5069  }
5070  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5071  } else {
5072  int S, rem, gap, s_count;
5073  S = n_th / n_places;
5074  s_count = 0;
5075  rem = n_th - (S * n_places);
5076  gap = rem > 0 ? n_places / rem : n_places;
5077  int place = masters_place;
5078  int gap_ct = gap;
5079  thidx = n_th;
5080  if (update_master_only == 1)
5081  thidx = 1;
5082  for (f = 0; f < thidx; f++) {
5083  kmp_info_t *th = team->t.t_threads[f];
5084  KMP_DEBUG_ASSERT(th != NULL);
5085 
5086  __kmp_set_thread_place(team, th, place, place, place);
5087  s_count++;
5088 
5089  if ((s_count == S) && rem && (gap_ct == gap)) {
5090  // do nothing, add an extra thread to place on next iteration
5091  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5092  // we added an extra thread to this place; move on to next place
5093  if (place == last_place) {
5094  place = first_place;
5095  } else if (place == (num_masks - 1)) {
5096  place = 0;
5097  } else {
5098  place++;
5099  }
5100  s_count = 0;
5101  gap_ct = 1;
5102  rem--;
5103  } else if (s_count == S) { // place is full; don't add extra thread
5104  if (place == last_place) {
5105  place = first_place;
5106  } else if (place == (num_masks - 1)) {
5107  place = 0;
5108  } else {
5109  place++;
5110  }
5111  gap_ct++;
5112  s_count = 0;
5113  }
5114 
5115  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5116  "partition = [%d,%d]\n",
5117  __kmp_gtid_from_thread(team->t.t_threads[f]),
5118  team->t.t_id, f, th->th.th_new_place,
5119  th->th.th_first_place, th->th.th_last_place));
5120  }
5121  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5122  }
5123  } break;
5124 
5125  default:
5126  break;
5127  }
5128 
5129  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5130 }
5131 
5132 #endif // KMP_AFFINITY_SUPPORTED
5133 
5134 /* allocate a new team data structure to use. take one off of the free pool if
5135  available */
5136 kmp_team_t *
5137 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5138 #if OMPT_SUPPORT
5139  ompt_data_t ompt_parallel_data,
5140 #endif
5141  kmp_proc_bind_t new_proc_bind,
5142  kmp_internal_control_t *new_icvs,
5143  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5144  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5145  int f;
5146  kmp_team_t *team;
5147  int use_hot_team = !root->r.r_active;
5148  int level = 0;
5149  int do_place_partition = 1;
5150 
5151  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5152  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5153  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5154  KMP_MB();
5155 
5156 #if KMP_NESTED_HOT_TEAMS
5157  kmp_hot_team_ptr_t *hot_teams;
5158  if (master) {
5159  team = master->th.th_team;
5160  level = team->t.t_active_level;
5161  if (master->th.th_teams_microtask) { // in teams construct?
5162  if (master->th.th_teams_size.nteams > 1 &&
5163  ( // #teams > 1
5164  team->t.t_pkfn ==
5165  (microtask_t)__kmp_teams_master || // inner fork of the teams
5166  master->th.th_teams_level <
5167  team->t.t_level)) { // or nested parallel inside the teams
5168  ++level; // not increment if #teams==1, or for outer fork of the teams;
5169  // increment otherwise
5170  }
5171  // Do not perform the place partition if inner fork of the teams
5172  // Wait until nested parallel region encountered inside teams construct
5173  if ((master->th.th_teams_size.nteams == 1 &&
5174  master->th.th_teams_level >= team->t.t_level) ||
5175  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5176  do_place_partition = 0;
5177  }
5178  hot_teams = master->th.th_hot_teams;
5179  if (level < __kmp_hot_teams_max_level && hot_teams &&
5180  hot_teams[level].hot_team) {
5181  // hot team has already been allocated for given level
5182  use_hot_team = 1;
5183  } else {
5184  use_hot_team = 0;
5185  }
5186  } else {
5187  // check we won't access uninitialized hot_teams, just in case
5188  KMP_DEBUG_ASSERT(new_nproc == 1);
5189  }
5190 #endif
5191  // Optimization to use a "hot" team
5192  if (use_hot_team && new_nproc > 1) {
5193  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5194 #if KMP_NESTED_HOT_TEAMS
5195  team = hot_teams[level].hot_team;
5196 #else
5197  team = root->r.r_hot_team;
5198 #endif
5199 #if KMP_DEBUG
5200  if (__kmp_tasking_mode != tskm_immediate_exec) {
5201  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5202  "task_team[1] = %p before reinit\n",
5203  team->t.t_task_team[0], team->t.t_task_team[1]));
5204  }
5205 #endif
5206 
5207  if (team->t.t_nproc != new_nproc &&
5208  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5209  // Distributed barrier may need a resize
5210  int old_nthr = team->t.t_nproc;
5211  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5212  }
5213 
5214  // If not doing the place partition, then reset the team's proc bind
5215  // to indicate that partitioning of all threads still needs to take place
5216  if (do_place_partition == 0)
5217  team->t.t_proc_bind = proc_bind_default;
5218  // Has the number of threads changed?
5219  /* Let's assume the most common case is that the number of threads is
5220  unchanged, and put that case first. */
5221  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5222  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5223  // This case can mean that omp_set_num_threads() was called and the hot
5224  // team size was already reduced, so we check the special flag
5225  if (team->t.t_size_changed == -1) {
5226  team->t.t_size_changed = 1;
5227  } else {
5228  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5229  }
5230 
5231  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5232  kmp_r_sched_t new_sched = new_icvs->sched;
5233  // set primary thread's schedule as new run-time schedule
5234  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5235 
5236  __kmp_reinitialize_team(team, new_icvs,
5237  root->r.r_uber_thread->th.th_ident);
5238 
5239  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5240  team->t.t_threads[0], team));
5241  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5242 
5243 #if KMP_AFFINITY_SUPPORTED
5244  if ((team->t.t_size_changed == 0) &&
5245  (team->t.t_proc_bind == new_proc_bind)) {
5246  if (new_proc_bind == proc_bind_spread) {
5247  if (do_place_partition) {
5248  // add flag to update only master for spread
5249  __kmp_partition_places(team, 1);
5250  }
5251  }
5252  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5253  "proc_bind = %d, partition = [%d,%d]\n",
5254  team->t.t_id, new_proc_bind, team->t.t_first_place,
5255  team->t.t_last_place));
5256  } else {
5257  if (do_place_partition) {
5258  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5259  __kmp_partition_places(team);
5260  }
5261  }
5262 #else
5263  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5264 #endif /* KMP_AFFINITY_SUPPORTED */
5265  } else if (team->t.t_nproc > new_nproc) {
5266  KA_TRACE(20,
5267  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5268  new_nproc));
5269 
5270  team->t.t_size_changed = 1;
5271  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5272  // Barrier size already reduced earlier in this function
5273  // Activate team threads via th_used_in_team
5274  __kmp_add_threads_to_team(team, new_nproc);
5275  }
5276  // When decreasing team size, threads no longer in the team should
5277  // unref task team.
5278  if (__kmp_tasking_mode != tskm_immediate_exec) {
5279  for (f = new_nproc; f < team->t.t_nproc; f++) {
5280  kmp_info_t *th = team->t.t_threads[f];
5281  KMP_DEBUG_ASSERT(th);
5282  th->th.th_task_team = NULL;
5283  }
5284  }
5285 #if KMP_NESTED_HOT_TEAMS
5286  if (__kmp_hot_teams_mode == 0) {
5287  // AC: saved number of threads should correspond to team's value in this
5288  // mode, can be bigger in mode 1, when hot team has threads in reserve
5289  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5290  hot_teams[level].hot_team_nth = new_nproc;
5291 #endif // KMP_NESTED_HOT_TEAMS
5292  /* release the extra threads we don't need any more */
5293  for (f = new_nproc; f < team->t.t_nproc; f++) {
5294  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5295  __kmp_free_thread(team->t.t_threads[f]);
5296  team->t.t_threads[f] = NULL;
5297  }
5298 #if KMP_NESTED_HOT_TEAMS
5299  } // (__kmp_hot_teams_mode == 0)
5300  else {
5301  // When keeping extra threads in team, switch threads to wait on own
5302  // b_go flag
5303  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5304  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5305  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5306  for (int b = 0; b < bs_last_barrier; ++b) {
5307  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5308  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5309  }
5310  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5311  }
5312  }
5313  }
5314 #endif // KMP_NESTED_HOT_TEAMS
5315  team->t.t_nproc = new_nproc;
5316  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5317  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5318  __kmp_reinitialize_team(team, new_icvs,
5319  root->r.r_uber_thread->th.th_ident);
5320 
5321  // Update remaining threads
5322  for (f = 0; f < new_nproc; ++f) {
5323  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5324  }
5325 
5326  // restore the current task state of the primary thread: should be the
5327  // implicit task
5328  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5329  team->t.t_threads[0], team));
5330 
5331  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5332 
5333 #ifdef KMP_DEBUG
5334  for (f = 0; f < team->t.t_nproc; f++) {
5335  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5336  team->t.t_threads[f]->th.th_team_nproc ==
5337  team->t.t_nproc);
5338  }
5339 #endif
5340 
5341  if (do_place_partition) {
5342  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5343 #if KMP_AFFINITY_SUPPORTED
5344  __kmp_partition_places(team);
5345 #endif
5346  }
5347  } else { // team->t.t_nproc < new_nproc
5348 
5349  KA_TRACE(20,
5350  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5351  new_nproc));
5352  int old_nproc = team->t.t_nproc; // save old value and use to update only
5353  team->t.t_size_changed = 1;
5354 
5355 #if KMP_NESTED_HOT_TEAMS
5356  int avail_threads = hot_teams[level].hot_team_nth;
5357  if (new_nproc < avail_threads)
5358  avail_threads = new_nproc;
5359  kmp_info_t **other_threads = team->t.t_threads;
5360  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5361  // Adjust barrier data of reserved threads (if any) of the team
5362  // Other data will be set in __kmp_initialize_info() below.
5363  int b;
5364  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5365  for (b = 0; b < bs_last_barrier; ++b) {
5366  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5367  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5368 #if USE_DEBUGGER
5369  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5370 #endif
5371  }
5372  }
5373  if (hot_teams[level].hot_team_nth >= new_nproc) {
5374  // we have all needed threads in reserve, no need to allocate any
5375  // this only possible in mode 1, cannot have reserved threads in mode 0
5376  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5377  team->t.t_nproc = new_nproc; // just get reserved threads involved
5378  } else {
5379  // We may have some threads in reserve, but not enough;
5380  // get reserved threads involved if any.
5381  team->t.t_nproc = hot_teams[level].hot_team_nth;
5382  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5383 #endif // KMP_NESTED_HOT_TEAMS
5384  if (team->t.t_max_nproc < new_nproc) {
5385  /* reallocate larger arrays */
5386  __kmp_reallocate_team_arrays(team, new_nproc);
5387  __kmp_reinitialize_team(team, new_icvs, NULL);
5388  }
5389 
5390 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5391  KMP_AFFINITY_SUPPORTED
5392  /* Temporarily set full mask for primary thread before creation of
5393  workers. The reason is that workers inherit the affinity from the
5394  primary thread, so if a lot of workers are created on the single
5395  core quickly, they don't get a chance to set their own affinity for
5396  a long time. */
5397  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5398 #endif
5399 
5400  /* allocate new threads for the hot team */
5401  for (f = team->t.t_nproc; f < new_nproc; f++) {
5402  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5403  KMP_DEBUG_ASSERT(new_worker);
5404  team->t.t_threads[f] = new_worker;
5405 
5406  KA_TRACE(20,
5407  ("__kmp_allocate_team: team %d init T#%d arrived: "
5408  "join=%llu, plain=%llu\n",
5409  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5410  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5411  team->t.t_bar[bs_plain_barrier].b_arrived));
5412 
5413  { // Initialize barrier data for new threads.
5414  int b;
5415  kmp_balign_t *balign = new_worker->th.th_bar;
5416  for (b = 0; b < bs_last_barrier; ++b) {
5417  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5418  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5419  KMP_BARRIER_PARENT_FLAG);
5420 #if USE_DEBUGGER
5421  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5422 #endif
5423  }
5424  }
5425  }
5426 
5427 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5428  KMP_AFFINITY_SUPPORTED
5429  /* Restore initial primary thread's affinity mask */
5430  new_temp_affinity.restore();
5431 #endif
5432 #if KMP_NESTED_HOT_TEAMS
5433  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5434 #endif // KMP_NESTED_HOT_TEAMS
5435  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5436  // Barrier size already increased earlier in this function
5437  // Activate team threads via th_used_in_team
5438  __kmp_add_threads_to_team(team, new_nproc);
5439  }
5440  /* make sure everyone is syncronized */
5441  // new threads below
5442  __kmp_initialize_team(team, new_nproc, new_icvs,
5443  root->r.r_uber_thread->th.th_ident);
5444 
5445  /* reinitialize the threads */
5446  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5447  for (f = 0; f < team->t.t_nproc; ++f)
5448  __kmp_initialize_info(team->t.t_threads[f], team, f,
5449  __kmp_gtid_from_tid(f, team));
5450 
5451  // set th_task_state for new threads in hot team with older thread's state
5452  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5453  for (f = old_nproc; f < team->t.t_nproc; ++f)
5454  team->t.t_threads[f]->th.th_task_state = old_state;
5455 
5456 #ifdef KMP_DEBUG
5457  for (f = 0; f < team->t.t_nproc; ++f) {
5458  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5459  team->t.t_threads[f]->th.th_team_nproc ==
5460  team->t.t_nproc);
5461  }
5462 #endif
5463 
5464  if (do_place_partition) {
5465  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5466 #if KMP_AFFINITY_SUPPORTED
5467  __kmp_partition_places(team);
5468 #endif
5469  }
5470  } // Check changes in number of threads
5471 
5472  if (master->th.th_teams_microtask) {
5473  for (f = 1; f < new_nproc; ++f) {
5474  // propagate teams construct specific info to workers
5475  kmp_info_t *thr = team->t.t_threads[f];
5476  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5477  thr->th.th_teams_level = master->th.th_teams_level;
5478  thr->th.th_teams_size = master->th.th_teams_size;
5479  }
5480  }
5481 #if KMP_NESTED_HOT_TEAMS
5482  if (level) {
5483  // Sync barrier state for nested hot teams, not needed for outermost hot
5484  // team.
5485  for (f = 1; f < new_nproc; ++f) {
5486  kmp_info_t *thr = team->t.t_threads[f];
5487  int b;
5488  kmp_balign_t *balign = thr->th.th_bar;
5489  for (b = 0; b < bs_last_barrier; ++b) {
5490  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5491  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5492 #if USE_DEBUGGER
5493  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5494 #endif
5495  }
5496  }
5497  }
5498 #endif // KMP_NESTED_HOT_TEAMS
5499 
5500  /* reallocate space for arguments if necessary */
5501  __kmp_alloc_argv_entries(argc, team, TRUE);
5502  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5503  // The hot team re-uses the previous task team,
5504  // if untouched during the previous release->gather phase.
5505 
5506  KF_TRACE(10, (" hot_team = %p\n", team));
5507 
5508 #if KMP_DEBUG
5509  if (__kmp_tasking_mode != tskm_immediate_exec) {
5510  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5511  "task_team[1] = %p after reinit\n",
5512  team->t.t_task_team[0], team->t.t_task_team[1]));
5513  }
5514 #endif
5515 
5516 #if OMPT_SUPPORT
5517  __ompt_team_assign_id(team, ompt_parallel_data);
5518 #endif
5519 
5520  KMP_MB();
5521 
5522  return team;
5523  }
5524 
5525  /* next, let's try to take one from the team pool */
5526  KMP_MB();
5527  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5528  /* TODO: consider resizing undersized teams instead of reaping them, now
5529  that we have a resizing mechanism */
5530  if (team->t.t_max_nproc >= max_nproc) {
5531  /* take this team from the team pool */
5532  __kmp_team_pool = team->t.t_next_pool;
5533 
5534  if (max_nproc > 1 &&
5535  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5536  if (!team->t.b) { // Allocate barrier structure
5537  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5538  }
5539  }
5540 
5541  /* setup the team for fresh use */
5542  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5543 
5544  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5545  "task_team[1] %p to NULL\n",
5546  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5547  team->t.t_task_team[0] = NULL;
5548  team->t.t_task_team[1] = NULL;
5549 
5550  /* reallocate space for arguments if necessary */
5551  __kmp_alloc_argv_entries(argc, team, TRUE);
5552  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5553 
5554  KA_TRACE(
5555  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5556  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5557  { // Initialize barrier data.
5558  int b;
5559  for (b = 0; b < bs_last_barrier; ++b) {
5560  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5561 #if USE_DEBUGGER
5562  team->t.t_bar[b].b_master_arrived = 0;
5563  team->t.t_bar[b].b_team_arrived = 0;
5564 #endif
5565  }
5566  }
5567 
5568  team->t.t_proc_bind = new_proc_bind;
5569 
5570  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5571  team->t.t_id));
5572 
5573 #if OMPT_SUPPORT
5574  __ompt_team_assign_id(team, ompt_parallel_data);
5575 #endif
5576 
5577  team->t.t_nested_nth = NULL;
5578 
5579  KMP_MB();
5580 
5581  return team;
5582  }
5583 
5584  /* reap team if it is too small, then loop back and check the next one */
5585  // not sure if this is wise, but, will be redone during the hot-teams
5586  // rewrite.
5587  /* TODO: Use technique to find the right size hot-team, don't reap them */
5588  team = __kmp_reap_team(team);
5589  __kmp_team_pool = team;
5590  }
5591 
5592  /* nothing available in the pool, no matter, make a new team! */
5593  KMP_MB();
5594  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5595 
5596  /* and set it up */
5597  team->t.t_max_nproc = max_nproc;
5598  if (max_nproc > 1 &&
5599  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5600  // Allocate barrier structure
5601  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5602  }
5603 
5604  /* NOTE well, for some reason allocating one big buffer and dividing it up
5605  seems to really hurt performance a lot on the P4, so, let's not use this */
5606  __kmp_allocate_team_arrays(team, max_nproc);
5607 
5608  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5609  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5610 
5611  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5612  "%p to NULL\n",
5613  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5614  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5615  // memory, no need to duplicate
5616  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5617  // memory, no need to duplicate
5618 
5619  if (__kmp_storage_map) {
5620  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5621  }
5622 
5623  /* allocate space for arguments */
5624  __kmp_alloc_argv_entries(argc, team, FALSE);
5625  team->t.t_argc = argc;
5626 
5627  KA_TRACE(20,
5628  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5629  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5630  { // Initialize barrier data.
5631  int b;
5632  for (b = 0; b < bs_last_barrier; ++b) {
5633  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5634 #if USE_DEBUGGER
5635  team->t.t_bar[b].b_master_arrived = 0;
5636  team->t.t_bar[b].b_team_arrived = 0;
5637 #endif
5638  }
5639  }
5640 
5641  team->t.t_proc_bind = new_proc_bind;
5642 
5643 #if OMPT_SUPPORT
5644  __ompt_team_assign_id(team, ompt_parallel_data);
5645  team->t.ompt_serialized_team_info = NULL;
5646 #endif
5647 
5648  KMP_MB();
5649 
5650  team->t.t_nested_nth = NULL;
5651 
5652  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5653  team->t.t_id));
5654 
5655  return team;
5656 }
5657 
5658 /* TODO implement hot-teams at all levels */
5659 /* TODO implement lazy thread release on demand (disband request) */
5660 
5661 /* free the team. return it to the team pool. release all the threads
5662  * associated with it */
5663 void __kmp_free_team(kmp_root_t *root,
5664  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5665  int f;
5666  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5667  team->t.t_id));
5668 
5669  /* verify state */
5670  KMP_DEBUG_ASSERT(root);
5671  KMP_DEBUG_ASSERT(team);
5672  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5673  KMP_DEBUG_ASSERT(team->t.t_threads);
5674 
5675  int use_hot_team = team == root->r.r_hot_team;
5676 #if KMP_NESTED_HOT_TEAMS
5677  int level;
5678  if (master) {
5679  level = team->t.t_active_level - 1;
5680  if (master->th.th_teams_microtask) { // in teams construct?
5681  if (master->th.th_teams_size.nteams > 1) {
5682  ++level; // level was not increased in teams construct for
5683  // team_of_masters
5684  }
5685  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5686  master->th.th_teams_level == team->t.t_level) {
5687  ++level; // level was not increased in teams construct for
5688  // team_of_workers before the parallel
5689  } // team->t.t_level will be increased inside parallel
5690  }
5691 #if KMP_DEBUG
5692  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5693 #endif
5694  if (level < __kmp_hot_teams_max_level) {
5695  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5696  use_hot_team = 1;
5697  }
5698  }
5699 #endif // KMP_NESTED_HOT_TEAMS
5700 
5701  /* team is done working */
5702  TCW_SYNC_PTR(team->t.t_pkfn,
5703  NULL); // Important for Debugging Support Library.
5704 #if KMP_OS_WINDOWS
5705  team->t.t_copyin_counter = 0; // init counter for possible reuse
5706 #endif
5707  // Do not reset pointer to parent team to NULL for hot teams.
5708 
5709  /* if we are non-hot team, release our threads */
5710  if (!use_hot_team) {
5711  if (__kmp_tasking_mode != tskm_immediate_exec) {
5712  // Wait for threads to reach reapable state
5713  for (f = 1; f < team->t.t_nproc; ++f) {
5714  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5715  kmp_info_t *th = team->t.t_threads[f];
5716  volatile kmp_uint32 *state = &th->th.th_reap_state;
5717  while (*state != KMP_SAFE_TO_REAP) {
5718 #if KMP_OS_WINDOWS
5719  // On Windows a thread can be killed at any time, check this
5720  DWORD ecode;
5721  if (!__kmp_is_thread_alive(th, &ecode)) {
5722  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5723  break;
5724  }
5725 #endif
5726  // first check if thread is sleeping
5727  if (th->th.th_sleep_loc)
5728  __kmp_null_resume_wrapper(th);
5729  KMP_CPU_PAUSE();
5730  }
5731  }
5732 
5733  // Delete task teams
5734  int tt_idx;
5735  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5736  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5737  if (task_team != NULL) {
5738  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5739  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5740  team->t.t_threads[f]->th.th_task_team = NULL;
5741  }
5742  KA_TRACE(
5743  20,
5744  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5745  __kmp_get_gtid(), task_team, team->t.t_id));
5746 #if KMP_NESTED_HOT_TEAMS
5747  __kmp_free_task_team(master, task_team);
5748 #endif
5749  team->t.t_task_team[tt_idx] = NULL;
5750  }
5751  }
5752  }
5753 
5754  // Before clearing parent pointer, check if nested_nth list should be freed
5755  if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5756  team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5757  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5758  KMP_INTERNAL_FREE(team->t.t_nested_nth);
5759  }
5760  team->t.t_nested_nth = NULL;
5761 
5762  // Reset pointer to parent team only for non-hot teams.
5763  team->t.t_parent = NULL;
5764  team->t.t_level = 0;
5765  team->t.t_active_level = 0;
5766 
5767  /* free the worker threads */
5768  for (f = 1; f < team->t.t_nproc; ++f) {
5769  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5770  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5771  (void)KMP_COMPARE_AND_STORE_ACQ32(
5772  &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5773  }
5774  __kmp_free_thread(team->t.t_threads[f]);
5775  }
5776 
5777  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5778  if (team->t.b) {
5779  // wake up thread at old location
5780  team->t.b->go_release();
5781  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5782  for (f = 1; f < team->t.t_nproc; ++f) {
5783  if (team->t.b->sleep[f].sleep) {
5784  __kmp_atomic_resume_64(
5785  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5786  (kmp_atomic_flag_64<> *)NULL);
5787  }
5788  }
5789  }
5790  // Wait for threads to be removed from team
5791  for (int f = 1; f < team->t.t_nproc; ++f) {
5792  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5793  KMP_CPU_PAUSE();
5794  }
5795  }
5796  }
5797 
5798  for (f = 1; f < team->t.t_nproc; ++f) {
5799  team->t.t_threads[f] = NULL;
5800  }
5801 
5802  if (team->t.t_max_nproc > 1 &&
5803  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5804  distributedBarrier::deallocate(team->t.b);
5805  team->t.b = NULL;
5806  }
5807  /* put the team back in the team pool */
5808  /* TODO limit size of team pool, call reap_team if pool too large */
5809  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5810  __kmp_team_pool = (volatile kmp_team_t *)team;
5811  } else { // Check if team was created for primary threads in teams construct
5812  // See if first worker is a CG root
5813  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5814  team->t.t_threads[1]->th.th_cg_roots);
5815  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5816  // Clean up the CG root nodes on workers so that this team can be re-used
5817  for (f = 1; f < team->t.t_nproc; ++f) {
5818  kmp_info_t *thr = team->t.t_threads[f];
5819  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5820  thr->th.th_cg_roots->cg_root == thr);
5821  // Pop current CG root off list
5822  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5823  thr->th.th_cg_roots = tmp->up;
5824  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5825  " up to node %p. cg_nthreads was %d\n",
5826  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5827  int i = tmp->cg_nthreads--;
5828  if (i == 1) {
5829  __kmp_free(tmp); // free CG if we are the last thread in it
5830  }
5831  // Restore current task's thread_limit from CG root
5832  if (thr->th.th_cg_roots)
5833  thr->th.th_current_task->td_icvs.thread_limit =
5834  thr->th.th_cg_roots->cg_thread_limit;
5835  }
5836  }
5837  }
5838 
5839  KMP_MB();
5840 }
5841 
5842 /* reap the team. destroy it, reclaim all its resources and free its memory */
5843 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5844  kmp_team_t *next_pool = team->t.t_next_pool;
5845 
5846  KMP_DEBUG_ASSERT(team);
5847  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5848  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5849  KMP_DEBUG_ASSERT(team->t.t_threads);
5850  KMP_DEBUG_ASSERT(team->t.t_argv);
5851 
5852  /* TODO clean the threads that are a part of this? */
5853 
5854  /* free stuff */
5855  __kmp_free_team_arrays(team);
5856  if (team->t.t_argv != &team->t.t_inline_argv[0])
5857  __kmp_free((void *)team->t.t_argv);
5858  __kmp_free(team);
5859 
5860  KMP_MB();
5861  return next_pool;
5862 }
5863 
5864 // Free the thread. Don't reap it, just place it on the pool of available
5865 // threads.
5866 //
5867 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5868 // binding for the affinity mechanism to be useful.
5869 //
5870 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5871 // However, we want to avoid a potential performance problem by always
5872 // scanning through the list to find the correct point at which to insert
5873 // the thread (potential N**2 behavior). To do this we keep track of the
5874 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5875 // With single-level parallelism, threads will always be added to the tail
5876 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5877 // parallelism, all bets are off and we may need to scan through the entire
5878 // free list.
5879 //
5880 // This change also has a potentially large performance benefit, for some
5881 // applications. Previously, as threads were freed from the hot team, they
5882 // would be placed back on the free list in inverse order. If the hot team
5883 // grew back to it's original size, then the freed thread would be placed
5884 // back on the hot team in reverse order. This could cause bad cache
5885 // locality problems on programs where the size of the hot team regularly
5886 // grew and shrunk.
5887 //
5888 // Now, for single-level parallelism, the OMP tid is always == gtid.
5889 void __kmp_free_thread(kmp_info_t *this_th) {
5890  int gtid;
5891  kmp_info_t **scan;
5892 
5893  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5894  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5895 
5896  KMP_DEBUG_ASSERT(this_th);
5897 
5898  // When moving thread to pool, switch thread to wait on own b_go flag, and
5899  // uninitialized (NULL team).
5900  int b;
5901  kmp_balign_t *balign = this_th->th.th_bar;
5902  for (b = 0; b < bs_last_barrier; ++b) {
5903  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5904  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5905  balign[b].bb.team = NULL;
5906  balign[b].bb.leaf_kids = 0;
5907  }
5908  this_th->th.th_task_state = 0;
5909  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5910 
5911  /* put thread back on the free pool */
5912  TCW_PTR(this_th->th.th_team, NULL);
5913  TCW_PTR(this_th->th.th_root, NULL);
5914  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5915 
5916  while (this_th->th.th_cg_roots) {
5917  this_th->th.th_cg_roots->cg_nthreads--;
5918  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5919  " %p of thread %p to %d\n",
5920  this_th, this_th->th.th_cg_roots,
5921  this_th->th.th_cg_roots->cg_root,
5922  this_th->th.th_cg_roots->cg_nthreads));
5923  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5924  if (tmp->cg_root == this_th) { // Thread is a cg_root
5925  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5926  KA_TRACE(
5927  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5928  this_th->th.th_cg_roots = tmp->up;
5929  __kmp_free(tmp);
5930  } else { // Worker thread
5931  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5932  __kmp_free(tmp);
5933  }
5934  this_th->th.th_cg_roots = NULL;
5935  break;
5936  }
5937  }
5938 
5939  /* If the implicit task assigned to this thread can be used by other threads
5940  * -> multiple threads can share the data and try to free the task at
5941  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5942  * with higher probability when hot team is disabled but can occurs even when
5943  * the hot team is enabled */
5944  __kmp_free_implicit_task(this_th);
5945  this_th->th.th_current_task = NULL;
5946 
5947  // If the __kmp_thread_pool_insert_pt is already past the new insert
5948  // point, then we need to re-scan the entire list.
5949  gtid = this_th->th.th_info.ds.ds_gtid;
5950  if (__kmp_thread_pool_insert_pt != NULL) {
5951  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5952  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5953  __kmp_thread_pool_insert_pt = NULL;
5954  }
5955  }
5956 
5957  // Scan down the list to find the place to insert the thread.
5958  // scan is the address of a link in the list, possibly the address of
5959  // __kmp_thread_pool itself.
5960  //
5961  // In the absence of nested parallelism, the for loop will have 0 iterations.
5962  if (__kmp_thread_pool_insert_pt != NULL) {
5963  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5964  } else {
5965  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5966  }
5967  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5968  scan = &((*scan)->th.th_next_pool))
5969  ;
5970 
5971  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5972  // to its address.
5973  TCW_PTR(this_th->th.th_next_pool, *scan);
5974  __kmp_thread_pool_insert_pt = *scan = this_th;
5975  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5976  (this_th->th.th_info.ds.ds_gtid <
5977  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5978  TCW_4(this_th->th.th_in_pool, TRUE);
5979  __kmp_suspend_initialize_thread(this_th);
5980  __kmp_lock_suspend_mx(this_th);
5981  if (this_th->th.th_active == TRUE) {
5982  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5983  this_th->th.th_active_in_pool = TRUE;
5984  }
5985 #if KMP_DEBUG
5986  else {
5987  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5988  }
5989 #endif
5990  __kmp_unlock_suspend_mx(this_th);
5991 
5992  TCW_4(__kmp_nth, __kmp_nth - 1);
5993 
5994 #ifdef KMP_ADJUST_BLOCKTIME
5995  /* Adjust blocktime back to user setting or default if necessary */
5996  /* Middle initialization might never have occurred */
5997  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5998  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5999  if (__kmp_nth <= __kmp_avail_proc) {
6000  __kmp_zero_bt = FALSE;
6001  }
6002  }
6003 #endif /* KMP_ADJUST_BLOCKTIME */
6004 
6005  KMP_MB();
6006 }
6007 
6008 /* ------------------------------------------------------------------------ */
6009 
6010 void *__kmp_launch_thread(kmp_info_t *this_thr) {
6011 #if OMP_PROFILING_SUPPORT
6012  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
6013  // TODO: add a configuration option for time granularity
6014  if (ProfileTraceFile)
6015  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6016 #endif
6017 
6018  int gtid = this_thr->th.th_info.ds.ds_gtid;
6019  /* void *stack_data;*/
6020  kmp_team_t **volatile pteam;
6021 
6022  KMP_MB();
6023  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6024 
6025  if (__kmp_env_consistency_check) {
6026  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6027  }
6028 
6029 #if OMPD_SUPPORT
6030  if (ompd_state & OMPD_ENABLE_BP)
6031  ompd_bp_thread_begin();
6032 #endif
6033 
6034 #if OMPT_SUPPORT
6035  ompt_data_t *thread_data = nullptr;
6036  if (ompt_enabled.enabled) {
6037  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6038  *thread_data = ompt_data_none;
6039 
6040  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6041  this_thr->th.ompt_thread_info.wait_id = 0;
6042  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6043  this_thr->th.ompt_thread_info.parallel_flags = 0;
6044  if (ompt_enabled.ompt_callback_thread_begin) {
6045  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6046  ompt_thread_worker, thread_data);
6047  }
6048  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6049  }
6050 #endif
6051 
6052  /* This is the place where threads wait for work */
6053  while (!TCR_4(__kmp_global.g.g_done)) {
6054  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6055  KMP_MB();
6056 
6057  /* wait for work to do */
6058  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6059 
6060  /* No tid yet since not part of a team */
6061  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6062 
6063 #if OMPT_SUPPORT
6064  if (ompt_enabled.enabled) {
6065  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6066  }
6067 #endif
6068 
6069  pteam = &this_thr->th.th_team;
6070 
6071  /* have we been allocated? */
6072  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6073  /* we were just woken up, so run our new task */
6074  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6075  int rc;
6076  KA_TRACE(20,
6077  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6078  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6079  (*pteam)->t.t_pkfn));
6080 
6081  updateHWFPControl(*pteam);
6082 
6083 #if OMPT_SUPPORT
6084  if (ompt_enabled.enabled) {
6085  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6086  }
6087 #endif
6088 
6089  rc = (*pteam)->t.t_invoke(gtid);
6090  KMP_ASSERT(rc);
6091 
6092  KMP_MB();
6093  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6094  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6095  (*pteam)->t.t_pkfn));
6096  }
6097 #if OMPT_SUPPORT
6098  if (ompt_enabled.enabled) {
6099  /* no frame set while outside task */
6100  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6101 
6102  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6103  }
6104 #endif
6105  /* join barrier after parallel region */
6106  __kmp_join_barrier(gtid);
6107  }
6108  }
6109 
6110 #if OMPD_SUPPORT
6111  if (ompd_state & OMPD_ENABLE_BP)
6112  ompd_bp_thread_end();
6113 #endif
6114 
6115 #if OMPT_SUPPORT
6116  if (ompt_enabled.ompt_callback_thread_end) {
6117  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6118  }
6119 #endif
6120 
6121  this_thr->th.th_task_team = NULL;
6122  /* run the destructors for the threadprivate data for this thread */
6123  __kmp_common_destroy_gtid(gtid);
6124 
6125  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6126  KMP_MB();
6127 
6128 #if OMP_PROFILING_SUPPORT
6129  llvm::timeTraceProfilerFinishThread();
6130 #endif
6131  return this_thr;
6132 }
6133 
6134 /* ------------------------------------------------------------------------ */
6135 
6136 void __kmp_internal_end_dest(void *specific_gtid) {
6137  // Make sure no significant bits are lost
6138  int gtid;
6139  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6140 
6141  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6142  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6143  * this is because 0 is reserved for the nothing-stored case */
6144 
6145  __kmp_internal_end_thread(gtid);
6146 }
6147 
6148 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6149 
6150 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6151  __kmp_internal_end_atexit();
6152 }
6153 
6154 #endif
6155 
6156 /* [Windows] josh: when the atexit handler is called, there may still be more
6157  than one thread alive */
6158 void __kmp_internal_end_atexit(void) {
6159  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6160  /* [Windows]
6161  josh: ideally, we want to completely shutdown the library in this atexit
6162  handler, but stat code that depends on thread specific data for gtid fails
6163  because that data becomes unavailable at some point during the shutdown, so
6164  we call __kmp_internal_end_thread instead. We should eventually remove the
6165  dependency on __kmp_get_specific_gtid in the stat code and use
6166  __kmp_internal_end_library to cleanly shutdown the library.
6167 
6168  // TODO: Can some of this comment about GVS be removed?
6169  I suspect that the offending stat code is executed when the calling thread
6170  tries to clean up a dead root thread's data structures, resulting in GVS
6171  code trying to close the GVS structures for that thread, but since the stat
6172  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6173  the calling thread is cleaning up itself instead of another thread, it get
6174  confused. This happens because allowing a thread to unregister and cleanup
6175  another thread is a recent modification for addressing an issue.
6176  Based on the current design (20050722), a thread may end up
6177  trying to unregister another thread only if thread death does not trigger
6178  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6179  thread specific data destructor function to detect thread death. For
6180  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6181  is nothing. Thus, the workaround is applicable only for Windows static
6182  stat library. */
6183  __kmp_internal_end_library(-1);
6184 #if KMP_OS_WINDOWS
6185  __kmp_close_console();
6186 #endif
6187 }
6188 
6189 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6190  // It is assumed __kmp_forkjoin_lock is acquired.
6191 
6192  int gtid;
6193 
6194  KMP_DEBUG_ASSERT(thread != NULL);
6195 
6196  gtid = thread->th.th_info.ds.ds_gtid;
6197 
6198  if (!is_root) {
6199  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6200  /* Assume the threads are at the fork barrier here */
6201  KA_TRACE(
6202  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6203  gtid));
6204  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6205  while (
6206  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6207  KMP_CPU_PAUSE();
6208  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6209  } else {
6210  /* Need release fence here to prevent seg faults for tree forkjoin
6211  barrier (GEH) */
6212  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6213  thread);
6214  __kmp_release_64(&flag);
6215  }
6216  }
6217 
6218  // Terminate OS thread.
6219  __kmp_reap_worker(thread);
6220 
6221  // The thread was killed asynchronously. If it was actively
6222  // spinning in the thread pool, decrement the global count.
6223  //
6224  // There is a small timing hole here - if the worker thread was just waking
6225  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6226  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6227  // the global counter might not get updated.
6228  //
6229  // Currently, this can only happen as the library is unloaded,
6230  // so there are no harmful side effects.
6231  if (thread->th.th_active_in_pool) {
6232  thread->th.th_active_in_pool = FALSE;
6233  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6234  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6235  }
6236  }
6237 
6238  __kmp_free_implicit_task(thread);
6239 
6240 // Free the fast memory for tasking
6241 #if USE_FAST_MEMORY
6242  __kmp_free_fast_memory(thread);
6243 #endif /* USE_FAST_MEMORY */
6244 
6245  __kmp_suspend_uninitialize_thread(thread);
6246 
6247  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6248  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6249 
6250  --__kmp_all_nth;
6251  // __kmp_nth was decremented when thread is added to the pool.
6252 
6253 #ifdef KMP_ADJUST_BLOCKTIME
6254  /* Adjust blocktime back to user setting or default if necessary */
6255  /* Middle initialization might never have occurred */
6256  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6257  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6258  if (__kmp_nth <= __kmp_avail_proc) {
6259  __kmp_zero_bt = FALSE;
6260  }
6261  }
6262 #endif /* KMP_ADJUST_BLOCKTIME */
6263 
6264  /* free the memory being used */
6265  if (__kmp_env_consistency_check) {
6266  if (thread->th.th_cons) {
6267  __kmp_free_cons_stack(thread->th.th_cons);
6268  thread->th.th_cons = NULL;
6269  }
6270  }
6271 
6272  if (thread->th.th_pri_common != NULL) {
6273  __kmp_free(thread->th.th_pri_common);
6274  thread->th.th_pri_common = NULL;
6275  }
6276 
6277 #if KMP_USE_BGET
6278  if (thread->th.th_local.bget_data != NULL) {
6279  __kmp_finalize_bget(thread);
6280  }
6281 #endif
6282 
6283 #if KMP_AFFINITY_SUPPORTED
6284  if (thread->th.th_affin_mask != NULL) {
6285  KMP_CPU_FREE(thread->th.th_affin_mask);
6286  thread->th.th_affin_mask = NULL;
6287  }
6288 #endif /* KMP_AFFINITY_SUPPORTED */
6289 
6290 #if KMP_USE_HIER_SCHED
6291  if (thread->th.th_hier_bar_data != NULL) {
6292  __kmp_free(thread->th.th_hier_bar_data);
6293  thread->th.th_hier_bar_data = NULL;
6294  }
6295 #endif
6296 
6297  __kmp_reap_team(thread->th.th_serial_team);
6298  thread->th.th_serial_team = NULL;
6299  __kmp_free(thread);
6300 
6301  KMP_MB();
6302 
6303 } // __kmp_reap_thread
6304 
6305 static void __kmp_itthash_clean(kmp_info_t *th) {
6306 #if USE_ITT_NOTIFY
6307  if (__kmp_itt_region_domains.count > 0) {
6308  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6309  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6310  while (bucket) {
6311  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6312  __kmp_thread_free(th, bucket);
6313  bucket = next;
6314  }
6315  }
6316  }
6317  if (__kmp_itt_barrier_domains.count > 0) {
6318  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6319  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6320  while (bucket) {
6321  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6322  __kmp_thread_free(th, bucket);
6323  bucket = next;
6324  }
6325  }
6326  }
6327 #endif
6328 }
6329 
6330 static void __kmp_internal_end(void) {
6331  int i;
6332 
6333  /* First, unregister the library */
6334  __kmp_unregister_library();
6335 
6336 #if KMP_OS_WINDOWS
6337  /* In Win static library, we can't tell when a root actually dies, so we
6338  reclaim the data structures for any root threads that have died but not
6339  unregistered themselves, in order to shut down cleanly.
6340  In Win dynamic library we also can't tell when a thread dies. */
6341  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6342 // dead roots
6343 #endif
6344 
6345  for (i = 0; i < __kmp_threads_capacity; i++)
6346  if (__kmp_root[i])
6347  if (__kmp_root[i]->r.r_active)
6348  break;
6349  KMP_MB(); /* Flush all pending memory write invalidates. */
6350  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6351 
6352  if (i < __kmp_threads_capacity) {
6353 #if KMP_USE_MONITOR
6354  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6355  KMP_MB(); /* Flush all pending memory write invalidates. */
6356 
6357  // Need to check that monitor was initialized before reaping it. If we are
6358  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6359  // __kmp_monitor will appear to contain valid data, but it is only valid in
6360  // the parent process, not the child.
6361  // New behavior (201008): instead of keying off of the flag
6362  // __kmp_init_parallel, the monitor thread creation is keyed off
6363  // of the new flag __kmp_init_monitor.
6364  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6365  if (TCR_4(__kmp_init_monitor)) {
6366  __kmp_reap_monitor(&__kmp_monitor);
6367  TCW_4(__kmp_init_monitor, 0);
6368  }
6369  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6370  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6371 #endif // KMP_USE_MONITOR
6372  } else {
6373 /* TODO move this to cleanup code */
6374 #ifdef KMP_DEBUG
6375  /* make sure that everything has properly ended */
6376  for (i = 0; i < __kmp_threads_capacity; i++) {
6377  if (__kmp_root[i]) {
6378  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6379  // there can be uber threads alive here
6380  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6381  }
6382  }
6383 #endif
6384 
6385  KMP_MB();
6386 
6387  // Reap the worker threads.
6388  // This is valid for now, but be careful if threads are reaped sooner.
6389  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6390  // Get the next thread from the pool.
6391  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6392  __kmp_thread_pool = thread->th.th_next_pool;
6393  // Reap it.
6394  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6395  thread->th.th_next_pool = NULL;
6396  thread->th.th_in_pool = FALSE;
6397  __kmp_reap_thread(thread, 0);
6398  }
6399  __kmp_thread_pool_insert_pt = NULL;
6400 
6401  // Reap teams.
6402  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6403  // Get the next team from the pool.
6404  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6405  __kmp_team_pool = team->t.t_next_pool;
6406  // Reap it.
6407  team->t.t_next_pool = NULL;
6408  __kmp_reap_team(team);
6409  }
6410 
6411  __kmp_reap_task_teams();
6412 
6413 #if KMP_OS_UNIX
6414  // Threads that are not reaped should not access any resources since they
6415  // are going to be deallocated soon, so the shutdown sequence should wait
6416  // until all threads either exit the final spin-waiting loop or begin
6417  // sleeping after the given blocktime.
6418  for (i = 0; i < __kmp_threads_capacity; i++) {
6419  kmp_info_t *thr = __kmp_threads[i];
6420  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6421  KMP_CPU_PAUSE();
6422  }
6423 #endif
6424 
6425  for (i = 0; i < __kmp_threads_capacity; ++i) {
6426  // TBD: Add some checking...
6427  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6428  }
6429 
6430  /* Make sure all threadprivate destructors get run by joining with all
6431  worker threads before resetting this flag */
6432  TCW_SYNC_4(__kmp_init_common, FALSE);
6433 
6434  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6435  KMP_MB();
6436 
6437 #if KMP_USE_MONITOR
6438  // See note above: One of the possible fixes for CQ138434 / CQ140126
6439  //
6440  // FIXME: push both code fragments down and CSE them?
6441  // push them into __kmp_cleanup() ?
6442  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6443  if (TCR_4(__kmp_init_monitor)) {
6444  __kmp_reap_monitor(&__kmp_monitor);
6445  TCW_4(__kmp_init_monitor, 0);
6446  }
6447  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6448  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6449 #endif
6450  } /* else !__kmp_global.t_active */
6451  TCW_4(__kmp_init_gtid, FALSE);
6452  KMP_MB(); /* Flush all pending memory write invalidates. */
6453 
6454  __kmp_cleanup();
6455 #if OMPT_SUPPORT
6456  ompt_fini();
6457 #endif
6458 }
6459 
6460 void __kmp_internal_end_library(int gtid_req) {
6461  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6462  /* this shouldn't be a race condition because __kmp_internal_end() is the
6463  only place to clear __kmp_serial_init */
6464  /* we'll check this later too, after we get the lock */
6465  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6466  // redundant, because the next check will work in any case.
6467  if (__kmp_global.g.g_abort) {
6468  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6469  /* TODO abort? */
6470  return;
6471  }
6472  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6473  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6474  return;
6475  }
6476 
6477  // If hidden helper team has been initialized, we need to deinit it
6478  if (TCR_4(__kmp_init_hidden_helper) &&
6479  !TCR_4(__kmp_hidden_helper_team_done)) {
6480  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6481  // First release the main thread to let it continue its work
6482  __kmp_hidden_helper_main_thread_release();
6483  // Wait until the hidden helper team has been destroyed
6484  __kmp_hidden_helper_threads_deinitz_wait();
6485  }
6486 
6487  KMP_MB(); /* Flush all pending memory write invalidates. */
6488  /* find out who we are and what we should do */
6489  {
6490  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6491  KA_TRACE(
6492  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6493  if (gtid == KMP_GTID_SHUTDOWN) {
6494  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6495  "already shutdown\n"));
6496  return;
6497  } else if (gtid == KMP_GTID_MONITOR) {
6498  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6499  "registered, or system shutdown\n"));
6500  return;
6501  } else if (gtid == KMP_GTID_DNE) {
6502  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6503  "shutdown\n"));
6504  /* we don't know who we are, but we may still shutdown the library */
6505  } else if (KMP_UBER_GTID(gtid)) {
6506  /* unregister ourselves as an uber thread. gtid is no longer valid */
6507  if (__kmp_root[gtid]->r.r_active) {
6508  __kmp_global.g.g_abort = -1;
6509  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6510  __kmp_unregister_library();
6511  KA_TRACE(10,
6512  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6513  gtid));
6514  return;
6515  } else {
6516  __kmp_itthash_clean(__kmp_threads[gtid]);
6517  KA_TRACE(
6518  10,
6519  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6520  __kmp_unregister_root_current_thread(gtid);
6521  }
6522  } else {
6523 /* worker threads may call this function through the atexit handler, if they
6524  * call exit() */
6525 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6526  TODO: do a thorough shutdown instead */
6527 #ifdef DUMP_DEBUG_ON_EXIT
6528  if (__kmp_debug_buf)
6529  __kmp_dump_debug_buffer();
6530 #endif
6531  // added unregister library call here when we switch to shm linux
6532  // if we don't, it will leave lots of files in /dev/shm
6533  // cleanup shared memory file before exiting.
6534  __kmp_unregister_library();
6535  return;
6536  }
6537  }
6538  /* synchronize the termination process */
6539  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6540 
6541  /* have we already finished */
6542  if (__kmp_global.g.g_abort) {
6543  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6544  /* TODO abort? */
6545  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6546  return;
6547  }
6548  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6549  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6550  return;
6551  }
6552 
6553  /* We need this lock to enforce mutex between this reading of
6554  __kmp_threads_capacity and the writing by __kmp_register_root.
6555  Alternatively, we can use a counter of roots that is atomically updated by
6556  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6557  __kmp_internal_end_*. */
6558  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6559 
6560  /* now we can safely conduct the actual termination */
6561  __kmp_internal_end();
6562 
6563  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6564  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6565 
6566  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6567 
6568 #ifdef DUMP_DEBUG_ON_EXIT
6569  if (__kmp_debug_buf)
6570  __kmp_dump_debug_buffer();
6571 #endif
6572 
6573 #if KMP_OS_WINDOWS
6574  __kmp_close_console();
6575 #endif
6576 
6577  __kmp_fini_allocator();
6578 
6579 } // __kmp_internal_end_library
6580 
6581 void __kmp_internal_end_thread(int gtid_req) {
6582  int i;
6583 
6584  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6585  /* this shouldn't be a race condition because __kmp_internal_end() is the
6586  * only place to clear __kmp_serial_init */
6587  /* we'll check this later too, after we get the lock */
6588  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6589  // redundant, because the next check will work in any case.
6590  if (__kmp_global.g.g_abort) {
6591  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6592  /* TODO abort? */
6593  return;
6594  }
6595  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6596  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6597  return;
6598  }
6599 
6600  // If hidden helper team has been initialized, we need to deinit it
6601  if (TCR_4(__kmp_init_hidden_helper) &&
6602  !TCR_4(__kmp_hidden_helper_team_done)) {
6603  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6604  // First release the main thread to let it continue its work
6605  __kmp_hidden_helper_main_thread_release();
6606  // Wait until the hidden helper team has been destroyed
6607  __kmp_hidden_helper_threads_deinitz_wait();
6608  }
6609 
6610  KMP_MB(); /* Flush all pending memory write invalidates. */
6611 
6612  /* find out who we are and what we should do */
6613  {
6614  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6615  KA_TRACE(10,
6616  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6617  if (gtid == KMP_GTID_SHUTDOWN) {
6618  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6619  "already shutdown\n"));
6620  return;
6621  } else if (gtid == KMP_GTID_MONITOR) {
6622  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6623  "registered, or system shutdown\n"));
6624  return;
6625  } else if (gtid == KMP_GTID_DNE) {
6626  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6627  "shutdown\n"));
6628  return;
6629  /* we don't know who we are */
6630  } else if (KMP_UBER_GTID(gtid)) {
6631  /* unregister ourselves as an uber thread. gtid is no longer valid */
6632  if (__kmp_root[gtid]->r.r_active) {
6633  __kmp_global.g.g_abort = -1;
6634  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6635  KA_TRACE(10,
6636  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6637  gtid));
6638  return;
6639  } else {
6640  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6641  gtid));
6642  __kmp_unregister_root_current_thread(gtid);
6643  }
6644  } else {
6645  /* just a worker thread, let's leave */
6646  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6647 
6648  if (gtid >= 0) {
6649  __kmp_threads[gtid]->th.th_task_team = NULL;
6650  }
6651 
6652  KA_TRACE(10,
6653  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6654  gtid));
6655  return;
6656  }
6657  }
6658 #if KMP_DYNAMIC_LIB
6659  if (__kmp_pause_status != kmp_hard_paused)
6660  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6661  // because we will better shutdown later in the library destructor.
6662  {
6663  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6664  return;
6665  }
6666 #endif
6667  /* synchronize the termination process */
6668  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6669 
6670  /* have we already finished */
6671  if (__kmp_global.g.g_abort) {
6672  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6673  /* TODO abort? */
6674  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6675  return;
6676  }
6677  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6678  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6679  return;
6680  }
6681 
6682  /* We need this lock to enforce mutex between this reading of
6683  __kmp_threads_capacity and the writing by __kmp_register_root.
6684  Alternatively, we can use a counter of roots that is atomically updated by
6685  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6686  __kmp_internal_end_*. */
6687 
6688  /* should we finish the run-time? are all siblings done? */
6689  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6690 
6691  for (i = 0; i < __kmp_threads_capacity; ++i) {
6692  if (KMP_UBER_GTID(i)) {
6693  KA_TRACE(
6694  10,
6695  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6696  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6697  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6698  return;
6699  }
6700  }
6701 
6702  /* now we can safely conduct the actual termination */
6703 
6704  __kmp_internal_end();
6705 
6706  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6707  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6708 
6709  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6710 
6711 #ifdef DUMP_DEBUG_ON_EXIT
6712  if (__kmp_debug_buf)
6713  __kmp_dump_debug_buffer();
6714 #endif
6715 } // __kmp_internal_end_thread
6716 
6717 // -----------------------------------------------------------------------------
6718 // Library registration stuff.
6719 
6720 static long __kmp_registration_flag = 0;
6721 // Random value used to indicate library initialization.
6722 static char *__kmp_registration_str = NULL;
6723 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6724 
6725 static inline char *__kmp_reg_status_name() {
6726 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6727  each thread. If registration and unregistration go in different threads
6728  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6729  env var can not be found, because the name will contain different pid. */
6730 // macOS* complains about name being too long with additional getuid()
6731 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6732  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6733  (int)getuid());
6734 #else
6735  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6736 #endif
6737 } // __kmp_reg_status_get
6738 
6739 #if defined(KMP_USE_SHM)
6740 bool __kmp_shm_available = false;
6741 bool __kmp_tmp_available = false;
6742 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6743 char *temp_reg_status_file_name = nullptr;
6744 #endif
6745 
6746 void __kmp_register_library_startup(void) {
6747 
6748  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6749  int done = 0;
6750  union {
6751  double dtime;
6752  long ltime;
6753  } time;
6754 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6755  __kmp_initialize_system_tick();
6756 #endif
6757  __kmp_read_system_time(&time.dtime);
6758  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6759  __kmp_registration_str =
6760  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6761  __kmp_registration_flag, KMP_LIBRARY_FILE);
6762 
6763  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6764  __kmp_registration_str));
6765 
6766  while (!done) {
6767 
6768  char *value = NULL; // Actual value of the environment variable.
6769 
6770 #if defined(KMP_USE_SHM)
6771  char *shm_name = nullptr;
6772  char *data1 = nullptr;
6773  __kmp_shm_available = __kmp_detect_shm();
6774  if (__kmp_shm_available) {
6775  int fd1 = -1;
6776  shm_name = __kmp_str_format("/%s", name);
6777  int shm_preexist = 0;
6778  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6779  if ((fd1 == -1) && (errno == EEXIST)) {
6780  // file didn't open because it already exists.
6781  // try opening existing file
6782  fd1 = shm_open(shm_name, O_RDWR, 0600);
6783  if (fd1 == -1) { // file didn't open
6784  KMP_WARNING(FunctionError, "Can't open SHM");
6785  __kmp_shm_available = false;
6786  } else { // able to open existing file
6787  shm_preexist = 1;
6788  }
6789  }
6790  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6791  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6792  KMP_WARNING(FunctionError, "Can't set size of SHM");
6793  __kmp_shm_available = false;
6794  }
6795  }
6796  if (__kmp_shm_available) { // SHM exists, now map it
6797  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6798  fd1, 0);
6799  if (data1 == MAP_FAILED) { // failed to map shared memory
6800  KMP_WARNING(FunctionError, "Can't map SHM");
6801  __kmp_shm_available = false;
6802  }
6803  }
6804  if (__kmp_shm_available) { // SHM mapped
6805  if (shm_preexist == 0) { // set data to SHM, set value
6806  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6807  }
6808  // Read value from either what we just wrote or existing file.
6809  value = __kmp_str_format("%s", data1); // read value from SHM
6810  munmap(data1, SHM_SIZE);
6811  }
6812  if (fd1 != -1)
6813  close(fd1);
6814  }
6815  if (!__kmp_shm_available)
6816  __kmp_tmp_available = __kmp_detect_tmp();
6817  if (!__kmp_shm_available && __kmp_tmp_available) {
6818  // SHM failed to work due to an error other than that the file already
6819  // exists. Try to create a temp file under /tmp.
6820  // If /tmp isn't accessible, fall back to using environment variable.
6821  // TODO: /tmp might not always be the temporary directory. For now we will
6822  // not consider TMPDIR.
6823  int fd1 = -1;
6824  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6825  int tmp_preexist = 0;
6826  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6827  if ((fd1 == -1) && (errno == EEXIST)) {
6828  // file didn't open because it already exists.
6829  // try opening existing file
6830  fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6831  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6832  KMP_WARNING(FunctionError, "Can't open TEMP");
6833  __kmp_tmp_available = false;
6834  } else {
6835  tmp_preexist = 1;
6836  }
6837  }
6838  if (__kmp_tmp_available && tmp_preexist == 0) {
6839  // we created /tmp file now set size
6840  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6841  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6842  __kmp_tmp_available = false;
6843  }
6844  }
6845  if (__kmp_tmp_available) {
6846  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6847  fd1, 0);
6848  if (data1 == MAP_FAILED) { // failed to map /tmp
6849  KMP_WARNING(FunctionError, "Can't map /tmp");
6850  __kmp_tmp_available = false;
6851  }
6852  }
6853  if (__kmp_tmp_available) {
6854  if (tmp_preexist == 0) { // set data to TMP, set value
6855  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6856  }
6857  // Read value from either what we just wrote or existing file.
6858  value = __kmp_str_format("%s", data1); // read value from SHM
6859  munmap(data1, SHM_SIZE);
6860  }
6861  if (fd1 != -1)
6862  close(fd1);
6863  }
6864  if (!__kmp_shm_available && !__kmp_tmp_available) {
6865  // no /dev/shm and no /tmp -- fall back to environment variable
6866  // Set environment variable, but do not overwrite if it exists.
6867  __kmp_env_set(name, __kmp_registration_str, 0);
6868  // read value to see if it got set
6869  value = __kmp_env_get(name);
6870  }
6871 #else // Windows and unix with static library
6872  // Set environment variable, but do not overwrite if it exists.
6873  __kmp_env_set(name, __kmp_registration_str, 0);
6874  // read value to see if it got set
6875  value = __kmp_env_get(name);
6876 #endif
6877 
6878  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6879  done = 1; // Ok, environment variable set successfully, exit the loop.
6880  } else {
6881  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6882  // Check whether it alive or dead.
6883  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6884  char *tail = value;
6885  char *flag_addr_str = NULL;
6886  char *flag_val_str = NULL;
6887  char const *file_name = NULL;
6888  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6889  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6890  file_name = tail;
6891  if (tail != NULL) {
6892  unsigned long *flag_addr = 0;
6893  unsigned long flag_val = 0;
6894  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6895  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6896  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6897  // First, check whether environment-encoded address is mapped into
6898  // addr space.
6899  // If so, dereference it to see if it still has the right value.
6900  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6901  neighbor = 1;
6902  } else {
6903  // If not, then we know the other copy of the library is no longer
6904  // running.
6905  neighbor = 2;
6906  }
6907  }
6908  }
6909  switch (neighbor) {
6910  case 0: // Cannot parse environment variable -- neighbor status unknown.
6911  // Assume it is the incompatible format of future version of the
6912  // library. Assume the other library is alive.
6913  // WARN( ... ); // TODO: Issue a warning.
6914  file_name = "unknown library";
6915  KMP_FALLTHROUGH();
6916  // Attention! Falling to the next case. That's intentional.
6917  case 1: { // Neighbor is alive.
6918  // Check it is allowed.
6919  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6920  if (!__kmp_str_match_true(duplicate_ok)) {
6921  // That's not allowed. Issue fatal error.
6922  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6923  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6924  }
6925  KMP_INTERNAL_FREE(duplicate_ok);
6926  __kmp_duplicate_library_ok = 1;
6927  done = 1; // Exit the loop.
6928  } break;
6929  case 2: { // Neighbor is dead.
6930 
6931 #if defined(KMP_USE_SHM)
6932  if (__kmp_shm_available) { // close shared memory.
6933  shm_unlink(shm_name); // this removes file in /dev/shm
6934  } else if (__kmp_tmp_available) {
6935  unlink(temp_reg_status_file_name); // this removes the temp file
6936  } else {
6937  // Clear the variable and try to register library again.
6938  __kmp_env_unset(name);
6939  }
6940 #else
6941  // Clear the variable and try to register library again.
6942  __kmp_env_unset(name);
6943 #endif
6944  } break;
6945  default: {
6946  KMP_DEBUG_ASSERT(0);
6947  } break;
6948  }
6949  }
6950  KMP_INTERNAL_FREE((void *)value);
6951 #if defined(KMP_USE_SHM)
6952  if (shm_name)
6953  KMP_INTERNAL_FREE((void *)shm_name);
6954 #endif
6955  } // while
6956  KMP_INTERNAL_FREE((void *)name);
6957 
6958 } // func __kmp_register_library_startup
6959 
6960 void __kmp_unregister_library(void) {
6961 
6962  char *name = __kmp_reg_status_name();
6963  char *value = NULL;
6964 
6965 #if defined(KMP_USE_SHM)
6966  char *shm_name = nullptr;
6967  int fd1;
6968  if (__kmp_shm_available) {
6969  shm_name = __kmp_str_format("/%s", name);
6970  fd1 = shm_open(shm_name, O_RDONLY, 0600);
6971  if (fd1 != -1) { // File opened successfully
6972  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6973  if (data1 != MAP_FAILED) {
6974  value = __kmp_str_format("%s", data1); // read value from SHM
6975  munmap(data1, SHM_SIZE);
6976  }
6977  close(fd1);
6978  }
6979  } else if (__kmp_tmp_available) { // try /tmp
6980  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6981  if (fd1 != -1) { // File opened successfully
6982  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6983  if (data1 != MAP_FAILED) {
6984  value = __kmp_str_format("%s", data1); // read value from /tmp
6985  munmap(data1, SHM_SIZE);
6986  }
6987  close(fd1);
6988  }
6989  } else { // fall back to envirable
6990  value = __kmp_env_get(name);
6991  }
6992 #else
6993  value = __kmp_env_get(name);
6994 #endif
6995 
6996  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6997  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6998  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6999 // Ok, this is our variable. Delete it.
7000 #if defined(KMP_USE_SHM)
7001  if (__kmp_shm_available) {
7002  shm_unlink(shm_name); // this removes file in /dev/shm
7003  } else if (__kmp_tmp_available) {
7004  unlink(temp_reg_status_file_name); // this removes the temp file
7005  } else {
7006  __kmp_env_unset(name);
7007  }
7008 #else
7009  __kmp_env_unset(name);
7010 #endif
7011  }
7012 
7013 #if defined(KMP_USE_SHM)
7014  if (shm_name)
7015  KMP_INTERNAL_FREE(shm_name);
7016  if (temp_reg_status_file_name)
7017  KMP_INTERNAL_FREE(temp_reg_status_file_name);
7018 #endif
7019 
7020  KMP_INTERNAL_FREE(__kmp_registration_str);
7021  KMP_INTERNAL_FREE(value);
7022  KMP_INTERNAL_FREE(name);
7023 
7024  __kmp_registration_flag = 0;
7025  __kmp_registration_str = NULL;
7026 
7027 } // __kmp_unregister_library
7028 
7029 // End of Library registration stuff.
7030 // -----------------------------------------------------------------------------
7031 
7032 #if KMP_MIC_SUPPORTED
7033 
7034 static void __kmp_check_mic_type() {
7035  kmp_cpuid_t cpuid_state = {0};
7036  kmp_cpuid_t *cs_p = &cpuid_state;
7037  __kmp_x86_cpuid(1, 0, cs_p);
7038  // We don't support mic1 at the moment
7039  if ((cs_p->eax & 0xff0) == 0xB10) {
7040  __kmp_mic_type = mic2;
7041  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
7042  __kmp_mic_type = mic3;
7043  } else {
7044  __kmp_mic_type = non_mic;
7045  }
7046 }
7047 
7048 #endif /* KMP_MIC_SUPPORTED */
7049 
7050 #if KMP_HAVE_UMWAIT
7051 static void __kmp_user_level_mwait_init() {
7052  struct kmp_cpuid buf;
7053  __kmp_x86_cpuid(7, 0, &buf);
7054  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7055  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7056  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7057  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7058  __kmp_umwait_enabled));
7059 }
7060 #elif KMP_HAVE_MWAIT
7061 #ifndef AT_INTELPHIUSERMWAIT
7062 // Spurious, non-existent value that should always fail to return anything.
7063 // Will be replaced with the correct value when we know that.
7064 #define AT_INTELPHIUSERMWAIT 10000
7065 #endif
7066 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7067 // earlier OS is used to build the RTL, we'll use the following internal
7068 // function when the entry is not found.
7069 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7070 unsigned long getauxval(unsigned long) { return 0; }
7071 
7072 static void __kmp_user_level_mwait_init() {
7073  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7074  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7075  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7076  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7077  if (__kmp_mic_type == mic3) {
7078  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7079  if ((res & 0x1) || __kmp_user_level_mwait) {
7080  __kmp_mwait_enabled = TRUE;
7081  if (__kmp_user_level_mwait) {
7082  KMP_INFORM(EnvMwaitWarn);
7083  }
7084  } else {
7085  __kmp_mwait_enabled = FALSE;
7086  }
7087  }
7088  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7089  "__kmp_mwait_enabled = %d\n",
7090  __kmp_mic_type, __kmp_mwait_enabled));
7091 }
7092 #endif /* KMP_HAVE_UMWAIT */
7093 
7094 static void __kmp_do_serial_initialize(void) {
7095  int i, gtid;
7096  size_t size;
7097 
7098  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7099 
7100  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7101  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7102  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7103  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7104  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7105 
7106 #if OMPT_SUPPORT
7107  ompt_pre_init();
7108 #endif
7109 #if OMPD_SUPPORT
7110  __kmp_env_dump();
7111  ompd_init();
7112 #endif
7113 
7114  __kmp_validate_locks();
7115 
7116 #if ENABLE_LIBOMPTARGET
7117  /* Initialize functions from libomptarget */
7118  __kmp_init_omptarget();
7119 #endif
7120 
7121  /* Initialize internal memory allocator */
7122  __kmp_init_allocator();
7123 
7124  /* Register the library startup via an environment variable or via mapped
7125  shared memory file and check to see whether another copy of the library is
7126  already registered. Since forked child process is often terminated, we
7127  postpone the registration till middle initialization in the child */
7128  if (__kmp_need_register_serial)
7129  __kmp_register_library_startup();
7130 
7131  /* TODO reinitialization of library */
7132  if (TCR_4(__kmp_global.g.g_done)) {
7133  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7134  }
7135 
7136  __kmp_global.g.g_abort = 0;
7137  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7138 
7139 /* initialize the locks */
7140 #if KMP_USE_ADAPTIVE_LOCKS
7141 #if KMP_DEBUG_ADAPTIVE_LOCKS
7142  __kmp_init_speculative_stats();
7143 #endif
7144 #endif
7145 #if KMP_STATS_ENABLED
7146  __kmp_stats_init();
7147 #endif
7148  __kmp_init_lock(&__kmp_global_lock);
7149  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7150  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7151  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7152  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7153  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7154  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7155  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7156  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7157  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7158  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7159  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7160  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7161  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7162  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7163  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7164 #if KMP_USE_MONITOR
7165  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7166 #endif
7167  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7168 
7169  /* conduct initialization and initial setup of configuration */
7170 
7171  __kmp_runtime_initialize();
7172 
7173 #if KMP_MIC_SUPPORTED
7174  __kmp_check_mic_type();
7175 #endif
7176 
7177 // Some global variable initialization moved here from kmp_env_initialize()
7178 #ifdef KMP_DEBUG
7179  kmp_diag = 0;
7180 #endif
7181  __kmp_abort_delay = 0;
7182 
7183  // From __kmp_init_dflt_team_nth()
7184  /* assume the entire machine will be used */
7185  __kmp_dflt_team_nth_ub = __kmp_xproc;
7186  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7187  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7188  }
7189  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7190  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7191  }
7192  __kmp_max_nth = __kmp_sys_max_nth;
7193  __kmp_cg_max_nth = __kmp_sys_max_nth;
7194  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7195  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7196  __kmp_teams_max_nth = __kmp_sys_max_nth;
7197  }
7198 
7199  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7200  // part
7201  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7202 #if KMP_USE_MONITOR
7203  __kmp_monitor_wakeups =
7204  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7205  __kmp_bt_intervals =
7206  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7207 #endif
7208  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7209  __kmp_library = library_throughput;
7210  // From KMP_SCHEDULE initialization
7211  __kmp_static = kmp_sch_static_balanced;
7212 // AC: do not use analytical here, because it is non-monotonous
7213 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7214 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7215 // need to repeat assignment
7216 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7217 // bit control and barrier method control parts
7218 #if KMP_FAST_REDUCTION_BARRIER
7219 #define kmp_reduction_barrier_gather_bb ((int)1)
7220 #define kmp_reduction_barrier_release_bb ((int)1)
7221 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7222 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7223 #endif // KMP_FAST_REDUCTION_BARRIER
7224  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7225  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7226  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7227  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7228  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7229 #if KMP_FAST_REDUCTION_BARRIER
7230  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7231  // lin_64 ): hyper,1
7232  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7233  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7234  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7235  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7236  }
7237 #endif // KMP_FAST_REDUCTION_BARRIER
7238  }
7239 #if KMP_FAST_REDUCTION_BARRIER
7240 #undef kmp_reduction_barrier_release_pat
7241 #undef kmp_reduction_barrier_gather_pat
7242 #undef kmp_reduction_barrier_release_bb
7243 #undef kmp_reduction_barrier_gather_bb
7244 #endif // KMP_FAST_REDUCTION_BARRIER
7245 #if KMP_MIC_SUPPORTED
7246  if (__kmp_mic_type == mic2) { // KNC
7247  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7248  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7249  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7250  1; // forkjoin release
7251  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7252  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7253  }
7254 #if KMP_FAST_REDUCTION_BARRIER
7255  if (__kmp_mic_type == mic2) { // KNC
7256  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7257  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7258  }
7259 #endif // KMP_FAST_REDUCTION_BARRIER
7260 #endif // KMP_MIC_SUPPORTED
7261 
7262 // From KMP_CHECKS initialization
7263 #ifdef KMP_DEBUG
7264  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7265 #else
7266  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7267 #endif
7268 
7269  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7270  __kmp_foreign_tp = TRUE;
7271 
7272  __kmp_global.g.g_dynamic = FALSE;
7273  __kmp_global.g.g_dynamic_mode = dynamic_default;
7274 
7275  __kmp_init_nesting_mode();
7276 
7277  __kmp_env_initialize(NULL);
7278 
7279 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7280  __kmp_user_level_mwait_init();
7281 #endif
7282 // Print all messages in message catalog for testing purposes.
7283 #ifdef KMP_DEBUG
7284  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7285  if (__kmp_str_match_true(val)) {
7286  kmp_str_buf_t buffer;
7287  __kmp_str_buf_init(&buffer);
7288  __kmp_i18n_dump_catalog(&buffer);
7289  __kmp_printf("%s", buffer.str);
7290  __kmp_str_buf_free(&buffer);
7291  }
7292  __kmp_env_free(&val);
7293 #endif
7294 
7295  __kmp_threads_capacity =
7296  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7297  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7298  __kmp_tp_capacity = __kmp_default_tp_capacity(
7299  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7300 
7301  // If the library is shut down properly, both pools must be NULL. Just in
7302  // case, set them to NULL -- some memory may leak, but subsequent code will
7303  // work even if pools are not freed.
7304  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7305  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7306  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7307  __kmp_thread_pool = NULL;
7308  __kmp_thread_pool_insert_pt = NULL;
7309  __kmp_team_pool = NULL;
7310 
7311  /* Allocate all of the variable sized records */
7312  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7313  * expandable */
7314  /* Since allocation is cache-aligned, just add extra padding at the end */
7315  size =
7316  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7317  CACHE_LINE;
7318  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7319  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7320  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7321 
7322  /* init thread counts */
7323  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7324  0); // Asserts fail if the library is reinitializing and
7325  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7326  __kmp_all_nth = 0;
7327  __kmp_nth = 0;
7328 
7329  /* setup the uber master thread and hierarchy */
7330  gtid = __kmp_register_root(TRUE);
7331  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7332  KMP_ASSERT(KMP_UBER_GTID(gtid));
7333  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7334 
7335  KMP_MB(); /* Flush all pending memory write invalidates. */
7336 
7337  __kmp_common_initialize();
7338 
7339 #if KMP_OS_UNIX
7340  /* invoke the child fork handler */
7341  __kmp_register_atfork();
7342 #endif
7343 
7344 #if !KMP_DYNAMIC_LIB || \
7345  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7346  {
7347  /* Invoke the exit handler when the program finishes, only for static
7348  library and macOS* dynamic. For other dynamic libraries, we already
7349  have _fini and DllMain. */
7350  int rc = atexit(__kmp_internal_end_atexit);
7351  if (rc != 0) {
7352  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7353  __kmp_msg_null);
7354  }
7355  }
7356 #endif
7357 
7358 #if KMP_HANDLE_SIGNALS
7359 #if KMP_OS_UNIX
7360  /* NOTE: make sure that this is called before the user installs their own
7361  signal handlers so that the user handlers are called first. this way they
7362  can return false, not call our handler, avoid terminating the library, and
7363  continue execution where they left off. */
7364  __kmp_install_signals(FALSE);
7365 #endif /* KMP_OS_UNIX */
7366 #if KMP_OS_WINDOWS
7367  __kmp_install_signals(TRUE);
7368 #endif /* KMP_OS_WINDOWS */
7369 #endif
7370 
7371  /* we have finished the serial initialization */
7372  __kmp_init_counter++;
7373 
7374  __kmp_init_serial = TRUE;
7375 
7376  if (__kmp_version) {
7377  __kmp_print_version_1();
7378  }
7379 
7380  if (__kmp_settings) {
7381  __kmp_env_print();
7382  }
7383 
7384  if (__kmp_display_env || __kmp_display_env_verbose) {
7385  __kmp_env_print_2();
7386  }
7387 
7388 #if OMPT_SUPPORT
7389  ompt_post_init();
7390 #endif
7391 
7392  KMP_MB();
7393 
7394  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7395 }
7396 
7397 void __kmp_serial_initialize(void) {
7398  if (__kmp_init_serial) {
7399  return;
7400  }
7401  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7402  if (__kmp_init_serial) {
7403  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7404  return;
7405  }
7406  __kmp_do_serial_initialize();
7407  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7408 }
7409 
7410 static void __kmp_do_middle_initialize(void) {
7411  int i, j;
7412  int prev_dflt_team_nth;
7413 
7414  if (!__kmp_init_serial) {
7415  __kmp_do_serial_initialize();
7416  }
7417 
7418  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7419 
7420  if (UNLIKELY(!__kmp_need_register_serial)) {
7421  // We are in a forked child process. The registration was skipped during
7422  // serial initialization in __kmp_atfork_child handler. Do it here.
7423  __kmp_register_library_startup();
7424  }
7425 
7426  // Save the previous value for the __kmp_dflt_team_nth so that
7427  // we can avoid some reinitialization if it hasn't changed.
7428  prev_dflt_team_nth = __kmp_dflt_team_nth;
7429 
7430 #if KMP_AFFINITY_SUPPORTED
7431  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7432  // number of cores on the machine.
7433  __kmp_affinity_initialize(__kmp_affinity);
7434 
7435 #endif /* KMP_AFFINITY_SUPPORTED */
7436 
7437  KMP_ASSERT(__kmp_xproc > 0);
7438  if (__kmp_avail_proc == 0) {
7439  __kmp_avail_proc = __kmp_xproc;
7440  }
7441 
7442  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7443  // correct them now
7444  j = 0;
7445  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7446  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7447  __kmp_avail_proc;
7448  j++;
7449  }
7450 
7451  if (__kmp_dflt_team_nth == 0) {
7452 #ifdef KMP_DFLT_NTH_CORES
7453  // Default #threads = #cores
7454  __kmp_dflt_team_nth = __kmp_ncores;
7455  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7456  "__kmp_ncores (%d)\n",
7457  __kmp_dflt_team_nth));
7458 #else
7459  // Default #threads = #available OS procs
7460  __kmp_dflt_team_nth = __kmp_avail_proc;
7461  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7462  "__kmp_avail_proc(%d)\n",
7463  __kmp_dflt_team_nth));
7464 #endif /* KMP_DFLT_NTH_CORES */
7465  }
7466 
7467  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7468  __kmp_dflt_team_nth = KMP_MIN_NTH;
7469  }
7470  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7471  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7472  }
7473 
7474  if (__kmp_nesting_mode > 0)
7475  __kmp_set_nesting_mode_threads();
7476 
7477  // There's no harm in continuing if the following check fails,
7478  // but it indicates an error in the previous logic.
7479  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7480 
7481  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7482  // Run through the __kmp_threads array and set the num threads icv for each
7483  // root thread that is currently registered with the RTL (which has not
7484  // already explicitly set its nthreads-var with a call to
7485  // omp_set_num_threads()).
7486  for (i = 0; i < __kmp_threads_capacity; i++) {
7487  kmp_info_t *thread = __kmp_threads[i];
7488  if (thread == NULL)
7489  continue;
7490  if (thread->th.th_current_task->td_icvs.nproc != 0)
7491  continue;
7492 
7493  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7494  }
7495  }
7496  KA_TRACE(
7497  20,
7498  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7499  __kmp_dflt_team_nth));
7500 
7501 #ifdef KMP_ADJUST_BLOCKTIME
7502  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7503  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7504  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7505  if (__kmp_nth > __kmp_avail_proc) {
7506  __kmp_zero_bt = TRUE;
7507  }
7508  }
7509 #endif /* KMP_ADJUST_BLOCKTIME */
7510 
7511  /* we have finished middle initialization */
7512  TCW_SYNC_4(__kmp_init_middle, TRUE);
7513 
7514  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7515 }
7516 
7517 void __kmp_middle_initialize(void) {
7518  if (__kmp_init_middle) {
7519  return;
7520  }
7521  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7522  if (__kmp_init_middle) {
7523  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7524  return;
7525  }
7526  __kmp_do_middle_initialize();
7527  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7528 }
7529 
7530 void __kmp_parallel_initialize(void) {
7531  int gtid = __kmp_entry_gtid(); // this might be a new root
7532 
7533  /* synchronize parallel initialization (for sibling) */
7534  if (TCR_4(__kmp_init_parallel))
7535  return;
7536  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7537  if (TCR_4(__kmp_init_parallel)) {
7538  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7539  return;
7540  }
7541 
7542  /* TODO reinitialization after we have already shut down */
7543  if (TCR_4(__kmp_global.g.g_done)) {
7544  KA_TRACE(
7545  10,
7546  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7547  __kmp_infinite_loop();
7548  }
7549 
7550  /* jc: The lock __kmp_initz_lock is already held, so calling
7551  __kmp_serial_initialize would cause a deadlock. So we call
7552  __kmp_do_serial_initialize directly. */
7553  if (!__kmp_init_middle) {
7554  __kmp_do_middle_initialize();
7555  }
7556  __kmp_assign_root_init_mask();
7557  __kmp_resume_if_hard_paused();
7558 
7559  /* begin initialization */
7560  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7561  KMP_ASSERT(KMP_UBER_GTID(gtid));
7562 
7563 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7564  // Save the FP control regs.
7565  // Worker threads will set theirs to these values at thread startup.
7566  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7567  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7568  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7569 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7570 
7571 #if KMP_OS_UNIX
7572 #if KMP_HANDLE_SIGNALS
7573  /* must be after __kmp_serial_initialize */
7574  __kmp_install_signals(TRUE);
7575 #endif
7576 #endif
7577 
7578  __kmp_suspend_initialize();
7579 
7580 #if defined(USE_LOAD_BALANCE)
7581  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7582  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7583  }
7584 #else
7585  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7586  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7587  }
7588 #endif
7589 
7590  if (__kmp_version) {
7591  __kmp_print_version_2();
7592  }
7593 
7594  /* we have finished parallel initialization */
7595  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7596 
7597  KMP_MB();
7598  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7599 
7600  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7601 }
7602 
7603 void __kmp_hidden_helper_initialize() {
7604  if (TCR_4(__kmp_init_hidden_helper))
7605  return;
7606 
7607  // __kmp_parallel_initialize is required before we initialize hidden helper
7608  if (!TCR_4(__kmp_init_parallel))
7609  __kmp_parallel_initialize();
7610 
7611  // Double check. Note that this double check should not be placed before
7612  // __kmp_parallel_initialize as it will cause dead lock.
7613  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7614  if (TCR_4(__kmp_init_hidden_helper)) {
7615  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7616  return;
7617  }
7618 
7619 #if KMP_AFFINITY_SUPPORTED
7620  // Initialize hidden helper affinity settings.
7621  // The above __kmp_parallel_initialize() will initialize
7622  // regular affinity (and topology) if not already done.
7623  if (!__kmp_hh_affinity.flags.initialized)
7624  __kmp_affinity_initialize(__kmp_hh_affinity);
7625 #endif
7626 
7627  // Set the count of hidden helper tasks to be executed to zero
7628  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7629 
7630  // Set the global variable indicating that we're initializing hidden helper
7631  // team/threads
7632  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7633 
7634  // Platform independent initialization
7635  __kmp_do_initialize_hidden_helper_threads();
7636 
7637  // Wait here for the finish of initialization of hidden helper teams
7638  __kmp_hidden_helper_threads_initz_wait();
7639 
7640  // We have finished hidden helper initialization
7641  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7642 
7643  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7644 }
7645 
7646 /* ------------------------------------------------------------------------ */
7647 
7648 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7649  kmp_team_t *team) {
7650  kmp_disp_t *dispatch;
7651 
7652  KMP_MB();
7653 
7654  /* none of the threads have encountered any constructs, yet. */
7655  this_thr->th.th_local.this_construct = 0;
7656 #if KMP_CACHE_MANAGE
7657  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7658 #endif /* KMP_CACHE_MANAGE */
7659  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7660  KMP_DEBUG_ASSERT(dispatch);
7661  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7662  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7663  // this_thr->th.th_info.ds.ds_tid ] );
7664 
7665  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7666  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7667  if (__kmp_env_consistency_check)
7668  __kmp_push_parallel(gtid, team->t.t_ident);
7669 
7670  KMP_MB(); /* Flush all pending memory write invalidates. */
7671 }
7672 
7673 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7674  kmp_team_t *team) {
7675  if (__kmp_env_consistency_check)
7676  __kmp_pop_parallel(gtid, team->t.t_ident);
7677 
7678  __kmp_finish_implicit_task(this_thr);
7679 }
7680 
7681 int __kmp_invoke_task_func(int gtid) {
7682  int rc;
7683  int tid = __kmp_tid_from_gtid(gtid);
7684  kmp_info_t *this_thr = __kmp_threads[gtid];
7685  kmp_team_t *team = this_thr->th.th_team;
7686 
7687  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7688 #if USE_ITT_BUILD
7689  if (__itt_stack_caller_create_ptr) {
7690  // inform ittnotify about entering user's code
7691  if (team->t.t_stack_id != NULL) {
7692  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7693  } else {
7694  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7695  __kmp_itt_stack_callee_enter(
7696  (__itt_caller)team->t.t_parent->t.t_stack_id);
7697  }
7698  }
7699 #endif /* USE_ITT_BUILD */
7700 #if INCLUDE_SSC_MARKS
7701  SSC_MARK_INVOKING();
7702 #endif
7703 
7704 #if OMPT_SUPPORT
7705  void *dummy;
7706  void **exit_frame_p;
7707  ompt_data_t *my_task_data;
7708  ompt_data_t *my_parallel_data;
7709  int ompt_team_size;
7710 
7711  if (ompt_enabled.enabled) {
7712  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7713  .ompt_task_info.frame.exit_frame.ptr);
7714  } else {
7715  exit_frame_p = &dummy;
7716  }
7717 
7718  my_task_data =
7719  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7720  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7721  if (ompt_enabled.ompt_callback_implicit_task) {
7722  ompt_team_size = team->t.t_nproc;
7723  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7724  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7725  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7726  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7727  }
7728 #endif
7729 
7730 #if KMP_STATS_ENABLED
7731  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7732  if (previous_state == stats_state_e::TEAMS_REGION) {
7733  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7734  } else {
7735  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7736  }
7737  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7738 #endif
7739 
7740  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7741  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7742 #if OMPT_SUPPORT
7743  ,
7744  exit_frame_p
7745 #endif
7746  );
7747 #if OMPT_SUPPORT
7748  *exit_frame_p = NULL;
7749  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7750 #endif
7751 
7752 #if KMP_STATS_ENABLED
7753  if (previous_state == stats_state_e::TEAMS_REGION) {
7754  KMP_SET_THREAD_STATE(previous_state);
7755  }
7756  KMP_POP_PARTITIONED_TIMER();
7757 #endif
7758 
7759 #if USE_ITT_BUILD
7760  if (__itt_stack_caller_create_ptr) {
7761  // inform ittnotify about leaving user's code
7762  if (team->t.t_stack_id != NULL) {
7763  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7764  } else {
7765  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7766  __kmp_itt_stack_callee_leave(
7767  (__itt_caller)team->t.t_parent->t.t_stack_id);
7768  }
7769  }
7770 #endif /* USE_ITT_BUILD */
7771  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7772 
7773  return rc;
7774 }
7775 
7776 void __kmp_teams_master(int gtid) {
7777  // This routine is called by all primary threads in teams construct
7778  kmp_info_t *thr = __kmp_threads[gtid];
7779  kmp_team_t *team = thr->th.th_team;
7780  ident_t *loc = team->t.t_ident;
7781  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7782  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7783  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7784  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7785  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7786 
7787  // This thread is a new CG root. Set up the proper variables.
7788  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7789  tmp->cg_root = thr; // Make thr the CG root
7790  // Init to thread limit stored when league primary threads were forked
7791  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7792  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7793  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7794  " cg_nthreads to 1\n",
7795  thr, tmp));
7796  tmp->up = thr->th.th_cg_roots;
7797  thr->th.th_cg_roots = tmp;
7798 
7799 // Launch league of teams now, but not let workers execute
7800 // (they hang on fork barrier until next parallel)
7801 #if INCLUDE_SSC_MARKS
7802  SSC_MARK_FORKING();
7803 #endif
7804  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7805  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7806  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7807 #if INCLUDE_SSC_MARKS
7808  SSC_MARK_JOINING();
7809 #endif
7810  // If the team size was reduced from the limit, set it to the new size
7811  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7812  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7813  // AC: last parameter "1" eliminates join barrier which won't work because
7814  // worker threads are in a fork barrier waiting for more parallel regions
7815  __kmp_join_call(loc, gtid
7816 #if OMPT_SUPPORT
7817  ,
7818  fork_context_intel
7819 #endif
7820  ,
7821  1);
7822 }
7823 
7824 int __kmp_invoke_teams_master(int gtid) {
7825  kmp_info_t *this_thr = __kmp_threads[gtid];
7826  kmp_team_t *team = this_thr->th.th_team;
7827 #if KMP_DEBUG
7828  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7829  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7830  (void *)__kmp_teams_master);
7831 #endif
7832  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7833 #if OMPT_SUPPORT
7834  int tid = __kmp_tid_from_gtid(gtid);
7835  ompt_data_t *task_data =
7836  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7837  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7838  if (ompt_enabled.ompt_callback_implicit_task) {
7839  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7840  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7841  ompt_task_initial);
7842  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7843  }
7844 #endif
7845  __kmp_teams_master(gtid);
7846 #if OMPT_SUPPORT
7847  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7848 #endif
7849  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7850  return 1;
7851 }
7852 
7853 /* this sets the requested number of threads for the next parallel region
7854  encountered by this team. since this should be enclosed in the forkjoin
7855  critical section it should avoid race conditions with asymmetrical nested
7856  parallelism */
7857 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7858  kmp_info_t *thr = __kmp_threads[gtid];
7859 
7860  if (num_threads > 0)
7861  thr->th.th_set_nproc = num_threads;
7862 }
7863 
7864 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7865  int *num_threads_list) {
7866  kmp_info_t *thr = __kmp_threads[gtid];
7867 
7868  KMP_DEBUG_ASSERT(list_length > 1);
7869 
7870  if (num_threads_list[0] > 0)
7871  thr->th.th_set_nproc = num_threads_list[0];
7872  thr->th.th_set_nested_nth =
7873  (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7874  for (kmp_uint32 i = 0; i < list_length; ++i)
7875  thr->th.th_set_nested_nth[i] = num_threads_list[i];
7876  thr->th.th_set_nested_nth_sz = list_length;
7877 }
7878 
7879 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7880  const char *msg) {
7881  kmp_info_t *thr = __kmp_threads[gtid];
7882  thr->th.th_nt_strict = true;
7883  thr->th.th_nt_loc = loc;
7884  // if sev is unset make fatal
7885  if (sev == severity_warning)
7886  thr->th.th_nt_sev = sev;
7887  else
7888  thr->th.th_nt_sev = severity_fatal;
7889  // if msg is unset, use an appropriate message
7890  if (msg)
7891  thr->th.th_nt_msg = msg;
7892  else
7893  thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7894  "strict num_threads clause.";
7895 }
7896 
7897 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7898  int num_threads) {
7899  KMP_DEBUG_ASSERT(thr);
7900  // Remember the number of threads for inner parallel regions
7901  if (!TCR_4(__kmp_init_middle))
7902  __kmp_middle_initialize(); // get internal globals calculated
7903  __kmp_assign_root_init_mask();
7904  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7905  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7906 
7907  if (num_threads == 0) {
7908  if (__kmp_teams_thread_limit > 0) {
7909  num_threads = __kmp_teams_thread_limit;
7910  } else {
7911  num_threads = __kmp_avail_proc / num_teams;
7912  }
7913  // adjust num_threads w/o warning as it is not user setting
7914  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7915  // no thread_limit clause specified - do not change thread-limit-var ICV
7916  if (num_threads > __kmp_dflt_team_nth) {
7917  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7918  }
7919  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7920  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7921  } // prevent team size to exceed thread-limit-var
7922  if (num_teams * num_threads > __kmp_teams_max_nth) {
7923  num_threads = __kmp_teams_max_nth / num_teams;
7924  }
7925  if (num_threads == 0) {
7926  num_threads = 1;
7927  }
7928  } else {
7929  if (num_threads < 0) {
7930  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7931  __kmp_msg_null);
7932  num_threads = 1;
7933  }
7934  // This thread will be the primary thread of the league primary threads
7935  // Store new thread limit; old limit is saved in th_cg_roots list
7936  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7937  // num_threads = min(num_threads, nthreads-var)
7938  if (num_threads > __kmp_dflt_team_nth) {
7939  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7940  }
7941  if (num_teams * num_threads > __kmp_teams_max_nth) {
7942  int new_threads = __kmp_teams_max_nth / num_teams;
7943  if (new_threads == 0) {
7944  new_threads = 1;
7945  }
7946  if (new_threads != num_threads) {
7947  if (!__kmp_reserve_warn) { // user asked for too many threads
7948  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7949  __kmp_msg(kmp_ms_warning,
7950  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7951  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7952  }
7953  }
7954  num_threads = new_threads;
7955  }
7956  }
7957  thr->th.th_teams_size.nth = num_threads;
7958 }
7959 
7960 /* this sets the requested number of teams for the teams region and/or
7961  the number of threads for the next parallel region encountered */
7962 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7963  int num_threads) {
7964  kmp_info_t *thr = __kmp_threads[gtid];
7965  if (num_teams < 0) {
7966  // OpenMP specification requires requested values to be positive,
7967  // but people can send us any value, so we'd better check
7968  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7969  __kmp_msg_null);
7970  num_teams = 1;
7971  }
7972  if (num_teams == 0) {
7973  if (__kmp_nteams > 0) {
7974  num_teams = __kmp_nteams;
7975  } else {
7976  num_teams = 1; // default number of teams is 1.
7977  }
7978  }
7979  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7980  if (!__kmp_reserve_warn) {
7981  __kmp_reserve_warn = 1;
7982  __kmp_msg(kmp_ms_warning,
7983  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7984  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7985  }
7986  num_teams = __kmp_teams_max_nth;
7987  }
7988  // Set number of teams (number of threads in the outer "parallel" of the
7989  // teams)
7990  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7991 
7992  __kmp_push_thread_limit(thr, num_teams, num_threads);
7993 }
7994 
7995 /* This sets the requested number of teams for the teams region and/or
7996  the number of threads for the next parallel region encountered */
7997 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7998  int num_teams_ub, int num_threads) {
7999  kmp_info_t *thr = __kmp_threads[gtid];
8000  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
8001  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
8002  KMP_DEBUG_ASSERT(num_threads >= 0);
8003 
8004  if (num_teams_lb > num_teams_ub) {
8005  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
8006  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
8007  }
8008 
8009  int num_teams = 1; // defalt number of teams is 1.
8010 
8011  if (num_teams_lb == 0 && num_teams_ub > 0)
8012  num_teams_lb = num_teams_ub;
8013 
8014  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
8015  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
8016  if (num_teams > __kmp_teams_max_nth) {
8017  if (!__kmp_reserve_warn) {
8018  __kmp_reserve_warn = 1;
8019  __kmp_msg(kmp_ms_warning,
8020  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
8021  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
8022  }
8023  num_teams = __kmp_teams_max_nth;
8024  }
8025  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
8026  num_teams = num_teams_ub;
8027  } else { // num_teams_lb <= num_teams <= num_teams_ub
8028  if (num_threads <= 0) {
8029  if (num_teams_ub > __kmp_teams_max_nth) {
8030  num_teams = num_teams_lb;
8031  } else {
8032  num_teams = num_teams_ub;
8033  }
8034  } else {
8035  num_teams = (num_threads > __kmp_teams_max_nth)
8036  ? num_teams
8037  : __kmp_teams_max_nth / num_threads;
8038  if (num_teams < num_teams_lb) {
8039  num_teams = num_teams_lb;
8040  } else if (num_teams > num_teams_ub) {
8041  num_teams = num_teams_ub;
8042  }
8043  }
8044  }
8045  // Set number of teams (number of threads in the outer "parallel" of the
8046  // teams)
8047  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8048 
8049  __kmp_push_thread_limit(thr, num_teams, num_threads);
8050 }
8051 
8052 // Set the proc_bind var to use in the following parallel region.
8053 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8054  kmp_info_t *thr = __kmp_threads[gtid];
8055  thr->th.th_set_proc_bind = proc_bind;
8056 }
8057 
8058 /* Launch the worker threads into the microtask. */
8059 
8060 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8061  kmp_info_t *this_thr = __kmp_threads[gtid];
8062 
8063 #ifdef KMP_DEBUG
8064  int f;
8065 #endif /* KMP_DEBUG */
8066 
8067  KMP_DEBUG_ASSERT(team);
8068  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8069  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8070  KMP_MB(); /* Flush all pending memory write invalidates. */
8071 
8072  team->t.t_construct = 0; /* no single directives seen yet */
8073  team->t.t_ordered.dt.t_value =
8074  0; /* thread 0 enters the ordered section first */
8075 
8076  /* Reset the identifiers on the dispatch buffer */
8077  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8078  if (team->t.t_max_nproc > 1) {
8079  int i;
8080  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8081  team->t.t_disp_buffer[i].buffer_index = i;
8082  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8083  }
8084  } else {
8085  team->t.t_disp_buffer[0].buffer_index = 0;
8086  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8087  }
8088 
8089  KMP_MB(); /* Flush all pending memory write invalidates. */
8090  KMP_ASSERT(this_thr->th.th_team == team);
8091 
8092 #ifdef KMP_DEBUG
8093  for (f = 0; f < team->t.t_nproc; f++) {
8094  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8095  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8096  }
8097 #endif /* KMP_DEBUG */
8098 
8099  /* release the worker threads so they may begin working */
8100  __kmp_fork_barrier(gtid, 0);
8101 }
8102 
8103 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8104  kmp_info_t *this_thr = __kmp_threads[gtid];
8105 
8106  KMP_DEBUG_ASSERT(team);
8107  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8108  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8109  KMP_MB(); /* Flush all pending memory write invalidates. */
8110 
8111  /* Join barrier after fork */
8112 
8113 #ifdef KMP_DEBUG
8114  if (__kmp_threads[gtid] &&
8115  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8116  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8117  __kmp_threads[gtid]);
8118  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8119  "team->t.t_nproc=%d\n",
8120  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8121  team->t.t_nproc);
8122  __kmp_print_structure();
8123  }
8124  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8125  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8126 #endif /* KMP_DEBUG */
8127 
8128  __kmp_join_barrier(gtid); /* wait for everyone */
8129 #if OMPT_SUPPORT
8130  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8131  if (ompt_enabled.enabled &&
8132  (ompt_state == ompt_state_wait_barrier_teams ||
8133  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8134  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8135  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8136  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8137 #if OMPT_OPTIONAL
8138  void *codeptr = NULL;
8139  if (KMP_MASTER_TID(ds_tid) &&
8140  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8141  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8142  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8143 
8144  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8145  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8146  sync_kind = ompt_sync_region_barrier_teams;
8147  if (ompt_enabled.ompt_callback_sync_region_wait) {
8148  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8149  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8150  }
8151  if (ompt_enabled.ompt_callback_sync_region) {
8152  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8153  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8154  }
8155 #endif
8156  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8157  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8158  ompt_scope_end, NULL, task_data, 0, ds_tid,
8159  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8160  }
8161  }
8162 #endif
8163 
8164  KMP_MB(); /* Flush all pending memory write invalidates. */
8165  KMP_ASSERT(this_thr->th.th_team == team);
8166 }
8167 
8168 /* ------------------------------------------------------------------------ */
8169 
8170 #ifdef USE_LOAD_BALANCE
8171 
8172 // Return the worker threads actively spinning in the hot team, if we
8173 // are at the outermost level of parallelism. Otherwise, return 0.
8174 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8175  int i;
8176  int retval;
8177  kmp_team_t *hot_team;
8178 
8179  if (root->r.r_active) {
8180  return 0;
8181  }
8182  hot_team = root->r.r_hot_team;
8183  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8184  return hot_team->t.t_nproc - 1; // Don't count primary thread
8185  }
8186 
8187  // Skip the primary thread - it is accounted for elsewhere.
8188  retval = 0;
8189  for (i = 1; i < hot_team->t.t_nproc; i++) {
8190  if (hot_team->t.t_threads[i]->th.th_active) {
8191  retval++;
8192  }
8193  }
8194  return retval;
8195 }
8196 
8197 // Perform an automatic adjustment to the number of
8198 // threads used by the next parallel region.
8199 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8200  int retval;
8201  int pool_active;
8202  int hot_team_active;
8203  int team_curr_active;
8204  int system_active;
8205 
8206  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8207  set_nproc));
8208  KMP_DEBUG_ASSERT(root);
8209  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8210  ->th.th_current_task->td_icvs.dynamic == TRUE);
8211  KMP_DEBUG_ASSERT(set_nproc > 1);
8212 
8213  if (set_nproc == 1) {
8214  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8215  return 1;
8216  }
8217 
8218  // Threads that are active in the thread pool, active in the hot team for this
8219  // particular root (if we are at the outer par level), and the currently
8220  // executing thread (to become the primary thread) are available to add to the
8221  // new team, but are currently contributing to the system load, and must be
8222  // accounted for.
8223  pool_active = __kmp_thread_pool_active_nth;
8224  hot_team_active = __kmp_active_hot_team_nproc(root);
8225  team_curr_active = pool_active + hot_team_active + 1;
8226 
8227  // Check the system load.
8228  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8229  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8230  "hot team active = %d\n",
8231  system_active, pool_active, hot_team_active));
8232 
8233  if (system_active < 0) {
8234  // There was an error reading the necessary info from /proc, so use the
8235  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8236  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8237  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8238  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8239 
8240  // Make this call behave like the thread limit algorithm.
8241  retval = __kmp_avail_proc - __kmp_nth +
8242  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8243  if (retval > set_nproc) {
8244  retval = set_nproc;
8245  }
8246  if (retval < KMP_MIN_NTH) {
8247  retval = KMP_MIN_NTH;
8248  }
8249 
8250  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8251  retval));
8252  return retval;
8253  }
8254 
8255  // There is a slight delay in the load balance algorithm in detecting new
8256  // running procs. The real system load at this instant should be at least as
8257  // large as the #active omp thread that are available to add to the team.
8258  if (system_active < team_curr_active) {
8259  system_active = team_curr_active;
8260  }
8261  retval = __kmp_avail_proc - system_active + team_curr_active;
8262  if (retval > set_nproc) {
8263  retval = set_nproc;
8264  }
8265  if (retval < KMP_MIN_NTH) {
8266  retval = KMP_MIN_NTH;
8267  }
8268 
8269  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8270  return retval;
8271 } // __kmp_load_balance_nproc()
8272 
8273 #endif /* USE_LOAD_BALANCE */
8274 
8275 /* ------------------------------------------------------------------------ */
8276 
8277 /* NOTE: this is called with the __kmp_init_lock held */
8278 void __kmp_cleanup(void) {
8279  int f;
8280 
8281  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8282 
8283  if (TCR_4(__kmp_init_parallel)) {
8284 #if KMP_HANDLE_SIGNALS
8285  __kmp_remove_signals();
8286 #endif
8287  TCW_4(__kmp_init_parallel, FALSE);
8288  }
8289 
8290  if (TCR_4(__kmp_init_middle)) {
8291 #if KMP_AFFINITY_SUPPORTED
8292  __kmp_affinity_uninitialize();
8293 #endif /* KMP_AFFINITY_SUPPORTED */
8294  __kmp_cleanup_hierarchy();
8295  TCW_4(__kmp_init_middle, FALSE);
8296  }
8297 
8298  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8299 
8300  if (__kmp_init_serial) {
8301  __kmp_runtime_destroy();
8302  __kmp_init_serial = FALSE;
8303  }
8304 
8305  __kmp_cleanup_threadprivate_caches();
8306 
8307  for (f = 0; f < __kmp_threads_capacity; f++) {
8308  if (__kmp_root[f] != NULL) {
8309  __kmp_free(__kmp_root[f]);
8310  __kmp_root[f] = NULL;
8311  }
8312  }
8313  __kmp_free(__kmp_threads);
8314  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8315  // there is no need in freeing __kmp_root.
8316  __kmp_threads = NULL;
8317  __kmp_root = NULL;
8318  __kmp_threads_capacity = 0;
8319 
8320  // Free old __kmp_threads arrays if they exist.
8321  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8322  while (ptr) {
8323  kmp_old_threads_list_t *next = ptr->next;
8324  __kmp_free(ptr->threads);
8325  __kmp_free(ptr);
8326  ptr = next;
8327  }
8328 
8329 #if KMP_USE_DYNAMIC_LOCK
8330  __kmp_cleanup_indirect_user_locks();
8331 #else
8332  __kmp_cleanup_user_locks();
8333 #endif
8334 #if OMPD_SUPPORT
8335  if (ompd_state) {
8336  __kmp_free(ompd_env_block);
8337  ompd_env_block = NULL;
8338  ompd_env_block_size = 0;
8339  }
8340 #endif
8341 
8342 #if KMP_AFFINITY_SUPPORTED
8343  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8344  __kmp_cpuinfo_file = NULL;
8345 #endif /* KMP_AFFINITY_SUPPORTED */
8346 
8347 #if KMP_USE_ADAPTIVE_LOCKS
8348 #if KMP_DEBUG_ADAPTIVE_LOCKS
8349  __kmp_print_speculative_stats();
8350 #endif
8351 #endif
8352  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8353  __kmp_nested_nth.nth = NULL;
8354  __kmp_nested_nth.size = 0;
8355  __kmp_nested_nth.used = 0;
8356 
8357  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8358  __kmp_nested_proc_bind.bind_types = NULL;
8359  __kmp_nested_proc_bind.size = 0;
8360  __kmp_nested_proc_bind.used = 0;
8361  if (__kmp_affinity_format) {
8362  KMP_INTERNAL_FREE(__kmp_affinity_format);
8363  __kmp_affinity_format = NULL;
8364  }
8365 
8366  __kmp_i18n_catclose();
8367 
8368 #if KMP_USE_HIER_SCHED
8369  __kmp_hier_scheds.deallocate();
8370 #endif
8371 
8372 #if KMP_STATS_ENABLED
8373  __kmp_stats_fini();
8374 #endif
8375 
8376  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8377 }
8378 
8379 /* ------------------------------------------------------------------------ */
8380 
8381 int __kmp_ignore_mppbeg(void) {
8382  char *env;
8383 
8384  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8385  if (__kmp_str_match_false(env))
8386  return FALSE;
8387  }
8388  // By default __kmpc_begin() is no-op.
8389  return TRUE;
8390 }
8391 
8392 int __kmp_ignore_mppend(void) {
8393  char *env;
8394 
8395  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8396  if (__kmp_str_match_false(env))
8397  return FALSE;
8398  }
8399  // By default __kmpc_end() is no-op.
8400  return TRUE;
8401 }
8402 
8403 void __kmp_internal_begin(void) {
8404  int gtid;
8405  kmp_root_t *root;
8406 
8407  /* this is a very important step as it will register new sibling threads
8408  and assign these new uber threads a new gtid */
8409  gtid = __kmp_entry_gtid();
8410  root = __kmp_threads[gtid]->th.th_root;
8411  KMP_ASSERT(KMP_UBER_GTID(gtid));
8412 
8413  if (root->r.r_begin)
8414  return;
8415  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8416  if (root->r.r_begin) {
8417  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8418  return;
8419  }
8420 
8421  root->r.r_begin = TRUE;
8422 
8423  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8424 }
8425 
8426 /* ------------------------------------------------------------------------ */
8427 
8428 void __kmp_user_set_library(enum library_type arg) {
8429  int gtid;
8430  kmp_root_t *root;
8431  kmp_info_t *thread;
8432 
8433  /* first, make sure we are initialized so we can get our gtid */
8434 
8435  gtid = __kmp_entry_gtid();
8436  thread = __kmp_threads[gtid];
8437 
8438  root = thread->th.th_root;
8439 
8440  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8441  library_serial));
8442  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8443  thread */
8444  KMP_WARNING(SetLibraryIncorrectCall);
8445  return;
8446  }
8447 
8448  switch (arg) {
8449  case library_serial:
8450  thread->th.th_set_nproc = 0;
8451  set__nproc(thread, 1);
8452  break;
8453  case library_turnaround:
8454  thread->th.th_set_nproc = 0;
8455  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8456  : __kmp_dflt_team_nth_ub);
8457  break;
8458  case library_throughput:
8459  thread->th.th_set_nproc = 0;
8460  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8461  : __kmp_dflt_team_nth_ub);
8462  break;
8463  default:
8464  KMP_FATAL(UnknownLibraryType, arg);
8465  }
8466 
8467  __kmp_aux_set_library(arg);
8468 }
8469 
8470 void __kmp_aux_set_stacksize(size_t arg) {
8471  if (!__kmp_init_serial)
8472  __kmp_serial_initialize();
8473 
8474 #if KMP_OS_DARWIN
8475  if (arg & (0x1000 - 1)) {
8476  arg &= ~(0x1000 - 1);
8477  if (arg + 0x1000) /* check for overflow if we round up */
8478  arg += 0x1000;
8479  }
8480 #endif
8481  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8482 
8483  /* only change the default stacksize before the first parallel region */
8484  if (!TCR_4(__kmp_init_parallel)) {
8485  size_t value = arg; /* argument is in bytes */
8486 
8487  if (value < __kmp_sys_min_stksize)
8488  value = __kmp_sys_min_stksize;
8489  else if (value > KMP_MAX_STKSIZE)
8490  value = KMP_MAX_STKSIZE;
8491 
8492  __kmp_stksize = value;
8493 
8494  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8495  }
8496 
8497  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8498 }
8499 
8500 /* set the behaviour of the runtime library */
8501 /* TODO this can cause some odd behaviour with sibling parallelism... */
8502 void __kmp_aux_set_library(enum library_type arg) {
8503  __kmp_library = arg;
8504 
8505  switch (__kmp_library) {
8506  case library_serial: {
8507  KMP_INFORM(LibraryIsSerial);
8508  } break;
8509  case library_turnaround:
8510  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8511  __kmp_use_yield = 2; // only yield when oversubscribed
8512  break;
8513  case library_throughput:
8514  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8515  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8516  break;
8517  default:
8518  KMP_FATAL(UnknownLibraryType, arg);
8519  }
8520 }
8521 
8522 /* Getting team information common for all team API */
8523 // Returns NULL if not in teams construct
8524 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8525  kmp_info_t *thr = __kmp_entry_thread();
8526  teams_serialized = 0;
8527  if (thr->th.th_teams_microtask) {
8528  kmp_team_t *team = thr->th.th_team;
8529  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8530  int ii = team->t.t_level;
8531  teams_serialized = team->t.t_serialized;
8532  int level = tlevel + 1;
8533  KMP_DEBUG_ASSERT(ii >= tlevel);
8534  while (ii > level) {
8535  for (teams_serialized = team->t.t_serialized;
8536  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8537  }
8538  if (team->t.t_serialized && (!teams_serialized)) {
8539  team = team->t.t_parent;
8540  continue;
8541  }
8542  if (ii > level) {
8543  team = team->t.t_parent;
8544  ii--;
8545  }
8546  }
8547  return team;
8548  }
8549  return NULL;
8550 }
8551 
8552 int __kmp_aux_get_team_num() {
8553  int serialized;
8554  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8555  if (team) {
8556  if (serialized > 1) {
8557  return 0; // teams region is serialized ( 1 team of 1 thread ).
8558  } else {
8559  return team->t.t_master_tid;
8560  }
8561  }
8562  return 0;
8563 }
8564 
8565 int __kmp_aux_get_num_teams() {
8566  int serialized;
8567  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8568  if (team) {
8569  if (serialized > 1) {
8570  return 1;
8571  } else {
8572  return team->t.t_parent->t.t_nproc;
8573  }
8574  }
8575  return 1;
8576 }
8577 
8578 /* ------------------------------------------------------------------------ */
8579 
8580 /*
8581  * Affinity Format Parser
8582  *
8583  * Field is in form of: %[[[0].]size]type
8584  * % and type are required (%% means print a literal '%')
8585  * type is either single char or long name surrounded by {},
8586  * e.g., N or {num_threads}
8587  * 0 => leading zeros
8588  * . => right justified when size is specified
8589  * by default output is left justified
8590  * size is the *minimum* field length
8591  * All other characters are printed as is
8592  *
8593  * Available field types:
8594  * L {thread_level} - omp_get_level()
8595  * n {thread_num} - omp_get_thread_num()
8596  * h {host} - name of host machine
8597  * P {process_id} - process id (integer)
8598  * T {thread_identifier} - native thread identifier (integer)
8599  * N {num_threads} - omp_get_num_threads()
8600  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8601  * a {thread_affinity} - comma separated list of integers or integer ranges
8602  * (values of affinity mask)
8603  *
8604  * Implementation-specific field types can be added
8605  * If a type is unknown, print "undefined"
8606  */
8607 
8608 // Structure holding the short name, long name, and corresponding data type
8609 // for snprintf. A table of these will represent the entire valid keyword
8610 // field types.
8611 typedef struct kmp_affinity_format_field_t {
8612  char short_name; // from spec e.g., L -> thread level
8613  const char *long_name; // from spec thread_level -> thread level
8614  char field_format; // data type for snprintf (typically 'd' or 's'
8615  // for integer or string)
8616 } kmp_affinity_format_field_t;
8617 
8618 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8619 #if KMP_AFFINITY_SUPPORTED
8620  {'A', "thread_affinity", 's'},
8621 #endif
8622  {'t', "team_num", 'd'},
8623  {'T', "num_teams", 'd'},
8624  {'L', "nesting_level", 'd'},
8625  {'n', "thread_num", 'd'},
8626  {'N', "num_threads", 'd'},
8627  {'a', "ancestor_tnum", 'd'},
8628  {'H', "host", 's'},
8629  {'P', "process_id", 'd'},
8630  {'i', "native_thread_id", 'd'}};
8631 
8632 // Return the number of characters it takes to hold field
8633 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8634  const char **ptr,
8635  kmp_str_buf_t *field_buffer) {
8636  int rc, format_index, field_value;
8637  const char *width_left, *width_right;
8638  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8639  static const int FORMAT_SIZE = 20;
8640  char format[FORMAT_SIZE] = {0};
8641  char absolute_short_name = 0;
8642 
8643  KMP_DEBUG_ASSERT(gtid >= 0);
8644  KMP_DEBUG_ASSERT(th);
8645  KMP_DEBUG_ASSERT(**ptr == '%');
8646  KMP_DEBUG_ASSERT(field_buffer);
8647 
8648  __kmp_str_buf_clear(field_buffer);
8649 
8650  // Skip the initial %
8651  (*ptr)++;
8652 
8653  // Check for %% first
8654  if (**ptr == '%') {
8655  __kmp_str_buf_cat(field_buffer, "%", 1);
8656  (*ptr)++; // skip over the second %
8657  return 1;
8658  }
8659 
8660  // Parse field modifiers if they are present
8661  pad_zeros = false;
8662  if (**ptr == '0') {
8663  pad_zeros = true;
8664  (*ptr)++; // skip over 0
8665  }
8666  right_justify = false;
8667  if (**ptr == '.') {
8668  right_justify = true;
8669  (*ptr)++; // skip over .
8670  }
8671  // Parse width of field: [width_left, width_right)
8672  width_left = width_right = NULL;
8673  if (**ptr >= '0' && **ptr <= '9') {
8674  width_left = *ptr;
8675  SKIP_DIGITS(*ptr);
8676  width_right = *ptr;
8677  }
8678 
8679  // Create the format for KMP_SNPRINTF based on flags parsed above
8680  format_index = 0;
8681  format[format_index++] = '%';
8682  if (!right_justify)
8683  format[format_index++] = '-';
8684  if (pad_zeros)
8685  format[format_index++] = '0';
8686  if (width_left && width_right) {
8687  int i = 0;
8688  // Only allow 8 digit number widths.
8689  // This also prevents overflowing format variable
8690  while (i < 8 && width_left < width_right) {
8691  format[format_index++] = *width_left;
8692  width_left++;
8693  i++;
8694  }
8695  }
8696 
8697  // Parse a name (long or short)
8698  // Canonicalize the name into absolute_short_name
8699  found_valid_name = false;
8700  parse_long_name = (**ptr == '{');
8701  if (parse_long_name)
8702  (*ptr)++; // skip initial left brace
8703  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8704  sizeof(__kmp_affinity_format_table[0]);
8705  ++i) {
8706  char short_name = __kmp_affinity_format_table[i].short_name;
8707  const char *long_name = __kmp_affinity_format_table[i].long_name;
8708  char field_format = __kmp_affinity_format_table[i].field_format;
8709  if (parse_long_name) {
8710  size_t length = KMP_STRLEN(long_name);
8711  if (strncmp(*ptr, long_name, length) == 0) {
8712  found_valid_name = true;
8713  (*ptr) += length; // skip the long name
8714  }
8715  } else if (**ptr == short_name) {
8716  found_valid_name = true;
8717  (*ptr)++; // skip the short name
8718  }
8719  if (found_valid_name) {
8720  format[format_index++] = field_format;
8721  format[format_index++] = '\0';
8722  absolute_short_name = short_name;
8723  break;
8724  }
8725  }
8726  if (parse_long_name) {
8727  if (**ptr != '}') {
8728  absolute_short_name = 0;
8729  } else {
8730  (*ptr)++; // skip over the right brace
8731  }
8732  }
8733 
8734  // Attempt to fill the buffer with the requested
8735  // value using snprintf within __kmp_str_buf_print()
8736  switch (absolute_short_name) {
8737  case 't':
8738  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8739  break;
8740  case 'T':
8741  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8742  break;
8743  case 'L':
8744  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8745  break;
8746  case 'n':
8747  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8748  break;
8749  case 'H': {
8750  static const int BUFFER_SIZE = 256;
8751  char buf[BUFFER_SIZE];
8752  __kmp_expand_host_name(buf, BUFFER_SIZE);
8753  rc = __kmp_str_buf_print(field_buffer, format, buf);
8754  } break;
8755  case 'P':
8756  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8757  break;
8758  case 'i':
8759  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8760  break;
8761  case 'N':
8762  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8763  break;
8764  case 'a':
8765  field_value =
8766  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8767  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8768  break;
8769 #if KMP_AFFINITY_SUPPORTED
8770  case 'A': {
8771  kmp_str_buf_t buf;
8772  __kmp_str_buf_init(&buf);
8773  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8774  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8775  __kmp_str_buf_free(&buf);
8776  } break;
8777 #endif
8778  default:
8779  // According to spec, If an implementation does not have info for field
8780  // type, then "undefined" is printed
8781  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8782  // Skip the field
8783  if (parse_long_name) {
8784  SKIP_TOKEN(*ptr);
8785  if (**ptr == '}')
8786  (*ptr)++;
8787  } else {
8788  (*ptr)++;
8789  }
8790  }
8791 
8792  KMP_ASSERT(format_index <= FORMAT_SIZE);
8793  return rc;
8794 }
8795 
8796 /*
8797  * Return number of characters needed to hold the affinity string
8798  * (not including null byte character)
8799  * The resultant string is printed to buffer, which the caller can then
8800  * handle afterwards
8801  */
8802 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8803  kmp_str_buf_t *buffer) {
8804  const char *parse_ptr;
8805  size_t retval;
8806  const kmp_info_t *th;
8807  kmp_str_buf_t field;
8808 
8809  KMP_DEBUG_ASSERT(buffer);
8810  KMP_DEBUG_ASSERT(gtid >= 0);
8811 
8812  __kmp_str_buf_init(&field);
8813  __kmp_str_buf_clear(buffer);
8814 
8815  th = __kmp_threads[gtid];
8816  retval = 0;
8817 
8818  // If format is NULL or zero-length string, then we use
8819  // affinity-format-var ICV
8820  parse_ptr = format;
8821  if (parse_ptr == NULL || *parse_ptr == '\0') {
8822  parse_ptr = __kmp_affinity_format;
8823  }
8824  KMP_DEBUG_ASSERT(parse_ptr);
8825 
8826  while (*parse_ptr != '\0') {
8827  // Parse a field
8828  if (*parse_ptr == '%') {
8829  // Put field in the buffer
8830  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8831  __kmp_str_buf_catbuf(buffer, &field);
8832  retval += rc;
8833  } else {
8834  // Put literal character in buffer
8835  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8836  retval++;
8837  parse_ptr++;
8838  }
8839  }
8840  __kmp_str_buf_free(&field);
8841  return retval;
8842 }
8843 
8844 // Displays the affinity string to stdout
8845 void __kmp_aux_display_affinity(int gtid, const char *format) {
8846  kmp_str_buf_t buf;
8847  __kmp_str_buf_init(&buf);
8848  __kmp_aux_capture_affinity(gtid, format, &buf);
8849  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8850  __kmp_str_buf_free(&buf);
8851 }
8852 
8853 /* ------------------------------------------------------------------------ */
8854 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8855  int blocktime = arg; /* argument is in microseconds */
8856 #if KMP_USE_MONITOR
8857  int bt_intervals;
8858 #endif
8859  kmp_int8 bt_set;
8860 
8861  __kmp_save_internal_controls(thread);
8862 
8863  /* Normalize and set blocktime for the teams */
8864  if (blocktime < KMP_MIN_BLOCKTIME)
8865  blocktime = KMP_MIN_BLOCKTIME;
8866  else if (blocktime > KMP_MAX_BLOCKTIME)
8867  blocktime = KMP_MAX_BLOCKTIME;
8868 
8869  set__blocktime_team(thread->th.th_team, tid, blocktime);
8870  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8871 
8872 #if KMP_USE_MONITOR
8873  /* Calculate and set blocktime intervals for the teams */
8874  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8875 
8876  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8877  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8878 #endif
8879 
8880  /* Set whether blocktime has been set to "TRUE" */
8881  bt_set = TRUE;
8882 
8883  set__bt_set_team(thread->th.th_team, tid, bt_set);
8884  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8885 #if KMP_USE_MONITOR
8886  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8887  "bt_intervals=%d, monitor_updates=%d\n",
8888  __kmp_gtid_from_tid(tid, thread->th.th_team),
8889  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8890  __kmp_monitor_wakeups));
8891 #else
8892  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8893  __kmp_gtid_from_tid(tid, thread->th.th_team),
8894  thread->th.th_team->t.t_id, tid, blocktime));
8895 #endif
8896 }
8897 
8898 void __kmp_aux_set_defaults(char const *str, size_t len) {
8899  if (!__kmp_init_serial) {
8900  __kmp_serial_initialize();
8901  }
8902  __kmp_env_initialize(str);
8903 
8904  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8905  __kmp_env_print();
8906  }
8907 } // __kmp_aux_set_defaults
8908 
8909 /* ------------------------------------------------------------------------ */
8910 /* internal fast reduction routines */
8911 
8912 PACKED_REDUCTION_METHOD_T
8913 __kmp_determine_reduction_method(
8914  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8915  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8916  kmp_critical_name *lck) {
8917 
8918  // Default reduction method: critical construct ( lck != NULL, like in current
8919  // PAROPT )
8920  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8921  // can be selected by RTL
8922  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8923  // can be selected by RTL
8924  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8925  // among generated by PAROPT.
8926 
8927  PACKED_REDUCTION_METHOD_T retval;
8928 
8929  int team_size;
8930 
8931  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8932 
8933 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8934  (loc && \
8935  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8936 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8937 
8938  retval = critical_reduce_block;
8939 
8940  // another choice of getting a team size (with 1 dynamic deference) is slower
8941  team_size = __kmp_get_team_num_threads(global_tid);
8942  if (team_size == 1) {
8943 
8944  retval = empty_reduce_block;
8945 
8946  } else {
8947 
8948  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8949 
8950 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8951  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8952  KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8953 
8954 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8955  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8956  KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8957 
8958  int teamsize_cutoff = 4;
8959 
8960 #if KMP_MIC_SUPPORTED
8961  if (__kmp_mic_type != non_mic) {
8962  teamsize_cutoff = 8;
8963  }
8964 #endif
8965  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8966  if (tree_available) {
8967  if (team_size <= teamsize_cutoff) {
8968  if (atomic_available) {
8969  retval = atomic_reduce_block;
8970  }
8971  } else {
8972  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8973  }
8974  } else if (atomic_available) {
8975  retval = atomic_reduce_block;
8976  }
8977 #else
8978 #error "Unknown or unsupported OS"
8979 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8980  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8981  // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8982 
8983 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8984  KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8985 
8986 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8987  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8988  KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8989 
8990  // basic tuning
8991 
8992  if (atomic_available) {
8993  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8994  retval = atomic_reduce_block;
8995  }
8996  } // otherwise: use critical section
8997 
8998 #elif KMP_OS_DARWIN
8999 
9000  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9001  if (atomic_available && (num_vars <= 3)) {
9002  retval = atomic_reduce_block;
9003  } else if (tree_available) {
9004  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
9005  (reduce_size < (2000 * sizeof(kmp_real64)))) {
9006  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
9007  }
9008  } // otherwise: use critical section
9009 
9010 #else
9011 #error "Unknown or unsupported OS"
9012 #endif
9013 
9014 #else
9015 #error "Unknown or unsupported architecture"
9016 #endif
9017  }
9018 
9019  // KMP_FORCE_REDUCTION
9020 
9021  // If the team is serialized (team_size == 1), ignore the forced reduction
9022  // method and stay with the unsynchronized method (empty_reduce_block)
9023  if (__kmp_force_reduction_method != reduction_method_not_defined &&
9024  team_size != 1) {
9025 
9026  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
9027 
9028  int atomic_available, tree_available;
9029 
9030  switch ((forced_retval = __kmp_force_reduction_method)) {
9031  case critical_reduce_block:
9032  KMP_ASSERT(lck); // lck should be != 0
9033  break;
9034 
9035  case atomic_reduce_block:
9036  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9037  if (!atomic_available) {
9038  KMP_WARNING(RedMethodNotSupported, "atomic");
9039  forced_retval = critical_reduce_block;
9040  }
9041  break;
9042 
9043  case tree_reduce_block:
9044  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9045  if (!tree_available) {
9046  KMP_WARNING(RedMethodNotSupported, "tree");
9047  forced_retval = critical_reduce_block;
9048  } else {
9049 #if KMP_FAST_REDUCTION_BARRIER
9050  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9051 #endif
9052  }
9053  break;
9054 
9055  default:
9056  KMP_ASSERT(0); // "unsupported method specified"
9057  }
9058 
9059  retval = forced_retval;
9060  }
9061 
9062  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9063 
9064 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9065 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9066 
9067  return (retval);
9068 }
9069 // this function is for testing set/get/determine reduce method
9070 kmp_int32 __kmp_get_reduce_method(void) {
9071  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9072 }
9073 
9074 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9075 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9076 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9077 
9078 // Hard pause shuts down the runtime completely. Resume happens naturally when
9079 // OpenMP is used subsequently.
9080 void __kmp_hard_pause() {
9081  __kmp_pause_status = kmp_hard_paused;
9082  __kmp_internal_end_thread(-1);
9083 }
9084 
9085 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9086 void __kmp_resume_if_soft_paused() {
9087  if (__kmp_pause_status == kmp_soft_paused) {
9088  __kmp_pause_status = kmp_not_paused;
9089 
9090  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9091  kmp_info_t *thread = __kmp_threads[gtid];
9092  if (thread) { // Wake it if sleeping
9093  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9094  thread);
9095  if (fl.is_sleeping())
9096  fl.resume(gtid);
9097  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9098  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9099  } else { // thread holds the lock and may sleep soon
9100  do { // until either the thread sleeps, or we can get the lock
9101  if (fl.is_sleeping()) {
9102  fl.resume(gtid);
9103  break;
9104  } else if (__kmp_try_suspend_mx(thread)) {
9105  __kmp_unlock_suspend_mx(thread);
9106  break;
9107  }
9108  } while (1);
9109  }
9110  }
9111  }
9112  }
9113 }
9114 
9115 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9116 // TODO: add warning messages
9117 int __kmp_pause_resource(kmp_pause_status_t level) {
9118  if (level == kmp_not_paused) { // requesting resume
9119  if (__kmp_pause_status == kmp_not_paused) {
9120  // error message about runtime not being paused, so can't resume
9121  return 1;
9122  } else {
9123  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9124  __kmp_pause_status == kmp_hard_paused);
9125  __kmp_pause_status = kmp_not_paused;
9126  return 0;
9127  }
9128  } else if (level == kmp_soft_paused) { // requesting soft pause
9129  if (__kmp_pause_status != kmp_not_paused) {
9130  // error message about already being paused
9131  return 1;
9132  } else {
9133  __kmp_soft_pause();
9134  return 0;
9135  }
9136  } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9137  // requesting hard pause or stop_tool pause
9138  if (__kmp_pause_status != kmp_not_paused) {
9139  // error message about already being paused
9140  return 1;
9141  } else {
9142  __kmp_hard_pause();
9143  return 0;
9144  }
9145  } else {
9146  // error message about invalid level
9147  return 1;
9148  }
9149 }
9150 
9151 void __kmp_omp_display_env(int verbose) {
9152  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9153  if (__kmp_init_serial == 0)
9154  __kmp_do_serial_initialize();
9155  __kmp_display_env_impl(!verbose, verbose);
9156  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9157 }
9158 
9159 // The team size is changing, so distributed barrier must be modified
9160 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9161  int new_nthreads) {
9162  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9163  bp_dist_bar);
9164  kmp_info_t **other_threads = team->t.t_threads;
9165 
9166  // We want all the workers to stop waiting on the barrier while we adjust the
9167  // size of the team.
9168  for (int f = 1; f < old_nthreads; ++f) {
9169  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9170  // Ignore threads that are already inactive or not present in the team
9171  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9172  // teams construct causes thread_limit to get passed in, and some of
9173  // those could be inactive; just ignore them
9174  continue;
9175  }
9176  // If thread is transitioning still to in_use state, wait for it
9177  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9178  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9179  KMP_CPU_PAUSE();
9180  }
9181  // The thread should be in_use now
9182  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9183  // Transition to unused state
9184  team->t.t_threads[f]->th.th_used_in_team.store(2);
9185  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9186  }
9187  // Release all the workers
9188  team->t.b->go_release();
9189 
9190  KMP_MFENCE();
9191 
9192  // Workers should see transition status 2 and move to 0; but may need to be
9193  // woken up first
9194  int count = old_nthreads - 1;
9195  while (count > 0) {
9196  count = old_nthreads - 1;
9197  for (int f = 1; f < old_nthreads; ++f) {
9198  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9199  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9200  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9201  void *, other_threads[f]->th.th_sleep_loc);
9202  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9203  }
9204  } else {
9205  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9206  count--;
9207  }
9208  }
9209  }
9210  // Now update the barrier size
9211  team->t.b->update_num_threads(new_nthreads);
9212  team->t.b->go_reset();
9213 }
9214 
9215 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9216  // Add the threads back to the team
9217  KMP_DEBUG_ASSERT(team);
9218  // Threads were paused and pointed at th_used_in_team temporarily during a
9219  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9220  // the thread that it should transition itself back into the team. Then, if
9221  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9222  // to wake it up.
9223  for (int f = 1; f < new_nthreads; ++f) {
9224  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9225  (void)KMP_COMPARE_AND_STORE_ACQ32(
9226  &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9227  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9228  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9229  (kmp_flag_32<false, false> *)NULL);
9230  }
9231  }
9232  // The threads should be transitioning to the team; when they are done, they
9233  // should have set th_used_in_team to 1. This loop forces master to wait until
9234  // all threads have moved into the team and are waiting in the barrier.
9235  int count = new_nthreads - 1;
9236  while (count > 0) {
9237  count = new_nthreads - 1;
9238  for (int f = 1; f < new_nthreads; ++f) {
9239  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9240  count--;
9241  }
9242  }
9243  }
9244 }
9245 
9246 // Globals and functions for hidden helper task
9247 kmp_info_t **__kmp_hidden_helper_threads;
9248 kmp_info_t *__kmp_hidden_helper_main_thread;
9249 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9250 #if KMP_OS_LINUX
9251 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9252 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9253 #else
9254 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9255 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9256 #endif
9257 
9258 namespace {
9259 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9260 
9261 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9262  // This is an explicit synchronization on all hidden helper threads in case
9263  // that when a regular thread pushes a hidden helper task to one hidden
9264  // helper thread, the thread has not been awaken once since they're released
9265  // by the main thread after creating the team.
9266  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9267  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9268  __kmp_hidden_helper_threads_num)
9269  ;
9270 
9271  // If main thread, then wait for signal
9272  if (__kmpc_master(nullptr, *gtid)) {
9273  // First, unset the initial state and release the initial thread
9274  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9275  __kmp_hidden_helper_initz_release();
9276  __kmp_hidden_helper_main_thread_wait();
9277  // Now wake up all worker threads
9278  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9279  __kmp_hidden_helper_worker_thread_signal();
9280  }
9281  }
9282 }
9283 } // namespace
9284 
9285 void __kmp_hidden_helper_threads_initz_routine() {
9286  // Create a new root for hidden helper team/threads
9287  const int gtid = __kmp_register_root(TRUE);
9288  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9289  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9290  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9291  __kmp_hidden_helper_threads_num;
9292 
9293  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9294 
9295  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9296 
9297  // Set the initialization flag to FALSE
9298  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9299 
9300  __kmp_hidden_helper_threads_deinitz_release();
9301 }
9302 
9303 /* Nesting Mode:
9304  Set via KMP_NESTING_MODE, which takes an integer.
9305  Note: we skip duplicate topology levels, and skip levels with only
9306  one entity.
9307  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9308  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9309  in the topology, and initializes the number of threads at each of those
9310  levels to the number of entities at each level, respectively, below the
9311  entity at the parent level.
9312  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9313  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9314  the user to turn nesting on explicitly. This is an even more experimental
9315  option to this experimental feature, and may change or go away in the
9316  future.
9317 */
9318 
9319 // Allocate space to store nesting levels
9320 void __kmp_init_nesting_mode() {
9321  int levels = KMP_HW_LAST;
9322  __kmp_nesting_mode_nlevels = levels;
9323  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9324  for (int i = 0; i < levels; ++i)
9325  __kmp_nesting_nth_level[i] = 0;
9326  if (__kmp_nested_nth.size < levels) {
9327  __kmp_nested_nth.nth =
9328  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9329  __kmp_nested_nth.size = levels;
9330  }
9331 }
9332 
9333 // Set # threads for top levels of nesting; must be called after topology set
9334 void __kmp_set_nesting_mode_threads() {
9335  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9336 
9337  if (__kmp_nesting_mode == 1)
9338  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9339  else if (__kmp_nesting_mode > 1)
9340  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9341 
9342  if (__kmp_topology) { // use topology info
9343  int loc, hw_level;
9344  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9345  loc < __kmp_nesting_mode_nlevels;
9346  loc++, hw_level++) {
9347  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9348  if (__kmp_nesting_nth_level[loc] == 1)
9349  loc--;
9350  }
9351  // Make sure all cores are used
9352  if (__kmp_nesting_mode > 1 && loc > 1) {
9353  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9354  int num_cores = __kmp_topology->get_count(core_level);
9355  int upper_levels = 1;
9356  for (int level = 0; level < loc - 1; ++level)
9357  upper_levels *= __kmp_nesting_nth_level[level];
9358  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9359  __kmp_nesting_nth_level[loc - 1] =
9360  num_cores / __kmp_nesting_nth_level[loc - 2];
9361  }
9362  __kmp_nesting_mode_nlevels = loc;
9363  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9364  } else { // no topology info available; provide a reasonable guesstimation
9365  if (__kmp_avail_proc >= 4) {
9366  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9367  __kmp_nesting_nth_level[1] = 2;
9368  __kmp_nesting_mode_nlevels = 2;
9369  } else {
9370  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9371  __kmp_nesting_mode_nlevels = 1;
9372  }
9373  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9374  }
9375  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9376  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9377  }
9378  set__nproc(thread, __kmp_nesting_nth_level[0]);
9379  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9380  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9381  if (get__max_active_levels(thread) > 1) {
9382  // if max levels was set, set nesting mode levels to same
9383  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9384  }
9385  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9386  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9387 }
9388 
9389 // Empty symbols to export (see exports_so.txt) when feature is disabled
9390 extern "C" {
9391 #if !KMP_STATS_ENABLED
9392 void __kmp_reset_stats() {}
9393 #endif
9394 #if !USE_DEBUGGER
9395 int __kmp_omp_debug_struct_info = FALSE;
9396 int __kmp_debugging = FALSE;
9397 #endif
9398 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9399 void __kmp_itt_fini_ittlib() {}
9400 void __kmp_itt_init_ittlib() {}
9401 #endif
9402 }
9403 
9404 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:212
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:370
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:377
@ kmp_sch_static
Definition: kmp.h:373
@ kmp_sch_guided_chunked
Definition: kmp.h:375
Definition: kmp.h:247
kmp_int32 flags
Definition: kmp.h:249