LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #include "kmp_utils.h"
28 #if KMP_USE_HIER_SCHED
29 #include "kmp_dispatch_hier.h"
30 #endif
31 
32 #if OMPT_SUPPORT
33 #include "ompt-specific.h"
34 #endif
35 #if OMPD_SUPPORT
36 #include "ompd-specific.h"
37 #endif
38 
39 #if OMP_PROFILING_SUPPORT
40 #include "llvm/Support/TimeProfiler.h"
41 static char *ProfileTraceFile = nullptr;
42 #endif
43 
44 /* these are temporary issues to be dealt with */
45 #define KMP_USE_PRCTL 0
46 
47 #if KMP_OS_WINDOWS
48 #include <process.h>
49 #endif
50 
51 #ifndef KMP_USE_SHM
52 // Windows and WASI do not need these include files as they don't use shared
53 // memory.
54 #else
55 #include <sys/mman.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #define SHM_SIZE 1024
59 #endif
60 
61 #if defined(KMP_GOMP_COMPAT)
62 char const __kmp_version_alt_comp[] =
63  KMP_VERSION_PREFIX "alternative compiler support: yes";
64 #endif /* defined(KMP_GOMP_COMPAT) */
65 
66 char const __kmp_version_omp_api[] =
67  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
68 
69 #ifdef KMP_DEBUG
70 char const __kmp_version_lock[] =
71  KMP_VERSION_PREFIX "lock type: run time selectable";
72 #endif /* KMP_DEBUG */
73 
74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
75 
76 /* ------------------------------------------------------------------------ */
77 
78 #if KMP_USE_MONITOR
79 kmp_info_t __kmp_monitor;
80 #endif
81 
82 /* Forward declarations */
83 
84 void __kmp_cleanup(void);
85 
86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
87  int gtid);
88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
89  kmp_internal_control_t *new_icvs,
90  ident_t *loc);
91 #if KMP_AFFINITY_SUPPORTED
92 static void __kmp_partition_places(kmp_team_t *team,
93  int update_master_only = 0);
94 #endif
95 static void __kmp_do_serial_initialize(void);
96 void __kmp_fork_barrier(int gtid, int tid);
97 void __kmp_join_barrier(int gtid);
98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
99  kmp_internal_control_t *new_icvs, ident_t *loc);
100 
101 #ifdef USE_LOAD_BALANCE
102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
103 #endif
104 
105 static int __kmp_expand_threads(int nNeed);
106 #if KMP_OS_WINDOWS
107 static int __kmp_unregister_root_other_thread(int gtid);
108 #endif
109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
111 
112 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
113  int new_nthreads);
114 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
115 
116 static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
117  int level) {
118  kmp_nested_nthreads_t *new_nested_nth =
119  (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
120  sizeof(kmp_nested_nthreads_t));
121  int new_size = level + thr->th.th_set_nested_nth_sz;
122  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
123  for (int i = 0; i < level + 1; ++i)
124  new_nested_nth->nth[i] = 0;
125  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
126  new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
127  new_nested_nth->size = new_nested_nth->used = new_size;
128  return new_nested_nth;
129 }
130 
131 /* Calculate the identifier of the current thread */
132 /* fast (and somewhat portable) way to get unique identifier of executing
133  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
134 int __kmp_get_global_thread_id() {
135  int i;
136  kmp_info_t **other_threads;
137  size_t stack_data;
138  char *stack_addr;
139  size_t stack_size;
140  char *stack_base;
141 
142  KA_TRACE(
143  1000,
144  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
145  __kmp_nth, __kmp_all_nth));
146 
147  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
148  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
149  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
150  __kmp_init_gtid for this to work. */
151 
152  if (!TCR_4(__kmp_init_gtid))
153  return KMP_GTID_DNE;
154 
155 #ifdef KMP_TDATA_GTID
156  if (TCR_4(__kmp_gtid_mode) >= 3) {
157  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
158  return __kmp_gtid;
159  }
160 #endif
161  if (TCR_4(__kmp_gtid_mode) >= 2) {
162  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
163  return __kmp_gtid_get_specific();
164  }
165  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
166 
167  stack_addr = (char *)&stack_data;
168  other_threads = __kmp_threads;
169 
170  /* ATT: The code below is a source of potential bugs due to unsynchronized
171  access to __kmp_threads array. For example:
172  1. Current thread loads other_threads[i] to thr and checks it, it is
173  non-NULL.
174  2. Current thread is suspended by OS.
175  3. Another thread unregisters and finishes (debug versions of free()
176  may fill memory with something like 0xEF).
177  4. Current thread is resumed.
178  5. Current thread reads junk from *thr.
179  TODO: Fix it. --ln */
180 
181  for (i = 0; i < __kmp_threads_capacity; i++) {
182 
183  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
184  if (!thr)
185  continue;
186 
187  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
188  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
189 
190  /* stack grows down -- search through all of the active threads */
191 
192  if (stack_addr <= stack_base) {
193  size_t stack_diff = stack_base - stack_addr;
194 
195  if (stack_diff <= stack_size) {
196  /* The only way we can be closer than the allocated */
197  /* stack size is if we are running on this thread. */
198  // __kmp_gtid_get_specific can return negative value because this
199  // function can be called by thread destructor. However, before the
200  // thread destructor is called, the value of the corresponding
201  // thread-specific data will be reset to NULL.
202  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() < 0 ||
203  __kmp_gtid_get_specific() == i);
204  return i;
205  }
206  }
207  }
208 
209  /* get specific to try and determine our gtid */
210  KA_TRACE(1000,
211  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
212  "thread, using TLS\n"));
213  i = __kmp_gtid_get_specific();
214 
215  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
216 
217  /* if we havn't been assigned a gtid, then return code */
218  if (i < 0)
219  return i;
220 
221  // other_threads[i] can be nullptr at this point because the corresponding
222  // thread could have already been destructed. It can happen when this function
223  // is called in end library routine.
224  if (!TCR_SYNC_PTR(other_threads[i]))
225  return i;
226 
227  /* dynamically updated stack window for uber threads to avoid get_specific
228  call */
229  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
230  KMP_FATAL(StackOverflow, i);
231  }
232 
233  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
234  if (stack_addr > stack_base) {
235  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
236  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
237  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
238  stack_base);
239  } else {
240  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
241  stack_base - stack_addr);
242  }
243 
244  /* Reprint stack bounds for ubermaster since they have been refined */
245  if (__kmp_storage_map) {
246  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
247  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
248  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
249  other_threads[i]->th.th_info.ds.ds_stacksize,
250  "th_%d stack (refinement)", i);
251  }
252  return i;
253 }
254 
255 int __kmp_get_global_thread_id_reg() {
256  int gtid;
257 
258  if (!__kmp_init_serial) {
259  gtid = KMP_GTID_DNE;
260  } else
261 #ifdef KMP_TDATA_GTID
262  if (TCR_4(__kmp_gtid_mode) >= 3) {
263  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
264  gtid = __kmp_gtid;
265  } else
266 #endif
267  if (TCR_4(__kmp_gtid_mode) >= 2) {
268  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
269  gtid = __kmp_gtid_get_specific();
270  } else {
271  KA_TRACE(1000,
272  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
273  gtid = __kmp_get_global_thread_id();
274  }
275 
276  /* we must be a new uber master sibling thread */
277  if (gtid == KMP_GTID_DNE) {
278  KA_TRACE(10,
279  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
280  "Registering a new gtid.\n"));
281  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
282  if (!__kmp_init_serial) {
283  __kmp_do_serial_initialize();
284  gtid = __kmp_gtid_get_specific();
285  } else {
286  gtid = __kmp_register_root(FALSE);
287  }
288  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
289  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
290  }
291 
292  KMP_DEBUG_ASSERT(gtid >= 0);
293 
294  return gtid;
295 }
296 
297 /* caller must hold forkjoin_lock */
298 void __kmp_check_stack_overlap(kmp_info_t *th) {
299  int f;
300  char *stack_beg = NULL;
301  char *stack_end = NULL;
302  int gtid;
303 
304  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
305  if (__kmp_storage_map) {
306  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
307  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
308 
309  gtid = __kmp_gtid_from_thread(th);
310 
311  if (gtid == KMP_GTID_MONITOR) {
312  __kmp_print_storage_map_gtid(
313  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
314  "th_%s stack (%s)", "mon",
315  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
316  } else {
317  __kmp_print_storage_map_gtid(
318  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
319  "th_%d stack (%s)", gtid,
320  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
321  }
322  }
323 
324  /* No point in checking ubermaster threads since they use refinement and
325  * cannot overlap */
326  gtid = __kmp_gtid_from_thread(th);
327  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
328  KA_TRACE(10,
329  ("__kmp_check_stack_overlap: performing extensive checking\n"));
330  if (stack_beg == NULL) {
331  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
332  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
333  }
334 
335  for (f = 0; f < __kmp_threads_capacity; f++) {
336  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
337 
338  if (f_th && f_th != th) {
339  char *other_stack_end =
340  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
341  char *other_stack_beg =
342  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
343  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
344  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
345 
346  /* Print the other stack values before the abort */
347  if (__kmp_storage_map)
348  __kmp_print_storage_map_gtid(
349  -1, other_stack_beg, other_stack_end,
350  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
351  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
352 
353  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
354  __kmp_msg_null);
355  }
356  }
357  }
358  }
359  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
360 }
361 
362 /* ------------------------------------------------------------------------ */
363 
364 void __kmp_infinite_loop(void) {
365  static int done = FALSE;
366 
367  while (!done) {
368  KMP_YIELD(TRUE);
369  }
370 }
371 
372 #define MAX_MESSAGE 512
373 
374 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
375  char const *format, ...) {
376  char buffer[MAX_MESSAGE];
377  va_list ap;
378 
379  va_start(ap, format);
380  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
381  p2, (unsigned long)size, format);
382  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
383  __kmp_vprintf(kmp_err, buffer, ap);
384 #if KMP_PRINT_DATA_PLACEMENT
385  int node;
386  if (gtid >= 0) {
387  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
388  if (__kmp_storage_map_verbose) {
389  node = __kmp_get_host_node(p1);
390  if (node < 0) /* doesn't work, so don't try this next time */
391  __kmp_storage_map_verbose = FALSE;
392  else {
393  char *last;
394  int lastNode;
395  int localProc = __kmp_get_cpu_from_gtid(gtid);
396 
397  const int page_size = KMP_GET_PAGE_SIZE();
398 
399  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
400  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
401  if (localProc >= 0)
402  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
403  localProc >> 1);
404  else
405  __kmp_printf_no_lock(" GTID %d\n", gtid);
406 #if KMP_USE_PRCTL
407  /* The more elaborate format is disabled for now because of the prctl
408  * hanging bug. */
409  do {
410  last = p1;
411  lastNode = node;
412  /* This loop collates adjacent pages with the same host node. */
413  do {
414  (char *)p1 += page_size;
415  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
416  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
417  lastNode);
418  } while (p1 <= p2);
419 #else
420  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
421  (char *)p1 + (page_size - 1),
422  __kmp_get_host_node(p1));
423  if (p1 < p2) {
424  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
425  (char *)p2 + (page_size - 1),
426  __kmp_get_host_node(p2));
427  }
428 #endif
429  }
430  }
431  } else
432  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
433  }
434 #endif /* KMP_PRINT_DATA_PLACEMENT */
435  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
436 
437  va_end(ap);
438 }
439 
440 void __kmp_warn(char const *format, ...) {
441  char buffer[MAX_MESSAGE];
442  va_list ap;
443 
444  if (__kmp_generate_warnings == kmp_warnings_off) {
445  return;
446  }
447 
448  va_start(ap, format);
449 
450  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
451  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
452  __kmp_vprintf(kmp_err, buffer, ap);
453  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
454 
455  va_end(ap);
456 }
457 
458 void __kmp_abort_process() {
459  // Later threads may stall here, but that's ok because abort() will kill them.
460  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
461 
462  if (__kmp_debug_buf) {
463  __kmp_dump_debug_buffer();
464  }
465 
466 #if KMP_OS_WINDOWS
467  // Let other threads know of abnormal termination and prevent deadlock
468  // if abort happened during library initialization or shutdown
469  __kmp_global.g.g_abort = SIGABRT;
470 
471  /* On Windows* OS by default abort() causes pop-up error box, which stalls
472  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
473  boxes. _set_abort_behavior() works well, but this function is not
474  available in VS7 (this is not problem for DLL, but it is a problem for
475  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
476  help, at least in some versions of MS C RTL.
477 
478  It seems following sequence is the only way to simulate abort() and
479  avoid pop-up error box. */
480  raise(SIGABRT);
481  _exit(3); // Just in case, if signal ignored, exit anyway.
482 #else
483  __kmp_unregister_library();
484  abort();
485 #endif
486 
487  __kmp_infinite_loop();
488  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
489 
490 } // __kmp_abort_process
491 
492 void __kmp_abort_thread(void) {
493  // TODO: Eliminate g_abort global variable and this function.
494  // In case of abort just call abort(), it will kill all the threads.
495  __kmp_infinite_loop();
496 } // __kmp_abort_thread
497 
498 /* Print out the storage map for the major kmp_info_t thread data structures
499  that are allocated together. */
500 
501 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
502  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
503  gtid);
504 
505  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
506  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
507 
508  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
509  sizeof(kmp_local_t), "th_%d.th_local", gtid);
510 
511  __kmp_print_storage_map_gtid(
512  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
513  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
514 
515  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
516  &thr->th.th_bar[bs_plain_barrier + 1],
517  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
518  gtid);
519 
520  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
521  &thr->th.th_bar[bs_forkjoin_barrier + 1],
522  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
523  gtid);
524 
525 #if KMP_FAST_REDUCTION_BARRIER
526  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
527  &thr->th.th_bar[bs_reduction_barrier + 1],
528  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
529  gtid);
530 #endif // KMP_FAST_REDUCTION_BARRIER
531 }
532 
533 /* Print out the storage map for the major kmp_team_t team data structures
534  that are allocated together. */
535 
536 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
537  int team_id, int num_thr) {
538  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
539  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
540  header, team_id);
541 
542  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
543  &team->t.t_bar[bs_last_barrier],
544  sizeof(kmp_balign_team_t) * bs_last_barrier,
545  "%s_%d.t_bar", header, team_id);
546 
547  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
548  &team->t.t_bar[bs_plain_barrier + 1],
549  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
550  header, team_id);
551 
552  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
553  &team->t.t_bar[bs_forkjoin_barrier + 1],
554  sizeof(kmp_balign_team_t),
555  "%s_%d.t_bar[forkjoin]", header, team_id);
556 
557 #if KMP_FAST_REDUCTION_BARRIER
558  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
559  &team->t.t_bar[bs_reduction_barrier + 1],
560  sizeof(kmp_balign_team_t),
561  "%s_%d.t_bar[reduction]", header, team_id);
562 #endif // KMP_FAST_REDUCTION_BARRIER
563 
564  __kmp_print_storage_map_gtid(
565  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
566  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
567 
568  __kmp_print_storage_map_gtid(
569  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
570  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
571 
572  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
573  &team->t.t_disp_buffer[num_disp_buff],
574  sizeof(dispatch_shared_info_t) * num_disp_buff,
575  "%s_%d.t_disp_buffer", header, team_id);
576 }
577 
578 static void __kmp_init_allocator() {
579  __kmp_init_memkind();
580  __kmp_init_target_mem();
581 }
582 static void __kmp_fini_allocator() {
583  __kmp_fini_target_mem();
584  __kmp_fini_memkind();
585 }
586 
587 /* ------------------------------------------------------------------------ */
588 
589 #if ENABLE_LIBOMPTARGET
590 static void __kmp_init_omptarget() {
591  __kmp_init_target_task();
592 }
593 #endif
594 
595 /* ------------------------------------------------------------------------ */
596 
597 #if KMP_DYNAMIC_LIB
598 #if KMP_OS_WINDOWS
599 
600 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
601  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
602 
603  switch (fdwReason) {
604 
605  case DLL_PROCESS_ATTACH:
606  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
607 
608  return TRUE;
609 
610  case DLL_PROCESS_DETACH:
611  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
612 
613  // According to Windows* documentation for DllMain entry point:
614  // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
615  // lpReserved == NULL when FreeLibrary() is called,
616  // lpReserved != NULL when the process is terminated.
617  // When FreeLibrary() is called, worker threads remain alive. So the
618  // runtime's state is consistent and executing proper shutdown is OK.
619  // When the process is terminated, worker threads have exited or been
620  // forcefully terminated by the OS and only the shutdown thread remains.
621  // This can leave the runtime in an inconsistent state.
622  // Hence, only attempt proper cleanup when FreeLibrary() is called.
623  // Otherwise, rely on OS to reclaim resources.
624  if (lpReserved == NULL)
625  __kmp_internal_end_library(__kmp_gtid_get_specific());
626 
627  return TRUE;
628 
629  case DLL_THREAD_ATTACH:
630  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
631 
632  /* if we want to register new siblings all the time here call
633  * __kmp_get_gtid(); */
634  return TRUE;
635 
636  case DLL_THREAD_DETACH:
637  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
638 
639  __kmp_internal_end_thread(__kmp_gtid_get_specific());
640  return TRUE;
641  }
642 
643  return TRUE;
644 }
645 
646 #endif /* KMP_OS_WINDOWS */
647 #endif /* KMP_DYNAMIC_LIB */
648 
649 /* __kmp_parallel_deo -- Wait until it's our turn. */
650 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
651  int gtid = *gtid_ref;
652 #ifdef BUILD_PARALLEL_ORDERED
653  kmp_team_t *team = __kmp_team_from_gtid(gtid);
654 #endif /* BUILD_PARALLEL_ORDERED */
655 
656  if (__kmp_env_consistency_check) {
657  if (__kmp_threads[gtid]->th.th_root->r.r_active)
658 #if KMP_USE_DYNAMIC_LOCK
659  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
660 #else
661  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
662 #endif
663  }
664 #ifdef BUILD_PARALLEL_ORDERED
665  if (!team->t.t_serialized) {
666  KMP_MB();
667  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
668  NULL);
669  KMP_MB();
670  }
671 #endif /* BUILD_PARALLEL_ORDERED */
672 }
673 
674 /* __kmp_parallel_dxo -- Signal the next task. */
675 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
676  int gtid = *gtid_ref;
677 #ifdef BUILD_PARALLEL_ORDERED
678  int tid = __kmp_tid_from_gtid(gtid);
679  kmp_team_t *team = __kmp_team_from_gtid(gtid);
680 #endif /* BUILD_PARALLEL_ORDERED */
681 
682  if (__kmp_env_consistency_check) {
683  if (__kmp_threads[gtid]->th.th_root->r.r_active)
684  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
685  }
686 #ifdef BUILD_PARALLEL_ORDERED
687  if (!team->t.t_serialized) {
688  KMP_MB(); /* Flush all pending memory write invalidates. */
689 
690  /* use the tid of the next thread in this team */
691  /* TODO replace with general release procedure */
692  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
693 
694  KMP_MB(); /* Flush all pending memory write invalidates. */
695  }
696 #endif /* BUILD_PARALLEL_ORDERED */
697 }
698 
699 /* ------------------------------------------------------------------------ */
700 /* The BARRIER for a SINGLE process section is always explicit */
701 
702 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
703  int status;
704  kmp_info_t *th;
705  kmp_team_t *team;
706 
707  if (!TCR_4(__kmp_init_parallel))
708  __kmp_parallel_initialize();
709  __kmp_resume_if_soft_paused();
710 
711  th = __kmp_threads[gtid];
712  team = th->th.th_team;
713  status = 0;
714 
715  th->th.th_ident = id_ref;
716 
717  if (team->t.t_serialized) {
718  status = 1;
719  } else {
720  kmp_int32 old_this = th->th.th_local.this_construct;
721 
722  ++th->th.th_local.this_construct;
723  /* try to set team count to thread count--success means thread got the
724  single block */
725  /* TODO: Should this be acquire or release? */
726  if (team->t.t_construct == old_this) {
727  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
728  th->th.th_local.this_construct);
729  }
730 #if USE_ITT_BUILD
731  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
732  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
733  team->t.t_active_level == 1) {
734  // Only report metadata by primary thread of active team at level 1
735  __kmp_itt_metadata_single(id_ref);
736  }
737 #endif /* USE_ITT_BUILD */
738  }
739 
740  if (__kmp_env_consistency_check) {
741  if (status && push_ws) {
742  __kmp_push_workshare(gtid, ct_psingle, id_ref);
743  } else {
744  __kmp_check_workshare(gtid, ct_psingle, id_ref);
745  }
746  }
747 #if USE_ITT_BUILD
748  if (status) {
749  __kmp_itt_single_start(gtid);
750  }
751 #endif /* USE_ITT_BUILD */
752  return status;
753 }
754 
755 void __kmp_exit_single(int gtid) {
756 #if USE_ITT_BUILD
757  __kmp_itt_single_end(gtid);
758 #endif /* USE_ITT_BUILD */
759  if (__kmp_env_consistency_check)
760  __kmp_pop_workshare(gtid, ct_psingle, NULL);
761 }
762 
763 /* determine if we can go parallel or must use a serialized parallel region and
764  * how many threads we can use
765  * set_nproc is the number of threads requested for the team
766  * returns 0 if we should serialize or only use one thread,
767  * otherwise the number of threads to use
768  * The forkjoin lock is held by the caller. */
769 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
770  int master_tid, int set_nthreads,
771  int enter_teams) {
772  int capacity;
773  int new_nthreads;
774  KMP_DEBUG_ASSERT(__kmp_init_serial);
775  KMP_DEBUG_ASSERT(root && parent_team);
776  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
777 
778  // If dyn-var is set, dynamically adjust the number of desired threads,
779  // according to the method specified by dynamic_mode.
780  new_nthreads = set_nthreads;
781  if (!get__dynamic_2(parent_team, master_tid)) {
782  ;
783  }
784 #ifdef USE_LOAD_BALANCE
785  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
786  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
787  if (new_nthreads == 1) {
788  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
789  "reservation to 1 thread\n",
790  master_tid));
791  return 1;
792  }
793  if (new_nthreads < set_nthreads) {
794  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
795  "reservation to %d threads\n",
796  master_tid, new_nthreads));
797  }
798  }
799 #endif /* USE_LOAD_BALANCE */
800  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
801  new_nthreads = __kmp_avail_proc - __kmp_nth +
802  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
803  if (new_nthreads <= 1) {
804  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
805  "reservation to 1 thread\n",
806  master_tid));
807  return 1;
808  }
809  if (new_nthreads < set_nthreads) {
810  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
811  "reservation to %d threads\n",
812  master_tid, new_nthreads));
813  } else {
814  new_nthreads = set_nthreads;
815  }
816  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
817  if (set_nthreads > 2) {
818  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
819  new_nthreads = (new_nthreads % set_nthreads) + 1;
820  if (new_nthreads == 1) {
821  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
822  "reservation to 1 thread\n",
823  master_tid));
824  return 1;
825  }
826  if (new_nthreads < set_nthreads) {
827  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
828  "reservation to %d threads\n",
829  master_tid, new_nthreads));
830  }
831  }
832  } else {
833  KMP_ASSERT(0);
834  }
835 
836  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
837  if (__kmp_nth + new_nthreads -
838  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
839  __kmp_max_nth) {
840  int tl_nthreads = __kmp_max_nth - __kmp_nth +
841  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
842  if (tl_nthreads <= 0) {
843  tl_nthreads = 1;
844  }
845 
846  // If dyn-var is false, emit a 1-time warning.
847  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
848  __kmp_reserve_warn = 1;
849  __kmp_msg(kmp_ms_warning,
850  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
851  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
852  }
853  if (tl_nthreads == 1) {
854  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
855  "reduced reservation to 1 thread\n",
856  master_tid));
857  return 1;
858  }
859  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
860  "reservation to %d threads\n",
861  master_tid, tl_nthreads));
862  new_nthreads = tl_nthreads;
863  }
864 
865  // Respect OMP_THREAD_LIMIT
866  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
867  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
868  if (cg_nthreads + new_nthreads -
869  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870  max_cg_threads) {
871  int tl_nthreads = max_cg_threads - cg_nthreads +
872  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
873  if (tl_nthreads <= 0) {
874  tl_nthreads = 1;
875  }
876 
877  // If dyn-var is false, emit a 1-time warning.
878  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
879  __kmp_reserve_warn = 1;
880  __kmp_msg(kmp_ms_warning,
881  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
882  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
883  }
884  if (tl_nthreads == 1) {
885  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
886  "reduced reservation to 1 thread\n",
887  master_tid));
888  return 1;
889  }
890  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
891  "reservation to %d threads\n",
892  master_tid, tl_nthreads));
893  new_nthreads = tl_nthreads;
894  }
895 
896  // Check if the threads array is large enough, or needs expanding.
897  // See comment in __kmp_register_root() about the adjustment if
898  // __kmp_threads[0] == NULL.
899  capacity = __kmp_threads_capacity;
900  if (TCR_PTR(__kmp_threads[0]) == NULL) {
901  --capacity;
902  }
903  // If it is not for initializing the hidden helper team, we need to take
904  // __kmp_hidden_helper_threads_num out of the capacity because it is included
905  // in __kmp_threads_capacity.
906  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
907  capacity -= __kmp_hidden_helper_threads_num;
908  }
909  if (__kmp_nth + new_nthreads -
910  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
911  capacity) {
912  // Expand the threads array.
913  int slotsRequired = __kmp_nth + new_nthreads -
914  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
915  capacity;
916  int slotsAdded = __kmp_expand_threads(slotsRequired);
917  if (slotsAdded < slotsRequired) {
918  // The threads array was not expanded enough.
919  new_nthreads -= (slotsRequired - slotsAdded);
920  KMP_ASSERT(new_nthreads >= 1);
921 
922  // If dyn-var is false, emit a 1-time warning.
923  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
924  __kmp_reserve_warn = 1;
925  if (__kmp_tp_cached) {
926  __kmp_msg(kmp_ms_warning,
927  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
928  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
929  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
930  } else {
931  __kmp_msg(kmp_ms_warning,
932  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
933  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
934  }
935  }
936  }
937  }
938 
939 #ifdef KMP_DEBUG
940  if (new_nthreads == 1) {
941  KC_TRACE(10,
942  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
943  "dead roots and rechecking; requested %d threads\n",
944  __kmp_get_gtid(), set_nthreads));
945  } else {
946  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
947  " %d threads\n",
948  __kmp_get_gtid(), new_nthreads, set_nthreads));
949  }
950 #endif // KMP_DEBUG
951 
952  if (this_thr->th.th_nt_strict && new_nthreads < set_nthreads) {
953  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
954  this_thr->th.th_nt_msg);
955  }
956  return new_nthreads;
957 }
958 
959 /* Allocate threads from the thread pool and assign them to the new team. We are
960  assured that there are enough threads available, because we checked on that
961  earlier within critical section forkjoin */
962 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
963  kmp_info_t *master_th, int master_gtid,
964  int fork_teams_workers) {
965  int i;
966  int use_hot_team;
967 
968  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
969  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
970  KMP_MB();
971 
972  /* first, let's setup the primary thread */
973  master_th->th.th_info.ds.ds_tid = 0;
974  master_th->th.th_team = team;
975  master_th->th.th_team_nproc = team->t.t_nproc;
976  master_th->th.th_team_master = master_th;
977  master_th->th.th_team_serialized = FALSE;
978  master_th->th.th_dispatch = &team->t.t_dispatch[0];
979 
980  /* make sure we are not the optimized hot team */
981  use_hot_team = 0;
982  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
983  if (hot_teams) { // hot teams array is not allocated if
984  // KMP_HOT_TEAMS_MAX_LEVEL=0
985  int level = team->t.t_active_level - 1; // index in array of hot teams
986  if (master_th->th.th_teams_microtask) { // are we inside the teams?
987  if (master_th->th.th_teams_size.nteams > 1) {
988  ++level; // level was not increased in teams construct for
989  // team_of_masters
990  }
991  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
992  master_th->th.th_teams_level == team->t.t_level) {
993  ++level; // level was not increased in teams construct for
994  // team_of_workers before the parallel
995  } // team->t.t_level will be increased inside parallel
996  }
997  if (level < __kmp_hot_teams_max_level) {
998  if (hot_teams[level].hot_team) {
999  // hot team has already been allocated for given level
1000  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1001  use_hot_team = 1; // the team is ready to use
1002  } else {
1003  use_hot_team = 0; // AC: threads are not allocated yet
1004  hot_teams[level].hot_team = team; // remember new hot team
1005  hot_teams[level].hot_team_nth = team->t.t_nproc;
1006  }
1007  } else {
1008  use_hot_team = 0;
1009  }
1010  }
1011  if (!use_hot_team) {
1012 
1013  /* install the primary thread */
1014  team->t.t_threads[0] = master_th;
1015  __kmp_initialize_info(master_th, team, 0, master_gtid);
1016 
1017  /* now, install the worker threads */
1018  for (i = 1; i < team->t.t_nproc; i++) {
1019 
1020  /* fork or reallocate a new thread and install it in team */
1021  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1022  team->t.t_threads[i] = thr;
1023  KMP_DEBUG_ASSERT(thr);
1024  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1025  /* align team and thread arrived states */
1026  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1027  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1028  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1029  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1030  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1031  team->t.t_bar[bs_plain_barrier].b_arrived));
1032  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1033  thr->th.th_teams_level = master_th->th.th_teams_level;
1034  thr->th.th_teams_size = master_th->th.th_teams_size;
1035  { // Initialize threads' barrier data.
1036  int b;
1037  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1038  for (b = 0; b < bs_last_barrier; ++b) {
1039  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1040  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1041 #if USE_DEBUGGER
1042  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1043 #endif
1044  }
1045  }
1046  }
1047 
1048 #if KMP_AFFINITY_SUPPORTED
1049  // Do not partition the places list for teams construct workers who
1050  // haven't actually been forked to do real work yet. This partitioning
1051  // will take place in the parallel region nested within the teams construct.
1052  if (!fork_teams_workers) {
1053  __kmp_partition_places(team);
1054  }
1055 #endif
1056 
1057  if (team->t.t_nproc > 1 &&
1058  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1059  team->t.b->update_num_threads(team->t.t_nproc);
1060  __kmp_add_threads_to_team(team, team->t.t_nproc);
1061  }
1062  }
1063 
1064  // Take care of primary thread's task state
1065  if (__kmp_tasking_mode != tskm_immediate_exec) {
1066  if (use_hot_team) {
1067  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
1068  KA_TRACE(
1069  20,
1070  ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
1071  "%p, new task_team %p / team %p\n",
1072  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
1073  team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
1074  team));
1075 
1076  // Store primary thread's current task state on new team
1077  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1078  master_th->th.th_task_state);
1079 
1080  // Restore primary thread's task state to hot team's state
1081  // by using thread 1's task state
1082  if (team->t.t_nproc > 1) {
1083  KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
1084  team->t.t_threads[1]->th.th_task_state == 1);
1085  KMP_CHECK_UPDATE(master_th->th.th_task_state,
1086  team->t.t_threads[1]->th.th_task_state);
1087  } else {
1088  master_th->th.th_task_state = 0;
1089  }
1090  } else {
1091  // Store primary thread's current task_state on new team
1092  KMP_CHECK_UPDATE(team->t.t_primary_task_state,
1093  master_th->th.th_task_state);
1094  // Are not using hot team, so set task state to 0.
1095  master_th->th.th_task_state = 0;
1096  }
1097  }
1098 
1099  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1100  for (i = 0; i < team->t.t_nproc; i++) {
1101  kmp_info_t *thr = team->t.t_threads[i];
1102  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1103  thr->th.th_prev_level != team->t.t_level) {
1104  team->t.t_display_affinity = 1;
1105  break;
1106  }
1107  }
1108  }
1109 
1110  KMP_MB();
1111 }
1112 
1113 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1114 // Propagate any changes to the floating point control registers out to the team
1115 // We try to avoid unnecessary writes to the relevant cache line in the team
1116 // structure, so we don't make changes unless they are needed.
1117 inline static void propagateFPControl(kmp_team_t *team) {
1118  if (__kmp_inherit_fp_control) {
1119  kmp_int16 x87_fpu_control_word;
1120  kmp_uint32 mxcsr;
1121 
1122  // Get primary thread's values of FPU control flags (both X87 and vector)
1123  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1124  __kmp_store_mxcsr(&mxcsr);
1125  mxcsr &= KMP_X86_MXCSR_MASK;
1126 
1127  // There is no point looking at t_fp_control_saved here.
1128  // If it is TRUE, we still have to update the values if they are different
1129  // from those we now have. If it is FALSE we didn't save anything yet, but
1130  // our objective is the same. We have to ensure that the values in the team
1131  // are the same as those we have.
1132  // So, this code achieves what we need whether or not t_fp_control_saved is
1133  // true. By checking whether the value needs updating we avoid unnecessary
1134  // writes that would put the cache-line into a written state, causing all
1135  // threads in the team to have to read it again.
1136  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1137  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1138  // Although we don't use this value, other code in the runtime wants to know
1139  // whether it should restore them. So we must ensure it is correct.
1140  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1141  } else {
1142  // Similarly here. Don't write to this cache-line in the team structure
1143  // unless we have to.
1144  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1145  }
1146 }
1147 
1148 // Do the opposite, setting the hardware registers to the updated values from
1149 // the team.
1150 inline static void updateHWFPControl(kmp_team_t *team) {
1151  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1152  // Only reset the fp control regs if they have been changed in the team.
1153  // the parallel region that we are exiting.
1154  kmp_int16 x87_fpu_control_word;
1155  kmp_uint32 mxcsr;
1156  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1157  __kmp_store_mxcsr(&mxcsr);
1158  mxcsr &= KMP_X86_MXCSR_MASK;
1159 
1160  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1161  __kmp_clear_x87_fpu_status_word();
1162  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1163  }
1164 
1165  if (team->t.t_mxcsr != mxcsr) {
1166  __kmp_load_mxcsr(&team->t.t_mxcsr);
1167  }
1168  }
1169 }
1170 #else
1171 #define propagateFPControl(x) ((void)0)
1172 #define updateHWFPControl(x) ((void)0)
1173 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1174 
1175 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1176  int realloc); // forward declaration
1177 
1178 /* Run a parallel region that has been serialized, so runs only in a team of the
1179  single primary thread. */
1180 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1181  kmp_info_t *this_thr;
1182  kmp_team_t *serial_team;
1183 
1184  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1185 
1186  /* Skip all this code for autopar serialized loops since it results in
1187  unacceptable overhead */
1188  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1189  return;
1190 
1191  if (!TCR_4(__kmp_init_parallel))
1192  __kmp_parallel_initialize();
1193  __kmp_resume_if_soft_paused();
1194 
1195  this_thr = __kmp_threads[global_tid];
1196  serial_team = this_thr->th.th_serial_team;
1197 
1198  /* utilize the serialized team held by this thread */
1199  KMP_DEBUG_ASSERT(serial_team);
1200  KMP_MB();
1201 
1202  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1203  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1204  proc_bind = proc_bind_false;
1205  } else if (proc_bind == proc_bind_default) {
1206  // No proc_bind clause was specified, so use the current value
1207  // of proc-bind-var for this parallel region.
1208  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1209  }
1210  // Reset for next parallel region
1211  this_thr->th.th_set_proc_bind = proc_bind_default;
1212 
1213  // OpenMP 6.0 12.1.2 requires the num_threads 'strict' modifier to also have
1214  // effect when parallel execution is disabled by a corresponding if clause
1215  // attached to the parallel directive.
1216  if (this_thr->th.th_nt_strict && this_thr->th.th_set_nproc > 1)
1217  __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
1218  this_thr->th.th_nt_msg);
1219  // Reset num_threads for next parallel region
1220  this_thr->th.th_set_nproc = 0;
1221 
1222 #if OMPT_SUPPORT
1223  ompt_data_t ompt_parallel_data = ompt_data_none;
1224  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1225  if (ompt_enabled.enabled &&
1226  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1227 
1228  ompt_task_info_t *parent_task_info;
1229  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1230 
1231  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1232  if (ompt_enabled.ompt_callback_parallel_begin) {
1233  int team_size = 1;
1234 
1235  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1236  &(parent_task_info->task_data), &(parent_task_info->frame),
1237  &ompt_parallel_data, team_size,
1238  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1239  }
1240  }
1241 #endif // OMPT_SUPPORT
1242 
1243  if (this_thr->th.th_team != serial_team) {
1244  // Nested level will be an index in the nested nthreads array
1245  int level = this_thr->th.th_team->t.t_level;
1246 
1247  if (serial_team->t.t_serialized) {
1248  /* this serial team was already used
1249  TODO increase performance by making this locks more specific */
1250  kmp_team_t *new_team;
1251 
1252  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1253 
1254  new_team = __kmp_allocate_team(
1255  this_thr->th.th_root, 1, 1,
1256 #if OMPT_SUPPORT
1257  ompt_parallel_data,
1258 #endif
1259  proc_bind, &this_thr->th.th_current_task->td_icvs, 0, NULL);
1260  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1261  KMP_ASSERT(new_team);
1262 
1263  /* setup new serialized team and install it */
1264  new_team->t.t_threads[0] = this_thr;
1265  new_team->t.t_parent = this_thr->th.th_team;
1266  serial_team = new_team;
1267  this_thr->th.th_serial_team = serial_team;
1268 
1269  KF_TRACE(
1270  10,
1271  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1272  global_tid, serial_team));
1273 
1274  /* TODO the above breaks the requirement that if we run out of resources,
1275  then we can still guarantee that serialized teams are ok, since we may
1276  need to allocate a new one */
1277  } else {
1278  KF_TRACE(
1279  10,
1280  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1281  global_tid, serial_team));
1282  }
1283 
1284  /* we have to initialize this serial team */
1285  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1286  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1287  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1288  serial_team->t.t_ident = loc;
1289  serial_team->t.t_serialized = 1;
1290  serial_team->t.t_nproc = 1;
1291  serial_team->t.t_parent = this_thr->th.th_team;
1292  if (this_thr->th.th_team->t.t_nested_nth)
1293  serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
1294  else
1295  serial_team->t.t_nested_nth = &__kmp_nested_nth;
1296  // Save previous team's task state on serial team structure
1297  serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
1298  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1299  this_thr->th.th_team = serial_team;
1300  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1301 
1302  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1303  this_thr->th.th_current_task));
1304  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1305  this_thr->th.th_current_task->td_flags.executing = 0;
1306 
1307  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1308 
1309  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1310  implicit task for each serialized task represented by
1311  team->t.t_serialized? */
1312  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1313  &this_thr->th.th_current_task->td_parent->td_icvs);
1314 
1315  // Thread value exists in the nested nthreads array for the next nested
1316  // level
1317  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1318  if (this_thr->th.th_team->t.t_nested_nth)
1319  nested_nth = this_thr->th.th_team->t.t_nested_nth;
1320  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1321  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1322  }
1323 
1324  if (__kmp_nested_proc_bind.used &&
1325  (level + 1 < __kmp_nested_proc_bind.used)) {
1326  this_thr->th.th_current_task->td_icvs.proc_bind =
1327  __kmp_nested_proc_bind.bind_types[level + 1];
1328  }
1329 
1330 #if USE_DEBUGGER
1331  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1332 #endif
1333  this_thr->th.th_info.ds.ds_tid = 0;
1334 
1335  /* set thread cache values */
1336  this_thr->th.th_team_nproc = 1;
1337  this_thr->th.th_team_master = this_thr;
1338  this_thr->th.th_team_serialized = 1;
1339  this_thr->th.th_task_team = NULL;
1340  this_thr->th.th_task_state = 0;
1341 
1342  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1343  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1344  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1345 
1346  propagateFPControl(serial_team);
1347 
1348  /* check if we need to allocate dispatch buffers stack */
1349  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1350  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1351  serial_team->t.t_dispatch->th_disp_buffer =
1352  (dispatch_private_info_t *)__kmp_allocate(
1353  sizeof(dispatch_private_info_t));
1354  }
1355  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1356 
1357  KMP_MB();
1358 
1359  } else {
1360  /* this serialized team is already being used,
1361  * that's fine, just add another nested level */
1362  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1363  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1364  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1365  ++serial_team->t.t_serialized;
1366  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1367 
1368  // Nested level will be an index in the nested nthreads array
1369  int level = this_thr->th.th_team->t.t_level;
1370  // Thread value exists in the nested nthreads array for the next nested
1371  // level
1372 
1373  kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
1374  if (serial_team->t.t_nested_nth)
1375  nested_nth = serial_team->t.t_nested_nth;
1376  if (nested_nth->used && (level + 1 < nested_nth->used)) {
1377  this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
1378  }
1379 
1380  serial_team->t.t_level++;
1381  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1382  "of serial team %p to %d\n",
1383  global_tid, serial_team, serial_team->t.t_level));
1384 
1385  /* allocate/push dispatch buffers stack */
1386  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1387  {
1388  dispatch_private_info_t *disp_buffer =
1389  (dispatch_private_info_t *)__kmp_allocate(
1390  sizeof(dispatch_private_info_t));
1391  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1392  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1393  }
1394  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1395 
1396  /* allocate/push task team stack */
1397  __kmp_push_task_team_node(this_thr, serial_team);
1398 
1399  KMP_MB();
1400  }
1401  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1402 
1403  // Perform the display affinity functionality for
1404  // serialized parallel regions
1405  if (__kmp_display_affinity) {
1406  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1407  this_thr->th.th_prev_num_threads != 1) {
1408  // NULL means use the affinity-format-var ICV
1409  __kmp_aux_display_affinity(global_tid, NULL);
1410  this_thr->th.th_prev_level = serial_team->t.t_level;
1411  this_thr->th.th_prev_num_threads = 1;
1412  }
1413  }
1414 
1415  if (__kmp_env_consistency_check)
1416  __kmp_push_parallel(global_tid, NULL);
1417 #if OMPT_SUPPORT
1418  serial_team->t.ompt_team_info.master_return_address = codeptr;
1419  if (ompt_enabled.enabled &&
1420  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1421  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1422  OMPT_GET_FRAME_ADDRESS(0);
1423 
1424  ompt_lw_taskteam_t lw_taskteam;
1425  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1426  &ompt_parallel_data, codeptr);
1427 
1428  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1429  // don't use lw_taskteam after linking. content was swaped
1430 
1431  /* OMPT implicit task begin */
1432  if (ompt_enabled.ompt_callback_implicit_task) {
1433  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1434  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1435  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1436  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1437  OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1438  __kmp_tid_from_gtid(global_tid);
1439  }
1440 
1441  /* OMPT state */
1442  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1443  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1444  OMPT_GET_FRAME_ADDRESS(0);
1445  }
1446 #endif
1447 }
1448 
1449 // Test if this fork is for a team closely nested in a teams construct
1450 static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1451  microtask_t microtask, int level,
1452  int teams_level, kmp_va_list ap) {
1453  return (master_th->th.th_teams_microtask && ap &&
1454  microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1455 }
1456 
1457 // Test if this fork is for the teams construct, i.e. to form the outer league
1458 // of teams
1459 static inline bool __kmp_is_entering_teams(int active_level, int level,
1460  int teams_level, kmp_va_list ap) {
1461  return ((ap == NULL && active_level == 0) ||
1462  (ap && teams_level > 0 && teams_level == level));
1463 }
1464 
1465 // AC: This is start of parallel that is nested inside teams construct.
1466 // The team is actual (hot), all workers are ready at the fork barrier.
1467 // No lock needed to initialize the team a bit, then free workers.
1468 static inline int
1469 __kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1470  kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1471  enum fork_context_e call_context, microtask_t microtask,
1472  launch_t invoker, int master_set_numthreads, int level,
1473 #if OMPT_SUPPORT
1474  ompt_data_t ompt_parallel_data, void *return_address,
1475 #endif
1476  kmp_va_list ap) {
1477  void **argv;
1478  int i;
1479 
1480  parent_team->t.t_ident = loc;
1481  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1482  parent_team->t.t_argc = argc;
1483  argv = (void **)parent_team->t.t_argv;
1484  for (i = argc - 1; i >= 0; --i) {
1485  *argv++ = va_arg(kmp_va_deref(ap), void *);
1486  }
1487  // Increment our nested depth levels, but not increase the serialization
1488  if (parent_team == master_th->th.th_serial_team) {
1489  // AC: we are in serialized parallel
1490  __kmpc_serialized_parallel(loc, gtid);
1491  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1492 
1493  if (call_context == fork_context_gnu) {
1494  // AC: need to decrement t_serialized for enquiry functions to work
1495  // correctly, will restore at join time
1496  parent_team->t.t_serialized--;
1497  return TRUE;
1498  }
1499 
1500 #if OMPD_SUPPORT
1501  parent_team->t.t_pkfn = microtask;
1502 #endif
1503 
1504 #if OMPT_SUPPORT
1505  void *dummy;
1506  void **exit_frame_p;
1507  ompt_data_t *implicit_task_data;
1508  ompt_lw_taskteam_t lw_taskteam;
1509 
1510  if (ompt_enabled.enabled) {
1511  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1512  &ompt_parallel_data, return_address);
1513  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1514 
1515  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1516  // Don't use lw_taskteam after linking. Content was swapped.
1517 
1518  /* OMPT implicit task begin */
1519  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1520  if (ompt_enabled.ompt_callback_implicit_task) {
1521  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1522  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1523  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1524  1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1525  }
1526 
1527  /* OMPT state */
1528  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1529  } else {
1530  exit_frame_p = &dummy;
1531  }
1532 #endif
1533 
1534  // AC: need to decrement t_serialized for enquiry functions to work
1535  // correctly, will restore at join time
1536  parent_team->t.t_serialized--;
1537 
1538  {
1539  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1540  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1541  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1542 #if OMPT_SUPPORT
1543  ,
1544  exit_frame_p
1545 #endif
1546  );
1547  }
1548 
1549 #if OMPT_SUPPORT
1550  if (ompt_enabled.enabled) {
1551  *exit_frame_p = NULL;
1552  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1553  if (ompt_enabled.ompt_callback_implicit_task) {
1554  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1555  ompt_scope_end, NULL, implicit_task_data, 1,
1556  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1557  }
1558  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1559  __ompt_lw_taskteam_unlink(master_th);
1560  if (ompt_enabled.ompt_callback_parallel_end) {
1561  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1562  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1563  OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1564  }
1565  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1566  }
1567 #endif
1568  return TRUE;
1569  }
1570 
1571  parent_team->t.t_pkfn = microtask;
1572  parent_team->t.t_invoke = invoker;
1573  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1574  parent_team->t.t_active_level++;
1575  parent_team->t.t_level++;
1576  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1577 
1578  // If the threads allocated to the team are less than the thread limit, update
1579  // the thread limit here. th_teams_size.nth is specific to this team nested
1580  // in a teams construct, the team is fully created, and we're about to do
1581  // the actual fork. Best to do this here so that the subsequent uses below
1582  // and in the join have the correct value.
1583  master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1584 
1585 #if OMPT_SUPPORT
1586  if (ompt_enabled.enabled) {
1587  ompt_lw_taskteam_t lw_taskteam;
1588  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1589  return_address);
1590  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1591  }
1592 #endif
1593 
1594  /* Change number of threads in the team if requested */
1595  if (master_set_numthreads) { // The parallel has num_threads clause
1596  if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1597  // AC: only can reduce number of threads dynamically, can't increase
1598  kmp_info_t **other_threads = parent_team->t.t_threads;
1599  // NOTE: if using distributed barrier, we need to run this code block
1600  // even when the team size appears not to have changed from the max.
1601  int old_proc = master_th->th.th_teams_size.nth;
1602  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1603  __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1604  __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1605  }
1606  parent_team->t.t_nproc = master_set_numthreads;
1607  for (i = 0; i < master_set_numthreads; ++i) {
1608  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1609  }
1610  }
1611  // Keep extra threads hot in the team for possible next parallels
1612  master_th->th.th_set_nproc = 0;
1613  }
1614 
1615 #if USE_DEBUGGER
1616  if (__kmp_debugging) { // Let debugger override number of threads.
1617  int nth = __kmp_omp_num_threads(loc);
1618  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1619  master_set_numthreads = nth;
1620  }
1621  }
1622 #endif
1623 
1624  // Figure out the proc_bind policy for the nested parallel within teams
1625  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1626  // proc_bind_default means don't update
1627  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1628  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1629  proc_bind = proc_bind_false;
1630  } else {
1631  // No proc_bind clause specified; use current proc-bind-var
1632  if (proc_bind == proc_bind_default) {
1633  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1634  }
1635  /* else: The proc_bind policy was specified explicitly on parallel clause.
1636  This overrides proc-bind-var for this parallel region, but does not
1637  change proc-bind-var. */
1638  // Figure the value of proc-bind-var for the child threads.
1639  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1640  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1641  master_th->th.th_current_task->td_icvs.proc_bind)) {
1642  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1643  }
1644  }
1645  KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1646  // Need to change the bind-var ICV to correct value for each implicit task
1647  if (proc_bind_icv != proc_bind_default &&
1648  master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1649  kmp_info_t **other_threads = parent_team->t.t_threads;
1650  for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1651  other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1652  }
1653  }
1654  // Reset for next parallel region
1655  master_th->th.th_set_proc_bind = proc_bind_default;
1656 
1657 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1658  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1659  KMP_ITT_DEBUG) &&
1660  __kmp_forkjoin_frames_mode == 3 &&
1661  parent_team->t.t_active_level == 1 // only report frames at level 1
1662  && master_th->th.th_teams_size.nteams == 1) {
1663  kmp_uint64 tmp_time = __itt_get_timestamp();
1664  master_th->th.th_frame_time = tmp_time;
1665  parent_team->t.t_region_time = tmp_time;
1666  }
1667  if (__itt_stack_caller_create_ptr) {
1668  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1669  // create new stack stitching id before entering fork barrier
1670  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1671  }
1672 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1673 #if KMP_AFFINITY_SUPPORTED
1674  __kmp_partition_places(parent_team);
1675 #endif
1676 
1677  KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1678  "master_th=%p, gtid=%d\n",
1679  root, parent_team, master_th, gtid));
1680  __kmp_internal_fork(loc, gtid, parent_team);
1681  KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1682  "master_th=%p, gtid=%d\n",
1683  root, parent_team, master_th, gtid));
1684 
1685  if (call_context == fork_context_gnu)
1686  return TRUE;
1687 
1688  /* Invoke microtask for PRIMARY thread */
1689  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1690  parent_team->t.t_id, parent_team->t.t_pkfn));
1691 
1692  if (!parent_team->t.t_invoke(gtid)) {
1693  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1694  }
1695  KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1696  parent_team->t.t_id, parent_team->t.t_pkfn));
1697  KMP_MB(); /* Flush all pending memory write invalidates. */
1698 
1699  KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1700 
1701  return TRUE;
1702 }
1703 
1704 // Create a serialized parallel region
1705 static inline int
1706 __kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1707  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1708  kmp_info_t *master_th, kmp_team_t *parent_team,
1709 #if OMPT_SUPPORT
1710  ompt_data_t *ompt_parallel_data, void **return_address,
1711  ompt_data_t **parent_task_data,
1712 #endif
1713  kmp_va_list ap) {
1714  kmp_team_t *team;
1715  int i;
1716  void **argv;
1717 
1718 /* josh todo: hypothetical question: what do we do for OS X*? */
1719 #if KMP_OS_LINUX && \
1720  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1721  SimpleVLA<void *> args(argc);
1722 #else
1723  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1724 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1725  KMP_ARCH_AARCH64) */
1726 
1727  KA_TRACE(
1728  20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1729 
1730  __kmpc_serialized_parallel(loc, gtid);
1731 
1732 #if OMPD_SUPPORT
1733  master_th->th.th_serial_team->t.t_pkfn = microtask;
1734 #endif
1735 
1736  if (call_context == fork_context_intel) {
1737  /* TODO this sucks, use the compiler itself to pass args! :) */
1738  master_th->th.th_serial_team->t.t_ident = loc;
1739  if (!ap) {
1740  // revert change made in __kmpc_serialized_parallel()
1741  master_th->th.th_serial_team->t.t_level--;
1742 // Get args from parent team for teams construct
1743 
1744 #if OMPT_SUPPORT
1745  void *dummy;
1746  void **exit_frame_p;
1747  ompt_task_info_t *task_info;
1748  ompt_lw_taskteam_t lw_taskteam;
1749 
1750  if (ompt_enabled.enabled) {
1751  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1752  ompt_parallel_data, *return_address);
1753 
1754  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1755  // don't use lw_taskteam after linking. content was swaped
1756  task_info = OMPT_CUR_TASK_INFO(master_th);
1757  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1758  if (ompt_enabled.ompt_callback_implicit_task) {
1759  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1760  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1761  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1762  &(task_info->task_data), 1,
1763  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1764  }
1765 
1766  /* OMPT state */
1767  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1768  } else {
1769  exit_frame_p = &dummy;
1770  }
1771 #endif
1772 
1773  {
1774  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1775  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1776  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1777 #if OMPT_SUPPORT
1778  ,
1779  exit_frame_p
1780 #endif
1781  );
1782  }
1783 
1784 #if OMPT_SUPPORT
1785  if (ompt_enabled.enabled) {
1786  *exit_frame_p = NULL;
1787  if (ompt_enabled.ompt_callback_implicit_task) {
1788  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1789  ompt_scope_end, NULL, &(task_info->task_data), 1,
1790  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1791  }
1792  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1793  __ompt_lw_taskteam_unlink(master_th);
1794  if (ompt_enabled.ompt_callback_parallel_end) {
1795  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1796  ompt_parallel_data, *parent_task_data,
1797  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1798  }
1799  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1800  }
1801 #endif
1802  } else if (microtask == (microtask_t)__kmp_teams_master) {
1803  KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1804  team = master_th->th.th_team;
1805  // team->t.t_pkfn = microtask;
1806  team->t.t_invoke = invoker;
1807  __kmp_alloc_argv_entries(argc, team, TRUE);
1808  team->t.t_argc = argc;
1809  argv = (void **)team->t.t_argv;
1810  for (i = argc - 1; i >= 0; --i)
1811  *argv++ = va_arg(kmp_va_deref(ap), void *);
1812  // AC: revert change made in __kmpc_serialized_parallel()
1813  // because initial code in teams should have level=0
1814  team->t.t_level--;
1815  // AC: call special invoker for outer "parallel" of teams construct
1816  invoker(gtid);
1817 #if OMPT_SUPPORT
1818  if (ompt_enabled.enabled) {
1819  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1820  if (ompt_enabled.ompt_callback_implicit_task) {
1821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1822  ompt_scope_end, NULL, &(task_info->task_data), 0,
1823  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1824  }
1825  if (ompt_enabled.ompt_callback_parallel_end) {
1826  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1827  ompt_parallel_data, *parent_task_data,
1828  OMPT_INVOKER(call_context) | ompt_parallel_league,
1829  *return_address);
1830  }
1831  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1832  }
1833 #endif
1834  } else {
1835  argv = args;
1836  for (i = argc - 1; i >= 0; --i)
1837  *argv++ = va_arg(kmp_va_deref(ap), void *);
1838  KMP_MB();
1839 
1840 #if OMPT_SUPPORT
1841  void *dummy;
1842  void **exit_frame_p;
1843  ompt_task_info_t *task_info;
1844  ompt_lw_taskteam_t lw_taskteam;
1845  ompt_data_t *implicit_task_data;
1846 
1847  if (ompt_enabled.enabled) {
1848  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1849  ompt_parallel_data, *return_address);
1850  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1851  // don't use lw_taskteam after linking. content was swaped
1852  task_info = OMPT_CUR_TASK_INFO(master_th);
1853  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1854 
1855  /* OMPT implicit task begin */
1856  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1857  if (ompt_enabled.ompt_callback_implicit_task) {
1858  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1859  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1860  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1861  ompt_task_implicit);
1862  OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1863  }
1864 
1865  /* OMPT state */
1866  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1867  } else {
1868  exit_frame_p = &dummy;
1869  }
1870 #endif
1871 
1872  {
1873  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1874  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1875  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1876 #if OMPT_SUPPORT
1877  ,
1878  exit_frame_p
1879 #endif
1880  );
1881  }
1882 
1883 #if OMPT_SUPPORT
1884  if (ompt_enabled.enabled) {
1885  *exit_frame_p = NULL;
1886  if (ompt_enabled.ompt_callback_implicit_task) {
1887  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1888  ompt_scope_end, NULL, &(task_info->task_data), 1,
1889  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1890  }
1891 
1892  *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1893  __ompt_lw_taskteam_unlink(master_th);
1894  if (ompt_enabled.ompt_callback_parallel_end) {
1895  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1896  ompt_parallel_data, *parent_task_data,
1897  OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1898  }
1899  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1900  }
1901 #endif
1902  }
1903  } else if (call_context == fork_context_gnu) {
1904 #if OMPT_SUPPORT
1905  if (ompt_enabled.enabled) {
1906  ompt_lw_taskteam_t lwt;
1907  __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1908  *return_address);
1909 
1910  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1911  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1912  }
1913 // don't use lw_taskteam after linking. content was swaped
1914 #endif
1915 
1916  // we were called from GNU native code
1917  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1918  return FALSE;
1919  } else {
1920  KMP_ASSERT2(call_context < fork_context_last,
1921  "__kmp_serial_fork_call: unknown fork_context parameter");
1922  }
1923 
1924  KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1925  KMP_MB();
1926  return FALSE;
1927 }
1928 
1929 /* most of the work for a fork */
1930 /* return true if we really went parallel, false if serialized */
1931 int __kmp_fork_call(ident_t *loc, int gtid,
1932  enum fork_context_e call_context, // Intel, GNU, ...
1933  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1934  kmp_va_list ap) {
1935  void **argv;
1936  int i;
1937  int master_tid;
1938  int master_this_cons;
1939  kmp_team_t *team;
1940  kmp_team_t *parent_team;
1941  kmp_info_t *master_th;
1942  kmp_root_t *root;
1943  int nthreads;
1944  int master_active;
1945  int master_set_numthreads;
1946  int task_thread_limit = 0;
1947  int level;
1948  int active_level;
1949  int teams_level;
1950  kmp_hot_team_ptr_t **p_hot_teams;
1951  { // KMP_TIME_BLOCK
1952  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1953  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1954 
1955  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1956  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1957  /* Some systems prefer the stack for the root thread(s) to start with */
1958  /* some gap from the parent stack to prevent false sharing. */
1959  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1960  /* These 2 lines below are so this does not get optimized out */
1961  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1962  __kmp_stkpadding += (short)((kmp_int64)dummy);
1963  }
1964 
1965  /* initialize if needed */
1966  KMP_DEBUG_ASSERT(
1967  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1968  if (!TCR_4(__kmp_init_parallel))
1969  __kmp_parallel_initialize();
1970  __kmp_resume_if_soft_paused();
1971 
1972  /* setup current data */
1973  // AC: potentially unsafe, not in sync with library shutdown,
1974  // __kmp_threads can be freed
1975  master_th = __kmp_threads[gtid];
1976 
1977  parent_team = master_th->th.th_team;
1978  master_tid = master_th->th.th_info.ds.ds_tid;
1979  master_this_cons = master_th->th.th_local.this_construct;
1980  root = master_th->th.th_root;
1981  master_active = root->r.r_active;
1982  master_set_numthreads = master_th->th.th_set_nproc;
1983  task_thread_limit =
1984  master_th->th.th_current_task->td_icvs.task_thread_limit;
1985 
1986 #if OMPT_SUPPORT
1987  ompt_data_t ompt_parallel_data = ompt_data_none;
1988  ompt_data_t *parent_task_data = NULL;
1989  ompt_frame_t *ompt_frame = NULL;
1990  void *return_address = NULL;
1991 
1992  if (ompt_enabled.enabled) {
1993  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1994  NULL, NULL);
1995  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1996  }
1997 #endif
1998 
1999  // Assign affinity to root thread if it hasn't happened yet
2000  __kmp_assign_root_init_mask();
2001 
2002  // Nested level will be an index in the nested nthreads array
2003  level = parent_team->t.t_level;
2004  // used to launch non-serial teams even if nested is not allowed
2005  active_level = parent_team->t.t_active_level;
2006  // needed to check nesting inside the teams
2007  teams_level = master_th->th.th_teams_level;
2008  p_hot_teams = &master_th->th.th_hot_teams;
2009  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
2010  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
2011  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
2012  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
2013  // it is either actual or not needed (when active_level > 0)
2014  (*p_hot_teams)[0].hot_team_nth = 1;
2015  }
2016 
2017 #if OMPT_SUPPORT
2018  if (ompt_enabled.enabled) {
2019  if (ompt_enabled.ompt_callback_parallel_begin) {
2020  int team_size = master_set_numthreads
2021  ? master_set_numthreads
2022  : get__nproc_2(parent_team, master_tid);
2023  int flags = OMPT_INVOKER(call_context) |
2024  ((microtask == (microtask_t)__kmp_teams_master)
2025  ? ompt_parallel_league
2026  : ompt_parallel_team);
2027  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
2028  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
2029  return_address);
2030  }
2031  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2032  }
2033 #endif
2034 
2035  master_th->th.th_ident = loc;
2036 
2037  // Parallel closely nested in teams construct:
2038  if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
2039  return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
2040  call_context, microtask, invoker,
2041  master_set_numthreads, level,
2042 #if OMPT_SUPPORT
2043  ompt_parallel_data, return_address,
2044 #endif
2045  ap);
2046  } // End parallel closely nested in teams construct
2047 
2048  // Need this to happen before we determine the number of threads, not while
2049  // we are allocating the team
2050  //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
2051 
2052  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
2053 
2054  // Determine the number of threads
2055  int enter_teams =
2056  __kmp_is_entering_teams(active_level, level, teams_level, ap);
2057  if ((!enter_teams &&
2058  (parent_team->t.t_active_level >=
2059  master_th->th.th_current_task->td_icvs.max_active_levels)) ||
2060  (__kmp_library == library_serial)) {
2061  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
2062  nthreads = 1;
2063  } else {
2064  nthreads = master_set_numthreads
2065  ? master_set_numthreads
2066  // TODO: get nproc directly from current task
2067  : get__nproc_2(parent_team, master_tid);
2068  // Use the thread_limit set for the current target task if exists, else go
2069  // with the deduced nthreads
2070  nthreads = task_thread_limit > 0 && task_thread_limit < nthreads
2071  ? task_thread_limit
2072  : nthreads;
2073  // Check if we need to take forkjoin lock? (no need for serialized
2074  // parallel out of teams construct).
2075  if (nthreads > 1) {
2076  /* determine how many new threads we can use */
2077  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2078  /* AC: If we execute teams from parallel region (on host), then teams
2079  should be created but each can only have 1 thread if nesting is
2080  disabled. If teams called from serial region, then teams and their
2081  threads should be created regardless of the nesting setting. */
2082  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2083  nthreads, enter_teams);
2084  if (nthreads == 1) {
2085  // Free lock for single thread execution here; for multi-thread
2086  // execution it will be freed later after team of threads created
2087  // and initialized
2088  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2089  }
2090  }
2091  }
2092  KMP_DEBUG_ASSERT(nthreads > 0);
2093 
2094  // If we temporarily changed the set number of threads then restore it now
2095  master_th->th.th_set_nproc = 0;
2096 
2097  if (nthreads == 1) {
2098  return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2099  invoker, master_th, parent_team,
2100 #if OMPT_SUPPORT
2101  &ompt_parallel_data, &return_address,
2102  &parent_task_data,
2103 #endif
2104  ap);
2105  } // if (nthreads == 1)
2106 
2107  // GEH: only modify the executing flag in the case when not serialized
2108  // serialized case is handled in kmpc_serialized_parallel
2109  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2110  "curtask=%p, curtask_max_aclevel=%d\n",
2111  parent_team->t.t_active_level, master_th,
2112  master_th->th.th_current_task,
2113  master_th->th.th_current_task->td_icvs.max_active_levels));
2114  // TODO: GEH - cannot do this assertion because root thread not set up as
2115  // executing
2116  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2117  master_th->th.th_current_task->td_flags.executing = 0;
2118 
2119  if (!master_th->th.th_teams_microtask || level > teams_level) {
2120  /* Increment our nested depth level */
2121  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2122  }
2123 
2124  // See if we need to make a copy of the ICVs.
2125  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2126  kmp_nested_nthreads_t *nested_nth = NULL;
2127  if (!master_th->th.th_set_nested_nth &&
2128  (level + 1 < parent_team->t.t_nested_nth->used) &&
2129  (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
2130  nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
2131  } else if (master_th->th.th_set_nested_nth) {
2132  nested_nth = __kmp_override_nested_nth(master_th, level);
2133  if ((level + 1 < nested_nth->used) &&
2134  (nested_nth->nth[level + 1] != nthreads_icv))
2135  nthreads_icv = nested_nth->nth[level + 1];
2136  else
2137  nthreads_icv = 0; // don't update
2138  } else {
2139  nthreads_icv = 0; // don't update
2140  }
2141 
2142  // Figure out the proc_bind_policy for the new team.
2143  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2144  // proc_bind_default means don't update
2145  kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2146  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2147  proc_bind = proc_bind_false;
2148  } else {
2149  // No proc_bind clause specified; use current proc-bind-var for this
2150  // parallel region
2151  if (proc_bind == proc_bind_default) {
2152  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2153  }
2154  // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2155  if (master_th->th.th_teams_microtask &&
2156  microtask == (microtask_t)__kmp_teams_master) {
2157  proc_bind = __kmp_teams_proc_bind;
2158  }
2159  /* else: The proc_bind policy was specified explicitly on parallel clause.
2160  This overrides proc-bind-var for this parallel region, but does not
2161  change proc-bind-var. */
2162  // Figure the value of proc-bind-var for the child threads.
2163  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2164  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2165  master_th->th.th_current_task->td_icvs.proc_bind)) {
2166  // Do not modify the proc bind icv for the two teams construct forks
2167  // They just let the proc bind icv pass through
2168  if (!master_th->th.th_teams_microtask ||
2169  !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2170  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2171  }
2172  }
2173 
2174  // Reset for next parallel region
2175  master_th->th.th_set_proc_bind = proc_bind_default;
2176 
2177  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2178  kmp_internal_control_t new_icvs;
2179  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2180  new_icvs.next = NULL;
2181  if (nthreads_icv > 0) {
2182  new_icvs.nproc = nthreads_icv;
2183  }
2184  if (proc_bind_icv != proc_bind_default) {
2185  new_icvs.proc_bind = proc_bind_icv;
2186  }
2187 
2188  /* allocate a new parallel team */
2189  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2190  team = __kmp_allocate_team(root, nthreads, nthreads,
2191 #if OMPT_SUPPORT
2192  ompt_parallel_data,
2193 #endif
2194  proc_bind, &new_icvs, argc, master_th);
2195  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2196  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2197  } else {
2198  /* allocate a new parallel team */
2199  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2200  team = __kmp_allocate_team(
2201  root, nthreads, nthreads,
2202 #if OMPT_SUPPORT
2203  ompt_parallel_data,
2204 #endif
2205  proc_bind, &master_th->th.th_current_task->td_icvs, argc, master_th);
2206  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2207  copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2208  &master_th->th.th_current_task->td_icvs);
2209  }
2210  KF_TRACE(
2211  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2212 
2213  /* setup the new team */
2214  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2215  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2216  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2217  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2218  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2219 #if OMPT_SUPPORT
2220  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2221  return_address);
2222 #endif
2223  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2224  // TODO: parent_team->t.t_level == INT_MAX ???
2225  if (!master_th->th.th_teams_microtask || level > teams_level) {
2226  int new_level = parent_team->t.t_level + 1;
2227  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2228  new_level = parent_team->t.t_active_level + 1;
2229  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2230  } else {
2231  // AC: Do not increase parallel level at start of the teams construct
2232  int new_level = parent_team->t.t_level;
2233  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2234  new_level = parent_team->t.t_active_level;
2235  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2236  }
2237  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2238  // set primary thread's schedule as new run-time schedule
2239  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2240 
2241  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2242  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2243 
2244  // Check if hot team has potentially outdated list, and if so, free it
2245  if (team->t.t_nested_nth &&
2246  team->t.t_nested_nth != parent_team->t.t_nested_nth) {
2247  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
2248  KMP_INTERNAL_FREE(team->t.t_nested_nth);
2249  team->t.t_nested_nth = NULL;
2250  }
2251  team->t.t_nested_nth = parent_team->t.t_nested_nth;
2252  if (master_th->th.th_set_nested_nth) {
2253  if (!nested_nth)
2254  nested_nth = __kmp_override_nested_nth(master_th, level);
2255  team->t.t_nested_nth = nested_nth;
2256  KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
2257  master_th->th.th_set_nested_nth = NULL;
2258  master_th->th.th_set_nested_nth_sz = 0;
2259  master_th->th.th_nt_strict = false;
2260  }
2261 
2262  // Update the floating point rounding in the team if required.
2263  propagateFPControl(team);
2264 #if OMPD_SUPPORT
2265  if (ompd_state & OMPD_ENABLE_BP)
2266  ompd_bp_parallel_begin();
2267 #endif
2268 
2269  KA_TRACE(
2270  20,
2271  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2272  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2273  team->t.t_nproc));
2274  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2275  (team->t.t_master_tid == 0 &&
2276  (team->t.t_parent == root->r.r_root_team ||
2277  team->t.t_parent->t.t_serialized)));
2278  KMP_MB();
2279 
2280  /* now, setup the arguments */
2281  argv = (void **)team->t.t_argv;
2282  if (ap) {
2283  for (i = argc - 1; i >= 0; --i) {
2284  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2285  KMP_CHECK_UPDATE(*argv, new_argv);
2286  argv++;
2287  }
2288  } else {
2289  for (i = 0; i < argc; ++i) {
2290  // Get args from parent team for teams construct
2291  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2292  }
2293  }
2294 
2295  /* now actually fork the threads */
2296  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2297  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2298  root->r.r_active = TRUE;
2299 
2300  __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2301  __kmp_setup_icv_copy(team, nthreads,
2302  &master_th->th.th_current_task->td_icvs, loc);
2303 
2304 #if OMPT_SUPPORT
2305  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2306 #endif
2307 
2308  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2309 
2310 #if USE_ITT_BUILD
2311  if (team->t.t_active_level == 1 // only report frames at level 1
2312  && !master_th->th.th_teams_microtask) { // not in teams construct
2313 #if USE_ITT_NOTIFY
2314  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2315  (__kmp_forkjoin_frames_mode == 3 ||
2316  __kmp_forkjoin_frames_mode == 1)) {
2317  kmp_uint64 tmp_time = 0;
2318  if (__itt_get_timestamp_ptr)
2319  tmp_time = __itt_get_timestamp();
2320  // Internal fork - report frame begin
2321  master_th->th.th_frame_time = tmp_time;
2322  if (__kmp_forkjoin_frames_mode == 3)
2323  team->t.t_region_time = tmp_time;
2324  } else
2325 // only one notification scheme (either "submit" or "forking/joined", not both)
2326 #endif /* USE_ITT_NOTIFY */
2327  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2328  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2329  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2330  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2331  }
2332  }
2333 #endif /* USE_ITT_BUILD */
2334 
2335  /* now go on and do the work */
2336  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2337  KMP_MB();
2338  KF_TRACE(10,
2339  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2340  root, team, master_th, gtid));
2341 
2342 #if USE_ITT_BUILD
2343  if (__itt_stack_caller_create_ptr) {
2344  // create new stack stitching id before entering fork barrier
2345  if (!enter_teams) {
2346  KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2347  team->t.t_stack_id = __kmp_itt_stack_caller_create();
2348  } else if (parent_team->t.t_serialized) {
2349  // keep stack stitching id in the serialized parent_team;
2350  // current team will be used for parallel inside the teams;
2351  // if parent_team is active, then it already keeps stack stitching id
2352  // for the league of teams
2353  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2354  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2355  }
2356  }
2357 #endif /* USE_ITT_BUILD */
2358 
2359  // AC: skip __kmp_internal_fork at teams construct, let only primary
2360  // threads execute
2361  if (ap) {
2362  __kmp_internal_fork(loc, gtid, team);
2363  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2364  "master_th=%p, gtid=%d\n",
2365  root, team, master_th, gtid));
2366  }
2367 
2368  if (call_context == fork_context_gnu) {
2369  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2370  return TRUE;
2371  }
2372 
2373  /* Invoke microtask for PRIMARY thread */
2374  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2375  team->t.t_id, team->t.t_pkfn));
2376  } // END of timer KMP_fork_call block
2377 
2378 #if KMP_STATS_ENABLED
2379  // If beginning a teams construct, then change thread state
2380  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2381  if (!ap) {
2382  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2383  }
2384 #endif
2385 
2386  if (!team->t.t_invoke(gtid)) {
2387  KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2388  }
2389 
2390 #if KMP_STATS_ENABLED
2391  // If was beginning of a teams construct, then reset thread state
2392  if (!ap) {
2393  KMP_SET_THREAD_STATE(previous_state);
2394  }
2395 #endif
2396 
2397  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2398  team->t.t_id, team->t.t_pkfn));
2399  KMP_MB(); /* Flush all pending memory write invalidates. */
2400 
2401  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2402 #if OMPT_SUPPORT
2403  if (ompt_enabled.enabled) {
2404  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2405  }
2406 #endif
2407 
2408  return TRUE;
2409 }
2410 
2411 #if OMPT_SUPPORT
2412 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2413  kmp_team_t *team) {
2414  // restore state outside the region
2415  thread->th.ompt_thread_info.state =
2416  ((team->t.t_serialized) ? ompt_state_work_serial
2417  : ompt_state_work_parallel);
2418 }
2419 
2420 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2421  kmp_team_t *team, ompt_data_t *parallel_data,
2422  int flags, void *codeptr) {
2423  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2424  if (ompt_enabled.ompt_callback_parallel_end) {
2425  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2426  parallel_data, &(task_info->task_data), flags, codeptr);
2427  }
2428 
2429  task_info->frame.enter_frame = ompt_data_none;
2430  __kmp_join_restore_state(thread, team);
2431 }
2432 #endif
2433 
2434 void __kmp_join_call(ident_t *loc, int gtid
2435 #if OMPT_SUPPORT
2436  ,
2437  enum fork_context_e fork_context
2438 #endif
2439  ,
2440  int exit_teams) {
2441  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2442  kmp_team_t *team;
2443  kmp_team_t *parent_team;
2444  kmp_info_t *master_th;
2445  kmp_root_t *root;
2446  int master_active;
2447 
2448  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2449 
2450  /* setup current data */
2451  master_th = __kmp_threads[gtid];
2452  root = master_th->th.th_root;
2453  team = master_th->th.th_team;
2454  parent_team = team->t.t_parent;
2455 
2456  master_th->th.th_ident = loc;
2457 
2458 #if OMPT_SUPPORT
2459  void *team_microtask = (void *)team->t.t_pkfn;
2460  // For GOMP interface with serialized parallel, need the
2461  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2462  // and end-parallel events.
2463  if (ompt_enabled.enabled &&
2464  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2465  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2466  }
2467 #endif
2468 
2469 #if KMP_DEBUG
2470  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2471  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2472  "th_task_team = %p\n",
2473  __kmp_gtid_from_thread(master_th), team,
2474  team->t.t_task_team[master_th->th.th_task_state],
2475  master_th->th.th_task_team));
2476  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
2477  }
2478 #endif
2479 
2480  if (team->t.t_serialized) {
2481  if (master_th->th.th_teams_microtask) {
2482  // We are in teams construct
2483  int level = team->t.t_level;
2484  int tlevel = master_th->th.th_teams_level;
2485  if (level == tlevel) {
2486  // AC: we haven't incremented it earlier at start of teams construct,
2487  // so do it here - at the end of teams construct
2488  team->t.t_level++;
2489  } else if (level == tlevel + 1) {
2490  // AC: we are exiting parallel inside teams, need to increment
2491  // serialization in order to restore it in the next call to
2492  // __kmpc_end_serialized_parallel
2493  team->t.t_serialized++;
2494  }
2495  }
2496  __kmpc_end_serialized_parallel(loc, gtid);
2497 
2498 #if OMPT_SUPPORT
2499  if (ompt_enabled.enabled) {
2500  if (fork_context == fork_context_gnu) {
2501  __ompt_lw_taskteam_unlink(master_th);
2502  }
2503  __kmp_join_restore_state(master_th, parent_team);
2504  }
2505 #endif
2506 
2507  return;
2508  }
2509 
2510  master_active = team->t.t_master_active;
2511 
2512  if (!exit_teams) {
2513  // AC: No barrier for internal teams at exit from teams construct.
2514  // But there is barrier for external team (league).
2515  __kmp_internal_join(loc, gtid, team);
2516 #if USE_ITT_BUILD
2517  if (__itt_stack_caller_create_ptr) {
2518  KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2519  // destroy the stack stitching id after join barrier
2520  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2521  team->t.t_stack_id = NULL;
2522  }
2523 #endif
2524  } else {
2525  master_th->th.th_task_state =
2526  0; // AC: no tasking in teams (out of any parallel)
2527 #if USE_ITT_BUILD
2528  if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2529  KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2530  // destroy the stack stitching id on exit from the teams construct
2531  // if parent_team is active, then the id will be destroyed later on
2532  // by master of the league of teams
2533  __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2534  parent_team->t.t_stack_id = NULL;
2535  }
2536 #endif
2537  }
2538 
2539  KMP_MB();
2540 
2541 #if OMPT_SUPPORT
2542  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2543  void *codeptr = team->t.ompt_team_info.master_return_address;
2544 #endif
2545 
2546 #if USE_ITT_BUILD
2547  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2548  if (team->t.t_active_level == 1 &&
2549  (!master_th->th.th_teams_microtask || /* not in teams construct */
2550  master_th->th.th_teams_size.nteams == 1)) {
2551  master_th->th.th_ident = loc;
2552  // only one notification scheme (either "submit" or "forking/joined", not
2553  // both)
2554  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2555  __kmp_forkjoin_frames_mode == 3)
2556  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2557  master_th->th.th_frame_time, 0, loc,
2558  master_th->th.th_team_nproc, 1);
2559  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2560  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2561  __kmp_itt_region_joined(gtid);
2562  } // active_level == 1
2563 #endif /* USE_ITT_BUILD */
2564 
2565 #if KMP_AFFINITY_SUPPORTED
2566  if (!exit_teams) {
2567  // Restore master thread's partition.
2568  master_th->th.th_first_place = team->t.t_first_place;
2569  master_th->th.th_last_place = team->t.t_last_place;
2570  }
2571 #endif // KMP_AFFINITY_SUPPORTED
2572 
2573  if (master_th->th.th_teams_microtask && !exit_teams &&
2574  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2575  team->t.t_level == master_th->th.th_teams_level + 1) {
2576 // AC: We need to leave the team structure intact at the end of parallel
2577 // inside the teams construct, so that at the next parallel same (hot) team
2578 // works, only adjust nesting levels
2579 #if OMPT_SUPPORT
2580  ompt_data_t ompt_parallel_data = ompt_data_none;
2581  if (ompt_enabled.enabled) {
2582  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2583  if (ompt_enabled.ompt_callback_implicit_task) {
2584  int ompt_team_size = team->t.t_nproc;
2585  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2586  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2587  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2588  }
2589  task_info->frame.exit_frame = ompt_data_none;
2590  task_info->task_data = ompt_data_none;
2591  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2592  __ompt_lw_taskteam_unlink(master_th);
2593  }
2594 #endif
2595  /* Decrement our nested depth level */
2596  team->t.t_level--;
2597  team->t.t_active_level--;
2598  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2599 
2600  // Restore number of threads in the team if needed. This code relies on
2601  // the proper adjustment of th_teams_size.nth after the fork in
2602  // __kmp_teams_master on each teams primary thread in the case that
2603  // __kmp_reserve_threads reduced it.
2604  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2605  int old_num = master_th->th.th_team_nproc;
2606  int new_num = master_th->th.th_teams_size.nth;
2607  kmp_info_t **other_threads = team->t.t_threads;
2608  team->t.t_nproc = new_num;
2609  for (int i = 0; i < old_num; ++i) {
2610  other_threads[i]->th.th_team_nproc = new_num;
2611  }
2612  // Adjust states of non-used threads of the team
2613  for (int i = old_num; i < new_num; ++i) {
2614  // Re-initialize thread's barrier data.
2615  KMP_DEBUG_ASSERT(other_threads[i]);
2616  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2617  for (int b = 0; b < bs_last_barrier; ++b) {
2618  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2619  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2620 #if USE_DEBUGGER
2621  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2622 #endif
2623  }
2624  if (__kmp_tasking_mode != tskm_immediate_exec) {
2625  // Synchronize thread's task state
2626  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2627  }
2628  }
2629  }
2630 
2631 #if OMPT_SUPPORT
2632  if (ompt_enabled.enabled) {
2633  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2634  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2635  }
2636 #endif
2637 
2638  return;
2639  }
2640 
2641  /* do cleanup and restore the parent team */
2642  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2643  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2644 
2645  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2646 
2647  /* jc: The following lock has instructions with REL and ACQ semantics,
2648  separating the parallel user code called in this parallel region
2649  from the serial user code called after this function returns. */
2650  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2651 
2652  if (!master_th->th.th_teams_microtask ||
2653  team->t.t_level > master_th->th.th_teams_level) {
2654  /* Decrement our nested depth level */
2655  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2656  }
2657  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2658 
2659 #if OMPT_SUPPORT
2660  if (ompt_enabled.enabled) {
2661  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2662  if (ompt_enabled.ompt_callback_implicit_task) {
2663  int flags = (team_microtask == (void *)__kmp_teams_master)
2664  ? ompt_task_initial
2665  : ompt_task_implicit;
2666  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2667  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2668  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2669  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2670  }
2671  task_info->frame.exit_frame = ompt_data_none;
2672  task_info->task_data = ompt_data_none;
2673  }
2674 #endif
2675 
2676  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2677  master_th, team));
2678  __kmp_pop_current_task_from_thread(master_th);
2679 
2680  master_th->th.th_def_allocator = team->t.t_def_allocator;
2681 
2682 #if OMPD_SUPPORT
2683  if (ompd_state & OMPD_ENABLE_BP)
2684  ompd_bp_parallel_end();
2685 #endif
2686  updateHWFPControl(team);
2687 
2688  if (root->r.r_active != master_active)
2689  root->r.r_active = master_active;
2690 
2691  __kmp_free_team(root, team, master_th); // this will free worker threads
2692 
2693  /* this race was fun to find. make sure the following is in the critical
2694  region otherwise assertions may fail occasionally since the old team may be
2695  reallocated and the hierarchy appears inconsistent. it is actually safe to
2696  run and won't cause any bugs, but will cause those assertion failures. it's
2697  only one deref&assign so might as well put this in the critical region */
2698  master_th->th.th_team = parent_team;
2699  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2700  master_th->th.th_team_master = parent_team->t.t_threads[0];
2701  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2702 
2703  /* restore serialized team, if need be */
2704  if (parent_team->t.t_serialized &&
2705  parent_team != master_th->th.th_serial_team &&
2706  parent_team != root->r.r_root_team) {
2707  __kmp_free_team(root, master_th->th.th_serial_team, NULL);
2708  master_th->th.th_serial_team = parent_team;
2709  }
2710 
2711  if (__kmp_tasking_mode != tskm_immediate_exec) {
2712  // Restore primary thread's task state from team structure
2713  KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
2714  team->t.t_primary_task_state == 1);
2715  master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
2716 
2717  // Copy the task team from the parent team to the primary thread
2718  master_th->th.th_task_team =
2719  parent_team->t.t_task_team[master_th->th.th_task_state];
2720  KA_TRACE(20,
2721  ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2722  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2723  parent_team));
2724  }
2725 
2726  // TODO: GEH - cannot do this assertion because root thread not set up as
2727  // executing
2728  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2729  master_th->th.th_current_task->td_flags.executing = 1;
2730 
2731  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2732 
2733 #if KMP_AFFINITY_SUPPORTED
2734  if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2735  __kmp_reset_root_init_mask(gtid);
2736  }
2737 #endif
2738 #if OMPT_SUPPORT
2739  int flags =
2740  OMPT_INVOKER(fork_context) |
2741  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2742  : ompt_parallel_team);
2743  if (ompt_enabled.enabled) {
2744  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2745  codeptr);
2746  }
2747 #endif
2748 
2749  KMP_MB();
2750  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2751 }
2752 
2753 /* Check whether we should push an internal control record onto the
2754  serial team stack. If so, do it. */
2755 void __kmp_save_internal_controls(kmp_info_t *thread) {
2756 
2757  if (thread->th.th_team != thread->th.th_serial_team) {
2758  return;
2759  }
2760  if (thread->th.th_team->t.t_serialized > 1) {
2761  int push = 0;
2762 
2763  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2764  push = 1;
2765  } else {
2766  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2767  thread->th.th_team->t.t_serialized) {
2768  push = 1;
2769  }
2770  }
2771  if (push) { /* push a record on the serial team's stack */
2772  kmp_internal_control_t *control =
2773  (kmp_internal_control_t *)__kmp_allocate(
2774  sizeof(kmp_internal_control_t));
2775 
2776  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2777 
2778  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2779 
2780  control->next = thread->th.th_team->t.t_control_stack_top;
2781  thread->th.th_team->t.t_control_stack_top = control;
2782  }
2783  }
2784 }
2785 
2786 /* Changes set_nproc */
2787 void __kmp_set_num_threads(int new_nth, int gtid) {
2788  kmp_info_t *thread;
2789  kmp_root_t *root;
2790 
2791  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2792  KMP_DEBUG_ASSERT(__kmp_init_serial);
2793 
2794  if (new_nth < 1)
2795  new_nth = 1;
2796  else if (new_nth > __kmp_max_nth)
2797  new_nth = __kmp_max_nth;
2798 
2799  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2800  thread = __kmp_threads[gtid];
2801  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2802  return; // nothing to do
2803 
2804  __kmp_save_internal_controls(thread);
2805 
2806  set__nproc(thread, new_nth);
2807 
2808  // If this omp_set_num_threads() call will cause the hot team size to be
2809  // reduced (in the absence of a num_threads clause), then reduce it now,
2810  // rather than waiting for the next parallel region.
2811  root = thread->th.th_root;
2812  if (__kmp_init_parallel && (!root->r.r_active) &&
2813  (root->r.r_hot_team->t.t_nproc > new_nth) && __kmp_hot_teams_max_level &&
2814  !__kmp_hot_teams_mode) {
2815  kmp_team_t *hot_team = root->r.r_hot_team;
2816  int f;
2817 
2818  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2819 
2820  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2821  __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2822  }
2823  // Release the extra threads we don't need any more.
2824  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2825  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2826  if (__kmp_tasking_mode != tskm_immediate_exec) {
2827  // When decreasing team size, threads no longer in the team should unref
2828  // task team.
2829  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2830  }
2831  __kmp_free_thread(hot_team->t.t_threads[f]);
2832  hot_team->t.t_threads[f] = NULL;
2833  }
2834  hot_team->t.t_nproc = new_nth;
2835  if (thread->th.th_hot_teams) {
2836  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2837  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2838  }
2839 
2840  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2841  hot_team->t.b->update_num_threads(new_nth);
2842  __kmp_add_threads_to_team(hot_team, new_nth);
2843  }
2844 
2845  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2846 
2847  // Update the t_nproc field in the threads that are still active.
2848  for (f = 0; f < new_nth; f++) {
2849  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2850  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2851  }
2852  // Special flag in case omp_set_num_threads() call
2853  hot_team->t.t_size_changed = -1;
2854  }
2855 }
2856 
2857 /* Changes max_active_levels */
2858 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2859  kmp_info_t *thread;
2860 
2861  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2862  "%d = (%d)\n",
2863  gtid, max_active_levels));
2864  KMP_DEBUG_ASSERT(__kmp_init_serial);
2865 
2866  // validate max_active_levels
2867  if (max_active_levels < 0) {
2868  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2869  // We ignore this call if the user has specified a negative value.
2870  // The current setting won't be changed. The last valid setting will be
2871  // used. A warning will be issued (if warnings are allowed as controlled by
2872  // the KMP_WARNINGS env var).
2873  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2874  "max_active_levels for thread %d = (%d)\n",
2875  gtid, max_active_levels));
2876  return;
2877  }
2878  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2879  // it's OK, the max_active_levels is within the valid range: [ 0;
2880  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2881  // We allow a zero value. (implementation defined behavior)
2882  } else {
2883  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2884  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2885  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2886  // Current upper limit is MAX_INT. (implementation defined behavior)
2887  // If the input exceeds the upper limit, we correct the input to be the
2888  // upper limit. (implementation defined behavior)
2889  // Actually, the flow should never get here until we use MAX_INT limit.
2890  }
2891  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2892  "max_active_levels for thread %d = (%d)\n",
2893  gtid, max_active_levels));
2894 
2895  thread = __kmp_threads[gtid];
2896 
2897  __kmp_save_internal_controls(thread);
2898 
2899  set__max_active_levels(thread, max_active_levels);
2900 }
2901 
2902 /* Gets max_active_levels */
2903 int __kmp_get_max_active_levels(int gtid) {
2904  kmp_info_t *thread;
2905 
2906  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2907  KMP_DEBUG_ASSERT(__kmp_init_serial);
2908 
2909  thread = __kmp_threads[gtid];
2910  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2911  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2912  "curtask_maxaclevel=%d\n",
2913  gtid, thread->th.th_current_task,
2914  thread->th.th_current_task->td_icvs.max_active_levels));
2915  return thread->th.th_current_task->td_icvs.max_active_levels;
2916 }
2917 
2918 // nteams-var per-device ICV
2919 void __kmp_set_num_teams(int num_teams) {
2920  if (num_teams > 0)
2921  __kmp_nteams = num_teams;
2922 }
2923 int __kmp_get_max_teams(void) { return __kmp_nteams; }
2924 // teams-thread-limit-var per-device ICV
2925 void __kmp_set_teams_thread_limit(int limit) {
2926  if (limit > 0)
2927  __kmp_teams_thread_limit = limit;
2928 }
2929 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2930 
2931 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2932 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2933 
2934 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2935 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2936  kmp_info_t *thread;
2937  kmp_sched_t orig_kind;
2938  // kmp_team_t *team;
2939 
2940  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2941  gtid, (int)kind, chunk));
2942  KMP_DEBUG_ASSERT(__kmp_init_serial);
2943 
2944  // Check if the kind parameter is valid, correct if needed.
2945  // Valid parameters should fit in one of two intervals - standard or extended:
2946  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2947  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2948  orig_kind = kind;
2949  kind = __kmp_sched_without_mods(kind);
2950 
2951  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2952  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2953  // TODO: Hint needs attention in case we change the default schedule.
2954  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2955  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2956  __kmp_msg_null);
2957  kind = kmp_sched_default;
2958  chunk = 0; // ignore chunk value in case of bad kind
2959  }
2960 
2961  thread = __kmp_threads[gtid];
2962 
2963  __kmp_save_internal_controls(thread);
2964 
2965  if (kind < kmp_sched_upper_std) {
2966  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2967  // differ static chunked vs. unchunked: chunk should be invalid to
2968  // indicate unchunked schedule (which is the default)
2969  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2970  } else {
2971  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2972  __kmp_sch_map[kind - kmp_sched_lower - 1];
2973  }
2974  } else {
2975  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2976  // kmp_sched_lower - 2 ];
2977  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2978  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2979  kmp_sched_lower - 2];
2980  }
2981  __kmp_sched_apply_mods_intkind(
2982  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2983  if (kind == kmp_sched_auto || chunk < 1) {
2984  // ignore parameter chunk for schedule auto
2985  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2986  } else {
2987  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2988  }
2989 }
2990 
2991 /* Gets def_sched_var ICV values */
2992 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2993  kmp_info_t *thread;
2994  enum sched_type th_type;
2995 
2996  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2997  KMP_DEBUG_ASSERT(__kmp_init_serial);
2998 
2999  thread = __kmp_threads[gtid];
3000 
3001  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
3002  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
3003  case kmp_sch_static:
3004  case kmp_sch_static_greedy:
3005  case kmp_sch_static_balanced:
3006  *kind = kmp_sched_static;
3007  __kmp_sched_apply_mods_stdkind(kind, th_type);
3008  *chunk = 0; // chunk was not set, try to show this fact via zero value
3009  return;
3010  case kmp_sch_static_chunked:
3011  *kind = kmp_sched_static;
3012  break;
3013  case kmp_sch_dynamic_chunked:
3014  *kind = kmp_sched_dynamic;
3015  break;
3017  case kmp_sch_guided_iterative_chunked:
3018  case kmp_sch_guided_analytical_chunked:
3019  *kind = kmp_sched_guided;
3020  break;
3021  case kmp_sch_auto:
3022  *kind = kmp_sched_auto;
3023  break;
3024  case kmp_sch_trapezoidal:
3025  *kind = kmp_sched_trapezoidal;
3026  break;
3027 #if KMP_STATIC_STEAL_ENABLED
3028  case kmp_sch_static_steal:
3029  *kind = kmp_sched_static_steal;
3030  break;
3031 #endif
3032  default:
3033  KMP_FATAL(UnknownSchedulingType, th_type);
3034  }
3035 
3036  __kmp_sched_apply_mods_stdkind(kind, th_type);
3037  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3038 }
3039 
3040 int __kmp_get_ancestor_thread_num(int gtid, int level) {
3041 
3042  int ii, dd;
3043  kmp_team_t *team;
3044  kmp_info_t *thr;
3045 
3046  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3047  KMP_DEBUG_ASSERT(__kmp_init_serial);
3048 
3049  // validate level
3050  if (level == 0)
3051  return 0;
3052  if (level < 0)
3053  return -1;
3054  thr = __kmp_threads[gtid];
3055  team = thr->th.th_team;
3056  ii = team->t.t_level;
3057  if (level > ii)
3058  return -1;
3059 
3060  if (thr->th.th_teams_microtask) {
3061  // AC: we are in teams region where multiple nested teams have same level
3062  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3063  if (level <=
3064  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3065  KMP_DEBUG_ASSERT(ii >= tlevel);
3066  // AC: As we need to pass by the teams league, we need to artificially
3067  // increase ii
3068  if (ii == tlevel) {
3069  ii += 2; // three teams have same level
3070  } else {
3071  ii++; // two teams have same level
3072  }
3073  }
3074  }
3075 
3076  if (ii == level)
3077  return __kmp_tid_from_gtid(gtid);
3078 
3079  dd = team->t.t_serialized;
3080  level++;
3081  while (ii > level) {
3082  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3083  }
3084  if ((team->t.t_serialized) && (!dd)) {
3085  team = team->t.t_parent;
3086  continue;
3087  }
3088  if (ii > level) {
3089  team = team->t.t_parent;
3090  dd = team->t.t_serialized;
3091  ii--;
3092  }
3093  }
3094 
3095  return (dd > 1) ? (0) : (team->t.t_master_tid);
3096 }
3097 
3098 int __kmp_get_team_size(int gtid, int level) {
3099 
3100  int ii, dd;
3101  kmp_team_t *team;
3102  kmp_info_t *thr;
3103 
3104  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3105  KMP_DEBUG_ASSERT(__kmp_init_serial);
3106 
3107  // validate level
3108  if (level == 0)
3109  return 1;
3110  if (level < 0)
3111  return -1;
3112  thr = __kmp_threads[gtid];
3113  team = thr->th.th_team;
3114  ii = team->t.t_level;
3115  if (level > ii)
3116  return -1;
3117 
3118  if (thr->th.th_teams_microtask) {
3119  // AC: we are in teams region where multiple nested teams have same level
3120  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3121  if (level <=
3122  tlevel) { // otherwise usual algorithm works (will not touch the teams)
3123  KMP_DEBUG_ASSERT(ii >= tlevel);
3124  // AC: As we need to pass by the teams league, we need to artificially
3125  // increase ii
3126  if (ii == tlevel) {
3127  ii += 2; // three teams have same level
3128  } else {
3129  ii++; // two teams have same level
3130  }
3131  }
3132  }
3133 
3134  while (ii > level) {
3135  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3136  }
3137  if (team->t.t_serialized && (!dd)) {
3138  team = team->t.t_parent;
3139  continue;
3140  }
3141  if (ii > level) {
3142  team = team->t.t_parent;
3143  ii--;
3144  }
3145  }
3146 
3147  return team->t.t_nproc;
3148 }
3149 
3150 kmp_r_sched_t __kmp_get_schedule_global() {
3151  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3152  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3153  // independently. So one can get the updated schedule here.
3154 
3155  kmp_r_sched_t r_sched;
3156 
3157  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3158  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3159  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3160  // different roots (even in OMP 2.5)
3161  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3162  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3163  if (s == kmp_sch_static) {
3164  // replace STATIC with more detailed schedule (balanced or greedy)
3165  r_sched.r_sched_type = __kmp_static;
3166  } else if (s == kmp_sch_guided_chunked) {
3167  // replace GUIDED with more detailed schedule (iterative or analytical)
3168  r_sched.r_sched_type = __kmp_guided;
3169  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3170  r_sched.r_sched_type = __kmp_sched;
3171  }
3172  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3173 
3174  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3175  // __kmp_chunk may be wrong here (if it was not ever set)
3176  r_sched.chunk = KMP_DEFAULT_CHUNK;
3177  } else {
3178  r_sched.chunk = __kmp_chunk;
3179  }
3180 
3181  return r_sched;
3182 }
3183 
3184 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3185  at least argc number of *t_argv entries for the requested team. */
3186 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3187 
3188  KMP_DEBUG_ASSERT(team);
3189  if (!realloc || argc > team->t.t_max_argc) {
3190 
3191  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3192  "current entries=%d\n",
3193  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3194  /* if previously allocated heap space for args, free them */
3195  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3196  __kmp_free((void *)team->t.t_argv);
3197 
3198  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3199  /* use unused space in the cache line for arguments */
3200  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3201  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3202  "argv entries\n",
3203  team->t.t_id, team->t.t_max_argc));
3204  team->t.t_argv = &team->t.t_inline_argv[0];
3205  if (__kmp_storage_map) {
3206  __kmp_print_storage_map_gtid(
3207  -1, &team->t.t_inline_argv[0],
3208  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3209  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3210  team->t.t_id);
3211  }
3212  } else {
3213  /* allocate space for arguments in the heap */
3214  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3215  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3216  : 2 * argc;
3217  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3218  "argv entries\n",
3219  team->t.t_id, team->t.t_max_argc));
3220  team->t.t_argv =
3221  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3222  if (__kmp_storage_map) {
3223  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3224  &team->t.t_argv[team->t.t_max_argc],
3225  sizeof(void *) * team->t.t_max_argc,
3226  "team_%d.t_argv", team->t.t_id);
3227  }
3228  }
3229  }
3230 }
3231 
3232 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3233  int i;
3234  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3235  team->t.t_threads =
3236  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3237  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3238  sizeof(dispatch_shared_info_t) * num_disp_buff);
3239  team->t.t_dispatch =
3240  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3241  team->t.t_implicit_task_taskdata =
3242  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3243  team->t.t_max_nproc = max_nth;
3244 
3245  /* setup dispatch buffers */
3246  for (i = 0; i < num_disp_buff; ++i) {
3247  team->t.t_disp_buffer[i].buffer_index = i;
3248  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3249  }
3250 }
3251 
3252 static void __kmp_free_team_arrays(kmp_team_t *team) {
3253  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3254  int i;
3255  for (i = 0; i < team->t.t_max_nproc; ++i) {
3256  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3257  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3258  team->t.t_dispatch[i].th_disp_buffer = NULL;
3259  }
3260  }
3261 #if KMP_USE_HIER_SCHED
3262  __kmp_dispatch_free_hierarchies(team);
3263 #endif
3264  __kmp_free(team->t.t_threads);
3265  __kmp_free(team->t.t_disp_buffer);
3266  __kmp_free(team->t.t_dispatch);
3267  __kmp_free(team->t.t_implicit_task_taskdata);
3268  team->t.t_threads = NULL;
3269  team->t.t_disp_buffer = NULL;
3270  team->t.t_dispatch = NULL;
3271  team->t.t_implicit_task_taskdata = 0;
3272 }
3273 
3274 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3275  kmp_info_t **oldThreads = team->t.t_threads;
3276 
3277  __kmp_free(team->t.t_disp_buffer);
3278  __kmp_free(team->t.t_dispatch);
3279  __kmp_free(team->t.t_implicit_task_taskdata);
3280  __kmp_allocate_team_arrays(team, max_nth);
3281 
3282  KMP_MEMCPY(team->t.t_threads, oldThreads,
3283  team->t.t_nproc * sizeof(kmp_info_t *));
3284 
3285  __kmp_free(oldThreads);
3286 }
3287 
3288 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3289 
3290  kmp_r_sched_t r_sched =
3291  __kmp_get_schedule_global(); // get current state of scheduling globals
3292 
3293  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3294 
3295  kmp_internal_control_t g_icvs = {
3296  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3297  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3298  // adjustment of threads (per thread)
3299  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3300  // whether blocktime is explicitly set
3301  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3302 #if KMP_USE_MONITOR
3303  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3304 // intervals
3305 #endif
3306  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3307  // next parallel region (per thread)
3308  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3309  __kmp_cg_max_nth, // int thread_limit;
3310  __kmp_task_max_nth, // int task_thread_limit; // to set the thread_limit
3311  // on task. This is used in the case of target thread_limit
3312  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3313  // for max_active_levels
3314  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3315  // {sched,chunk} pair
3316  __kmp_nested_proc_bind.bind_types[0],
3317  __kmp_default_device,
3318  NULL // struct kmp_internal_control *next;
3319  };
3320 
3321  return g_icvs;
3322 }
3323 
3324 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3325 
3326  kmp_internal_control_t gx_icvs;
3327  gx_icvs.serial_nesting_level =
3328  0; // probably =team->t.t_serial like in save_inter_controls
3329  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3330  gx_icvs.next = NULL;
3331 
3332  return gx_icvs;
3333 }
3334 
3335 static void __kmp_initialize_root(kmp_root_t *root) {
3336  int f;
3337  kmp_team_t *root_team;
3338  kmp_team_t *hot_team;
3339  int hot_team_max_nth;
3340  kmp_r_sched_t r_sched =
3341  __kmp_get_schedule_global(); // get current state of scheduling globals
3342  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3343  KMP_DEBUG_ASSERT(root);
3344  KMP_ASSERT(!root->r.r_begin);
3345 
3346  /* setup the root state structure */
3347  __kmp_init_lock(&root->r.r_begin_lock);
3348  root->r.r_begin = FALSE;
3349  root->r.r_active = FALSE;
3350  root->r.r_in_parallel = 0;
3351  root->r.r_blocktime = __kmp_dflt_blocktime;
3352 #if KMP_AFFINITY_SUPPORTED
3353  root->r.r_affinity_assigned = FALSE;
3354 #endif
3355 
3356  /* setup the root team for this task */
3357  /* allocate the root team structure */
3358  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3359 
3360  root_team = __kmp_allocate_team(root,
3361  1, // new_nproc
3362  1, // max_nproc
3363 #if OMPT_SUPPORT
3364  ompt_data_none, // root parallel id
3365 #endif
3366  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3367  0, // argc
3368  NULL // primary thread is unknown
3369  );
3370 #if USE_DEBUGGER
3371  // Non-NULL value should be assigned to make the debugger display the root
3372  // team.
3373  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3374 #endif
3375 
3376  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3377 
3378  root->r.r_root_team = root_team;
3379  root_team->t.t_control_stack_top = NULL;
3380 
3381  /* initialize root team */
3382  root_team->t.t_threads[0] = NULL;
3383  root_team->t.t_nproc = 1;
3384  root_team->t.t_serialized = 1;
3385  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3386  root_team->t.t_sched.sched = r_sched.sched;
3387  root_team->t.t_nested_nth = &__kmp_nested_nth;
3388  KA_TRACE(
3389  20,
3390  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3391  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3392 
3393  /* setup the hot team for this task */
3394  /* allocate the hot team structure */
3395  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3396 
3397  hot_team = __kmp_allocate_team(root,
3398  1, // new_nproc
3399  __kmp_dflt_team_nth_ub * 2, // max_nproc
3400 #if OMPT_SUPPORT
3401  ompt_data_none, // root parallel id
3402 #endif
3403  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3404  0, // argc
3405  NULL // primary thread is unknown
3406  );
3407  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3408 
3409  root->r.r_hot_team = hot_team;
3410  root_team->t.t_control_stack_top = NULL;
3411 
3412  /* first-time initialization */
3413  hot_team->t.t_parent = root_team;
3414 
3415  /* initialize hot team */
3416  hot_team_max_nth = hot_team->t.t_max_nproc;
3417  for (f = 0; f < hot_team_max_nth; ++f) {
3418  hot_team->t.t_threads[f] = NULL;
3419  }
3420  hot_team->t.t_nproc = 1;
3421  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3422  hot_team->t.t_sched.sched = r_sched.sched;
3423  hot_team->t.t_size_changed = 0;
3424  hot_team->t.t_nested_nth = &__kmp_nested_nth;
3425 }
3426 
3427 #ifdef KMP_DEBUG
3428 
3429 typedef struct kmp_team_list_item {
3430  kmp_team_p const *entry;
3431  struct kmp_team_list_item *next;
3432 } kmp_team_list_item_t;
3433 typedef kmp_team_list_item_t *kmp_team_list_t;
3434 
3435 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3436  kmp_team_list_t list, // List of teams.
3437  kmp_team_p const *team // Team to add.
3438 ) {
3439 
3440  // List must terminate with item where both entry and next are NULL.
3441  // Team is added to the list only once.
3442  // List is sorted in ascending order by team id.
3443  // Team id is *not* a key.
3444 
3445  kmp_team_list_t l;
3446 
3447  KMP_DEBUG_ASSERT(list != NULL);
3448  if (team == NULL) {
3449  return;
3450  }
3451 
3452  __kmp_print_structure_team_accum(list, team->t.t_parent);
3453  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3454 
3455  // Search list for the team.
3456  l = list;
3457  while (l->next != NULL && l->entry != team) {
3458  l = l->next;
3459  }
3460  if (l->next != NULL) {
3461  return; // Team has been added before, exit.
3462  }
3463 
3464  // Team is not found. Search list again for insertion point.
3465  l = list;
3466  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3467  l = l->next;
3468  }
3469 
3470  // Insert team.
3471  {
3472  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3473  sizeof(kmp_team_list_item_t));
3474  *item = *l;
3475  l->entry = team;
3476  l->next = item;
3477  }
3478 }
3479 
3480 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3481 
3482 ) {
3483  __kmp_printf("%s", title);
3484  if (team != NULL) {
3485  __kmp_printf("%2x %p\n", team->t.t_id, team);
3486  } else {
3487  __kmp_printf(" - (nil)\n");
3488  }
3489 }
3490 
3491 static void __kmp_print_structure_thread(char const *title,
3492  kmp_info_p const *thread) {
3493  __kmp_printf("%s", title);
3494  if (thread != NULL) {
3495  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3496  } else {
3497  __kmp_printf(" - (nil)\n");
3498  }
3499 }
3500 
3501 void __kmp_print_structure(void) {
3502 
3503  kmp_team_list_t list;
3504 
3505  // Initialize list of teams.
3506  list =
3507  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3508  list->entry = NULL;
3509  list->next = NULL;
3510 
3511  __kmp_printf("\n------------------------------\nGlobal Thread "
3512  "Table\n------------------------------\n");
3513  {
3514  int gtid;
3515  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3516  __kmp_printf("%2d", gtid);
3517  if (__kmp_threads != NULL) {
3518  __kmp_printf(" %p", __kmp_threads[gtid]);
3519  }
3520  if (__kmp_root != NULL) {
3521  __kmp_printf(" %p", __kmp_root[gtid]);
3522  }
3523  __kmp_printf("\n");
3524  }
3525  }
3526 
3527  // Print out __kmp_threads array.
3528  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3529  "----------\n");
3530  if (__kmp_threads != NULL) {
3531  int gtid;
3532  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3533  kmp_info_t const *thread = __kmp_threads[gtid];
3534  if (thread != NULL) {
3535  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3536  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3537  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3538  __kmp_print_structure_team(" Serial Team: ",
3539  thread->th.th_serial_team);
3540  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3541  __kmp_print_structure_thread(" Primary: ",
3542  thread->th.th_team_master);
3543  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3544  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3545  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3546  __kmp_print_structure_thread(" Next in pool: ",
3547  thread->th.th_next_pool);
3548  __kmp_printf("\n");
3549  __kmp_print_structure_team_accum(list, thread->th.th_team);
3550  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3551  }
3552  }
3553  } else {
3554  __kmp_printf("Threads array is not allocated.\n");
3555  }
3556 
3557  // Print out __kmp_root array.
3558  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3559  "--------\n");
3560  if (__kmp_root != NULL) {
3561  int gtid;
3562  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3563  kmp_root_t const *root = __kmp_root[gtid];
3564  if (root != NULL) {
3565  __kmp_printf("GTID %2d %p:\n", gtid, root);
3566  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3567  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3568  __kmp_print_structure_thread(" Uber Thread: ",
3569  root->r.r_uber_thread);
3570  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3571  __kmp_printf(" In Parallel: %2d\n",
3572  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3573  __kmp_printf("\n");
3574  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3575  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3576  }
3577  }
3578  } else {
3579  __kmp_printf("Ubers array is not allocated.\n");
3580  }
3581 
3582  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3583  "--------\n");
3584  while (list->next != NULL) {
3585  kmp_team_p const *team = list->entry;
3586  int i;
3587  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3588  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3589  __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3590  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3591  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3592  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3593  for (i = 0; i < team->t.t_nproc; ++i) {
3594  __kmp_printf(" Thread %2d: ", i);
3595  __kmp_print_structure_thread("", team->t.t_threads[i]);
3596  }
3597  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3598  __kmp_printf("\n");
3599  list = list->next;
3600  }
3601 
3602  // Print out __kmp_thread_pool and __kmp_team_pool.
3603  __kmp_printf("\n------------------------------\nPools\n----------------------"
3604  "--------\n");
3605  __kmp_print_structure_thread("Thread pool: ",
3606  CCAST(kmp_info_t *, __kmp_thread_pool));
3607  __kmp_print_structure_team("Team pool: ",
3608  CCAST(kmp_team_t *, __kmp_team_pool));
3609  __kmp_printf("\n");
3610 
3611  // Free team list.
3612  while (list != NULL) {
3613  kmp_team_list_item_t *item = list;
3614  list = list->next;
3615  KMP_INTERNAL_FREE(item);
3616  }
3617 }
3618 
3619 #endif
3620 
3621 //---------------------------------------------------------------------------
3622 // Stuff for per-thread fast random number generator
3623 // Table of primes
3624 static const unsigned __kmp_primes[] = {
3625  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3626  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3627  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3628  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3629  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3630  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3631  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3632  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3633  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3634  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3635  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3636 
3637 //---------------------------------------------------------------------------
3638 // __kmp_get_random: Get a random number using a linear congruential method.
3639 unsigned short __kmp_get_random(kmp_info_t *thread) {
3640  unsigned x = thread->th.th_x;
3641  unsigned short r = (unsigned short)(x >> 16);
3642 
3643  thread->th.th_x = x * thread->th.th_a + 1;
3644 
3645  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3646  thread->th.th_info.ds.ds_tid, r));
3647 
3648  return r;
3649 }
3650 //--------------------------------------------------------
3651 // __kmp_init_random: Initialize a random number generator
3652 void __kmp_init_random(kmp_info_t *thread) {
3653  unsigned seed = thread->th.th_info.ds.ds_tid;
3654 
3655  thread->th.th_a =
3656  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3657  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3658  KA_TRACE(30,
3659  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3660 }
3661 
3662 #if KMP_OS_WINDOWS
3663 /* reclaim array entries for root threads that are already dead, returns number
3664  * reclaimed */
3665 static int __kmp_reclaim_dead_roots(void) {
3666  int i, r = 0;
3667 
3668  for (i = 0; i < __kmp_threads_capacity; ++i) {
3669  if (KMP_UBER_GTID(i) &&
3670  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3671  !__kmp_root[i]
3672  ->r.r_active) { // AC: reclaim only roots died in non-active state
3673  r += __kmp_unregister_root_other_thread(i);
3674  }
3675  }
3676  return r;
3677 }
3678 #endif
3679 
3680 /* This function attempts to create free entries in __kmp_threads and
3681  __kmp_root, and returns the number of free entries generated.
3682 
3683  For Windows* OS static library, the first mechanism used is to reclaim array
3684  entries for root threads that are already dead.
3685 
3686  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3687  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3688  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3689  threadprivate cache array has been created. Synchronization with
3690  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3691 
3692  After any dead root reclamation, if the clipping value allows array expansion
3693  to result in the generation of a total of nNeed free slots, the function does
3694  that expansion. If not, nothing is done beyond the possible initial root
3695  thread reclamation.
3696 
3697  If any argument is negative, the behavior is undefined. */
3698 static int __kmp_expand_threads(int nNeed) {
3699  int added = 0;
3700  int minimumRequiredCapacity;
3701  int newCapacity;
3702  kmp_info_t **newThreads;
3703  kmp_root_t **newRoot;
3704 
3705  // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3706  // resizing __kmp_threads does not need additional protection if foreign
3707  // threads are present
3708 
3709 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3710  /* only for Windows static library */
3711  /* reclaim array entries for root threads that are already dead */
3712  added = __kmp_reclaim_dead_roots();
3713 
3714  if (nNeed) {
3715  nNeed -= added;
3716  if (nNeed < 0)
3717  nNeed = 0;
3718  }
3719 #endif
3720  if (nNeed <= 0)
3721  return added;
3722 
3723  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3724  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3725  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3726  // > __kmp_max_nth in one of two ways:
3727  //
3728  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3729  // may not be reused by another thread, so we may need to increase
3730  // __kmp_threads_capacity to __kmp_max_nth + 1.
3731  //
3732  // 2) New foreign root(s) are encountered. We always register new foreign
3733  // roots. This may cause a smaller # of threads to be allocated at
3734  // subsequent parallel regions, but the worker threads hang around (and
3735  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3736  //
3737  // Anyway, that is the reason for moving the check to see if
3738  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3739  // instead of having it performed here. -BB
3740 
3741  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3742 
3743  /* compute expansion headroom to check if we can expand */
3744  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3745  /* possible expansion too small -- give up */
3746  return added;
3747  }
3748  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3749 
3750  newCapacity = __kmp_threads_capacity;
3751  do {
3752  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3753  : __kmp_sys_max_nth;
3754  } while (newCapacity < minimumRequiredCapacity);
3755  newThreads = (kmp_info_t **)__kmp_allocate(
3756  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3757  newRoot =
3758  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3759  KMP_MEMCPY(newThreads, __kmp_threads,
3760  __kmp_threads_capacity * sizeof(kmp_info_t *));
3761  KMP_MEMCPY(newRoot, __kmp_root,
3762  __kmp_threads_capacity * sizeof(kmp_root_t *));
3763  // Put old __kmp_threads array on a list. Any ongoing references to the old
3764  // list will be valid. This list is cleaned up at library shutdown.
3765  kmp_old_threads_list_t *node =
3766  (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3767  node->threads = __kmp_threads;
3768  node->next = __kmp_old_threads_list;
3769  __kmp_old_threads_list = node;
3770 
3771  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3772  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3773  added += newCapacity - __kmp_threads_capacity;
3774  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3775 
3776  if (newCapacity > __kmp_tp_capacity) {
3777  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3778  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3779  __kmp_threadprivate_resize_cache(newCapacity);
3780  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3781  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3782  }
3783  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3784  }
3785 
3786  return added;
3787 }
3788 
3789 /* Register the current thread as a root thread and obtain our gtid. We must
3790  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3791  thread that calls from __kmp_do_serial_initialize() */
3792 int __kmp_register_root(int initial_thread) {
3793  kmp_info_t *root_thread;
3794  kmp_root_t *root;
3795  int gtid;
3796  int capacity;
3797  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3798  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3799  KMP_MB();
3800 
3801  /* 2007-03-02:
3802  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3803  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3804  work as expected -- it may return false (that means there is at least one
3805  empty slot in __kmp_threads array), but it is possible the only free slot
3806  is #0, which is reserved for initial thread and so cannot be used for this
3807  one. Following code workarounds this bug.
3808 
3809  However, right solution seems to be not reserving slot #0 for initial
3810  thread because:
3811  (1) there is no magic in slot #0,
3812  (2) we cannot detect initial thread reliably (the first thread which does
3813  serial initialization may be not a real initial thread).
3814  */
3815  capacity = __kmp_threads_capacity;
3816  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3817  --capacity;
3818  }
3819 
3820  // If it is not for initializing the hidden helper team, we need to take
3821  // __kmp_hidden_helper_threads_num out of the capacity because it is included
3822  // in __kmp_threads_capacity.
3823  if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3824  capacity -= __kmp_hidden_helper_threads_num;
3825  }
3826 
3827  /* see if there are too many threads */
3828  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3829  if (__kmp_tp_cached) {
3830  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3831  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3832  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3833  } else {
3834  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3835  __kmp_msg_null);
3836  }
3837  }
3838 
3839  // When hidden helper task is enabled, __kmp_threads is organized as follows:
3840  // 0: initial thread, also a regular OpenMP thread.
3841  // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3842  // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3843  // regular OpenMP threads.
3844  if (TCR_4(__kmp_init_hidden_helper_threads)) {
3845  // Find an available thread slot for hidden helper thread. Slots for hidden
3846  // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3847  for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3848  gtid <= __kmp_hidden_helper_threads_num;
3849  gtid++)
3850  ;
3851  KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3852  KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3853  "hidden helper thread: T#%d\n",
3854  gtid));
3855  } else {
3856  /* find an available thread slot */
3857  // Don't reassign the zero slot since we need that to only be used by
3858  // initial thread. Slots for hidden helper threads should also be skipped.
3859  if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3860  gtid = 0;
3861  } else {
3862  for (gtid = __kmp_hidden_helper_threads_num + 1;
3863  TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3864  ;
3865  }
3866  KA_TRACE(
3867  1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3868  KMP_ASSERT(gtid < __kmp_threads_capacity);
3869  }
3870 
3871  /* update global accounting */
3872  __kmp_all_nth++;
3873  TCW_4(__kmp_nth, __kmp_nth + 1);
3874 
3875  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3876  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3877  if (__kmp_adjust_gtid_mode) {
3878  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3879  if (TCR_4(__kmp_gtid_mode) != 2) {
3880  TCW_4(__kmp_gtid_mode, 2);
3881  }
3882  } else {
3883  if (TCR_4(__kmp_gtid_mode) != 1) {
3884  TCW_4(__kmp_gtid_mode, 1);
3885  }
3886  }
3887  }
3888 
3889 #ifdef KMP_ADJUST_BLOCKTIME
3890  /* Adjust blocktime to zero if necessary */
3891  /* Middle initialization might not have occurred yet */
3892  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3893  if (__kmp_nth > __kmp_avail_proc) {
3894  __kmp_zero_bt = TRUE;
3895  }
3896  }
3897 #endif /* KMP_ADJUST_BLOCKTIME */
3898 
3899  /* setup this new hierarchy */
3900  if (!(root = __kmp_root[gtid])) {
3901  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3902  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3903  }
3904 
3905 #if KMP_STATS_ENABLED
3906  // Initialize stats as soon as possible (right after gtid assignment).
3907  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3908  __kmp_stats_thread_ptr->startLife();
3909  KMP_SET_THREAD_STATE(SERIAL_REGION);
3910  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3911 #endif
3912  __kmp_initialize_root(root);
3913 
3914  /* setup new root thread structure */
3915  if (root->r.r_uber_thread) {
3916  root_thread = root->r.r_uber_thread;
3917  } else {
3918  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3919  if (__kmp_storage_map) {
3920  __kmp_print_thread_storage_map(root_thread, gtid);
3921  }
3922  root_thread->th.th_info.ds.ds_gtid = gtid;
3923 #if OMPT_SUPPORT
3924  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3925 #endif
3926  root_thread->th.th_root = root;
3927  if (__kmp_env_consistency_check) {
3928  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3929  }
3930 #if USE_FAST_MEMORY
3931  __kmp_initialize_fast_memory(root_thread);
3932 #endif /* USE_FAST_MEMORY */
3933 
3934 #if KMP_USE_BGET
3935  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3936  __kmp_initialize_bget(root_thread);
3937 #endif
3938  __kmp_init_random(root_thread); // Initialize random number generator
3939  }
3940 
3941  /* setup the serial team held in reserve by the root thread */
3942  if (!root_thread->th.th_serial_team) {
3943  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3944  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3945  root_thread->th.th_serial_team =
3946  __kmp_allocate_team(root, 1, 1,
3947 #if OMPT_SUPPORT
3948  ompt_data_none, // root parallel id
3949 #endif
3950  proc_bind_default, &r_icvs, 0, NULL);
3951  }
3952  KMP_ASSERT(root_thread->th.th_serial_team);
3953  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3954  root_thread->th.th_serial_team));
3955 
3956  /* drop root_thread into place */
3957  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3958 
3959  root->r.r_root_team->t.t_threads[0] = root_thread;
3960  root->r.r_hot_team->t.t_threads[0] = root_thread;
3961  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3962  // AC: the team created in reserve, not for execution (it is unused for now).
3963  root_thread->th.th_serial_team->t.t_serialized = 0;
3964  root->r.r_uber_thread = root_thread;
3965 
3966  /* initialize the thread, get it ready to go */
3967  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3968  TCW_4(__kmp_init_gtid, TRUE);
3969 
3970  /* prepare the primary thread for get_gtid() */
3971  __kmp_gtid_set_specific(gtid);
3972 
3973 #if USE_ITT_BUILD
3974  __kmp_itt_thread_name(gtid);
3975 #endif /* USE_ITT_BUILD */
3976 
3977 #ifdef KMP_TDATA_GTID
3978  __kmp_gtid = gtid;
3979 #endif
3980  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3981  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3982 
3983  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3984  "plain=%u\n",
3985  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3986  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3987  KMP_INIT_BARRIER_STATE));
3988  { // Initialize barrier data.
3989  int b;
3990  for (b = 0; b < bs_last_barrier; ++b) {
3991  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3992 #if USE_DEBUGGER
3993  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3994 #endif
3995  }
3996  }
3997  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3998  KMP_INIT_BARRIER_STATE);
3999 
4000 #if KMP_AFFINITY_SUPPORTED
4001  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
4002  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
4003  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
4004  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
4005 #endif /* KMP_AFFINITY_SUPPORTED */
4006  root_thread->th.th_def_allocator = __kmp_def_allocator;
4007  root_thread->th.th_prev_level = 0;
4008  root_thread->th.th_prev_num_threads = 1;
4009 
4010  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
4011  tmp->cg_root = root_thread;
4012  tmp->cg_thread_limit = __kmp_cg_max_nth;
4013  tmp->cg_nthreads = 1;
4014  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
4015  " cg_nthreads init to 1\n",
4016  root_thread, tmp));
4017  tmp->up = NULL;
4018  root_thread->th.th_cg_roots = tmp;
4019 
4020  __kmp_root_counter++;
4021 
4022 #if OMPT_SUPPORT
4023  if (ompt_enabled.enabled) {
4024 
4025  kmp_info_t *root_thread = ompt_get_thread();
4026 
4027  ompt_set_thread_state(root_thread, ompt_state_overhead);
4028 
4029  if (ompt_enabled.ompt_callback_thread_begin) {
4030  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4031  ompt_thread_initial, __ompt_get_thread_data_internal());
4032  }
4033  ompt_data_t *task_data;
4034  ompt_data_t *parallel_data;
4035  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4036  NULL);
4037  if (ompt_enabled.ompt_callback_implicit_task) {
4038  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4039  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4040  }
4041 
4042  ompt_set_thread_state(root_thread, ompt_state_work_serial);
4043  }
4044 #endif
4045 #if OMPD_SUPPORT
4046  if (ompd_state & OMPD_ENABLE_BP)
4047  ompd_bp_thread_begin();
4048 #endif
4049 
4050  KMP_MB();
4051  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4052 
4053  return gtid;
4054 }
4055 
4056 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4057  const int max_level) {
4058  int i, n, nth;
4059  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4060  if (!hot_teams || !hot_teams[level].hot_team) {
4061  return 0;
4062  }
4063  KMP_DEBUG_ASSERT(level < max_level);
4064  kmp_team_t *team = hot_teams[level].hot_team;
4065  nth = hot_teams[level].hot_team_nth;
4066  n = nth - 1; // primary thread is not freed
4067  if (level < max_level - 1) {
4068  for (i = 0; i < nth; ++i) {
4069  kmp_info_t *th = team->t.t_threads[i];
4070  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4071  if (i > 0 && th->th.th_hot_teams) {
4072  __kmp_free(th->th.th_hot_teams);
4073  th->th.th_hot_teams = NULL;
4074  }
4075  }
4076  }
4077  __kmp_free_team(root, team, NULL);
4078  return n;
4079 }
4080 
4081 // Resets a root thread and clear its root and hot teams.
4082 // Returns the number of __kmp_threads entries directly and indirectly freed.
4083 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4084  kmp_team_t *root_team = root->r.r_root_team;
4085  kmp_team_t *hot_team = root->r.r_hot_team;
4086  int n = hot_team->t.t_nproc;
4087  int i;
4088 
4089  KMP_DEBUG_ASSERT(!root->r.r_active);
4090 
4091  root->r.r_root_team = NULL;
4092  root->r.r_hot_team = NULL;
4093  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4094  // before call to __kmp_free_team().
4095  __kmp_free_team(root, root_team, NULL);
4096  if (__kmp_hot_teams_max_level >
4097  0) { // need to free nested hot teams and their threads if any
4098  for (i = 0; i < hot_team->t.t_nproc; ++i) {
4099  kmp_info_t *th = hot_team->t.t_threads[i];
4100  if (__kmp_hot_teams_max_level > 1) {
4101  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4102  }
4103  if (th->th.th_hot_teams) {
4104  __kmp_free(th->th.th_hot_teams);
4105  th->th.th_hot_teams = NULL;
4106  }
4107  }
4108  }
4109  __kmp_free_team(root, hot_team, NULL);
4110 
4111  // Before we can reap the thread, we need to make certain that all other
4112  // threads in the teams that had this root as ancestor have stopped trying to
4113  // steal tasks.
4114  if (__kmp_tasking_mode != tskm_immediate_exec) {
4115  __kmp_wait_to_unref_task_teams();
4116  }
4117 
4118 #if KMP_OS_WINDOWS
4119  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4120  KA_TRACE(
4121  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4122  "\n",
4123  (LPVOID) & (root->r.r_uber_thread->th),
4124  root->r.r_uber_thread->th.th_info.ds.ds_thread));
4125  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4126 #endif /* KMP_OS_WINDOWS */
4127 
4128 #if OMPD_SUPPORT
4129  if (ompd_state & OMPD_ENABLE_BP)
4130  ompd_bp_thread_end();
4131 #endif
4132 
4133 #if OMPT_SUPPORT
4134  ompt_data_t *task_data;
4135  ompt_data_t *parallel_data;
4136  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4137  NULL);
4138  if (ompt_enabled.ompt_callback_implicit_task) {
4139  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4140  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4141  }
4142  if (ompt_enabled.ompt_callback_thread_end) {
4143  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4144  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4145  }
4146 #endif
4147 
4148  TCW_4(__kmp_nth,
4149  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4150  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4151  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4152  " to %d\n",
4153  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4154  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4155  if (i == 1) {
4156  // need to free contention group structure
4157  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4158  root->r.r_uber_thread->th.th_cg_roots->cg_root);
4159  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4160  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4161  root->r.r_uber_thread->th.th_cg_roots = NULL;
4162  }
4163  __kmp_reap_thread(root->r.r_uber_thread, 1);
4164 
4165  // We canot put root thread to __kmp_thread_pool, so we have to reap it
4166  // instead of freeing.
4167  root->r.r_uber_thread = NULL;
4168  /* mark root as no longer in use */
4169  root->r.r_begin = FALSE;
4170 
4171  return n;
4172 }
4173 
4174 void __kmp_unregister_root_current_thread(int gtid) {
4175  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4176  /* this lock should be ok, since unregister_root_current_thread is never
4177  called during an abort, only during a normal close. furthermore, if you
4178  have the forkjoin lock, you should never try to get the initz lock */
4179  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4180  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4181  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4182  "exiting T#%d\n",
4183  gtid));
4184  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4185  return;
4186  }
4187  kmp_root_t *root = __kmp_root[gtid];
4188 
4189  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4190  KMP_ASSERT(KMP_UBER_GTID(gtid));
4191  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4192  KMP_ASSERT(root->r.r_active == FALSE);
4193 
4194  KMP_MB();
4195 
4196  kmp_info_t *thread = __kmp_threads[gtid];
4197  kmp_team_t *team = thread->th.th_team;
4198  kmp_task_team_t *task_team = thread->th.th_task_team;
4199 
4200  // we need to wait for the proxy tasks before finishing the thread
4201  if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4202  task_team->tt.tt_hidden_helper_task_encountered)) {
4203 #if OMPT_SUPPORT
4204  // the runtime is shutting down so we won't report any events
4205  thread->th.ompt_thread_info.state = ompt_state_undefined;
4206 #endif
4207  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4208  }
4209 
4210  __kmp_reset_root(gtid, root);
4211 
4212  KMP_MB();
4213  KC_TRACE(10,
4214  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4215 
4216  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4217 }
4218 
4219 #if KMP_OS_WINDOWS
4220 /* __kmp_forkjoin_lock must be already held
4221  Unregisters a root thread that is not the current thread. Returns the number
4222  of __kmp_threads entries freed as a result. */
4223 static int __kmp_unregister_root_other_thread(int gtid) {
4224  kmp_root_t *root = __kmp_root[gtid];
4225  int r;
4226 
4227  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4228  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4229  KMP_ASSERT(KMP_UBER_GTID(gtid));
4230  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4231  KMP_ASSERT(root->r.r_active == FALSE);
4232 
4233  r = __kmp_reset_root(gtid, root);
4234  KC_TRACE(10,
4235  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4236  return r;
4237 }
4238 #endif
4239 
4240 #if KMP_DEBUG
4241 void __kmp_task_info() {
4242 
4243  kmp_int32 gtid = __kmp_entry_gtid();
4244  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4245  kmp_info_t *this_thr = __kmp_threads[gtid];
4246  kmp_team_t *steam = this_thr->th.th_serial_team;
4247  kmp_team_t *team = this_thr->th.th_team;
4248 
4249  __kmp_printf(
4250  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4251  "ptask=%p\n",
4252  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4253  team->t.t_implicit_task_taskdata[tid].td_parent);
4254 }
4255 #endif // KMP_DEBUG
4256 
4257 /* TODO optimize with one big memclr, take out what isn't needed, split
4258  responsibility to workers as much as possible, and delay initialization of
4259  features as much as possible */
4260 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4261  int tid, int gtid) {
4262  /* this_thr->th.th_info.ds.ds_gtid is setup in
4263  kmp_allocate_thread/create_worker.
4264  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4265  KMP_DEBUG_ASSERT(this_thr != NULL);
4266  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4267  KMP_DEBUG_ASSERT(team);
4268  KMP_DEBUG_ASSERT(team->t.t_threads);
4269  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4270  kmp_info_t *master = team->t.t_threads[0];
4271  KMP_DEBUG_ASSERT(master);
4272  KMP_DEBUG_ASSERT(master->th.th_root);
4273 
4274  KMP_MB();
4275 
4276  TCW_SYNC_PTR(this_thr->th.th_team, team);
4277 
4278  this_thr->th.th_info.ds.ds_tid = tid;
4279  this_thr->th.th_set_nproc = 0;
4280  if (__kmp_tasking_mode != tskm_immediate_exec)
4281  // When tasking is possible, threads are not safe to reap until they are
4282  // done tasking; this will be set when tasking code is exited in wait
4283  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4284  else // no tasking --> always safe to reap
4285  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4286  this_thr->th.th_set_proc_bind = proc_bind_default;
4287 
4288 #if KMP_AFFINITY_SUPPORTED
4289  this_thr->th.th_new_place = this_thr->th.th_current_place;
4290 #endif
4291  this_thr->th.th_root = master->th.th_root;
4292 
4293  /* setup the thread's cache of the team structure */
4294  this_thr->th.th_team_nproc = team->t.t_nproc;
4295  this_thr->th.th_team_master = master;
4296  this_thr->th.th_team_serialized = team->t.t_serialized;
4297 
4298  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4299 
4300  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4301  tid, gtid, this_thr, this_thr->th.th_current_task));
4302 
4303  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4304  team, tid, TRUE);
4305 
4306  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4307  tid, gtid, this_thr, this_thr->th.th_current_task));
4308  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4309  // __kmp_initialize_team()?
4310 
4311  /* TODO no worksharing in speculative threads */
4312  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4313 
4314  this_thr->th.th_local.this_construct = 0;
4315 
4316  if (!this_thr->th.th_pri_common) {
4317  this_thr->th.th_pri_common =
4318  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4319  if (__kmp_storage_map) {
4320  __kmp_print_storage_map_gtid(
4321  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4322  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4323  }
4324  this_thr->th.th_pri_head = NULL;
4325  }
4326 
4327  if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4328  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4329  // Make new thread's CG root same as primary thread's
4330  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4331  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4332  if (tmp) {
4333  // worker changes CG, need to check if old CG should be freed
4334  int i = tmp->cg_nthreads--;
4335  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4336  " on node %p of thread %p to %d\n",
4337  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4338  if (i == 1) {
4339  __kmp_free(tmp); // last thread left CG --> free it
4340  }
4341  }
4342  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4343  // Increment new thread's CG root's counter to add the new thread
4344  this_thr->th.th_cg_roots->cg_nthreads++;
4345  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4346  " node %p of thread %p to %d\n",
4347  this_thr, this_thr->th.th_cg_roots,
4348  this_thr->th.th_cg_roots->cg_root,
4349  this_thr->th.th_cg_roots->cg_nthreads));
4350  this_thr->th.th_current_task->td_icvs.thread_limit =
4351  this_thr->th.th_cg_roots->cg_thread_limit;
4352  }
4353 
4354  /* Initialize dynamic dispatch */
4355  {
4356  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4357  // Use team max_nproc since this will never change for the team.
4358  size_t disp_size =
4359  sizeof(dispatch_private_info_t) *
4360  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4361  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4362  team->t.t_max_nproc));
4363  KMP_ASSERT(dispatch);
4364  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4365  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4366 
4367  dispatch->th_disp_index = 0;
4368  dispatch->th_doacross_buf_idx = 0;
4369  if (!dispatch->th_disp_buffer) {
4370  dispatch->th_disp_buffer =
4371  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4372 
4373  if (__kmp_storage_map) {
4374  __kmp_print_storage_map_gtid(
4375  gtid, &dispatch->th_disp_buffer[0],
4376  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4377  ? 1
4378  : __kmp_dispatch_num_buffers],
4379  disp_size,
4380  "th_%d.th_dispatch.th_disp_buffer "
4381  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4382  gtid, team->t.t_id, gtid);
4383  }
4384  } else {
4385  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4386  }
4387 
4388  dispatch->th_dispatch_pr_current = 0;
4389  dispatch->th_dispatch_sh_current = 0;
4390 
4391  dispatch->th_deo_fcn = 0; /* ORDERED */
4392  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4393  }
4394 
4395  this_thr->th.th_next_pool = NULL;
4396 
4397  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4398  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4399 
4400  KMP_MB();
4401 }
4402 
4403 /* allocate a new thread for the requesting team. this is only called from
4404  within a forkjoin critical section. we will first try to get an available
4405  thread from the thread pool. if none is available, we will fork a new one
4406  assuming we are able to create a new one. this should be assured, as the
4407  caller should check on this first. */
4408 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4409  int new_tid) {
4410  kmp_team_t *serial_team;
4411  kmp_info_t *new_thr;
4412  int new_gtid;
4413 
4414  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4415  KMP_DEBUG_ASSERT(root && team);
4416  KMP_MB();
4417 
4418  /* first, try to get one from the thread pool unless allocating thread is
4419  * the main hidden helper thread. The hidden helper team should always
4420  * allocate new OS threads. */
4421  if (__kmp_thread_pool && !KMP_HIDDEN_HELPER_TEAM(team)) {
4422  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4423  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4424  if (new_thr == __kmp_thread_pool_insert_pt) {
4425  __kmp_thread_pool_insert_pt = NULL;
4426  }
4427  TCW_4(new_thr->th.th_in_pool, FALSE);
4428  __kmp_suspend_initialize_thread(new_thr);
4429  __kmp_lock_suspend_mx(new_thr);
4430  if (new_thr->th.th_active_in_pool == TRUE) {
4431  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4432  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4433  new_thr->th.th_active_in_pool = FALSE;
4434  }
4435  __kmp_unlock_suspend_mx(new_thr);
4436 
4437  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4438  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4439  KMP_ASSERT(!new_thr->th.th_team);
4440  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4441 
4442  /* setup the thread structure */
4443  __kmp_initialize_info(new_thr, team, new_tid,
4444  new_thr->th.th_info.ds.ds_gtid);
4445  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4446 
4447  TCW_4(__kmp_nth, __kmp_nth + 1);
4448 
4449  new_thr->th.th_task_state = 0;
4450 
4451  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4452  // Make sure pool thread has transitioned to waiting on own thread struct
4453  KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4454  // Thread activated in __kmp_allocate_team when increasing team size
4455  }
4456 
4457 #ifdef KMP_ADJUST_BLOCKTIME
4458  /* Adjust blocktime back to zero if necessary */
4459  /* Middle initialization might not have occurred yet */
4460  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4461  if (__kmp_nth > __kmp_avail_proc) {
4462  __kmp_zero_bt = TRUE;
4463  }
4464  }
4465 #endif /* KMP_ADJUST_BLOCKTIME */
4466 
4467 #if KMP_DEBUG
4468  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4469  // KMP_BARRIER_PARENT_FLAG.
4470  int b;
4471  kmp_balign_t *balign = new_thr->th.th_bar;
4472  for (b = 0; b < bs_last_barrier; ++b)
4473  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4474 #endif
4475 
4476  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4477  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4478 
4479  KMP_MB();
4480  return new_thr;
4481  }
4482 
4483  /* no, well fork a new one */
4484  KMP_ASSERT(KMP_HIDDEN_HELPER_TEAM(team) || __kmp_nth == __kmp_all_nth);
4485  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4486 
4487 #if KMP_USE_MONITOR
4488  // If this is the first worker thread the RTL is creating, then also
4489  // launch the monitor thread. We try to do this as early as possible.
4490  if (!TCR_4(__kmp_init_monitor)) {
4491  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4492  if (!TCR_4(__kmp_init_monitor)) {
4493  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4494  TCW_4(__kmp_init_monitor, 1);
4495  __kmp_create_monitor(&__kmp_monitor);
4496  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4497 #if KMP_OS_WINDOWS
4498  // AC: wait until monitor has started. This is a fix for CQ232808.
4499  // The reason is that if the library is loaded/unloaded in a loop with
4500  // small (parallel) work in between, then there is high probability that
4501  // monitor thread started after the library shutdown. At shutdown it is
4502  // too late to cope with the problem, because when the primary thread is
4503  // in DllMain (process detach) the monitor has no chances to start (it is
4504  // blocked), and primary thread has no means to inform the monitor that
4505  // the library has gone, because all the memory which the monitor can
4506  // access is going to be released/reset.
4507  while (TCR_4(__kmp_init_monitor) < 2) {
4508  KMP_YIELD(TRUE);
4509  }
4510  KF_TRACE(10, ("after monitor thread has started\n"));
4511 #endif
4512  }
4513  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4514  }
4515 #endif
4516 
4517  KMP_MB();
4518 
4519  {
4520  int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4521  ? 1
4522  : __kmp_hidden_helper_threads_num + 1;
4523 
4524  for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4525  ++new_gtid) {
4526  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4527  }
4528 
4529  if (TCR_4(__kmp_init_hidden_helper_threads)) {
4530  KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4531  }
4532  }
4533 
4534  /* allocate space for it. */
4535  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4536 
4537  new_thr->th.th_nt_strict = false;
4538  new_thr->th.th_nt_loc = NULL;
4539  new_thr->th.th_nt_sev = severity_fatal;
4540  new_thr->th.th_nt_msg = NULL;
4541 
4542  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4543 
4544 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4545  // suppress race conditions detection on synchronization flags in debug mode
4546  // this helps to analyze library internals eliminating false positives
4547  __itt_suppress_mark_range(
4548  __itt_suppress_range, __itt_suppress_threading_errors,
4549  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4550  __itt_suppress_mark_range(
4551  __itt_suppress_range, __itt_suppress_threading_errors,
4552  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4553 #if KMP_OS_WINDOWS
4554  __itt_suppress_mark_range(
4555  __itt_suppress_range, __itt_suppress_threading_errors,
4556  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4557 #else
4558  __itt_suppress_mark_range(__itt_suppress_range,
4559  __itt_suppress_threading_errors,
4560  &new_thr->th.th_suspend_init_count,
4561  sizeof(new_thr->th.th_suspend_init_count));
4562 #endif
4563  // TODO: check if we need to also suppress b_arrived flags
4564  __itt_suppress_mark_range(__itt_suppress_range,
4565  __itt_suppress_threading_errors,
4566  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4567  sizeof(new_thr->th.th_bar[0].bb.b_go));
4568  __itt_suppress_mark_range(__itt_suppress_range,
4569  __itt_suppress_threading_errors,
4570  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4571  sizeof(new_thr->th.th_bar[1].bb.b_go));
4572  __itt_suppress_mark_range(__itt_suppress_range,
4573  __itt_suppress_threading_errors,
4574  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4575  sizeof(new_thr->th.th_bar[2].bb.b_go));
4576 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4577  if (__kmp_storage_map) {
4578  __kmp_print_thread_storage_map(new_thr, new_gtid);
4579  }
4580 
4581  // add the reserve serialized team, initialized from the team's primary thread
4582  {
4583  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4584  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4585  new_thr->th.th_serial_team = serial_team =
4586  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4587 #if OMPT_SUPPORT
4588  ompt_data_none, // root parallel id
4589 #endif
4590  proc_bind_default, &r_icvs, 0, NULL);
4591  }
4592  KMP_ASSERT(serial_team);
4593  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4594  // execution (it is unused for now).
4595  serial_team->t.t_threads[0] = new_thr;
4596  KF_TRACE(10,
4597  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4598  new_thr));
4599 
4600  /* setup the thread structures */
4601  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4602 
4603 #if USE_FAST_MEMORY
4604  __kmp_initialize_fast_memory(new_thr);
4605 #endif /* USE_FAST_MEMORY */
4606 
4607 #if KMP_USE_BGET
4608  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4609  __kmp_initialize_bget(new_thr);
4610 #endif
4611 
4612  __kmp_init_random(new_thr); // Initialize random number generator
4613 
4614  /* Initialize these only once when thread is grabbed for a team allocation */
4615  KA_TRACE(20,
4616  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4617  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4618 
4619  int b;
4620  kmp_balign_t *balign = new_thr->th.th_bar;
4621  for (b = 0; b < bs_last_barrier; ++b) {
4622  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4623  balign[b].bb.team = NULL;
4624  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4625  balign[b].bb.use_oncore_barrier = 0;
4626  }
4627 
4628  TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4629  new_thr->th.th_sleep_loc_type = flag_unset;
4630 
4631  new_thr->th.th_spin_here = FALSE;
4632  new_thr->th.th_next_waiting = 0;
4633 #if KMP_OS_UNIX
4634  new_thr->th.th_blocking = false;
4635 #endif
4636 
4637 #if KMP_AFFINITY_SUPPORTED
4638  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4639  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4640  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4641  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4642 #endif
4643  new_thr->th.th_def_allocator = __kmp_def_allocator;
4644  new_thr->th.th_prev_level = 0;
4645  new_thr->th.th_prev_num_threads = 1;
4646 
4647  TCW_4(new_thr->th.th_in_pool, FALSE);
4648  new_thr->th.th_active_in_pool = FALSE;
4649  TCW_4(new_thr->th.th_active, TRUE);
4650 
4651  new_thr->th.th_set_nested_nth = NULL;
4652  new_thr->th.th_set_nested_nth_sz = 0;
4653 
4654  /* adjust the global counters */
4655  __kmp_all_nth++;
4656  __kmp_nth++;
4657 
4658  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4659  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4660  if (__kmp_adjust_gtid_mode) {
4661  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4662  if (TCR_4(__kmp_gtid_mode) != 2) {
4663  TCW_4(__kmp_gtid_mode, 2);
4664  }
4665  } else {
4666  if (TCR_4(__kmp_gtid_mode) != 1) {
4667  TCW_4(__kmp_gtid_mode, 1);
4668  }
4669  }
4670  }
4671 
4672 #ifdef KMP_ADJUST_BLOCKTIME
4673  /* Adjust blocktime back to zero if necessary */
4674  /* Middle initialization might not have occurred yet */
4675  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4676  if (__kmp_nth > __kmp_avail_proc) {
4677  __kmp_zero_bt = TRUE;
4678  }
4679  }
4680 #endif /* KMP_ADJUST_BLOCKTIME */
4681 
4682 #if KMP_AFFINITY_SUPPORTED
4683  // Set the affinity and topology information for new thread
4684  __kmp_affinity_set_init_mask(new_gtid, /*isa_root=*/FALSE);
4685 #endif
4686 
4687  /* actually fork it and create the new worker thread */
4688  KF_TRACE(
4689  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4690  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4691  KF_TRACE(10,
4692  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4693 
4694  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4695  new_gtid));
4696  KMP_MB();
4697  return new_thr;
4698 }
4699 
4700 /* Reinitialize team for reuse.
4701  The hot team code calls this case at every fork barrier, so EPCC barrier
4702  test are extremely sensitive to changes in it, esp. writes to the team
4703  struct, which cause a cache invalidation in all threads.
4704  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4705 static void __kmp_reinitialize_team(kmp_team_t *team,
4706  kmp_internal_control_t *new_icvs,
4707  ident_t *loc) {
4708  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4709  team->t.t_threads[0], team));
4710  KMP_DEBUG_ASSERT(team && new_icvs);
4711  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4712  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4713 
4714  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4715  // Copy ICVs to the primary thread's implicit taskdata
4716  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4717  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4718 
4719  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4720  team->t.t_threads[0], team));
4721 }
4722 
4723 /* Initialize the team data structure.
4724  This assumes the t_threads and t_max_nproc are already set.
4725  Also, we don't touch the arguments */
4726 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4727  kmp_internal_control_t *new_icvs,
4728  ident_t *loc) {
4729  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4730 
4731  /* verify */
4732  KMP_DEBUG_ASSERT(team);
4733  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4734  KMP_DEBUG_ASSERT(team->t.t_threads);
4735  KMP_MB();
4736 
4737  team->t.t_master_tid = 0; /* not needed */
4738  /* team->t.t_master_bar; not needed */
4739  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4740  team->t.t_nproc = new_nproc;
4741 
4742  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4743  team->t.t_next_pool = NULL;
4744  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4745  * up hot team */
4746 
4747  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4748  team->t.t_invoke = NULL; /* not needed */
4749 
4750  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4751  team->t.t_sched.sched = new_icvs->sched.sched;
4752 
4753 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4754  team->t.t_fp_control_saved = FALSE; /* not needed */
4755  team->t.t_x87_fpu_control_word = 0; /* not needed */
4756  team->t.t_mxcsr = 0; /* not needed */
4757 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4758 
4759  team->t.t_construct = 0;
4760 
4761  team->t.t_ordered.dt.t_value = 0;
4762  team->t.t_master_active = FALSE;
4763 
4764 #ifdef KMP_DEBUG
4765  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4766 #endif
4767 #if KMP_OS_WINDOWS
4768  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4769 #endif
4770 
4771  team->t.t_control_stack_top = NULL;
4772 
4773  __kmp_reinitialize_team(team, new_icvs, loc);
4774 
4775  KMP_MB();
4776  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4777 }
4778 
4779 #if KMP_AFFINITY_SUPPORTED
4780 static inline void __kmp_set_thread_place(kmp_team_t *team, kmp_info_t *th,
4781  int first, int last, int newp) {
4782  th->th.th_first_place = first;
4783  th->th.th_last_place = last;
4784  th->th.th_new_place = newp;
4785  if (newp != th->th.th_current_place) {
4786  if (__kmp_display_affinity && team->t.t_display_affinity != 1)
4787  team->t.t_display_affinity = 1;
4788  // Copy topology information associated with the new place
4789  th->th.th_topology_ids = __kmp_affinity.ids[th->th.th_new_place];
4790  th->th.th_topology_attrs = __kmp_affinity.attrs[th->th.th_new_place];
4791  }
4792 }
4793 
4794 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4795 // It calculates the worker + primary thread's partition based upon the parent
4796 // thread's partition, and binds each worker to a thread in their partition.
4797 // The primary thread's partition should already include its current binding.
4798 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4799  // Do not partition places for the hidden helper team
4800  if (KMP_HIDDEN_HELPER_TEAM(team))
4801  return;
4802  // Copy the primary thread's place partition to the team struct
4803  kmp_info_t *master_th = team->t.t_threads[0];
4804  KMP_DEBUG_ASSERT(master_th != NULL);
4805  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4806  int first_place = master_th->th.th_first_place;
4807  int last_place = master_th->th.th_last_place;
4808  int masters_place = master_th->th.th_current_place;
4809  int num_masks = __kmp_affinity.num_masks;
4810  team->t.t_first_place = first_place;
4811  team->t.t_last_place = last_place;
4812 
4813  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4814  "bound to place %d partition = [%d,%d]\n",
4815  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4816  team->t.t_id, masters_place, first_place, last_place));
4817 
4818  switch (proc_bind) {
4819 
4820  case proc_bind_default:
4821  // Serial teams might have the proc_bind policy set to proc_bind_default.
4822  // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4823  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4824  break;
4825 
4826  case proc_bind_primary: {
4827  int f;
4828  int n_th = team->t.t_nproc;
4829  for (f = 1; f < n_th; f++) {
4830  kmp_info_t *th = team->t.t_threads[f];
4831  KMP_DEBUG_ASSERT(th != NULL);
4832  __kmp_set_thread_place(team, th, first_place, last_place, masters_place);
4833 
4834  KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4835  "partition = [%d,%d]\n",
4836  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4837  f, masters_place, first_place, last_place));
4838  }
4839  } break;
4840 
4841  case proc_bind_close: {
4842  int f;
4843  int n_th = team->t.t_nproc;
4844  int n_places;
4845  if (first_place <= last_place) {
4846  n_places = last_place - first_place + 1;
4847  } else {
4848  n_places = num_masks - first_place + last_place + 1;
4849  }
4850  if (n_th <= n_places) {
4851  int place = masters_place;
4852  for (f = 1; f < n_th; f++) {
4853  kmp_info_t *th = team->t.t_threads[f];
4854  KMP_DEBUG_ASSERT(th != NULL);
4855 
4856  if (place == last_place) {
4857  place = first_place;
4858  } else if (place == (num_masks - 1)) {
4859  place = 0;
4860  } else {
4861  place++;
4862  }
4863  __kmp_set_thread_place(team, th, first_place, last_place, place);
4864 
4865  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4866  "partition = [%d,%d]\n",
4867  __kmp_gtid_from_thread(team->t.t_threads[f]),
4868  team->t.t_id, f, place, first_place, last_place));
4869  }
4870  } else {
4871  int S, rem, gap, s_count;
4872  S = n_th / n_places;
4873  s_count = 0;
4874  rem = n_th - (S * n_places);
4875  gap = rem > 0 ? n_places / rem : n_places;
4876  int place = masters_place;
4877  int gap_ct = gap;
4878  for (f = 0; f < n_th; f++) {
4879  kmp_info_t *th = team->t.t_threads[f];
4880  KMP_DEBUG_ASSERT(th != NULL);
4881 
4882  __kmp_set_thread_place(team, th, first_place, last_place, place);
4883  s_count++;
4884 
4885  if ((s_count == S) && rem && (gap_ct == gap)) {
4886  // do nothing, add an extra thread to place on next iteration
4887  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4888  // we added an extra thread to this place; move to next place
4889  if (place == last_place) {
4890  place = first_place;
4891  } else if (place == (num_masks - 1)) {
4892  place = 0;
4893  } else {
4894  place++;
4895  }
4896  s_count = 0;
4897  gap_ct = 1;
4898  rem--;
4899  } else if (s_count == S) { // place full; don't add extra
4900  if (place == last_place) {
4901  place = first_place;
4902  } else if (place == (num_masks - 1)) {
4903  place = 0;
4904  } else {
4905  place++;
4906  }
4907  gap_ct++;
4908  s_count = 0;
4909  }
4910 
4911  KA_TRACE(100,
4912  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4913  "partition = [%d,%d]\n",
4914  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4915  th->th.th_new_place, first_place, last_place));
4916  }
4917  KMP_DEBUG_ASSERT(place == masters_place);
4918  }
4919  } break;
4920 
4921  case proc_bind_spread: {
4922  int f;
4923  int n_th = team->t.t_nproc;
4924  int n_places;
4925  int thidx;
4926  if (first_place <= last_place) {
4927  n_places = last_place - first_place + 1;
4928  } else {
4929  n_places = num_masks - first_place + last_place + 1;
4930  }
4931  if (n_th <= n_places) {
4932  int place = -1;
4933 
4934  if (n_places != num_masks) {
4935  int S = n_places / n_th;
4936  int s_count, rem, gap, gap_ct;
4937 
4938  place = masters_place;
4939  rem = n_places - n_th * S;
4940  gap = rem ? n_th / rem : 1;
4941  gap_ct = gap;
4942  thidx = n_th;
4943  if (update_master_only == 1)
4944  thidx = 1;
4945  for (f = 0; f < thidx; f++) {
4946  kmp_info_t *th = team->t.t_threads[f];
4947  KMP_DEBUG_ASSERT(th != NULL);
4948 
4949  int fplace = place, nplace = place;
4950  s_count = 1;
4951  while (s_count < S) {
4952  if (place == last_place) {
4953  place = first_place;
4954  } else if (place == (num_masks - 1)) {
4955  place = 0;
4956  } else {
4957  place++;
4958  }
4959  s_count++;
4960  }
4961  if (rem && (gap_ct == gap)) {
4962  if (place == last_place) {
4963  place = first_place;
4964  } else if (place == (num_masks - 1)) {
4965  place = 0;
4966  } else {
4967  place++;
4968  }
4969  rem--;
4970  gap_ct = 0;
4971  }
4972  __kmp_set_thread_place(team, th, fplace, place, nplace);
4973  gap_ct++;
4974 
4975  if (place == last_place) {
4976  place = first_place;
4977  } else if (place == (num_masks - 1)) {
4978  place = 0;
4979  } else {
4980  place++;
4981  }
4982 
4983  KA_TRACE(100,
4984  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4985  "partition = [%d,%d], num_masks: %u\n",
4986  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4987  f, th->th.th_new_place, th->th.th_first_place,
4988  th->th.th_last_place, num_masks));
4989  }
4990  } else {
4991  /* Having uniform space of available computation places I can create
4992  T partitions of round(P/T) size and put threads into the first
4993  place of each partition. */
4994  double current = static_cast<double>(masters_place);
4995  double spacing =
4996  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4997  int first, last;
4998  kmp_info_t *th;
4999 
5000  thidx = n_th + 1;
5001  if (update_master_only == 1)
5002  thidx = 1;
5003  for (f = 0; f < thidx; f++) {
5004  first = static_cast<int>(current);
5005  last = static_cast<int>(current + spacing) - 1;
5006  KMP_DEBUG_ASSERT(last >= first);
5007  if (first >= n_places) {
5008  if (masters_place) {
5009  first -= n_places;
5010  last -= n_places;
5011  if (first == (masters_place + 1)) {
5012  KMP_DEBUG_ASSERT(f == n_th);
5013  first--;
5014  }
5015  if (last == masters_place) {
5016  KMP_DEBUG_ASSERT(f == (n_th - 1));
5017  last--;
5018  }
5019  } else {
5020  KMP_DEBUG_ASSERT(f == n_th);
5021  first = 0;
5022  last = 0;
5023  }
5024  }
5025  if (last >= n_places) {
5026  last = (n_places - 1);
5027  }
5028  place = first;
5029  current += spacing;
5030  if (f < n_th) {
5031  KMP_DEBUG_ASSERT(0 <= first);
5032  KMP_DEBUG_ASSERT(n_places > first);
5033  KMP_DEBUG_ASSERT(0 <= last);
5034  KMP_DEBUG_ASSERT(n_places > last);
5035  KMP_DEBUG_ASSERT(last_place >= first_place);
5036  th = team->t.t_threads[f];
5037  KMP_DEBUG_ASSERT(th);
5038  __kmp_set_thread_place(team, th, first, last, place);
5039  KA_TRACE(100,
5040  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5041  "partition = [%d,%d], spacing = %.4f\n",
5042  __kmp_gtid_from_thread(team->t.t_threads[f]),
5043  team->t.t_id, f, th->th.th_new_place,
5044  th->th.th_first_place, th->th.th_last_place, spacing));
5045  }
5046  }
5047  }
5048  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5049  } else {
5050  int S, rem, gap, s_count;
5051  S = n_th / n_places;
5052  s_count = 0;
5053  rem = n_th - (S * n_places);
5054  gap = rem > 0 ? n_places / rem : n_places;
5055  int place = masters_place;
5056  int gap_ct = gap;
5057  thidx = n_th;
5058  if (update_master_only == 1)
5059  thidx = 1;
5060  for (f = 0; f < thidx; f++) {
5061  kmp_info_t *th = team->t.t_threads[f];
5062  KMP_DEBUG_ASSERT(th != NULL);
5063 
5064  __kmp_set_thread_place(team, th, place, place, place);
5065  s_count++;
5066 
5067  if ((s_count == S) && rem && (gap_ct == gap)) {
5068  // do nothing, add an extra thread to place on next iteration
5069  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5070  // we added an extra thread to this place; move on to next place
5071  if (place == last_place) {
5072  place = first_place;
5073  } else if (place == (num_masks - 1)) {
5074  place = 0;
5075  } else {
5076  place++;
5077  }
5078  s_count = 0;
5079  gap_ct = 1;
5080  rem--;
5081  } else if (s_count == S) { // place is full; don't add extra thread
5082  if (place == last_place) {
5083  place = first_place;
5084  } else if (place == (num_masks - 1)) {
5085  place = 0;
5086  } else {
5087  place++;
5088  }
5089  gap_ct++;
5090  s_count = 0;
5091  }
5092 
5093  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5094  "partition = [%d,%d]\n",
5095  __kmp_gtid_from_thread(team->t.t_threads[f]),
5096  team->t.t_id, f, th->th.th_new_place,
5097  th->th.th_first_place, th->th.th_last_place));
5098  }
5099  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5100  }
5101  } break;
5102 
5103  default:
5104  break;
5105  }
5106 
5107  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5108 }
5109 
5110 #endif // KMP_AFFINITY_SUPPORTED
5111 
5112 /* allocate a new team data structure to use. take one off of the free pool if
5113  available */
5114 kmp_team_t *__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5115 #if OMPT_SUPPORT
5116  ompt_data_t ompt_parallel_data,
5117 #endif
5118  kmp_proc_bind_t new_proc_bind,
5119  kmp_internal_control_t *new_icvs, int argc,
5120  kmp_info_t *master) {
5121  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5122  int f;
5123  kmp_team_t *team;
5124  int use_hot_team = !root->r.r_active;
5125  int level = 0;
5126  int do_place_partition = 1;
5127 
5128  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5129  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5130  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5131  KMP_MB();
5132 
5133  kmp_hot_team_ptr_t *hot_teams;
5134  if (master) {
5135  team = master->th.th_team;
5136  level = team->t.t_active_level;
5137  if (master->th.th_teams_microtask) { // in teams construct?
5138  if (master->th.th_teams_size.nteams > 1 &&
5139  ( // #teams > 1
5140  team->t.t_pkfn ==
5141  (microtask_t)__kmp_teams_master || // inner fork of the teams
5142  master->th.th_teams_level <
5143  team->t.t_level)) { // or nested parallel inside the teams
5144  ++level; // not increment if #teams==1, or for outer fork of the teams;
5145  // increment otherwise
5146  }
5147  // Do not perform the place partition if inner fork of the teams
5148  // Wait until nested parallel region encountered inside teams construct
5149  if ((master->th.th_teams_size.nteams == 1 &&
5150  master->th.th_teams_level >= team->t.t_level) ||
5151  (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5152  do_place_partition = 0;
5153  }
5154  hot_teams = master->th.th_hot_teams;
5155  if (level < __kmp_hot_teams_max_level && hot_teams &&
5156  hot_teams[level].hot_team) {
5157  // hot team has already been allocated for given level
5158  use_hot_team = 1;
5159  } else {
5160  use_hot_team = 0;
5161  }
5162  } else {
5163  // check we won't access uninitialized hot_teams, just in case
5164  KMP_DEBUG_ASSERT(new_nproc == 1);
5165  }
5166  // Optimization to use a "hot" team
5167  if (use_hot_team && new_nproc > 1) {
5168  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5169  team = hot_teams[level].hot_team;
5170 #if KMP_DEBUG
5171  if (__kmp_tasking_mode != tskm_immediate_exec) {
5172  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5173  "task_team[1] = %p before reinit\n",
5174  team->t.t_task_team[0], team->t.t_task_team[1]));
5175  }
5176 #endif
5177 
5178  if (team->t.t_nproc != new_nproc &&
5179  __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5180  // Distributed barrier may need a resize
5181  int old_nthr = team->t.t_nproc;
5182  __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5183  }
5184 
5185  // If not doing the place partition, then reset the team's proc bind
5186  // to indicate that partitioning of all threads still needs to take place
5187  if (do_place_partition == 0)
5188  team->t.t_proc_bind = proc_bind_default;
5189  // Has the number of threads changed?
5190  /* Let's assume the most common case is that the number of threads is
5191  unchanged, and put that case first. */
5192  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5193  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5194  // This case can mean that omp_set_num_threads() was called and the hot
5195  // team size was already reduced, so we check the special flag
5196  if (team->t.t_size_changed == -1) {
5197  team->t.t_size_changed = 1;
5198  } else {
5199  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5200  }
5201 
5202  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5203  kmp_r_sched_t new_sched = new_icvs->sched;
5204  // set primary thread's schedule as new run-time schedule
5205  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5206 
5207  __kmp_reinitialize_team(team, new_icvs,
5208  root->r.r_uber_thread->th.th_ident);
5209 
5210  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5211  team->t.t_threads[0], team));
5212  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5213 
5214 #if KMP_AFFINITY_SUPPORTED
5215  if ((team->t.t_size_changed == 0) &&
5216  (team->t.t_proc_bind == new_proc_bind)) {
5217  if (new_proc_bind == proc_bind_spread) {
5218  if (do_place_partition) {
5219  // add flag to update only master for spread
5220  __kmp_partition_places(team, 1);
5221  }
5222  }
5223  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5224  "proc_bind = %d, partition = [%d,%d]\n",
5225  team->t.t_id, new_proc_bind, team->t.t_first_place,
5226  team->t.t_last_place));
5227  } else {
5228  if (do_place_partition) {
5229  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5230  __kmp_partition_places(team);
5231  }
5232  }
5233 #else
5234  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5235 #endif /* KMP_AFFINITY_SUPPORTED */
5236  } else if (team->t.t_nproc > new_nproc) {
5237  KA_TRACE(20,
5238  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5239  new_nproc));
5240 
5241  team->t.t_size_changed = 1;
5242  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5243  // Barrier size already reduced earlier in this function
5244  // Activate team threads via th_used_in_team
5245  __kmp_add_threads_to_team(team, new_nproc);
5246  }
5247  // When decreasing team size, threads no longer in the team should
5248  // unref task team.
5249  if (__kmp_tasking_mode != tskm_immediate_exec) {
5250  for (f = new_nproc; f < team->t.t_nproc; f++) {
5251  kmp_info_t *th = team->t.t_threads[f];
5252  KMP_DEBUG_ASSERT(th);
5253  th->th.th_task_team = NULL;
5254  }
5255  }
5256  if (__kmp_hot_teams_mode == 0) {
5257  // AC: saved number of threads should correspond to team's value in this
5258  // mode, can be bigger in mode 1, when hot team has threads in reserve
5259  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5260  hot_teams[level].hot_team_nth = new_nproc;
5261  /* release the extra threads we don't need any more */
5262  for (f = new_nproc; f < team->t.t_nproc; f++) {
5263  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5264  __kmp_free_thread(team->t.t_threads[f]);
5265  team->t.t_threads[f] = NULL;
5266  }
5267  } // (__kmp_hot_teams_mode == 0)
5268  else {
5269  // When keeping extra threads in team, switch threads to wait on own
5270  // b_go flag
5271  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5272  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5273  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5274  for (int b = 0; b < bs_last_barrier; ++b) {
5275  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5276  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5277  }
5278  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5279  }
5280  }
5281  }
5282  team->t.t_nproc = new_nproc;
5283  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5284  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5285  __kmp_reinitialize_team(team, new_icvs,
5286  root->r.r_uber_thread->th.th_ident);
5287 
5288  // Update remaining threads
5289  for (f = 0; f < new_nproc; ++f) {
5290  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5291  }
5292 
5293  // restore the current task state of the primary thread: should be the
5294  // implicit task
5295  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5296  team->t.t_threads[0], team));
5297 
5298  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5299 
5300 #ifdef KMP_DEBUG
5301  for (f = 0; f < team->t.t_nproc; f++) {
5302  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5303  team->t.t_threads[f]->th.th_team_nproc ==
5304  team->t.t_nproc);
5305  }
5306 #endif
5307 
5308  if (do_place_partition) {
5309  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5310 #if KMP_AFFINITY_SUPPORTED
5311  __kmp_partition_places(team);
5312 #endif
5313  }
5314  } else { // team->t.t_nproc < new_nproc
5315 
5316  KA_TRACE(20,
5317  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5318  new_nproc));
5319  int old_nproc = team->t.t_nproc; // save old value and use to update only
5320  team->t.t_size_changed = 1;
5321 
5322  int avail_threads = hot_teams[level].hot_team_nth;
5323  if (new_nproc < avail_threads)
5324  avail_threads = new_nproc;
5325  kmp_info_t **other_threads = team->t.t_threads;
5326  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5327  // Adjust barrier data of reserved threads (if any) of the team
5328  // Other data will be set in __kmp_initialize_info() below.
5329  int b;
5330  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5331  for (b = 0; b < bs_last_barrier; ++b) {
5332  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5333  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5334 #if USE_DEBUGGER
5335  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5336 #endif
5337  }
5338  }
5339  if (hot_teams[level].hot_team_nth >= new_nproc) {
5340  // we have all needed threads in reserve, no need to allocate any
5341  // this only possible in mode 1, cannot have reserved threads in mode 0
5342  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5343  team->t.t_nproc = new_nproc; // just get reserved threads involved
5344  } else {
5345  // We may have some threads in reserve, but not enough;
5346  // get reserved threads involved if any.
5347  team->t.t_nproc = hot_teams[level].hot_team_nth;
5348  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5349  if (team->t.t_max_nproc < new_nproc) {
5350  /* reallocate larger arrays */
5351  __kmp_reallocate_team_arrays(team, new_nproc);
5352  __kmp_reinitialize_team(team, new_icvs, NULL);
5353  }
5354 
5355 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5356  KMP_AFFINITY_SUPPORTED
5357  /* Temporarily set full mask for primary thread before creation of
5358  workers. The reason is that workers inherit the affinity from the
5359  primary thread, so if a lot of workers are created on the single
5360  core quickly, they don't get a chance to set their own affinity for
5361  a long time. */
5362  kmp_affinity_raii_t new_temp_affinity{__kmp_affin_fullMask};
5363 #endif
5364 
5365  /* allocate new threads for the hot team */
5366  for (f = team->t.t_nproc; f < new_nproc; f++) {
5367  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5368  KMP_DEBUG_ASSERT(new_worker);
5369  team->t.t_threads[f] = new_worker;
5370 
5371  KA_TRACE(20,
5372  ("__kmp_allocate_team: team %d init T#%d arrived: "
5373  "join=%llu, plain=%llu\n",
5374  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5375  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5376  team->t.t_bar[bs_plain_barrier].b_arrived));
5377 
5378  { // Initialize barrier data for new threads.
5379  int b;
5380  kmp_balign_t *balign = new_worker->th.th_bar;
5381  for (b = 0; b < bs_last_barrier; ++b) {
5382  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5383  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5384  KMP_BARRIER_PARENT_FLAG);
5385 #if USE_DEBUGGER
5386  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5387 #endif
5388  }
5389  }
5390  }
5391 
5392 #if (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_DRAGONFLY) && \
5393  KMP_AFFINITY_SUPPORTED
5394  /* Restore initial primary thread's affinity mask */
5395  new_temp_affinity.restore();
5396 #endif
5397  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5398  if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5399  // Barrier size already increased earlier in this function
5400  // Activate team threads via th_used_in_team
5401  __kmp_add_threads_to_team(team, new_nproc);
5402  }
5403  /* make sure everyone is syncronized */
5404  // new threads below
5405  __kmp_initialize_team(team, new_nproc, new_icvs,
5406  root->r.r_uber_thread->th.th_ident);
5407 
5408  /* reinitialize the threads */
5409  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5410  for (f = 0; f < team->t.t_nproc; ++f)
5411  __kmp_initialize_info(team->t.t_threads[f], team, f,
5412  __kmp_gtid_from_tid(f, team));
5413 
5414  // set th_task_state for new threads in hot team with older thread's state
5415  kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5416  for (f = old_nproc; f < team->t.t_nproc; ++f)
5417  team->t.t_threads[f]->th.th_task_state = old_state;
5418 
5419 #ifdef KMP_DEBUG
5420  for (f = 0; f < team->t.t_nproc; ++f) {
5421  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5422  team->t.t_threads[f]->th.th_team_nproc ==
5423  team->t.t_nproc);
5424  }
5425 #endif
5426 
5427  if (do_place_partition) {
5428  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5429 #if KMP_AFFINITY_SUPPORTED
5430  __kmp_partition_places(team);
5431 #endif
5432  }
5433  } // Check changes in number of threads
5434 
5435  if (master->th.th_teams_microtask) {
5436  for (f = 1; f < new_nproc; ++f) {
5437  // propagate teams construct specific info to workers
5438  kmp_info_t *thr = team->t.t_threads[f];
5439  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5440  thr->th.th_teams_level = master->th.th_teams_level;
5441  thr->th.th_teams_size = master->th.th_teams_size;
5442  }
5443  }
5444  if (level) {
5445  // Sync barrier state for nested hot teams, not needed for outermost hot
5446  // team.
5447  for (f = 1; f < new_nproc; ++f) {
5448  kmp_info_t *thr = team->t.t_threads[f];
5449  int b;
5450  kmp_balign_t *balign = thr->th.th_bar;
5451  for (b = 0; b < bs_last_barrier; ++b) {
5452  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5453  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5454 #if USE_DEBUGGER
5455  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5456 #endif
5457  }
5458  }
5459  }
5460 
5461  /* reallocate space for arguments if necessary */
5462  __kmp_alloc_argv_entries(argc, team, TRUE);
5463  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5464  // The hot team re-uses the previous task team,
5465  // if untouched during the previous release->gather phase.
5466 
5467  KF_TRACE(10, (" hot_team = %p\n", team));
5468 
5469 #if KMP_DEBUG
5470  if (__kmp_tasking_mode != tskm_immediate_exec) {
5471  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5472  "task_team[1] = %p after reinit\n",
5473  team->t.t_task_team[0], team->t.t_task_team[1]));
5474  }
5475 #endif
5476 
5477 #if OMPT_SUPPORT
5478  __ompt_team_assign_id(team, ompt_parallel_data);
5479 #endif
5480 
5481  KMP_MB();
5482 
5483  return team;
5484  }
5485 
5486  /* next, let's try to take one from the team pool */
5487  KMP_MB();
5488  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5489  /* TODO: consider resizing undersized teams instead of reaping them, now
5490  that we have a resizing mechanism */
5491  if (team->t.t_max_nproc >= max_nproc) {
5492  /* take this team from the team pool */
5493  __kmp_team_pool = team->t.t_next_pool;
5494 
5495  if (max_nproc > 1 &&
5496  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5497  if (!team->t.b) { // Allocate barrier structure
5498  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5499  }
5500  }
5501 
5502  /* setup the team for fresh use */
5503  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5504 
5505  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5506  "task_team[1] %p to NULL\n",
5507  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5508  team->t.t_task_team[0] = NULL;
5509  team->t.t_task_team[1] = NULL;
5510 
5511  /* reallocate space for arguments if necessary */
5512  __kmp_alloc_argv_entries(argc, team, TRUE);
5513  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5514 
5515  KA_TRACE(
5516  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5517  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5518  { // Initialize barrier data.
5519  int b;
5520  for (b = 0; b < bs_last_barrier; ++b) {
5521  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5522 #if USE_DEBUGGER
5523  team->t.t_bar[b].b_master_arrived = 0;
5524  team->t.t_bar[b].b_team_arrived = 0;
5525 #endif
5526  }
5527  }
5528 
5529  team->t.t_proc_bind = new_proc_bind;
5530 
5531  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5532  team->t.t_id));
5533 
5534 #if OMPT_SUPPORT
5535  __ompt_team_assign_id(team, ompt_parallel_data);
5536 #endif
5537 
5538  team->t.t_nested_nth = NULL;
5539 
5540  KMP_MB();
5541 
5542  return team;
5543  }
5544 
5545  /* reap team if it is too small, then loop back and check the next one */
5546  // not sure if this is wise, but, will be redone during the hot-teams
5547  // rewrite.
5548  /* TODO: Use technique to find the right size hot-team, don't reap them */
5549  team = __kmp_reap_team(team);
5550  __kmp_team_pool = team;
5551  }
5552 
5553  /* nothing available in the pool, no matter, make a new team! */
5554  KMP_MB();
5555  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5556 
5557  /* and set it up */
5558  team->t.t_max_nproc = max_nproc;
5559  if (max_nproc > 1 &&
5560  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5561  // Allocate barrier structure
5562  team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5563  }
5564 
5565  /* NOTE well, for some reason allocating one big buffer and dividing it up
5566  seems to really hurt performance a lot on the P4, so, let's not use this */
5567  __kmp_allocate_team_arrays(team, max_nproc);
5568 
5569  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5570  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5571 
5572  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5573  "%p to NULL\n",
5574  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5575  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5576  // memory, no need to duplicate
5577  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5578  // memory, no need to duplicate
5579 
5580  if (__kmp_storage_map) {
5581  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5582  }
5583 
5584  /* allocate space for arguments */
5585  __kmp_alloc_argv_entries(argc, team, FALSE);
5586  team->t.t_argc = argc;
5587 
5588  KA_TRACE(20,
5589  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5590  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5591  { // Initialize barrier data.
5592  int b;
5593  for (b = 0; b < bs_last_barrier; ++b) {
5594  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5595 #if USE_DEBUGGER
5596  team->t.t_bar[b].b_master_arrived = 0;
5597  team->t.t_bar[b].b_team_arrived = 0;
5598 #endif
5599  }
5600  }
5601 
5602  team->t.t_proc_bind = new_proc_bind;
5603 
5604 #if OMPT_SUPPORT
5605  __ompt_team_assign_id(team, ompt_parallel_data);
5606  team->t.ompt_serialized_team_info = NULL;
5607 #endif
5608 
5609  KMP_MB();
5610 
5611  team->t.t_nested_nth = NULL;
5612 
5613  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5614  team->t.t_id));
5615 
5616  return team;
5617 }
5618 
5619 /* TODO implement hot-teams at all levels */
5620 /* TODO implement lazy thread release on demand (disband request) */
5621 
5622 /* free the team. return it to the team pool. release all the threads
5623  * associated with it */
5624 void __kmp_free_team(kmp_root_t *root, kmp_team_t *team, kmp_info_t *master) {
5625  int f;
5626  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5627  team->t.t_id));
5628 
5629  /* verify state */
5630  KMP_DEBUG_ASSERT(root);
5631  KMP_DEBUG_ASSERT(team);
5632  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5633  KMP_DEBUG_ASSERT(team->t.t_threads);
5634 
5635  int use_hot_team = team == root->r.r_hot_team;
5636  int level;
5637  if (master) {
5638  level = team->t.t_active_level - 1;
5639  if (master->th.th_teams_microtask) { // in teams construct?
5640  if (master->th.th_teams_size.nteams > 1) {
5641  ++level; // level was not increased in teams construct for
5642  // team_of_masters
5643  }
5644  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5645  master->th.th_teams_level == team->t.t_level) {
5646  ++level; // level was not increased in teams construct for
5647  // team_of_workers before the parallel
5648  } // team->t.t_level will be increased inside parallel
5649  }
5650 #if KMP_DEBUG
5651  kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5652 #endif
5653  if (level < __kmp_hot_teams_max_level) {
5654  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5655  use_hot_team = 1;
5656  }
5657  }
5658 
5659  /* team is done working */
5660  TCW_SYNC_PTR(team->t.t_pkfn,
5661  NULL); // Important for Debugging Support Library.
5662 #if KMP_OS_WINDOWS
5663  team->t.t_copyin_counter = 0; // init counter for possible reuse
5664 #endif
5665  // Do not reset pointer to parent team to NULL for hot teams.
5666 
5667  /* if we are non-hot team, release our threads */
5668  if (!use_hot_team) {
5669  if (__kmp_tasking_mode != tskm_immediate_exec) {
5670  // Wait for threads to reach reapable state
5671  for (f = 1; f < team->t.t_nproc; ++f) {
5672  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5673  kmp_info_t *th = team->t.t_threads[f];
5674  volatile kmp_uint32 *state = &th->th.th_reap_state;
5675  while (*state != KMP_SAFE_TO_REAP) {
5676 #if KMP_OS_WINDOWS
5677  // On Windows a thread can be killed at any time, check this
5678  DWORD ecode;
5679  if (!__kmp_is_thread_alive(th, &ecode)) {
5680  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5681  break;
5682  }
5683 #endif
5684  // first check if thread is sleeping
5685  if (th->th.th_sleep_loc)
5686  __kmp_null_resume_wrapper(th);
5687  KMP_CPU_PAUSE();
5688  }
5689  }
5690 
5691  // Delete task teams
5692  int tt_idx;
5693  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5694  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5695  if (task_team != NULL) {
5696  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5697  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5698  team->t.t_threads[f]->th.th_task_team = NULL;
5699  }
5700  KA_TRACE(
5701  20,
5702  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5703  __kmp_get_gtid(), task_team, team->t.t_id));
5704  __kmp_free_task_team(master, task_team);
5705  team->t.t_task_team[tt_idx] = NULL;
5706  }
5707  }
5708  }
5709 
5710  // Before clearing parent pointer, check if nested_nth list should be freed
5711  if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
5712  team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
5713  KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
5714  KMP_INTERNAL_FREE(team->t.t_nested_nth);
5715  }
5716  team->t.t_nested_nth = NULL;
5717 
5718  // Reset pointer to parent team only for non-hot teams.
5719  team->t.t_parent = NULL;
5720  team->t.t_level = 0;
5721  team->t.t_active_level = 0;
5722 
5723  /* free the worker threads */
5724  for (f = 1; f < team->t.t_nproc; ++f) {
5725  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5726  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5727  (void)KMP_COMPARE_AND_STORE_ACQ32(
5728  &(team->t.t_threads[f]->th.th_used_in_team), 1, 2);
5729  }
5730  __kmp_free_thread(team->t.t_threads[f]);
5731  }
5732 
5733  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5734  if (team->t.b) {
5735  // wake up thread at old location
5736  team->t.b->go_release();
5737  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5738  for (f = 1; f < team->t.t_nproc; ++f) {
5739  if (team->t.b->sleep[f].sleep) {
5740  __kmp_atomic_resume_64(
5741  team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5742  (kmp_atomic_flag_64<> *)NULL);
5743  }
5744  }
5745  }
5746  // Wait for threads to be removed from team
5747  for (int f = 1; f < team->t.t_nproc; ++f) {
5748  while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5749  KMP_CPU_PAUSE();
5750  }
5751  }
5752  }
5753 
5754  for (f = 1; f < team->t.t_nproc; ++f) {
5755  team->t.t_threads[f] = NULL;
5756  }
5757 
5758  if (team->t.t_max_nproc > 1 &&
5759  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5760  distributedBarrier::deallocate(team->t.b);
5761  team->t.b = NULL;
5762  }
5763  /* put the team back in the team pool */
5764  /* TODO limit size of team pool, call reap_team if pool too large */
5765  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5766  __kmp_team_pool = (volatile kmp_team_t *)team;
5767  } else { // Check if team was created for primary threads in teams construct
5768  // See if first worker is a CG root
5769  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5770  team->t.t_threads[1]->th.th_cg_roots);
5771  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5772  // Clean up the CG root nodes on workers so that this team can be re-used
5773  for (f = 1; f < team->t.t_nproc; ++f) {
5774  kmp_info_t *thr = team->t.t_threads[f];
5775  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5776  thr->th.th_cg_roots->cg_root == thr);
5777  // Pop current CG root off list
5778  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5779  thr->th.th_cg_roots = tmp->up;
5780  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5781  " up to node %p. cg_nthreads was %d\n",
5782  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5783  int i = tmp->cg_nthreads--;
5784  if (i == 1) {
5785  __kmp_free(tmp); // free CG if we are the last thread in it
5786  }
5787  // Restore current task's thread_limit from CG root
5788  if (thr->th.th_cg_roots)
5789  thr->th.th_current_task->td_icvs.thread_limit =
5790  thr->th.th_cg_roots->cg_thread_limit;
5791  }
5792  }
5793  }
5794 
5795  KMP_MB();
5796 }
5797 
5798 /* reap the team. destroy it, reclaim all its resources and free its memory */
5799 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5800  kmp_team_t *next_pool = team->t.t_next_pool;
5801 
5802  KMP_DEBUG_ASSERT(team);
5803  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5804  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5805  KMP_DEBUG_ASSERT(team->t.t_threads);
5806  KMP_DEBUG_ASSERT(team->t.t_argv);
5807 
5808  /* TODO clean the threads that are a part of this? */
5809 
5810  /* free stuff */
5811  __kmp_free_team_arrays(team);
5812  if (team->t.t_argv != &team->t.t_inline_argv[0])
5813  __kmp_free((void *)team->t.t_argv);
5814  __kmp_free(team);
5815 
5816  KMP_MB();
5817  return next_pool;
5818 }
5819 
5820 // Free the thread. Don't reap it, just place it on the pool of available
5821 // threads.
5822 //
5823 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5824 // binding for the affinity mechanism to be useful.
5825 //
5826 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5827 // However, we want to avoid a potential performance problem by always
5828 // scanning through the list to find the correct point at which to insert
5829 // the thread (potential N**2 behavior). To do this we keep track of the
5830 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5831 // With single-level parallelism, threads will always be added to the tail
5832 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5833 // parallelism, all bets are off and we may need to scan through the entire
5834 // free list.
5835 //
5836 // This change also has a potentially large performance benefit, for some
5837 // applications. Previously, as threads were freed from the hot team, they
5838 // would be placed back on the free list in inverse order. If the hot team
5839 // grew back to it's original size, then the freed thread would be placed
5840 // back on the hot team in reverse order. This could cause bad cache
5841 // locality problems on programs where the size of the hot team regularly
5842 // grew and shrunk.
5843 //
5844 // Now, for single-level parallelism, the OMP tid is always == gtid.
5845 void __kmp_free_thread(kmp_info_t *this_th) {
5846  int gtid;
5847  kmp_info_t **scan;
5848 
5849  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5850  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5851 
5852  KMP_DEBUG_ASSERT(this_th);
5853 
5854  // When moving thread to pool, switch thread to wait on own b_go flag, and
5855  // uninitialized (NULL team).
5856  int b;
5857  kmp_balign_t *balign = this_th->th.th_bar;
5858  for (b = 0; b < bs_last_barrier; ++b) {
5859  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5860  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5861  balign[b].bb.team = NULL;
5862  balign[b].bb.leaf_kids = 0;
5863  }
5864  this_th->th.th_task_state = 0;
5865  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5866 
5867  /* put thread back on the free pool */
5868  TCW_PTR(this_th->th.th_team, NULL);
5869  TCW_PTR(this_th->th.th_root, NULL);
5870  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5871 
5872  while (this_th->th.th_cg_roots) {
5873  this_th->th.th_cg_roots->cg_nthreads--;
5874  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5875  " %p of thread %p to %d\n",
5876  this_th, this_th->th.th_cg_roots,
5877  this_th->th.th_cg_roots->cg_root,
5878  this_th->th.th_cg_roots->cg_nthreads));
5879  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5880  if (tmp->cg_root == this_th) { // Thread is a cg_root
5881  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5882  KA_TRACE(
5883  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5884  this_th->th.th_cg_roots = tmp->up;
5885  __kmp_free(tmp);
5886  } else { // Worker thread
5887  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5888  __kmp_free(tmp);
5889  }
5890  this_th->th.th_cg_roots = NULL;
5891  break;
5892  }
5893  }
5894 
5895  /* If the implicit task assigned to this thread can be used by other threads
5896  * -> multiple threads can share the data and try to free the task at
5897  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5898  * with higher probability when hot team is disabled but can occurs even when
5899  * the hot team is enabled */
5900  __kmp_free_implicit_task(this_th);
5901  this_th->th.th_current_task = NULL;
5902 
5903  // If the __kmp_thread_pool_insert_pt is already past the new insert
5904  // point, then we need to re-scan the entire list.
5905  gtid = this_th->th.th_info.ds.ds_gtid;
5906  if (__kmp_thread_pool_insert_pt != NULL) {
5907  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5908  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5909  __kmp_thread_pool_insert_pt = NULL;
5910  }
5911  }
5912 
5913  // Scan down the list to find the place to insert the thread.
5914  // scan is the address of a link in the list, possibly the address of
5915  // __kmp_thread_pool itself.
5916  //
5917  // In the absence of nested parallelism, the for loop will have 0 iterations.
5918  if (__kmp_thread_pool_insert_pt != NULL) {
5919  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5920  } else {
5921  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5922  }
5923  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5924  scan = &((*scan)->th.th_next_pool))
5925  ;
5926 
5927  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5928  // to its address.
5929  TCW_PTR(this_th->th.th_next_pool, *scan);
5930  __kmp_thread_pool_insert_pt = *scan = this_th;
5931  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5932  (this_th->th.th_info.ds.ds_gtid <
5933  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5934  TCW_4(this_th->th.th_in_pool, TRUE);
5935  __kmp_suspend_initialize_thread(this_th);
5936  __kmp_lock_suspend_mx(this_th);
5937  if (this_th->th.th_active == TRUE) {
5938  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5939  this_th->th.th_active_in_pool = TRUE;
5940  }
5941 #if KMP_DEBUG
5942  else {
5943  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5944  }
5945 #endif
5946  __kmp_unlock_suspend_mx(this_th);
5947 
5948  TCW_4(__kmp_nth, __kmp_nth - 1);
5949 
5950 #ifdef KMP_ADJUST_BLOCKTIME
5951  /* Adjust blocktime back to user setting or default if necessary */
5952  /* Middle initialization might never have occurred */
5953  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5954  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5955  if (__kmp_nth <= __kmp_avail_proc) {
5956  __kmp_zero_bt = FALSE;
5957  }
5958  }
5959 #endif /* KMP_ADJUST_BLOCKTIME */
5960 
5961  KMP_MB();
5962 }
5963 
5964 /* ------------------------------------------------------------------------ */
5965 
5966 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5967 #if OMP_PROFILING_SUPPORT
5968  ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5969  // TODO: add a configuration option for time granularity
5970  if (ProfileTraceFile)
5971  llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
5972 #endif
5973 
5974  int gtid = this_thr->th.th_info.ds.ds_gtid;
5975  /* void *stack_data;*/
5976  kmp_team_t **volatile pteam;
5977 
5978  KMP_MB();
5979  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5980 
5981  if (__kmp_env_consistency_check) {
5982  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5983  }
5984 
5985 #if OMPD_SUPPORT
5986  if (ompd_state & OMPD_ENABLE_BP)
5987  ompd_bp_thread_begin();
5988 #endif
5989 
5990 #if OMPT_SUPPORT
5991  ompt_data_t *thread_data = nullptr;
5992  if (ompt_enabled.enabled) {
5993  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5994  *thread_data = ompt_data_none;
5995 
5996  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5997  this_thr->th.ompt_thread_info.wait_id = 0;
5998  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5999  this_thr->th.ompt_thread_info.parallel_flags = 0;
6000  if (ompt_enabled.ompt_callback_thread_begin) {
6001  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6002  ompt_thread_worker, thread_data);
6003  }
6004  this_thr->th.ompt_thread_info.state = ompt_state_idle;
6005  }
6006 #endif
6007 
6008  /* This is the place where threads wait for work */
6009  while (!TCR_4(__kmp_global.g.g_done)) {
6010  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6011  KMP_MB();
6012 
6013  /* wait for work to do */
6014  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6015 
6016  /* No tid yet since not part of a team */
6017  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6018 
6019 #if OMPT_SUPPORT
6020  if (ompt_enabled.enabled) {
6021  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6022  }
6023 #endif
6024 
6025  pteam = &this_thr->th.th_team;
6026 
6027  /* have we been allocated? */
6028  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6029  /* we were just woken up, so run our new task */
6030  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6031  int rc;
6032  KA_TRACE(20,
6033  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6034  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6035  (*pteam)->t.t_pkfn));
6036 
6037  updateHWFPControl(*pteam);
6038 
6039 #if OMPT_SUPPORT
6040  if (ompt_enabled.enabled) {
6041  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6042  }
6043 #endif
6044 
6045  rc = (*pteam)->t.t_invoke(gtid);
6046  KMP_ASSERT(rc);
6047 
6048  KMP_MB();
6049  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6050  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6051  (*pteam)->t.t_pkfn));
6052  }
6053 #if OMPT_SUPPORT
6054  if (ompt_enabled.enabled) {
6055  /* no frame set while outside task */
6056  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6057 
6058  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6059  }
6060 #endif
6061  /* join barrier after parallel region */
6062  __kmp_join_barrier(gtid);
6063  }
6064  }
6065 
6066 #if OMPD_SUPPORT
6067  if (ompd_state & OMPD_ENABLE_BP)
6068  ompd_bp_thread_end();
6069 #endif
6070 
6071 #if OMPT_SUPPORT
6072  if (ompt_enabled.ompt_callback_thread_end) {
6073  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6074  }
6075 #endif
6076 
6077  this_thr->th.th_task_team = NULL;
6078  /* run the destructors for the threadprivate data for this thread */
6079  __kmp_common_destroy_gtid(gtid);
6080 
6081  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6082  KMP_MB();
6083 
6084 #if OMP_PROFILING_SUPPORT
6085  llvm::timeTraceProfilerFinishThread();
6086 #endif
6087  return this_thr;
6088 }
6089 
6090 /* ------------------------------------------------------------------------ */
6091 
6092 void __kmp_internal_end_dest(void *specific_gtid) {
6093  // Make sure no significant bits are lost
6094  int gtid;
6095  __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6096 
6097  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6098  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6099  * this is because 0 is reserved for the nothing-stored case */
6100 
6101  __kmp_internal_end_thread(gtid);
6102 }
6103 
6104 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6105 
6106 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6107  __kmp_internal_end_atexit();
6108 }
6109 
6110 #endif
6111 
6112 /* [Windows] josh: when the atexit handler is called, there may still be more
6113  than one thread alive */
6114 void __kmp_internal_end_atexit(void) {
6115  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6116  /* [Windows]
6117  josh: ideally, we want to completely shutdown the library in this atexit
6118  handler, but stat code that depends on thread specific data for gtid fails
6119  because that data becomes unavailable at some point during the shutdown, so
6120  we call __kmp_internal_end_thread instead. We should eventually remove the
6121  dependency on __kmp_get_specific_gtid in the stat code and use
6122  __kmp_internal_end_library to cleanly shutdown the library.
6123 
6124  // TODO: Can some of this comment about GVS be removed?
6125  I suspect that the offending stat code is executed when the calling thread
6126  tries to clean up a dead root thread's data structures, resulting in GVS
6127  code trying to close the GVS structures for that thread, but since the stat
6128  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6129  the calling thread is cleaning up itself instead of another thread, it get
6130  confused. This happens because allowing a thread to unregister and cleanup
6131  another thread is a recent modification for addressing an issue.
6132  Based on the current design (20050722), a thread may end up
6133  trying to unregister another thread only if thread death does not trigger
6134  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6135  thread specific data destructor function to detect thread death. For
6136  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6137  is nothing. Thus, the workaround is applicable only for Windows static
6138  stat library. */
6139  __kmp_internal_end_library(-1);
6140 #if KMP_OS_WINDOWS
6141  __kmp_close_console();
6142 #endif
6143 }
6144 
6145 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6146  // It is assumed __kmp_forkjoin_lock is acquired.
6147 
6148  int gtid;
6149 
6150  KMP_DEBUG_ASSERT(thread != NULL);
6151 
6152  gtid = thread->th.th_info.ds.ds_gtid;
6153 
6154  if (!is_root) {
6155  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6156  /* Assume the threads are at the fork barrier here */
6157  KA_TRACE(
6158  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6159  gtid));
6160  if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6161  while (
6162  !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6163  KMP_CPU_PAUSE();
6164  __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6165  } else {
6166  /* Need release fence here to prevent seg faults for tree forkjoin
6167  barrier (GEH) */
6168  kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6169  thread);
6170  __kmp_release_64(&flag);
6171  }
6172  }
6173 
6174  // Terminate OS thread.
6175  __kmp_reap_worker(thread);
6176 
6177  // The thread was killed asynchronously. If it was actively
6178  // spinning in the thread pool, decrement the global count.
6179  //
6180  // There is a small timing hole here - if the worker thread was just waking
6181  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6182  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6183  // the global counter might not get updated.
6184  //
6185  // Currently, this can only happen as the library is unloaded,
6186  // so there are no harmful side effects.
6187  if (thread->th.th_active_in_pool) {
6188  thread->th.th_active_in_pool = FALSE;
6189  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6190  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6191  }
6192  }
6193 
6194  __kmp_free_implicit_task(thread);
6195 
6196 // Free the fast memory for tasking
6197 #if USE_FAST_MEMORY
6198  __kmp_free_fast_memory(thread);
6199 #endif /* USE_FAST_MEMORY */
6200 
6201  __kmp_suspend_uninitialize_thread(thread);
6202 
6203  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6204  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6205 
6206  --__kmp_all_nth;
6207  // __kmp_nth was decremented when thread is added to the pool.
6208 
6209 #ifdef KMP_ADJUST_BLOCKTIME
6210  /* Adjust blocktime back to user setting or default if necessary */
6211  /* Middle initialization might never have occurred */
6212  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6213  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6214  if (__kmp_nth <= __kmp_avail_proc) {
6215  __kmp_zero_bt = FALSE;
6216  }
6217  }
6218 #endif /* KMP_ADJUST_BLOCKTIME */
6219 
6220  /* free the memory being used */
6221  if (__kmp_env_consistency_check) {
6222  if (thread->th.th_cons) {
6223  __kmp_free_cons_stack(thread->th.th_cons);
6224  thread->th.th_cons = NULL;
6225  }
6226  }
6227 
6228  if (thread->th.th_pri_common != NULL) {
6229  __kmp_free(thread->th.th_pri_common);
6230  thread->th.th_pri_common = NULL;
6231  }
6232 
6233 #if KMP_USE_BGET
6234  if (thread->th.th_local.bget_data != NULL) {
6235  __kmp_finalize_bget(thread);
6236  }
6237 #endif
6238 
6239 #if KMP_AFFINITY_SUPPORTED
6240  if (thread->th.th_affin_mask != NULL) {
6241  KMP_CPU_FREE(thread->th.th_affin_mask);
6242  thread->th.th_affin_mask = NULL;
6243  }
6244 #endif /* KMP_AFFINITY_SUPPORTED */
6245 
6246 #if KMP_USE_HIER_SCHED
6247  if (thread->th.th_hier_bar_data != NULL) {
6248  __kmp_free(thread->th.th_hier_bar_data);
6249  thread->th.th_hier_bar_data = NULL;
6250  }
6251 #endif
6252 
6253  __kmp_reap_team(thread->th.th_serial_team);
6254  thread->th.th_serial_team = NULL;
6255  __kmp_free(thread);
6256 
6257  KMP_MB();
6258 
6259 } // __kmp_reap_thread
6260 
6261 static void __kmp_itthash_clean(kmp_info_t *th) {
6262 #if USE_ITT_NOTIFY
6263  if (__kmp_itt_region_domains.count > 0) {
6264  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6265  kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6266  while (bucket) {
6267  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6268  __kmp_thread_free(th, bucket);
6269  bucket = next;
6270  }
6271  }
6272  }
6273  if (__kmp_itt_barrier_domains.count > 0) {
6274  for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6275  kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6276  while (bucket) {
6277  kmp_itthash_entry_t *next = bucket->next_in_bucket;
6278  __kmp_thread_free(th, bucket);
6279  bucket = next;
6280  }
6281  }
6282  }
6283 #endif
6284 }
6285 
6286 static void __kmp_internal_end(void) {
6287  int i;
6288 
6289  /* First, unregister the library */
6290  __kmp_unregister_library();
6291 
6292 #if KMP_OS_WINDOWS
6293  /* In Win static library, we can't tell when a root actually dies, so we
6294  reclaim the data structures for any root threads that have died but not
6295  unregistered themselves, in order to shut down cleanly.
6296  In Win dynamic library we also can't tell when a thread dies. */
6297  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6298 // dead roots
6299 #endif
6300 
6301  for (i = 0; i < __kmp_threads_capacity; i++)
6302  if (__kmp_root[i])
6303  if (__kmp_root[i]->r.r_active)
6304  break;
6305  KMP_MB(); /* Flush all pending memory write invalidates. */
6306  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6307 
6308  if (i < __kmp_threads_capacity) {
6309 #if KMP_USE_MONITOR
6310  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6311  KMP_MB(); /* Flush all pending memory write invalidates. */
6312 
6313  // Need to check that monitor was initialized before reaping it. If we are
6314  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6315  // __kmp_monitor will appear to contain valid data, but it is only valid in
6316  // the parent process, not the child.
6317  // New behavior (201008): instead of keying off of the flag
6318  // __kmp_init_parallel, the monitor thread creation is keyed off
6319  // of the new flag __kmp_init_monitor.
6320  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6321  if (TCR_4(__kmp_init_monitor)) {
6322  __kmp_reap_monitor(&__kmp_monitor);
6323  TCW_4(__kmp_init_monitor, 0);
6324  }
6325  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6326  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6327 #endif // KMP_USE_MONITOR
6328  } else {
6329 /* TODO move this to cleanup code */
6330 #ifdef KMP_DEBUG
6331  /* make sure that everything has properly ended */
6332  for (i = 0; i < __kmp_threads_capacity; i++) {
6333  if (__kmp_root[i]) {
6334  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6335  // there can be uber threads alive here
6336  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6337  }
6338  }
6339 #endif
6340 
6341  KMP_MB();
6342 
6343  // Reap the worker threads.
6344  // This is valid for now, but be careful if threads are reaped sooner.
6345  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6346  // Get the next thread from the pool.
6347  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6348  __kmp_thread_pool = thread->th.th_next_pool;
6349  // Reap it.
6350  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6351  thread->th.th_next_pool = NULL;
6352  thread->th.th_in_pool = FALSE;
6353  __kmp_reap_thread(thread, 0);
6354  }
6355  __kmp_thread_pool_insert_pt = NULL;
6356 
6357  // Reap teams.
6358  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6359  // Get the next team from the pool.
6360  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6361  __kmp_team_pool = team->t.t_next_pool;
6362  // Reap it.
6363  team->t.t_next_pool = NULL;
6364  __kmp_reap_team(team);
6365  }
6366 
6367  __kmp_reap_task_teams();
6368 
6369 #if KMP_OS_UNIX
6370  // Threads that are not reaped should not access any resources since they
6371  // are going to be deallocated soon, so the shutdown sequence should wait
6372  // until all threads either exit the final spin-waiting loop or begin
6373  // sleeping after the given blocktime.
6374  for (i = 0; i < __kmp_threads_capacity; i++) {
6375  kmp_info_t *thr = __kmp_threads[i];
6376  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6377  KMP_CPU_PAUSE();
6378  }
6379 #endif
6380 
6381  for (i = 0; i < __kmp_threads_capacity; ++i) {
6382  // TBD: Add some checking...
6383  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6384  }
6385 
6386  /* Make sure all threadprivate destructors get run by joining with all
6387  worker threads before resetting this flag */
6388  TCW_SYNC_4(__kmp_init_common, FALSE);
6389 
6390  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6391  KMP_MB();
6392 
6393 #if KMP_USE_MONITOR
6394  // See note above: One of the possible fixes for CQ138434 / CQ140126
6395  //
6396  // FIXME: push both code fragments down and CSE them?
6397  // push them into __kmp_cleanup() ?
6398  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6399  if (TCR_4(__kmp_init_monitor)) {
6400  __kmp_reap_monitor(&__kmp_monitor);
6401  TCW_4(__kmp_init_monitor, 0);
6402  }
6403  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6404  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6405 #endif
6406  } /* else !__kmp_global.t_active */
6407  TCW_4(__kmp_init_gtid, FALSE);
6408  KMP_MB(); /* Flush all pending memory write invalidates. */
6409 
6410  __kmp_cleanup();
6411 #if OMPT_SUPPORT
6412  ompt_fini();
6413 #endif
6414 }
6415 
6416 void __kmp_internal_end_library(int gtid_req) {
6417  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6418  /* this shouldn't be a race condition because __kmp_internal_end() is the
6419  only place to clear __kmp_serial_init */
6420  /* we'll check this later too, after we get the lock */
6421  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6422  // redundant, because the next check will work in any case.
6423  if (__kmp_global.g.g_abort) {
6424  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6425  /* TODO abort? */
6426  return;
6427  }
6428  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6429  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6430  return;
6431  }
6432 
6433  // If hidden helper team has been initialized, we need to deinit it
6434  if (TCR_4(__kmp_init_hidden_helper) &&
6435  !TCR_4(__kmp_hidden_helper_team_done)) {
6436  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6437  // First release the main thread to let it continue its work
6438  __kmp_hidden_helper_main_thread_release();
6439  // Wait until the hidden helper team has been destroyed
6440  __kmp_hidden_helper_threads_deinitz_wait();
6441  }
6442 
6443  KMP_MB(); /* Flush all pending memory write invalidates. */
6444  /* find out who we are and what we should do */
6445  {
6446  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6447  KA_TRACE(
6448  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6449  if (gtid == KMP_GTID_SHUTDOWN) {
6450  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6451  "already shutdown\n"));
6452  return;
6453  } else if (gtid == KMP_GTID_MONITOR) {
6454  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6455  "registered, or system shutdown\n"));
6456  return;
6457  } else if (gtid == KMP_GTID_DNE) {
6458  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6459  "shutdown\n"));
6460  /* we don't know who we are, but we may still shutdown the library */
6461  } else if (KMP_UBER_GTID(gtid)) {
6462  /* unregister ourselves as an uber thread. gtid is no longer valid */
6463  if (__kmp_root[gtid]->r.r_active) {
6464  __kmp_global.g.g_abort = -1;
6465  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6466  __kmp_unregister_library();
6467  KA_TRACE(10,
6468  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6469  gtid));
6470  return;
6471  } else {
6472  __kmp_itthash_clean(__kmp_threads[gtid]);
6473  KA_TRACE(
6474  10,
6475  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6476  __kmp_unregister_root_current_thread(gtid);
6477  }
6478  } else {
6479 /* worker threads may call this function through the atexit handler, if they
6480  * call exit() */
6481 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6482  TODO: do a thorough shutdown instead */
6483 #ifdef DUMP_DEBUG_ON_EXIT
6484  if (__kmp_debug_buf)
6485  __kmp_dump_debug_buffer();
6486 #endif
6487  // added unregister library call here when we switch to shm linux
6488  // if we don't, it will leave lots of files in /dev/shm
6489  // cleanup shared memory file before exiting.
6490  __kmp_unregister_library();
6491  return;
6492  }
6493  }
6494  /* synchronize the termination process */
6495  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6496 
6497  /* have we already finished */
6498  if (__kmp_global.g.g_abort) {
6499  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6500  /* TODO abort? */
6501  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6502  return;
6503  }
6504  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6505  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6506  return;
6507  }
6508 
6509  /* We need this lock to enforce mutex between this reading of
6510  __kmp_threads_capacity and the writing by __kmp_register_root.
6511  Alternatively, we can use a counter of roots that is atomically updated by
6512  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6513  __kmp_internal_end_*. */
6514  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6515 
6516  /* now we can safely conduct the actual termination */
6517  __kmp_internal_end();
6518 
6519  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6520  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6521 
6522  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6523 
6524 #ifdef DUMP_DEBUG_ON_EXIT
6525  if (__kmp_debug_buf)
6526  __kmp_dump_debug_buffer();
6527 #endif
6528 
6529 #if KMP_OS_WINDOWS
6530  __kmp_close_console();
6531 #endif
6532 
6533  __kmp_fini_allocator();
6534 
6535 } // __kmp_internal_end_library
6536 
6537 void __kmp_internal_end_thread(int gtid_req) {
6538  int i;
6539 
6540  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6541  /* this shouldn't be a race condition because __kmp_internal_end() is the
6542  * only place to clear __kmp_serial_init */
6543  /* we'll check this later too, after we get the lock */
6544  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6545  // redundant, because the next check will work in any case.
6546  if (__kmp_global.g.g_abort) {
6547  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6548  /* TODO abort? */
6549  return;
6550  }
6551  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6552  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6553  return;
6554  }
6555 
6556  // If hidden helper team has been initialized, we need to deinit it
6557  if (TCR_4(__kmp_init_hidden_helper) &&
6558  !TCR_4(__kmp_hidden_helper_team_done)) {
6559  TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6560  // First release the main thread to let it continue its work
6561  __kmp_hidden_helper_main_thread_release();
6562  // Wait until the hidden helper team has been destroyed
6563  __kmp_hidden_helper_threads_deinitz_wait();
6564  }
6565 
6566  KMP_MB(); /* Flush all pending memory write invalidates. */
6567 
6568  /* find out who we are and what we should do */
6569  {
6570  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6571  KA_TRACE(10,
6572  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6573  if (gtid == KMP_GTID_SHUTDOWN) {
6574  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6575  "already shutdown\n"));
6576  return;
6577  } else if (gtid == KMP_GTID_MONITOR) {
6578  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6579  "registered, or system shutdown\n"));
6580  return;
6581  } else if (gtid == KMP_GTID_DNE) {
6582  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6583  "shutdown\n"));
6584  return;
6585  /* we don't know who we are */
6586  } else if (KMP_UBER_GTID(gtid)) {
6587  /* unregister ourselves as an uber thread. gtid is no longer valid */
6588  if (__kmp_root[gtid]->r.r_active) {
6589  __kmp_global.g.g_abort = -1;
6590  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6591  KA_TRACE(10,
6592  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6593  gtid));
6594  return;
6595  } else {
6596  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6597  gtid));
6598  __kmp_unregister_root_current_thread(gtid);
6599  }
6600  } else {
6601  /* just a worker thread, let's leave */
6602  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6603 
6604  if (gtid >= 0) {
6605  __kmp_threads[gtid]->th.th_task_team = NULL;
6606  }
6607 
6608  KA_TRACE(10,
6609  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6610  gtid));
6611  return;
6612  }
6613  }
6614 #if KMP_DYNAMIC_LIB
6615  if (__kmp_pause_status != kmp_hard_paused)
6616  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6617  // because we will better shutdown later in the library destructor.
6618  {
6619  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6620  return;
6621  }
6622 #endif
6623  /* synchronize the termination process */
6624  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6625 
6626  /* have we already finished */
6627  if (__kmp_global.g.g_abort) {
6628  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6629  /* TODO abort? */
6630  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6631  return;
6632  }
6633  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6634  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6635  return;
6636  }
6637 
6638  /* We need this lock to enforce mutex between this reading of
6639  __kmp_threads_capacity and the writing by __kmp_register_root.
6640  Alternatively, we can use a counter of roots that is atomically updated by
6641  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6642  __kmp_internal_end_*. */
6643 
6644  /* should we finish the run-time? are all siblings done? */
6645  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6646 
6647  for (i = 0; i < __kmp_threads_capacity; ++i) {
6648  if (KMP_UBER_GTID(i)) {
6649  KA_TRACE(
6650  10,
6651  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6652  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6653  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6654  return;
6655  }
6656  }
6657 
6658  /* now we can safely conduct the actual termination */
6659 
6660  __kmp_internal_end();
6661 
6662  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6663  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6664 
6665  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6666 
6667 #ifdef DUMP_DEBUG_ON_EXIT
6668  if (__kmp_debug_buf)
6669  __kmp_dump_debug_buffer();
6670 #endif
6671 } // __kmp_internal_end_thread
6672 
6673 // -----------------------------------------------------------------------------
6674 // Library registration stuff.
6675 
6676 static long __kmp_registration_flag = 0;
6677 // Random value used to indicate library initialization.
6678 static char *__kmp_registration_str = NULL;
6679 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6680 
6681 static inline char *__kmp_reg_status_name() {
6682 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6683  each thread. If registration and unregistration go in different threads
6684  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6685  env var can not be found, because the name will contain different pid. */
6686 // macOS* complains about name being too long with additional getuid()
6687 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6688  return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6689  (int)getuid());
6690 #else
6691  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6692 #endif
6693 } // __kmp_reg_status_get
6694 
6695 #if defined(KMP_USE_SHM)
6696 bool __kmp_shm_available = false;
6697 bool __kmp_tmp_available = false;
6698 // If /dev/shm is not accessible, we will create a temporary file under /tmp.
6699 char *temp_reg_status_file_name = nullptr;
6700 #endif
6701 
6702 void __kmp_register_library_startup(void) {
6703 
6704  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6705  int done = 0;
6706  union {
6707  double dtime;
6708  long ltime;
6709  } time;
6710 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6711  __kmp_initialize_system_tick();
6712 #endif
6713  __kmp_read_system_time(&time.dtime);
6714  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6715  __kmp_registration_str =
6716  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6717  __kmp_registration_flag, KMP_LIBRARY_FILE);
6718 
6719  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6720  __kmp_registration_str));
6721 
6722  while (!done) {
6723 
6724  char *value = NULL; // Actual value of the environment variable.
6725 
6726 #if defined(KMP_USE_SHM)
6727  char *shm_name = nullptr;
6728  char *data1 = nullptr;
6729  __kmp_shm_available = __kmp_detect_shm();
6730  if (__kmp_shm_available) {
6731  int fd1 = -1;
6732  shm_name = __kmp_str_format("/%s", name);
6733  int shm_preexist = 0;
6734  fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6735  if ((fd1 == -1) && (errno == EEXIST)) {
6736  // file didn't open because it already exists.
6737  // try opening existing file
6738  fd1 = shm_open(shm_name, O_RDWR, 0600);
6739  if (fd1 == -1) { // file didn't open
6740  KMP_WARNING(FunctionError, "Can't open SHM");
6741  __kmp_shm_available = false;
6742  } else { // able to open existing file
6743  shm_preexist = 1;
6744  }
6745  }
6746  if (__kmp_shm_available && shm_preexist == 0) { // SHM created, set size
6747  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6748  KMP_WARNING(FunctionError, "Can't set size of SHM");
6749  __kmp_shm_available = false;
6750  }
6751  }
6752  if (__kmp_shm_available) { // SHM exists, now map it
6753  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6754  fd1, 0);
6755  if (data1 == MAP_FAILED) { // failed to map shared memory
6756  KMP_WARNING(FunctionError, "Can't map SHM");
6757  __kmp_shm_available = false;
6758  }
6759  }
6760  if (__kmp_shm_available) { // SHM mapped
6761  if (shm_preexist == 0) { // set data to SHM, set value
6762  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6763  }
6764  // Read value from either what we just wrote or existing file.
6765  value = __kmp_str_format("%s", data1); // read value from SHM
6766  munmap(data1, SHM_SIZE);
6767  }
6768  if (fd1 != -1)
6769  close(fd1);
6770  }
6771  if (!__kmp_shm_available)
6772  __kmp_tmp_available = __kmp_detect_tmp();
6773  if (!__kmp_shm_available && __kmp_tmp_available) {
6774  // SHM failed to work due to an error other than that the file already
6775  // exists. Try to create a temp file under /tmp.
6776  // If /tmp isn't accessible, fall back to using environment variable.
6777  // TODO: /tmp might not always be the temporary directory. For now we will
6778  // not consider TMPDIR.
6779  int fd1 = -1;
6780  temp_reg_status_file_name = __kmp_str_format("/tmp/%s", name);
6781  int tmp_preexist = 0;
6782  fd1 = open(temp_reg_status_file_name, O_CREAT | O_EXCL | O_RDWR, 0600);
6783  if ((fd1 == -1) && (errno == EEXIST)) {
6784  // file didn't open because it already exists.
6785  // try opening existing file
6786  fd1 = open(temp_reg_status_file_name, O_RDWR, 0600);
6787  if (fd1 == -1) { // file didn't open if (fd1 == -1) {
6788  KMP_WARNING(FunctionError, "Can't open TEMP");
6789  __kmp_tmp_available = false;
6790  } else {
6791  tmp_preexist = 1;
6792  }
6793  }
6794  if (__kmp_tmp_available && tmp_preexist == 0) {
6795  // we created /tmp file now set size
6796  if (ftruncate(fd1, SHM_SIZE) == -1) { // error occured setting size;
6797  KMP_WARNING(FunctionError, "Can't set size of /tmp file");
6798  __kmp_tmp_available = false;
6799  }
6800  }
6801  if (__kmp_tmp_available) {
6802  data1 = (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
6803  fd1, 0);
6804  if (data1 == MAP_FAILED) { // failed to map /tmp
6805  KMP_WARNING(FunctionError, "Can't map /tmp");
6806  __kmp_tmp_available = false;
6807  }
6808  }
6809  if (__kmp_tmp_available) {
6810  if (tmp_preexist == 0) { // set data to TMP, set value
6811  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6812  }
6813  // Read value from either what we just wrote or existing file.
6814  value = __kmp_str_format("%s", data1); // read value from SHM
6815  munmap(data1, SHM_SIZE);
6816  }
6817  if (fd1 != -1)
6818  close(fd1);
6819  }
6820  if (!__kmp_shm_available && !__kmp_tmp_available) {
6821  // no /dev/shm and no /tmp -- fall back to environment variable
6822  // Set environment variable, but do not overwrite if it exists.
6823  __kmp_env_set(name, __kmp_registration_str, 0);
6824  // read value to see if it got set
6825  value = __kmp_env_get(name);
6826  }
6827 #else // Windows and unix with static library
6828  // Set environment variable, but do not overwrite if it exists.
6829  __kmp_env_set(name, __kmp_registration_str, 0);
6830  // read value to see if it got set
6831  value = __kmp_env_get(name);
6832 #endif
6833 
6834  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6835  done = 1; // Ok, environment variable set successfully, exit the loop.
6836  } else {
6837  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6838  // Check whether it alive or dead.
6839  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6840  char *tail = value;
6841  char *flag_addr_str = NULL;
6842  char *flag_val_str = NULL;
6843  char const *file_name = NULL;
6844  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6845  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6846  file_name = tail;
6847  if (tail != NULL) {
6848  unsigned long *flag_addr = 0;
6849  unsigned long flag_val = 0;
6850  KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6851  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6852  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6853  // First, check whether environment-encoded address is mapped into
6854  // addr space.
6855  // If so, dereference it to see if it still has the right value.
6856  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6857  neighbor = 1;
6858  } else {
6859  // If not, then we know the other copy of the library is no longer
6860  // running.
6861  neighbor = 2;
6862  }
6863  }
6864  }
6865  switch (neighbor) {
6866  case 0: // Cannot parse environment variable -- neighbor status unknown.
6867  // Assume it is the incompatible format of future version of the
6868  // library. Assume the other library is alive.
6869  // WARN( ... ); // TODO: Issue a warning.
6870  file_name = "unknown library";
6871  KMP_FALLTHROUGH();
6872  // Attention! Falling to the next case. That's intentional.
6873  case 1: { // Neighbor is alive.
6874  // Check it is allowed.
6875  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6876  if (!__kmp_str_match_true(duplicate_ok)) {
6877  // That's not allowed. Issue fatal error.
6878  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6879  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6880  }
6881  KMP_INTERNAL_FREE(duplicate_ok);
6882  __kmp_duplicate_library_ok = 1;
6883  done = 1; // Exit the loop.
6884  } break;
6885  case 2: { // Neighbor is dead.
6886 
6887 #if defined(KMP_USE_SHM)
6888  if (__kmp_shm_available) { // close shared memory.
6889  shm_unlink(shm_name); // this removes file in /dev/shm
6890  } else if (__kmp_tmp_available) {
6891  unlink(temp_reg_status_file_name); // this removes the temp file
6892  } else {
6893  // Clear the variable and try to register library again.
6894  __kmp_env_unset(name);
6895  }
6896 #else
6897  // Clear the variable and try to register library again.
6898  __kmp_env_unset(name);
6899 #endif
6900  } break;
6901  default: {
6902  KMP_DEBUG_ASSERT(0);
6903  } break;
6904  }
6905  }
6906  KMP_INTERNAL_FREE((void *)value);
6907 #if defined(KMP_USE_SHM)
6908  if (shm_name)
6909  KMP_INTERNAL_FREE((void *)shm_name);
6910 #endif
6911  } // while
6912  KMP_INTERNAL_FREE((void *)name);
6913 
6914 } // func __kmp_register_library_startup
6915 
6916 void __kmp_unregister_library(void) {
6917 
6918  char *name = __kmp_reg_status_name();
6919  char *value = NULL;
6920 
6921 #if defined(KMP_USE_SHM)
6922  char *shm_name = nullptr;
6923  int fd1;
6924  if (__kmp_shm_available) {
6925  shm_name = __kmp_str_format("/%s", name);
6926  fd1 = shm_open(shm_name, O_RDONLY, 0600);
6927  if (fd1 != -1) { // File opened successfully
6928  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6929  if (data1 != MAP_FAILED) {
6930  value = __kmp_str_format("%s", data1); // read value from SHM
6931  munmap(data1, SHM_SIZE);
6932  }
6933  close(fd1);
6934  }
6935  } else if (__kmp_tmp_available) { // try /tmp
6936  fd1 = open(temp_reg_status_file_name, O_RDONLY);
6937  if (fd1 != -1) { // File opened successfully
6938  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6939  if (data1 != MAP_FAILED) {
6940  value = __kmp_str_format("%s", data1); // read value from /tmp
6941  munmap(data1, SHM_SIZE);
6942  }
6943  close(fd1);
6944  }
6945  } else { // fall back to envirable
6946  value = __kmp_env_get(name);
6947  }
6948 #else
6949  value = __kmp_env_get(name);
6950 #endif
6951 
6952  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6953  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6954  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6955 // Ok, this is our variable. Delete it.
6956 #if defined(KMP_USE_SHM)
6957  if (__kmp_shm_available) {
6958  shm_unlink(shm_name); // this removes file in /dev/shm
6959  } else if (__kmp_tmp_available) {
6960  unlink(temp_reg_status_file_name); // this removes the temp file
6961  } else {
6962  __kmp_env_unset(name);
6963  }
6964 #else
6965  __kmp_env_unset(name);
6966 #endif
6967  }
6968 
6969 #if defined(KMP_USE_SHM)
6970  if (shm_name)
6971  KMP_INTERNAL_FREE(shm_name);
6972  if (temp_reg_status_file_name)
6973  KMP_INTERNAL_FREE(temp_reg_status_file_name);
6974 #endif
6975 
6976  KMP_INTERNAL_FREE(__kmp_registration_str);
6977  KMP_INTERNAL_FREE(value);
6978  KMP_INTERNAL_FREE(name);
6979 
6980  __kmp_registration_flag = 0;
6981  __kmp_registration_str = NULL;
6982 
6983 } // __kmp_unregister_library
6984 
6985 // End of Library registration stuff.
6986 // -----------------------------------------------------------------------------
6987 
6988 #if KMP_MIC_SUPPORTED
6989 
6990 static void __kmp_check_mic_type() {
6991  kmp_cpuid_t cpuid_state = {0};
6992  kmp_cpuid_t *cs_p = &cpuid_state;
6993  __kmp_x86_cpuid(1, 0, cs_p);
6994  // We don't support mic1 at the moment
6995  if ((cs_p->eax & 0xff0) == 0xB10) {
6996  __kmp_mic_type = mic2;
6997  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6998  __kmp_mic_type = mic3;
6999  } else {
7000  __kmp_mic_type = non_mic;
7001  }
7002 }
7003 
7004 #endif /* KMP_MIC_SUPPORTED */
7005 
7006 #if KMP_HAVE_UMWAIT
7007 static void __kmp_user_level_mwait_init() {
7008  struct kmp_cpuid buf;
7009  __kmp_x86_cpuid(7, 0, &buf);
7010  __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
7011  __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
7012  __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
7013  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
7014  __kmp_umwait_enabled));
7015 }
7016 #elif KMP_HAVE_MWAIT
7017 #ifndef AT_INTELPHIUSERMWAIT
7018 // Spurious, non-existent value that should always fail to return anything.
7019 // Will be replaced with the correct value when we know that.
7020 #define AT_INTELPHIUSERMWAIT 10000
7021 #endif
7022 // getauxval() function is available in RHEL7 and SLES12. If a system with an
7023 // earlier OS is used to build the RTL, we'll use the following internal
7024 // function when the entry is not found.
7025 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
7026 unsigned long getauxval(unsigned long) { return 0; }
7027 
7028 static void __kmp_user_level_mwait_init() {
7029  // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7030  // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7031  // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7032  // KMP_USER_LEVEL_MWAIT was set to TRUE.
7033  if (__kmp_mic_type == mic3) {
7034  unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7035  if ((res & 0x1) || __kmp_user_level_mwait) {
7036  __kmp_mwait_enabled = TRUE;
7037  if (__kmp_user_level_mwait) {
7038  KMP_INFORM(EnvMwaitWarn);
7039  }
7040  } else {
7041  __kmp_mwait_enabled = FALSE;
7042  }
7043  }
7044  KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7045  "__kmp_mwait_enabled = %d\n",
7046  __kmp_mic_type, __kmp_mwait_enabled));
7047 }
7048 #endif /* KMP_HAVE_UMWAIT */
7049 
7050 static void __kmp_do_serial_initialize(void) {
7051  int i, gtid;
7052  size_t size;
7053 
7054  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7055 
7056  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7057  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7058  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7059  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7060  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7061 
7062 #if OMPT_SUPPORT
7063  ompt_pre_init();
7064 #endif
7065 #if OMPD_SUPPORT
7066  __kmp_env_dump();
7067  ompd_init();
7068 #endif
7069 
7070  __kmp_validate_locks();
7071 
7072 #if ENABLE_LIBOMPTARGET
7073  /* Initialize functions from libomptarget */
7074  __kmp_init_omptarget();
7075 #endif
7076 
7077  /* Initialize internal memory allocator */
7078  __kmp_init_allocator();
7079 
7080  /* Register the library startup via an environment variable or via mapped
7081  shared memory file and check to see whether another copy of the library is
7082  already registered. Since forked child process is often terminated, we
7083  postpone the registration till middle initialization in the child */
7084  if (__kmp_need_register_serial)
7085  __kmp_register_library_startup();
7086 
7087  /* TODO reinitialization of library */
7088  if (TCR_4(__kmp_global.g.g_done)) {
7089  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7090  }
7091 
7092  __kmp_global.g.g_abort = 0;
7093  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7094 
7095 /* initialize the locks */
7096 #if KMP_USE_ADAPTIVE_LOCKS
7097 #if KMP_DEBUG_ADAPTIVE_LOCKS
7098  __kmp_init_speculative_stats();
7099 #endif
7100 #endif
7101 #if KMP_STATS_ENABLED
7102  __kmp_stats_init();
7103 #endif
7104  __kmp_init_lock(&__kmp_global_lock);
7105  __kmp_init_atomic_lock(&__kmp_atomic_lock);
7106  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7107  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7108  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7109  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7110  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7111  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7112  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7113  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7114  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7115  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7116  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7117  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7118  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7119  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7120 #if KMP_USE_MONITOR
7121  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7122 #endif
7123  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7124 
7125  /* conduct initialization and initial setup of configuration */
7126 
7127  __kmp_runtime_initialize();
7128 
7129 #if KMP_MIC_SUPPORTED
7130  __kmp_check_mic_type();
7131 #endif
7132 
7133 // Some global variable initialization moved here from kmp_env_initialize()
7134 #ifdef KMP_DEBUG
7135  kmp_diag = 0;
7136 #endif
7137  __kmp_abort_delay = 0;
7138 
7139  // From __kmp_init_dflt_team_nth()
7140  /* assume the entire machine will be used */
7141  __kmp_dflt_team_nth_ub = __kmp_xproc;
7142  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7143  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7144  }
7145  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7146  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7147  }
7148  __kmp_max_nth = __kmp_sys_max_nth;
7149  __kmp_cg_max_nth = __kmp_sys_max_nth;
7150  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7151  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7152  __kmp_teams_max_nth = __kmp_sys_max_nth;
7153  }
7154 
7155  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7156  // part
7157  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7158 #if KMP_USE_MONITOR
7159  __kmp_monitor_wakeups =
7160  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7161  __kmp_bt_intervals =
7162  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7163 #endif
7164  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7165  __kmp_library = library_throughput;
7166  // From KMP_SCHEDULE initialization
7167  __kmp_static = kmp_sch_static_balanced;
7168 // AC: do not use analytical here, because it is non-monotonous
7169 //__kmp_guided = kmp_sch_guided_iterative_chunked;
7170 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7171 // need to repeat assignment
7172 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7173 // bit control and barrier method control parts
7174 #if KMP_FAST_REDUCTION_BARRIER
7175 #define kmp_reduction_barrier_gather_bb ((int)1)
7176 #define kmp_reduction_barrier_release_bb ((int)1)
7177 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7178 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7179 #endif // KMP_FAST_REDUCTION_BARRIER
7180  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7181  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7182  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7183  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7184  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7185 #if KMP_FAST_REDUCTION_BARRIER
7186  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7187  // lin_64 ): hyper,1
7188  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7189  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7190  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7191  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7192  }
7193 #endif // KMP_FAST_REDUCTION_BARRIER
7194  }
7195 #if KMP_FAST_REDUCTION_BARRIER
7196 #undef kmp_reduction_barrier_release_pat
7197 #undef kmp_reduction_barrier_gather_pat
7198 #undef kmp_reduction_barrier_release_bb
7199 #undef kmp_reduction_barrier_gather_bb
7200 #endif // KMP_FAST_REDUCTION_BARRIER
7201 #if KMP_MIC_SUPPORTED
7202  if (__kmp_mic_type == mic2) { // KNC
7203  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7204  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7205  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7206  1; // forkjoin release
7207  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7208  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7209  }
7210 #if KMP_FAST_REDUCTION_BARRIER
7211  if (__kmp_mic_type == mic2) { // KNC
7212  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7213  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7214  }
7215 #endif // KMP_FAST_REDUCTION_BARRIER
7216 #endif // KMP_MIC_SUPPORTED
7217 
7218 // From KMP_CHECKS initialization
7219 #ifdef KMP_DEBUG
7220  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7221 #else
7222  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7223 #endif
7224 
7225  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7226  __kmp_foreign_tp = TRUE;
7227 
7228  __kmp_global.g.g_dynamic = FALSE;
7229  __kmp_global.g.g_dynamic_mode = dynamic_default;
7230 
7231  __kmp_init_nesting_mode();
7232 
7233  __kmp_env_initialize(NULL);
7234 
7235 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7236  __kmp_user_level_mwait_init();
7237 #endif
7238 // Print all messages in message catalog for testing purposes.
7239 #ifdef KMP_DEBUG
7240  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7241  if (__kmp_str_match_true(val)) {
7242  kmp_str_buf_t buffer;
7243  __kmp_str_buf_init(&buffer);
7244  __kmp_i18n_dump_catalog(&buffer);
7245  __kmp_printf("%s", buffer.str);
7246  __kmp_str_buf_free(&buffer);
7247  }
7248  __kmp_env_free(&val);
7249 #endif
7250 
7251  __kmp_threads_capacity =
7252  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7253  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7254  __kmp_tp_capacity = __kmp_default_tp_capacity(
7255  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7256 
7257  // If the library is shut down properly, both pools must be NULL. Just in
7258  // case, set them to NULL -- some memory may leak, but subsequent code will
7259  // work even if pools are not freed.
7260  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7261  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7262  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7263  __kmp_thread_pool = NULL;
7264  __kmp_thread_pool_insert_pt = NULL;
7265  __kmp_team_pool = NULL;
7266 
7267  /* Allocate all of the variable sized records */
7268  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7269  * expandable */
7270  /* Since allocation is cache-aligned, just add extra padding at the end */
7271  size =
7272  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7273  CACHE_LINE;
7274  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7275  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7276  sizeof(kmp_info_t *) * __kmp_threads_capacity);
7277 
7278  /* init thread counts */
7279  KMP_DEBUG_ASSERT(__kmp_all_nth ==
7280  0); // Asserts fail if the library is reinitializing and
7281  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7282  __kmp_all_nth = 0;
7283  __kmp_nth = 0;
7284 
7285  /* setup the uber master thread and hierarchy */
7286  gtid = __kmp_register_root(TRUE);
7287  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7288  KMP_ASSERT(KMP_UBER_GTID(gtid));
7289  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7290 
7291  KMP_MB(); /* Flush all pending memory write invalidates. */
7292 
7293  __kmp_common_initialize();
7294 
7295 #if KMP_OS_UNIX
7296  /* invoke the child fork handler */
7297  __kmp_register_atfork();
7298 #endif
7299 
7300 #if !KMP_DYNAMIC_LIB || \
7301  ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7302  {
7303  /* Invoke the exit handler when the program finishes, only for static
7304  library and macOS* dynamic. For other dynamic libraries, we already
7305  have _fini and DllMain. */
7306  int rc = atexit(__kmp_internal_end_atexit);
7307  if (rc != 0) {
7308  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7309  __kmp_msg_null);
7310  }
7311  }
7312 #endif
7313 
7314 #if KMP_HANDLE_SIGNALS
7315 #if KMP_OS_UNIX
7316  /* NOTE: make sure that this is called before the user installs their own
7317  signal handlers so that the user handlers are called first. this way they
7318  can return false, not call our handler, avoid terminating the library, and
7319  continue execution where they left off. */
7320  __kmp_install_signals(FALSE);
7321 #endif /* KMP_OS_UNIX */
7322 #if KMP_OS_WINDOWS
7323  __kmp_install_signals(TRUE);
7324 #endif /* KMP_OS_WINDOWS */
7325 #endif
7326 
7327  /* we have finished the serial initialization */
7328  __kmp_init_counter++;
7329 
7330  __kmp_init_serial = TRUE;
7331 
7332  if (__kmp_version) {
7333  __kmp_print_version_1();
7334  }
7335 
7336  if (__kmp_settings) {
7337  __kmp_env_print();
7338  }
7339 
7340  if (__kmp_display_env || __kmp_display_env_verbose) {
7341  __kmp_env_print_2();
7342  }
7343 
7344 #if OMPT_SUPPORT
7345  ompt_post_init();
7346 #endif
7347 
7348  KMP_MB();
7349 
7350  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7351 }
7352 
7353 void __kmp_serial_initialize(void) {
7354  if (__kmp_init_serial) {
7355  return;
7356  }
7357  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7358  if (__kmp_init_serial) {
7359  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7360  return;
7361  }
7362  __kmp_do_serial_initialize();
7363  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7364 }
7365 
7366 static void __kmp_do_middle_initialize(void) {
7367  int i, j;
7368  int prev_dflt_team_nth;
7369 
7370  if (!__kmp_init_serial) {
7371  __kmp_do_serial_initialize();
7372  }
7373 
7374  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7375 
7376  if (UNLIKELY(!__kmp_need_register_serial)) {
7377  // We are in a forked child process. The registration was skipped during
7378  // serial initialization in __kmp_atfork_child handler. Do it here.
7379  __kmp_register_library_startup();
7380  }
7381 
7382  // Save the previous value for the __kmp_dflt_team_nth so that
7383  // we can avoid some reinitialization if it hasn't changed.
7384  prev_dflt_team_nth = __kmp_dflt_team_nth;
7385 
7386 #if KMP_AFFINITY_SUPPORTED
7387  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7388  // number of cores on the machine.
7389  __kmp_affinity_initialize(__kmp_affinity);
7390 
7391 #endif /* KMP_AFFINITY_SUPPORTED */
7392 
7393  KMP_ASSERT(__kmp_xproc > 0);
7394  if (__kmp_avail_proc == 0) {
7395  __kmp_avail_proc = __kmp_xproc;
7396  }
7397 
7398  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7399  // correct them now
7400  j = 0;
7401  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7402  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7403  __kmp_avail_proc;
7404  j++;
7405  }
7406 
7407  if (__kmp_dflt_team_nth == 0) {
7408 #ifdef KMP_DFLT_NTH_CORES
7409  // Default #threads = #cores
7410  __kmp_dflt_team_nth = __kmp_ncores;
7411  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7412  "__kmp_ncores (%d)\n",
7413  __kmp_dflt_team_nth));
7414 #else
7415  // Default #threads = #available OS procs
7416  __kmp_dflt_team_nth = __kmp_avail_proc;
7417  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7418  "__kmp_avail_proc(%d)\n",
7419  __kmp_dflt_team_nth));
7420 #endif /* KMP_DFLT_NTH_CORES */
7421  }
7422 
7423  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7424  __kmp_dflt_team_nth = KMP_MIN_NTH;
7425  }
7426  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7427  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7428  }
7429 
7430  if (__kmp_nesting_mode > 0)
7431  __kmp_set_nesting_mode_threads();
7432 
7433  // There's no harm in continuing if the following check fails,
7434  // but it indicates an error in the previous logic.
7435  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7436 
7437  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7438  // Run through the __kmp_threads array and set the num threads icv for each
7439  // root thread that is currently registered with the RTL (which has not
7440  // already explicitly set its nthreads-var with a call to
7441  // omp_set_num_threads()).
7442  for (i = 0; i < __kmp_threads_capacity; i++) {
7443  kmp_info_t *thread = __kmp_threads[i];
7444  if (thread == NULL)
7445  continue;
7446  if (thread->th.th_current_task->td_icvs.nproc != 0)
7447  continue;
7448 
7449  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7450  }
7451  }
7452  KA_TRACE(
7453  20,
7454  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7455  __kmp_dflt_team_nth));
7456 
7457 #ifdef KMP_ADJUST_BLOCKTIME
7458  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7459  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7460  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7461  if (__kmp_nth > __kmp_avail_proc) {
7462  __kmp_zero_bt = TRUE;
7463  }
7464  }
7465 #endif /* KMP_ADJUST_BLOCKTIME */
7466 
7467  /* we have finished middle initialization */
7468  TCW_SYNC_4(__kmp_init_middle, TRUE);
7469 
7470  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7471 }
7472 
7473 void __kmp_middle_initialize(void) {
7474  if (__kmp_init_middle) {
7475  return;
7476  }
7477  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7478  if (__kmp_init_middle) {
7479  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7480  return;
7481  }
7482  __kmp_do_middle_initialize();
7483  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7484 }
7485 
7486 void __kmp_parallel_initialize(void) {
7487  int gtid = __kmp_entry_gtid(); // this might be a new root
7488 
7489  /* synchronize parallel initialization (for sibling) */
7490  if (TCR_4(__kmp_init_parallel))
7491  return;
7492  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7493  if (TCR_4(__kmp_init_parallel)) {
7494  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7495  return;
7496  }
7497 
7498  /* TODO reinitialization after we have already shut down */
7499  if (TCR_4(__kmp_global.g.g_done)) {
7500  KA_TRACE(
7501  10,
7502  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7503  __kmp_infinite_loop();
7504  }
7505 
7506  /* jc: The lock __kmp_initz_lock is already held, so calling
7507  __kmp_serial_initialize would cause a deadlock. So we call
7508  __kmp_do_serial_initialize directly. */
7509  if (!__kmp_init_middle) {
7510  __kmp_do_middle_initialize();
7511  }
7512  __kmp_assign_root_init_mask();
7513  __kmp_resume_if_hard_paused();
7514 
7515  /* begin initialization */
7516  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7517  KMP_ASSERT(KMP_UBER_GTID(gtid));
7518 
7519 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7520  // Save the FP control regs.
7521  // Worker threads will set theirs to these values at thread startup.
7522  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7523  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7524  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7525 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7526 
7527 #if KMP_OS_UNIX
7528 #if KMP_HANDLE_SIGNALS
7529  /* must be after __kmp_serial_initialize */
7530  __kmp_install_signals(TRUE);
7531 #endif
7532 #endif
7533 
7534  __kmp_suspend_initialize();
7535 
7536 #if defined(USE_LOAD_BALANCE)
7537  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7538  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7539  }
7540 #else
7541  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7542  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7543  }
7544 #endif
7545 
7546  if (__kmp_version) {
7547  __kmp_print_version_2();
7548  }
7549 
7550  /* we have finished parallel initialization */
7551  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7552 
7553  KMP_MB();
7554  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7555 
7556  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7557 }
7558 
7559 void __kmp_hidden_helper_initialize() {
7560  if (TCR_4(__kmp_init_hidden_helper))
7561  return;
7562 
7563  // __kmp_parallel_initialize is required before we initialize hidden helper
7564  if (!TCR_4(__kmp_init_parallel))
7565  __kmp_parallel_initialize();
7566 
7567  // Double check. Note that this double check should not be placed before
7568  // __kmp_parallel_initialize as it will cause dead lock.
7569  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7570  if (TCR_4(__kmp_init_hidden_helper)) {
7571  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7572  return;
7573  }
7574 
7575 #if KMP_AFFINITY_SUPPORTED
7576  // Initialize hidden helper affinity settings.
7577  // The above __kmp_parallel_initialize() will initialize
7578  // regular affinity (and topology) if not already done.
7579  if (!__kmp_hh_affinity.flags.initialized)
7580  __kmp_affinity_initialize(__kmp_hh_affinity);
7581 #endif
7582 
7583  // Set the count of hidden helper tasks to be executed to zero
7584  KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7585 
7586  // Set the global variable indicating that we're initializing hidden helper
7587  // team/threads
7588  TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7589 
7590  // Platform independent initialization
7591  __kmp_do_initialize_hidden_helper_threads();
7592 
7593  // Wait here for the finish of initialization of hidden helper teams
7594  __kmp_hidden_helper_threads_initz_wait();
7595 
7596  // We have finished hidden helper initialization
7597  TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7598 
7599  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7600 }
7601 
7602 /* ------------------------------------------------------------------------ */
7603 
7604 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7605  kmp_team_t *team) {
7606  kmp_disp_t *dispatch;
7607 
7608  KMP_MB();
7609 
7610  /* none of the threads have encountered any constructs, yet. */
7611  this_thr->th.th_local.this_construct = 0;
7612 #if KMP_CACHE_MANAGE
7613  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7614 #endif /* KMP_CACHE_MANAGE */
7615  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7616  KMP_DEBUG_ASSERT(dispatch);
7617  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7618  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7619  // this_thr->th.th_info.ds.ds_tid ] );
7620 
7621  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7622  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7623  if (__kmp_env_consistency_check)
7624  __kmp_push_parallel(gtid, team->t.t_ident);
7625 
7626  KMP_MB(); /* Flush all pending memory write invalidates. */
7627 }
7628 
7629 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7630  kmp_team_t *team) {
7631  if (__kmp_env_consistency_check)
7632  __kmp_pop_parallel(gtid, team->t.t_ident);
7633 
7634  __kmp_finish_implicit_task(this_thr);
7635 }
7636 
7637 int __kmp_invoke_task_func(int gtid) {
7638  int rc;
7639  int tid = __kmp_tid_from_gtid(gtid);
7640  kmp_info_t *this_thr = __kmp_threads[gtid];
7641  kmp_team_t *team = this_thr->th.th_team;
7642 
7643  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7644 #if USE_ITT_BUILD
7645  if (__itt_stack_caller_create_ptr) {
7646  // inform ittnotify about entering user's code
7647  if (team->t.t_stack_id != NULL) {
7648  __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7649  } else {
7650  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7651  __kmp_itt_stack_callee_enter(
7652  (__itt_caller)team->t.t_parent->t.t_stack_id);
7653  }
7654  }
7655 #endif /* USE_ITT_BUILD */
7656 #if INCLUDE_SSC_MARKS
7657  SSC_MARK_INVOKING();
7658 #endif
7659 
7660 #if OMPT_SUPPORT
7661  void *dummy;
7662  void **exit_frame_p;
7663  ompt_data_t *my_task_data;
7664  ompt_data_t *my_parallel_data;
7665  int ompt_team_size;
7666 
7667  if (ompt_enabled.enabled) {
7668  exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7669  .ompt_task_info.frame.exit_frame.ptr);
7670  } else {
7671  exit_frame_p = &dummy;
7672  }
7673 
7674  my_task_data =
7675  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7676  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7677  if (ompt_enabled.ompt_callback_implicit_task) {
7678  ompt_team_size = team->t.t_nproc;
7679  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7680  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7681  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7682  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7683  }
7684 #endif
7685 
7686 #if KMP_STATS_ENABLED
7687  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7688  if (previous_state == stats_state_e::TEAMS_REGION) {
7689  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7690  } else {
7691  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7692  }
7693  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7694 #endif
7695 
7696  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7697  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7698 #if OMPT_SUPPORT
7699  ,
7700  exit_frame_p
7701 #endif
7702  );
7703 #if OMPT_SUPPORT
7704  *exit_frame_p = NULL;
7705  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_team;
7706 #endif
7707 
7708 #if KMP_STATS_ENABLED
7709  if (previous_state == stats_state_e::TEAMS_REGION) {
7710  KMP_SET_THREAD_STATE(previous_state);
7711  }
7712  KMP_POP_PARTITIONED_TIMER();
7713 #endif
7714 
7715 #if USE_ITT_BUILD
7716  if (__itt_stack_caller_create_ptr) {
7717  // inform ittnotify about leaving user's code
7718  if (team->t.t_stack_id != NULL) {
7719  __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7720  } else {
7721  KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7722  __kmp_itt_stack_callee_leave(
7723  (__itt_caller)team->t.t_parent->t.t_stack_id);
7724  }
7725  }
7726 #endif /* USE_ITT_BUILD */
7727  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7728 
7729  return rc;
7730 }
7731 
7732 void __kmp_teams_master(int gtid) {
7733  // This routine is called by all primary threads in teams construct
7734  kmp_info_t *thr = __kmp_threads[gtid];
7735  kmp_team_t *team = thr->th.th_team;
7736  ident_t *loc = team->t.t_ident;
7737  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7738  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7739  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7740  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7741  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7742 
7743  // This thread is a new CG root. Set up the proper variables.
7744  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7745  tmp->cg_root = thr; // Make thr the CG root
7746  // Init to thread limit stored when league primary threads were forked
7747  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7748  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7749  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7750  " cg_nthreads to 1\n",
7751  thr, tmp));
7752  tmp->up = thr->th.th_cg_roots;
7753  thr->th.th_cg_roots = tmp;
7754 
7755 // Launch league of teams now, but not let workers execute
7756 // (they hang on fork barrier until next parallel)
7757 #if INCLUDE_SSC_MARKS
7758  SSC_MARK_FORKING();
7759 #endif
7760  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7761  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7762  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7763 #if INCLUDE_SSC_MARKS
7764  SSC_MARK_JOINING();
7765 #endif
7766  // If the team size was reduced from the limit, set it to the new size
7767  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7768  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7769  // AC: last parameter "1" eliminates join barrier which won't work because
7770  // worker threads are in a fork barrier waiting for more parallel regions
7771  __kmp_join_call(loc, gtid
7772 #if OMPT_SUPPORT
7773  ,
7774  fork_context_intel
7775 #endif
7776  ,
7777  1);
7778 }
7779 
7780 int __kmp_invoke_teams_master(int gtid) {
7781  kmp_info_t *this_thr = __kmp_threads[gtid];
7782  kmp_team_t *team = this_thr->th.th_team;
7783 #if KMP_DEBUG
7784  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7785  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7786  (void *)__kmp_teams_master);
7787 #endif
7788  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7789 #if OMPT_SUPPORT
7790  int tid = __kmp_tid_from_gtid(gtid);
7791  ompt_data_t *task_data =
7792  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7793  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7794  if (ompt_enabled.ompt_callback_implicit_task) {
7795  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7796  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7797  ompt_task_initial);
7798  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7799  }
7800 #endif
7801  __kmp_teams_master(gtid);
7802 #if OMPT_SUPPORT
7803  this_thr->th.ompt_thread_info.parallel_flags = ompt_parallel_league;
7804 #endif
7805  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7806  return 1;
7807 }
7808 
7809 /* this sets the requested number of threads for the next parallel region
7810  encountered by this team. since this should be enclosed in the forkjoin
7811  critical section it should avoid race conditions with asymmetrical nested
7812  parallelism */
7813 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7814  kmp_info_t *thr = __kmp_threads[gtid];
7815 
7816  if (num_threads > 0)
7817  thr->th.th_set_nproc = num_threads;
7818 }
7819 
7820 void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
7821  int *num_threads_list) {
7822  kmp_info_t *thr = __kmp_threads[gtid];
7823 
7824  KMP_DEBUG_ASSERT(list_length > 1);
7825 
7826  if (num_threads_list[0] > 0)
7827  thr->th.th_set_nproc = num_threads_list[0];
7828  thr->th.th_set_nested_nth =
7829  (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
7830  for (kmp_uint32 i = 0; i < list_length; ++i)
7831  thr->th.th_set_nested_nth[i] = num_threads_list[i];
7832  thr->th.th_set_nested_nth_sz = list_length;
7833 }
7834 
7835 void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
7836  const char *msg) {
7837  kmp_info_t *thr = __kmp_threads[gtid];
7838  thr->th.th_nt_strict = true;
7839  thr->th.th_nt_loc = loc;
7840  // if sev is unset make fatal
7841  if (sev == severity_warning)
7842  thr->th.th_nt_sev = sev;
7843  else
7844  thr->th.th_nt_sev = severity_fatal;
7845  // if msg is unset, use an appropriate message
7846  if (msg)
7847  thr->th.th_nt_msg = msg;
7848  else
7849  thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
7850  "strict num_threads clause.";
7851 }
7852 
7853 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7854  int num_threads) {
7855  KMP_DEBUG_ASSERT(thr);
7856  // Remember the number of threads for inner parallel regions
7857  if (!TCR_4(__kmp_init_middle))
7858  __kmp_middle_initialize(); // get internal globals calculated
7859  __kmp_assign_root_init_mask();
7860  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7861  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7862 
7863  if (num_threads == 0) {
7864  if (__kmp_teams_thread_limit > 0) {
7865  num_threads = __kmp_teams_thread_limit;
7866  } else {
7867  num_threads = __kmp_avail_proc / num_teams;
7868  }
7869  // adjust num_threads w/o warning as it is not user setting
7870  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7871  // no thread_limit clause specified - do not change thread-limit-var ICV
7872  if (num_threads > __kmp_dflt_team_nth) {
7873  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7874  }
7875  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7876  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7877  } // prevent team size to exceed thread-limit-var
7878  if (num_teams * num_threads > __kmp_teams_max_nth) {
7879  num_threads = __kmp_teams_max_nth / num_teams;
7880  }
7881  if (num_threads == 0) {
7882  num_threads = 1;
7883  }
7884  } else {
7885  if (num_threads < 0) {
7886  __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7887  __kmp_msg_null);
7888  num_threads = 1;
7889  }
7890  // This thread will be the primary thread of the league primary threads
7891  // Store new thread limit; old limit is saved in th_cg_roots list
7892  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7893  // num_threads = min(num_threads, nthreads-var)
7894  if (num_threads > __kmp_dflt_team_nth) {
7895  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7896  }
7897  if (num_teams * num_threads > __kmp_teams_max_nth) {
7898  int new_threads = __kmp_teams_max_nth / num_teams;
7899  if (new_threads == 0) {
7900  new_threads = 1;
7901  }
7902  if (new_threads != num_threads) {
7903  if (!__kmp_reserve_warn) { // user asked for too many threads
7904  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7905  __kmp_msg(kmp_ms_warning,
7906  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7907  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7908  }
7909  }
7910  num_threads = new_threads;
7911  }
7912  }
7913  thr->th.th_teams_size.nth = num_threads;
7914 }
7915 
7916 /* this sets the requested number of teams for the teams region and/or
7917  the number of threads for the next parallel region encountered */
7918 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7919  int num_threads) {
7920  kmp_info_t *thr = __kmp_threads[gtid];
7921  if (num_teams < 0) {
7922  // OpenMP specification requires requested values to be positive,
7923  // but people can send us any value, so we'd better check
7924  __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7925  __kmp_msg_null);
7926  num_teams = 1;
7927  }
7928  if (num_teams == 0) {
7929  if (__kmp_nteams > 0) {
7930  num_teams = __kmp_nteams;
7931  } else {
7932  num_teams = 1; // default number of teams is 1.
7933  }
7934  }
7935  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7936  if (!__kmp_reserve_warn) {
7937  __kmp_reserve_warn = 1;
7938  __kmp_msg(kmp_ms_warning,
7939  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7940  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7941  }
7942  num_teams = __kmp_teams_max_nth;
7943  }
7944  // Set number of teams (number of threads in the outer "parallel" of the
7945  // teams)
7946  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7947 
7948  __kmp_push_thread_limit(thr, num_teams, num_threads);
7949 }
7950 
7951 /* This sets the requested number of teams for the teams region and/or
7952  the number of threads for the next parallel region encountered */
7953 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7954  int num_teams_ub, int num_threads) {
7955  kmp_info_t *thr = __kmp_threads[gtid];
7956  KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7957  KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7958  KMP_DEBUG_ASSERT(num_threads >= 0);
7959 
7960  if (num_teams_lb > num_teams_ub) {
7961  __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7962  KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7963  }
7964 
7965  int num_teams = 1; // defalt number of teams is 1.
7966 
7967  if (num_teams_lb == 0 && num_teams_ub > 0)
7968  num_teams_lb = num_teams_ub;
7969 
7970  if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7971  num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7972  if (num_teams > __kmp_teams_max_nth) {
7973  if (!__kmp_reserve_warn) {
7974  __kmp_reserve_warn = 1;
7975  __kmp_msg(kmp_ms_warning,
7976  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7977  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7978  }
7979  num_teams = __kmp_teams_max_nth;
7980  }
7981  } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7982  num_teams = num_teams_ub;
7983  } else { // num_teams_lb <= num_teams <= num_teams_ub
7984  if (num_threads <= 0) {
7985  if (num_teams_ub > __kmp_teams_max_nth) {
7986  num_teams = num_teams_lb;
7987  } else {
7988  num_teams = num_teams_ub;
7989  }
7990  } else {
7991  num_teams = (num_threads > __kmp_teams_max_nth)
7992  ? num_teams
7993  : __kmp_teams_max_nth / num_threads;
7994  if (num_teams < num_teams_lb) {
7995  num_teams = num_teams_lb;
7996  } else if (num_teams > num_teams_ub) {
7997  num_teams = num_teams_ub;
7998  }
7999  }
8000  }
8001  // Set number of teams (number of threads in the outer "parallel" of the
8002  // teams)
8003  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
8004 
8005  __kmp_push_thread_limit(thr, num_teams, num_threads);
8006 }
8007 
8008 // Set the proc_bind var to use in the following parallel region.
8009 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
8010  kmp_info_t *thr = __kmp_threads[gtid];
8011  thr->th.th_set_proc_bind = proc_bind;
8012 }
8013 
8014 /* Launch the worker threads into the microtask. */
8015 
8016 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
8017  kmp_info_t *this_thr = __kmp_threads[gtid];
8018 
8019 #ifdef KMP_DEBUG
8020  int f;
8021 #endif /* KMP_DEBUG */
8022 
8023  KMP_DEBUG_ASSERT(team);
8024  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8025  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8026  KMP_MB(); /* Flush all pending memory write invalidates. */
8027 
8028  team->t.t_construct = 0; /* no single directives seen yet */
8029  team->t.t_ordered.dt.t_value =
8030  0; /* thread 0 enters the ordered section first */
8031 
8032  /* Reset the identifiers on the dispatch buffer */
8033  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
8034  if (team->t.t_max_nproc > 1) {
8035  int i;
8036  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
8037  team->t.t_disp_buffer[i].buffer_index = i;
8038  team->t.t_disp_buffer[i].doacross_buf_idx = i;
8039  }
8040  } else {
8041  team->t.t_disp_buffer[0].buffer_index = 0;
8042  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
8043  }
8044 
8045  KMP_MB(); /* Flush all pending memory write invalidates. */
8046  KMP_ASSERT(this_thr->th.th_team == team);
8047 
8048 #ifdef KMP_DEBUG
8049  for (f = 0; f < team->t.t_nproc; f++) {
8050  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
8051  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
8052  }
8053 #endif /* KMP_DEBUG */
8054 
8055  /* release the worker threads so they may begin working */
8056  __kmp_fork_barrier(gtid, 0);
8057 }
8058 
8059 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
8060  kmp_info_t *this_thr = __kmp_threads[gtid];
8061 
8062  KMP_DEBUG_ASSERT(team);
8063  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
8064  KMP_ASSERT(KMP_MASTER_GTID(gtid));
8065  KMP_MB(); /* Flush all pending memory write invalidates. */
8066 
8067  /* Join barrier after fork */
8068 
8069 #ifdef KMP_DEBUG
8070  if (__kmp_threads[gtid] &&
8071  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8072  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8073  __kmp_threads[gtid]);
8074  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8075  "team->t.t_nproc=%d\n",
8076  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8077  team->t.t_nproc);
8078  __kmp_print_structure();
8079  }
8080  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8081  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8082 #endif /* KMP_DEBUG */
8083 
8084  __kmp_join_barrier(gtid); /* wait for everyone */
8085 #if OMPT_SUPPORT
8086  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
8087  if (ompt_enabled.enabled &&
8088  (ompt_state == ompt_state_wait_barrier_teams ||
8089  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
8090  int ds_tid = this_thr->th.th_info.ds.ds_tid;
8091  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8092  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8093 #if OMPT_OPTIONAL
8094  void *codeptr = NULL;
8095  if (KMP_MASTER_TID(ds_tid) &&
8096  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8097  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8098  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8099 
8100  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
8101  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
8102  sync_kind = ompt_sync_region_barrier_teams;
8103  if (ompt_enabled.ompt_callback_sync_region_wait) {
8104  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8105  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8106  }
8107  if (ompt_enabled.ompt_callback_sync_region) {
8108  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8109  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
8110  }
8111 #endif
8112  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8113  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8114  ompt_scope_end, NULL, task_data, 0, ds_tid,
8115  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8116  }
8117  }
8118 #endif
8119 
8120  KMP_MB(); /* Flush all pending memory write invalidates. */
8121  KMP_ASSERT(this_thr->th.th_team == team);
8122 }
8123 
8124 /* ------------------------------------------------------------------------ */
8125 
8126 #ifdef USE_LOAD_BALANCE
8127 
8128 // Return the worker threads actively spinning in the hot team, if we
8129 // are at the outermost level of parallelism. Otherwise, return 0.
8130 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8131  int i;
8132  int retval;
8133  kmp_team_t *hot_team;
8134 
8135  if (root->r.r_active) {
8136  return 0;
8137  }
8138  hot_team = root->r.r_hot_team;
8139  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8140  return hot_team->t.t_nproc - 1; // Don't count primary thread
8141  }
8142 
8143  // Skip the primary thread - it is accounted for elsewhere.
8144  retval = 0;
8145  for (i = 1; i < hot_team->t.t_nproc; i++) {
8146  if (hot_team->t.t_threads[i]->th.th_active) {
8147  retval++;
8148  }
8149  }
8150  return retval;
8151 }
8152 
8153 // Perform an automatic adjustment to the number of
8154 // threads used by the next parallel region.
8155 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8156  int retval;
8157  int pool_active;
8158  int hot_team_active;
8159  int team_curr_active;
8160  int system_active;
8161 
8162  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8163  set_nproc));
8164  KMP_DEBUG_ASSERT(root);
8165  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8166  ->th.th_current_task->td_icvs.dynamic == TRUE);
8167  KMP_DEBUG_ASSERT(set_nproc > 1);
8168 
8169  if (set_nproc == 1) {
8170  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8171  return 1;
8172  }
8173 
8174  // Threads that are active in the thread pool, active in the hot team for this
8175  // particular root (if we are at the outer par level), and the currently
8176  // executing thread (to become the primary thread) are available to add to the
8177  // new team, but are currently contributing to the system load, and must be
8178  // accounted for.
8179  pool_active = __kmp_thread_pool_active_nth;
8180  hot_team_active = __kmp_active_hot_team_nproc(root);
8181  team_curr_active = pool_active + hot_team_active + 1;
8182 
8183  // Check the system load.
8184  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8185  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8186  "hot team active = %d\n",
8187  system_active, pool_active, hot_team_active));
8188 
8189  if (system_active < 0) {
8190  // There was an error reading the necessary info from /proc, so use the
8191  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8192  // = dynamic_thread_limit, we shouldn't wind up getting back here.
8193  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8194  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8195 
8196  // Make this call behave like the thread limit algorithm.
8197  retval = __kmp_avail_proc - __kmp_nth +
8198  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8199  if (retval > set_nproc) {
8200  retval = set_nproc;
8201  }
8202  if (retval < KMP_MIN_NTH) {
8203  retval = KMP_MIN_NTH;
8204  }
8205 
8206  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8207  retval));
8208  return retval;
8209  }
8210 
8211  // There is a slight delay in the load balance algorithm in detecting new
8212  // running procs. The real system load at this instant should be at least as
8213  // large as the #active omp thread that are available to add to the team.
8214  if (system_active < team_curr_active) {
8215  system_active = team_curr_active;
8216  }
8217  retval = __kmp_avail_proc - system_active + team_curr_active;
8218  if (retval > set_nproc) {
8219  retval = set_nproc;
8220  }
8221  if (retval < KMP_MIN_NTH) {
8222  retval = KMP_MIN_NTH;
8223  }
8224 
8225  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8226  return retval;
8227 } // __kmp_load_balance_nproc()
8228 
8229 #endif /* USE_LOAD_BALANCE */
8230 
8231 /* ------------------------------------------------------------------------ */
8232 
8233 /* NOTE: this is called with the __kmp_init_lock held */
8234 void __kmp_cleanup(void) {
8235  int f;
8236 
8237  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8238 
8239  if (TCR_4(__kmp_init_parallel)) {
8240 #if KMP_HANDLE_SIGNALS
8241  __kmp_remove_signals();
8242 #endif
8243  TCW_4(__kmp_init_parallel, FALSE);
8244  }
8245 
8246  if (TCR_4(__kmp_init_middle)) {
8247 #if KMP_AFFINITY_SUPPORTED
8248  __kmp_affinity_uninitialize();
8249 #endif /* KMP_AFFINITY_SUPPORTED */
8250  __kmp_cleanup_hierarchy();
8251  TCW_4(__kmp_init_middle, FALSE);
8252  }
8253 
8254  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8255 
8256  if (__kmp_init_serial) {
8257  __kmp_runtime_destroy();
8258  __kmp_init_serial = FALSE;
8259  }
8260 
8261  __kmp_cleanup_threadprivate_caches();
8262 
8263  for (f = 0; f < __kmp_threads_capacity; f++) {
8264  if (__kmp_root[f] != NULL) {
8265  __kmp_free(__kmp_root[f]);
8266  __kmp_root[f] = NULL;
8267  }
8268  }
8269  __kmp_free(__kmp_threads);
8270  // __kmp_threads and __kmp_root were allocated at once, as single block, so
8271  // there is no need in freeing __kmp_root.
8272  __kmp_threads = NULL;
8273  __kmp_root = NULL;
8274  __kmp_threads_capacity = 0;
8275 
8276  // Free old __kmp_threads arrays if they exist.
8277  kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8278  while (ptr) {
8279  kmp_old_threads_list_t *next = ptr->next;
8280  __kmp_free(ptr->threads);
8281  __kmp_free(ptr);
8282  ptr = next;
8283  }
8284  __kmp_old_threads_list = NULL;
8285 
8286 #if KMP_USE_DYNAMIC_LOCK
8287  __kmp_cleanup_indirect_user_locks();
8288 #else
8289  __kmp_cleanup_user_locks();
8290 #endif
8291 #if OMPD_SUPPORT
8292  if (ompd_env_block) {
8293  __kmp_free(ompd_env_block);
8294  ompd_env_block = NULL;
8295  ompd_env_block_size = 0;
8296  }
8297 #endif
8298 
8299 #if KMP_AFFINITY_SUPPORTED
8300  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8301  __kmp_cpuinfo_file = NULL;
8302 #endif /* KMP_AFFINITY_SUPPORTED */
8303 
8304 #if KMP_USE_ADAPTIVE_LOCKS
8305 #if KMP_DEBUG_ADAPTIVE_LOCKS
8306  __kmp_print_speculative_stats();
8307 #endif
8308 #endif
8309  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8310  __kmp_nested_nth.nth = NULL;
8311  __kmp_nested_nth.size = 0;
8312  __kmp_nested_nth.used = 0;
8313 
8314  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8315  __kmp_nested_proc_bind.bind_types = NULL;
8316  __kmp_nested_proc_bind.size = 0;
8317  __kmp_nested_proc_bind.used = 0;
8318  __kmp_dflt_team_nth = 0;
8319  __kmp_dflt_team_nth_ub = 0;
8320  if (__kmp_affinity_format) {
8321  KMP_INTERNAL_FREE(__kmp_affinity_format);
8322  __kmp_affinity_format = NULL;
8323  }
8324 
8325  __kmp_i18n_catclose();
8326 
8327  if (__kmp_nesting_nth_level)
8328  KMP_INTERNAL_FREE(__kmp_nesting_nth_level);
8329 
8330 #if KMP_USE_HIER_SCHED
8331  __kmp_hier_scheds.deallocate();
8332 #endif
8333 
8334 #if KMP_STATS_ENABLED
8335  __kmp_stats_fini();
8336 #endif
8337 
8338  __kmpc_destroy_allocator(KMP_GTID_SHUTDOWN, __kmp_def_allocator);
8339  __kmp_def_allocator = omp_default_mem_alloc;
8340 
8341  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8342 }
8343 
8344 /* ------------------------------------------------------------------------ */
8345 
8346 int __kmp_ignore_mppbeg(void) {
8347  char *env;
8348 
8349  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8350  if (__kmp_str_match_false(env))
8351  return FALSE;
8352  }
8353  // By default __kmpc_begin() is no-op.
8354  return TRUE;
8355 }
8356 
8357 int __kmp_ignore_mppend(void) {
8358  char *env;
8359 
8360  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8361  if (__kmp_str_match_false(env))
8362  return FALSE;
8363  }
8364  // By default __kmpc_end() is no-op.
8365  return TRUE;
8366 }
8367 
8368 void __kmp_internal_begin(void) {
8369  int gtid;
8370  kmp_root_t *root;
8371 
8372  /* this is a very important step as it will register new sibling threads
8373  and assign these new uber threads a new gtid */
8374  gtid = __kmp_entry_gtid();
8375  root = __kmp_threads[gtid]->th.th_root;
8376  KMP_ASSERT(KMP_UBER_GTID(gtid));
8377 
8378  if (root->r.r_begin)
8379  return;
8380  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8381  if (root->r.r_begin) {
8382  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8383  return;
8384  }
8385 
8386  root->r.r_begin = TRUE;
8387 
8388  __kmp_release_lock(&root->r.r_begin_lock, gtid);
8389 }
8390 
8391 /* ------------------------------------------------------------------------ */
8392 
8393 void __kmp_user_set_library(enum library_type arg) {
8394  int gtid;
8395  kmp_root_t *root;
8396  kmp_info_t *thread;
8397 
8398  /* first, make sure we are initialized so we can get our gtid */
8399 
8400  gtid = __kmp_entry_gtid();
8401  thread = __kmp_threads[gtid];
8402 
8403  root = thread->th.th_root;
8404 
8405  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8406  library_serial));
8407  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8408  thread */
8409  KMP_WARNING(SetLibraryIncorrectCall);
8410  return;
8411  }
8412 
8413  switch (arg) {
8414  case library_serial:
8415  thread->th.th_set_nproc = 0;
8416  set__nproc(thread, 1);
8417  break;
8418  case library_turnaround:
8419  thread->th.th_set_nproc = 0;
8420  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8421  : __kmp_dflt_team_nth_ub);
8422  break;
8423  case library_throughput:
8424  thread->th.th_set_nproc = 0;
8425  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8426  : __kmp_dflt_team_nth_ub);
8427  break;
8428  default:
8429  KMP_FATAL(UnknownLibraryType, arg);
8430  }
8431 
8432  __kmp_aux_set_library(arg);
8433 }
8434 
8435 void __kmp_aux_set_stacksize(size_t arg) {
8436  if (!__kmp_init_serial)
8437  __kmp_serial_initialize();
8438 
8439 #if KMP_OS_DARWIN
8440  if (arg & (0x1000 - 1)) {
8441  arg &= ~(0x1000 - 1);
8442  if (arg + 0x1000) /* check for overflow if we round up */
8443  arg += 0x1000;
8444  }
8445 #endif
8446  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8447 
8448  /* only change the default stacksize before the first parallel region */
8449  if (!TCR_4(__kmp_init_parallel)) {
8450  size_t value = arg; /* argument is in bytes */
8451 
8452  if (value < __kmp_sys_min_stksize)
8453  value = __kmp_sys_min_stksize;
8454  else if (value > KMP_MAX_STKSIZE)
8455  value = KMP_MAX_STKSIZE;
8456 
8457  __kmp_stksize = value;
8458 
8459  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8460  }
8461 
8462  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8463 }
8464 
8465 /* set the behaviour of the runtime library */
8466 /* TODO this can cause some odd behaviour with sibling parallelism... */
8467 void __kmp_aux_set_library(enum library_type arg) {
8468  __kmp_library = arg;
8469 
8470  switch (__kmp_library) {
8471  case library_serial: {
8472  KMP_INFORM(LibraryIsSerial);
8473  } break;
8474  case library_turnaround:
8475  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8476  __kmp_use_yield = 2; // only yield when oversubscribed
8477  break;
8478  case library_throughput:
8479  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8480  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8481  break;
8482  default:
8483  KMP_FATAL(UnknownLibraryType, arg);
8484  }
8485 }
8486 
8487 /* Getting team information common for all team API */
8488 // Returns NULL if not in teams construct
8489 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8490  kmp_info_t *thr = __kmp_entry_thread();
8491  teams_serialized = 0;
8492  if (thr->th.th_teams_microtask) {
8493  kmp_team_t *team = thr->th.th_team;
8494  int tlevel = thr->th.th_teams_level; // the level of the teams construct
8495  int ii = team->t.t_level;
8496  teams_serialized = team->t.t_serialized;
8497  int level = tlevel + 1;
8498  KMP_DEBUG_ASSERT(ii >= tlevel);
8499  while (ii > level) {
8500  for (teams_serialized = team->t.t_serialized;
8501  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8502  }
8503  if (team->t.t_serialized && (!teams_serialized)) {
8504  team = team->t.t_parent;
8505  continue;
8506  }
8507  if (ii > level) {
8508  team = team->t.t_parent;
8509  ii--;
8510  }
8511  }
8512  return team;
8513  }
8514  return NULL;
8515 }
8516 
8517 int __kmp_aux_get_team_num() {
8518  int serialized;
8519  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8520  if (team) {
8521  if (serialized > 1) {
8522  return 0; // teams region is serialized ( 1 team of 1 thread ).
8523  } else {
8524  return team->t.t_master_tid;
8525  }
8526  }
8527  return 0;
8528 }
8529 
8530 int __kmp_aux_get_num_teams() {
8531  int serialized;
8532  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8533  if (team) {
8534  if (serialized > 1) {
8535  return 1;
8536  } else {
8537  return team->t.t_parent->t.t_nproc;
8538  }
8539  }
8540  return 1;
8541 }
8542 
8543 /* ------------------------------------------------------------------------ */
8544 
8545 /*
8546  * Affinity Format Parser
8547  *
8548  * Field is in form of: %[[[0].]size]type
8549  * % and type are required (%% means print a literal '%')
8550  * type is either single char or long name surrounded by {},
8551  * e.g., N or {num_threads}
8552  * 0 => leading zeros
8553  * . => right justified when size is specified
8554  * by default output is left justified
8555  * size is the *minimum* field length
8556  * All other characters are printed as is
8557  *
8558  * Available field types:
8559  * L {thread_level} - omp_get_level()
8560  * n {thread_num} - omp_get_thread_num()
8561  * h {host} - name of host machine
8562  * P {process_id} - process id (integer)
8563  * T {thread_identifier} - native thread identifier (integer)
8564  * N {num_threads} - omp_get_num_threads()
8565  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8566  * a {thread_affinity} - comma separated list of integers or integer ranges
8567  * (values of affinity mask)
8568  *
8569  * Implementation-specific field types can be added
8570  * If a type is unknown, print "undefined"
8571  */
8572 
8573 // Structure holding the short name, long name, and corresponding data type
8574 // for snprintf. A table of these will represent the entire valid keyword
8575 // field types.
8576 typedef struct kmp_affinity_format_field_t {
8577  char short_name; // from spec e.g., L -> thread level
8578  const char *long_name; // from spec thread_level -> thread level
8579  char field_format; // data type for snprintf (typically 'd' or 's'
8580  // for integer or string)
8581 } kmp_affinity_format_field_t;
8582 
8583 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8584 #if KMP_AFFINITY_SUPPORTED
8585  {'A', "thread_affinity", 's'},
8586 #endif
8587  {'t', "team_num", 'd'},
8588  {'T', "num_teams", 'd'},
8589  {'L', "nesting_level", 'd'},
8590  {'n', "thread_num", 'd'},
8591  {'N', "num_threads", 'd'},
8592  {'a', "ancestor_tnum", 'd'},
8593  {'H', "host", 's'},
8594  {'P', "process_id", 'd'},
8595  {'i', "native_thread_id", 'd'}};
8596 
8597 // Return the number of characters it takes to hold field
8598 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8599  const char **ptr,
8600  kmp_str_buf_t *field_buffer) {
8601  int rc, format_index, field_value;
8602  const char *width_left, *width_right;
8603  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8604  static const int FORMAT_SIZE = 20;
8605  char format[FORMAT_SIZE] = {0};
8606  char absolute_short_name = 0;
8607 
8608  KMP_DEBUG_ASSERT(gtid >= 0);
8609  KMP_DEBUG_ASSERT(th);
8610  KMP_DEBUG_ASSERT(**ptr == '%');
8611  KMP_DEBUG_ASSERT(field_buffer);
8612 
8613  __kmp_str_buf_clear(field_buffer);
8614 
8615  // Skip the initial %
8616  (*ptr)++;
8617 
8618  // Check for %% first
8619  if (**ptr == '%') {
8620  __kmp_str_buf_cat(field_buffer, "%", 1);
8621  (*ptr)++; // skip over the second %
8622  return 1;
8623  }
8624 
8625  // Parse field modifiers if they are present
8626  pad_zeros = false;
8627  if (**ptr == '0') {
8628  pad_zeros = true;
8629  (*ptr)++; // skip over 0
8630  }
8631  right_justify = false;
8632  if (**ptr == '.') {
8633  right_justify = true;
8634  (*ptr)++; // skip over .
8635  }
8636  // Parse width of field: [width_left, width_right)
8637  width_left = width_right = NULL;
8638  if (**ptr >= '0' && **ptr <= '9') {
8639  width_left = *ptr;
8640  SKIP_DIGITS(*ptr);
8641  width_right = *ptr;
8642  }
8643 
8644  // Create the format for KMP_SNPRINTF based on flags parsed above
8645  format_index = 0;
8646  format[format_index++] = '%';
8647  if (!right_justify)
8648  format[format_index++] = '-';
8649  if (pad_zeros)
8650  format[format_index++] = '0';
8651  if (width_left && width_right) {
8652  int i = 0;
8653  // Only allow 8 digit number widths.
8654  // This also prevents overflowing format variable
8655  while (i < 8 && width_left < width_right) {
8656  format[format_index++] = *width_left;
8657  width_left++;
8658  i++;
8659  }
8660  }
8661 
8662  // Parse a name (long or short)
8663  // Canonicalize the name into absolute_short_name
8664  found_valid_name = false;
8665  parse_long_name = (**ptr == '{');
8666  if (parse_long_name)
8667  (*ptr)++; // skip initial left brace
8668  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8669  sizeof(__kmp_affinity_format_table[0]);
8670  ++i) {
8671  char short_name = __kmp_affinity_format_table[i].short_name;
8672  const char *long_name = __kmp_affinity_format_table[i].long_name;
8673  char field_format = __kmp_affinity_format_table[i].field_format;
8674  if (parse_long_name) {
8675  size_t length = KMP_STRLEN(long_name);
8676  if (strncmp(*ptr, long_name, length) == 0) {
8677  found_valid_name = true;
8678  (*ptr) += length; // skip the long name
8679  }
8680  } else if (**ptr == short_name) {
8681  found_valid_name = true;
8682  (*ptr)++; // skip the short name
8683  }
8684  if (found_valid_name) {
8685  format[format_index++] = field_format;
8686  format[format_index++] = '\0';
8687  absolute_short_name = short_name;
8688  break;
8689  }
8690  }
8691  if (parse_long_name) {
8692  if (**ptr != '}') {
8693  absolute_short_name = 0;
8694  } else {
8695  (*ptr)++; // skip over the right brace
8696  }
8697  }
8698 
8699  // Attempt to fill the buffer with the requested
8700  // value using snprintf within __kmp_str_buf_print()
8701  switch (absolute_short_name) {
8702  case 't':
8703  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8704  break;
8705  case 'T':
8706  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8707  break;
8708  case 'L':
8709  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8710  break;
8711  case 'n':
8712  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8713  break;
8714  case 'H': {
8715  static const int BUFFER_SIZE = 256;
8716  char buf[BUFFER_SIZE];
8717  __kmp_expand_host_name(buf, BUFFER_SIZE);
8718  rc = __kmp_str_buf_print(field_buffer, format, buf);
8719  } break;
8720  case 'P':
8721  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8722  break;
8723  case 'i':
8724  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8725  break;
8726  case 'N':
8727  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8728  break;
8729  case 'a':
8730  field_value =
8731  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8732  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8733  break;
8734 #if KMP_AFFINITY_SUPPORTED
8735  case 'A': {
8736  if (th->th.th_affin_mask) {
8737  kmp_str_buf_t buf;
8738  __kmp_str_buf_init(&buf);
8739  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8740  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8741  __kmp_str_buf_free(&buf);
8742  } else {
8743  rc = __kmp_str_buf_print(field_buffer, "%s", "disabled");
8744  }
8745  } break;
8746 #endif
8747  default:
8748  // According to spec, If an implementation does not have info for field
8749  // type, then "undefined" is printed
8750  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8751  // Skip the field
8752  if (parse_long_name) {
8753  SKIP_TOKEN(*ptr);
8754  if (**ptr == '}')
8755  (*ptr)++;
8756  } else {
8757  (*ptr)++;
8758  }
8759  }
8760 
8761  KMP_ASSERT(format_index <= FORMAT_SIZE);
8762  return rc;
8763 }
8764 
8765 /*
8766  * Return number of characters needed to hold the affinity string
8767  * (not including null byte character)
8768  * The resultant string is printed to buffer, which the caller can then
8769  * handle afterwards
8770  */
8771 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8772  kmp_str_buf_t *buffer) {
8773  const char *parse_ptr;
8774  size_t retval;
8775  const kmp_info_t *th;
8776  kmp_str_buf_t field;
8777 
8778  KMP_DEBUG_ASSERT(buffer);
8779  KMP_DEBUG_ASSERT(gtid >= 0);
8780 
8781  __kmp_str_buf_init(&field);
8782  __kmp_str_buf_clear(buffer);
8783 
8784  th = __kmp_threads[gtid];
8785  retval = 0;
8786 
8787  // If format is NULL or zero-length string, then we use
8788  // affinity-format-var ICV
8789  parse_ptr = format;
8790  if (parse_ptr == NULL || *parse_ptr == '\0') {
8791  parse_ptr = __kmp_affinity_format;
8792  }
8793  KMP_DEBUG_ASSERT(parse_ptr);
8794 
8795  while (*parse_ptr != '\0') {
8796  // Parse a field
8797  if (*parse_ptr == '%') {
8798  // Put field in the buffer
8799  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8800  __kmp_str_buf_catbuf(buffer, &field);
8801  retval += rc;
8802  } else {
8803  // Put literal character in buffer
8804  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8805  retval++;
8806  parse_ptr++;
8807  }
8808  }
8809  __kmp_str_buf_free(&field);
8810  return retval;
8811 }
8812 
8813 // Displays the affinity string to stdout
8814 void __kmp_aux_display_affinity(int gtid, const char *format) {
8815  kmp_str_buf_t buf;
8816  __kmp_str_buf_init(&buf);
8817  __kmp_aux_capture_affinity(gtid, format, &buf);
8818  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8819  __kmp_str_buf_free(&buf);
8820 }
8821 
8822 /* ------------------------------------------------------------------------ */
8823 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8824  int blocktime = arg; /* argument is in microseconds */
8825 #if KMP_USE_MONITOR
8826  int bt_intervals;
8827 #endif
8828  kmp_int8 bt_set;
8829 
8830  __kmp_save_internal_controls(thread);
8831 
8832  /* Normalize and set blocktime for the teams */
8833  if (blocktime < KMP_MIN_BLOCKTIME)
8834  blocktime = KMP_MIN_BLOCKTIME;
8835  else if (blocktime > KMP_MAX_BLOCKTIME)
8836  blocktime = KMP_MAX_BLOCKTIME;
8837 
8838  set__blocktime_team(thread->th.th_team, tid, blocktime);
8839  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8840 
8841 #if KMP_USE_MONITOR
8842  /* Calculate and set blocktime intervals for the teams */
8843  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8844 
8845  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8846  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8847 #endif
8848 
8849  /* Set whether blocktime has been set to "TRUE" */
8850  bt_set = TRUE;
8851 
8852  set__bt_set_team(thread->th.th_team, tid, bt_set);
8853  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8854 #if KMP_USE_MONITOR
8855  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8856  "bt_intervals=%d, monitor_updates=%d\n",
8857  __kmp_gtid_from_tid(tid, thread->th.th_team),
8858  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8859  __kmp_monitor_wakeups));
8860 #else
8861  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8862  __kmp_gtid_from_tid(tid, thread->th.th_team),
8863  thread->th.th_team->t.t_id, tid, blocktime));
8864 #endif
8865 }
8866 
8867 void __kmp_aux_set_defaults(char const *str, size_t len) {
8868  if (!__kmp_init_serial) {
8869  __kmp_serial_initialize();
8870  }
8871  __kmp_env_initialize(str);
8872 
8873  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8874  __kmp_env_print();
8875  }
8876 } // __kmp_aux_set_defaults
8877 
8878 /* ------------------------------------------------------------------------ */
8879 /* internal fast reduction routines */
8880 
8881 PACKED_REDUCTION_METHOD_T
8882 __kmp_determine_reduction_method(
8883  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8884  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8885  kmp_critical_name *lck) {
8886 
8887  // Default reduction method: critical construct ( lck != NULL, like in current
8888  // PAROPT )
8889  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8890  // can be selected by RTL
8891  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8892  // can be selected by RTL
8893  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8894  // among generated by PAROPT.
8895 
8896  PACKED_REDUCTION_METHOD_T retval;
8897 
8898  int team_size;
8899 
8900  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8901 
8902 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8903  (loc && \
8904  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8905 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8906 
8907  retval = critical_reduce_block;
8908 
8909  // another choice of getting a team size (with 1 dynamic deference) is slower
8910  team_size = __kmp_get_team_num_threads(global_tid);
8911  if (team_size == 1) {
8912 
8913  retval = empty_reduce_block;
8914 
8915  } else {
8916 
8917  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8918 
8919 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8920  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
8921  KMP_ARCH_VE || KMP_ARCH_S390X || KMP_ARCH_WASM
8922 
8923 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8924  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU || \
8925  KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8926 
8927  int teamsize_cutoff = 4;
8928 
8929 #if KMP_MIC_SUPPORTED
8930  if (__kmp_mic_type != non_mic) {
8931  teamsize_cutoff = 8;
8932  }
8933 #endif
8934  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8935  if (tree_available) {
8936  if (team_size <= teamsize_cutoff) {
8937  if (atomic_available) {
8938  retval = atomic_reduce_block;
8939  }
8940  } else {
8941  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8942  }
8943  } else if (atomic_available) {
8944  retval = atomic_reduce_block;
8945  }
8946 #else
8947 #error "Unknown or unsupported OS"
8948 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8949  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HAIKU ||
8950  // KMP_OS_HURD || KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8951 
8952 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS || \
8953  KMP_ARCH_WASM || KMP_ARCH_PPC || KMP_ARCH_AARCH64_32 || KMP_ARCH_SPARC
8954 
8955 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8956  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HAIKU || KMP_OS_HURD || \
8957  KMP_OS_SOLARIS || KMP_OS_WASI || KMP_OS_AIX
8958 
8959  // basic tuning
8960 
8961  if (atomic_available) {
8962  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8963  retval = atomic_reduce_block;
8964  }
8965  } // otherwise: use critical section
8966 
8967 #elif KMP_OS_DARWIN
8968 
8969  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8970  if (atomic_available && (num_vars <= 3)) {
8971  retval = atomic_reduce_block;
8972  } else if (tree_available) {
8973  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8974  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8975  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8976  }
8977  } // otherwise: use critical section
8978 
8979 #else
8980 #error "Unknown or unsupported OS"
8981 #endif
8982 
8983 #else
8984 #error "Unknown or unsupported architecture"
8985 #endif
8986  }
8987 
8988  // KMP_FORCE_REDUCTION
8989 
8990  // If the team is serialized (team_size == 1), ignore the forced reduction
8991  // method and stay with the unsynchronized method (empty_reduce_block)
8992  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8993  team_size != 1) {
8994 
8995  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8996 
8997  int atomic_available, tree_available;
8998 
8999  switch ((forced_retval = __kmp_force_reduction_method)) {
9000  case critical_reduce_block:
9001  KMP_ASSERT(lck); // lck should be != 0
9002  break;
9003 
9004  case atomic_reduce_block:
9005  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
9006  if (!atomic_available) {
9007  KMP_WARNING(RedMethodNotSupported, "atomic");
9008  forced_retval = critical_reduce_block;
9009  }
9010  break;
9011 
9012  case tree_reduce_block:
9013  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
9014  if (!tree_available) {
9015  KMP_WARNING(RedMethodNotSupported, "tree");
9016  forced_retval = critical_reduce_block;
9017  } else {
9018 #if KMP_FAST_REDUCTION_BARRIER
9019  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
9020 #endif
9021  }
9022  break;
9023 
9024  default:
9025  KMP_ASSERT(0); // "unsupported method specified"
9026  }
9027 
9028  retval = forced_retval;
9029  }
9030 
9031  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
9032 
9033 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
9034 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
9035 
9036  return (retval);
9037 }
9038 // this function is for testing set/get/determine reduce method
9039 kmp_int32 __kmp_get_reduce_method(void) {
9040  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
9041 }
9042 
9043 // Soft pause sets up threads to ignore blocktime and just go to sleep.
9044 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
9045 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
9046 
9047 // Hard pause shuts down the runtime completely. Resume happens naturally when
9048 // OpenMP is used subsequently.
9049 void __kmp_hard_pause() {
9050  __kmp_pause_status = kmp_hard_paused;
9051  __kmp_internal_end_thread(-1);
9052 }
9053 
9054 // Soft resume sets __kmp_pause_status, and wakes up all threads.
9055 void __kmp_resume_if_soft_paused() {
9056  if (__kmp_pause_status == kmp_soft_paused) {
9057  __kmp_pause_status = kmp_not_paused;
9058 
9059  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
9060  kmp_info_t *thread = __kmp_threads[gtid];
9061  if (thread) { // Wake it if sleeping
9062  kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
9063  thread);
9064  if (fl.is_sleeping())
9065  fl.resume(gtid);
9066  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
9067  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
9068  } else { // thread holds the lock and may sleep soon
9069  do { // until either the thread sleeps, or we can get the lock
9070  if (fl.is_sleeping()) {
9071  fl.resume(gtid);
9072  break;
9073  } else if (__kmp_try_suspend_mx(thread)) {
9074  __kmp_unlock_suspend_mx(thread);
9075  break;
9076  }
9077  } while (1);
9078  }
9079  }
9080  }
9081  }
9082 }
9083 
9084 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
9085 // TODO: add warning messages
9086 int __kmp_pause_resource(kmp_pause_status_t level) {
9087  if (level == kmp_not_paused) { // requesting resume
9088  if (__kmp_pause_status == kmp_not_paused) {
9089  // error message about runtime not being paused, so can't resume
9090  return 1;
9091  } else {
9092  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9093  __kmp_pause_status == kmp_hard_paused);
9094  __kmp_pause_status = kmp_not_paused;
9095  return 0;
9096  }
9097  } else if (level == kmp_soft_paused) { // requesting soft pause
9098  if (__kmp_pause_status != kmp_not_paused) {
9099  // error message about already being paused
9100  return 1;
9101  } else {
9102  __kmp_soft_pause();
9103  return 0;
9104  }
9105  } else if (level == kmp_hard_paused || level == kmp_stop_tool_paused) {
9106  // requesting hard pause or stop_tool pause
9107  if (__kmp_pause_status != kmp_not_paused) {
9108  // error message about already being paused
9109  return 1;
9110  } else {
9111  __kmp_hard_pause();
9112  return 0;
9113  }
9114  } else {
9115  // error message about invalid level
9116  return 1;
9117  }
9118 }
9119 
9120 void __kmp_omp_display_env(int verbose) {
9121  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9122  if (__kmp_init_serial == 0)
9123  __kmp_do_serial_initialize();
9124  __kmp_display_env_impl(!verbose, verbose);
9125  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9126 }
9127 
9128 // The team size is changing, so distributed barrier must be modified
9129 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9130  int new_nthreads) {
9131  KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9132  bp_dist_bar);
9133  kmp_info_t **other_threads = team->t.t_threads;
9134 
9135  // We want all the workers to stop waiting on the barrier while we adjust the
9136  // size of the team.
9137  for (int f = 1; f < old_nthreads; ++f) {
9138  KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9139  // Ignore threads that are already inactive or not present in the team
9140  if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9141  // teams construct causes thread_limit to get passed in, and some of
9142  // those could be inactive; just ignore them
9143  continue;
9144  }
9145  // If thread is transitioning still to in_use state, wait for it
9146  if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9147  while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9148  KMP_CPU_PAUSE();
9149  }
9150  // The thread should be in_use now
9151  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9152  // Transition to unused state
9153  team->t.t_threads[f]->th.th_used_in_team.store(2);
9154  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9155  }
9156  // Release all the workers
9157  team->t.b->go_release();
9158 
9159  KMP_MFENCE();
9160 
9161  // Workers should see transition status 2 and move to 0; but may need to be
9162  // woken up first
9163  int count = old_nthreads - 1;
9164  while (count > 0) {
9165  count = old_nthreads - 1;
9166  for (int f = 1; f < old_nthreads; ++f) {
9167  if (other_threads[f]->th.th_used_in_team.load() != 0) {
9168  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9169  kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9170  void *, other_threads[f]->th.th_sleep_loc);
9171  __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9172  }
9173  } else {
9174  KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9175  count--;
9176  }
9177  }
9178  }
9179  // Now update the barrier size
9180  team->t.b->update_num_threads(new_nthreads);
9181  team->t.b->go_reset();
9182 }
9183 
9184 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9185  // Add the threads back to the team
9186  KMP_DEBUG_ASSERT(team);
9187  // Threads were paused and pointed at th_used_in_team temporarily during a
9188  // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9189  // the thread that it should transition itself back into the team. Then, if
9190  // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9191  // to wake it up.
9192  for (int f = 1; f < new_nthreads; ++f) {
9193  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9194  (void)KMP_COMPARE_AND_STORE_ACQ32(
9195  &(team->t.t_threads[f]->th.th_used_in_team), 0, 3);
9196  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9197  __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9198  (kmp_flag_32<false, false> *)NULL);
9199  }
9200  }
9201  // The threads should be transitioning to the team; when they are done, they
9202  // should have set th_used_in_team to 1. This loop forces master to wait until
9203  // all threads have moved into the team and are waiting in the barrier.
9204  int count = new_nthreads - 1;
9205  while (count > 0) {
9206  count = new_nthreads - 1;
9207  for (int f = 1; f < new_nthreads; ++f) {
9208  if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9209  count--;
9210  }
9211  }
9212  }
9213 }
9214 
9215 // Globals and functions for hidden helper task
9216 kmp_info_t **__kmp_hidden_helper_threads;
9217 kmp_info_t *__kmp_hidden_helper_main_thread;
9218 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9219 #if KMP_OS_LINUX
9220 kmp_int32 __kmp_hidden_helper_threads_num = 8;
9221 kmp_int32 __kmp_enable_hidden_helper = TRUE;
9222 #else
9223 kmp_int32 __kmp_hidden_helper_threads_num = 0;
9224 kmp_int32 __kmp_enable_hidden_helper = FALSE;
9225 #endif
9226 
9227 namespace {
9228 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9229 
9230 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9231  // This is an explicit synchronization on all hidden helper threads in case
9232  // that when a regular thread pushes a hidden helper task to one hidden
9233  // helper thread, the thread has not been awaken once since they're released
9234  // by the main thread after creating the team.
9235  KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9236  while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9237  __kmp_hidden_helper_threads_num)
9238  ;
9239 
9240  // If main thread, then wait for signal
9241  if (__kmpc_master(nullptr, *gtid)) {
9242  // First, unset the initial state and release the initial thread
9243  TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9244  __kmp_hidden_helper_initz_release();
9245  __kmp_hidden_helper_main_thread_wait();
9246  // Now wake up all worker threads
9247  for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9248  __kmp_hidden_helper_worker_thread_signal();
9249  }
9250  }
9251 }
9252 } // namespace
9253 
9254 void __kmp_hidden_helper_threads_initz_routine() {
9255  // Create a new root for hidden helper team/threads
9256  const int gtid = __kmp_register_root(TRUE);
9257  __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9258  __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9259  __kmp_hidden_helper_main_thread->th.th_set_nproc =
9260  __kmp_hidden_helper_threads_num;
9261 
9262  KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9263 
9264  __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9265 
9266  // Set the initialization flag to FALSE
9267  TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9268 
9269  __kmp_hidden_helper_threads_deinitz_release();
9270 }
9271 
9272 /* Nesting Mode:
9273  Set via KMP_NESTING_MODE, which takes an integer.
9274  Note: we skip duplicate topology levels, and skip levels with only
9275  one entity.
9276  KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9277  KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9278  in the topology, and initializes the number of threads at each of those
9279  levels to the number of entities at each level, respectively, below the
9280  entity at the parent level.
9281  KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9282  but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9283  the user to turn nesting on explicitly. This is an even more experimental
9284  option to this experimental feature, and may change or go away in the
9285  future.
9286 */
9287 
9288 // Allocate space to store nesting levels
9289 void __kmp_init_nesting_mode() {
9290  int levels = KMP_HW_LAST;
9291  __kmp_nesting_mode_nlevels = levels;
9292  __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9293  for (int i = 0; i < levels; ++i)
9294  __kmp_nesting_nth_level[i] = 0;
9295  if (__kmp_nested_nth.size < levels) {
9296  __kmp_nested_nth.nth =
9297  (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9298  __kmp_nested_nth.size = levels;
9299  }
9300 }
9301 
9302 // Set # threads for top levels of nesting; must be called after topology set
9303 void __kmp_set_nesting_mode_threads() {
9304  kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9305 
9306  if (__kmp_nesting_mode == 1)
9307  __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9308  else if (__kmp_nesting_mode > 1)
9309  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9310 
9311  if (__kmp_topology) { // use topology info
9312  int loc, hw_level;
9313  for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9314  loc < __kmp_nesting_mode_nlevels;
9315  loc++, hw_level++) {
9316  __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9317  if (__kmp_nesting_nth_level[loc] == 1)
9318  loc--;
9319  }
9320  // Make sure all cores are used
9321  if (__kmp_nesting_mode > 1 && loc > 1) {
9322  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9323  int num_cores = __kmp_topology->get_count(core_level);
9324  int upper_levels = 1;
9325  for (int level = 0; level < loc - 1; ++level)
9326  upper_levels *= __kmp_nesting_nth_level[level];
9327  if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9328  __kmp_nesting_nth_level[loc - 1] =
9329  num_cores / __kmp_nesting_nth_level[loc - 2];
9330  }
9331  __kmp_nesting_mode_nlevels = loc;
9332  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9333  } else { // no topology info available; provide a reasonable guesstimation
9334  if (__kmp_avail_proc >= 4) {
9335  __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9336  __kmp_nesting_nth_level[1] = 2;
9337  __kmp_nesting_mode_nlevels = 2;
9338  } else {
9339  __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9340  __kmp_nesting_mode_nlevels = 1;
9341  }
9342  __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9343  }
9344  for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9345  __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9346  }
9347  set__nproc(thread, __kmp_nesting_nth_level[0]);
9348  if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9349  __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9350  if (get__max_active_levels(thread) > 1) {
9351  // if max levels was set, set nesting mode levels to same
9352  __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9353  }
9354  if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9355  set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9356 }
9357 
9358 // Empty symbols to export (see exports_so.txt) when feature is disabled
9359 extern "C" {
9360 #if !KMP_STATS_ENABLED
9361 void __kmp_reset_stats() {}
9362 #endif
9363 #if !USE_DEBUGGER
9364 int __kmp_omp_debug_struct_info = FALSE;
9365 int __kmp_debugging = FALSE;
9366 #endif
9367 #if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9368 void __kmp_itt_fini_ittlib() {}
9369 void __kmp_itt_init_ittlib() {}
9370 #endif
9371 }
9372 
9373 // end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:192
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:350
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:357
@ kmp_sch_static
Definition: kmp.h:353
@ kmp_sch_guided_chunked
Definition: kmp.h:355
Definition: kmp.h:227
kmp_int32 flags
Definition: kmp.h:229