LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52 
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55  KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57 
58 char const __kmp_version_omp_api[] =
59  KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60 
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63  KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65 
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67 
68 /* ------------------------------------------------------------------------ */
69 
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73 
74 /* Forward declarations */
75 
76 void __kmp_cleanup(void);
77 
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79  int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81  kmp_internal_control_t *new_icvs,
82  ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85  int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91  kmp_internal_control_t *new_icvs, ident_t *loc);
92 
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96 
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
104 
105 /* Calculate the identifier of the current thread */
106 /* fast (and somewhat portable) way to get unique identifier of executing
107  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
108 int __kmp_get_global_thread_id() {
109  int i;
110  kmp_info_t **other_threads;
111  size_t stack_data;
112  char *stack_addr;
113  size_t stack_size;
114  char *stack_base;
115 
116  KA_TRACE(
117  1000,
118  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
119  __kmp_nth, __kmp_all_nth));
120 
121  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
122  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
123  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
124  __kmp_init_gtid for this to work. */
125 
126  if (!TCR_4(__kmp_init_gtid))
127  return KMP_GTID_DNE;
128 
129 #ifdef KMP_TDATA_GTID
130  if (TCR_4(__kmp_gtid_mode) >= 3) {
131  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
132  return __kmp_gtid;
133  }
134 #endif
135  if (TCR_4(__kmp_gtid_mode) >= 2) {
136  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
137  return __kmp_gtid_get_specific();
138  }
139  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
140 
141  stack_addr = (char *)&stack_data;
142  other_threads = __kmp_threads;
143 
144  /* ATT: The code below is a source of potential bugs due to unsynchronized
145  access to __kmp_threads array. For example:
146  1. Current thread loads other_threads[i] to thr and checks it, it is
147  non-NULL.
148  2. Current thread is suspended by OS.
149  3. Another thread unregisters and finishes (debug versions of free()
150  may fill memory with something like 0xEF).
151  4. Current thread is resumed.
152  5. Current thread reads junk from *thr.
153  TODO: Fix it. --ln */
154 
155  for (i = 0; i < __kmp_threads_capacity; i++) {
156 
157  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
158  if (!thr)
159  continue;
160 
161  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
162  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
163 
164  /* stack grows down -- search through all of the active threads */
165 
166  if (stack_addr <= stack_base) {
167  size_t stack_diff = stack_base - stack_addr;
168 
169  if (stack_diff <= stack_size) {
170  /* The only way we can be closer than the allocated */
171  /* stack size is if we are running on this thread. */
172  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
173  return i;
174  }
175  }
176  }
177 
178  /* get specific to try and determine our gtid */
179  KA_TRACE(1000,
180  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
181  "thread, using TLS\n"));
182  i = __kmp_gtid_get_specific();
183 
184  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
185 
186  /* if we havn't been assigned a gtid, then return code */
187  if (i < 0)
188  return i;
189 
190  /* dynamically updated stack window for uber threads to avoid get_specific
191  call */
192  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
193  KMP_FATAL(StackOverflow, i);
194  }
195 
196  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
197  if (stack_addr > stack_base) {
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
199  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
200  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
201  stack_base);
202  } else {
203  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
204  stack_base - stack_addr);
205  }
206 
207  /* Reprint stack bounds for ubermaster since they have been refined */
208  if (__kmp_storage_map) {
209  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
210  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
211  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
212  other_threads[i]->th.th_info.ds.ds_stacksize,
213  "th_%d stack (refinement)", i);
214  }
215  return i;
216 }
217 
218 int __kmp_get_global_thread_id_reg() {
219  int gtid;
220 
221  if (!__kmp_init_serial) {
222  gtid = KMP_GTID_DNE;
223  } else
224 #ifdef KMP_TDATA_GTID
225  if (TCR_4(__kmp_gtid_mode) >= 3) {
226  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
227  gtid = __kmp_gtid;
228  } else
229 #endif
230  if (TCR_4(__kmp_gtid_mode) >= 2) {
231  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
232  gtid = __kmp_gtid_get_specific();
233  } else {
234  KA_TRACE(1000,
235  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
236  gtid = __kmp_get_global_thread_id();
237  }
238 
239  /* we must be a new uber master sibling thread */
240  if (gtid == KMP_GTID_DNE) {
241  KA_TRACE(10,
242  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
243  "Registering a new gtid.\n"));
244  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
245  if (!__kmp_init_serial) {
246  __kmp_do_serial_initialize();
247  gtid = __kmp_gtid_get_specific();
248  } else {
249  gtid = __kmp_register_root(FALSE);
250  }
251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
252  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
253  }
254 
255  KMP_DEBUG_ASSERT(gtid >= 0);
256 
257  return gtid;
258 }
259 
260 /* caller must hold forkjoin_lock */
261 void __kmp_check_stack_overlap(kmp_info_t *th) {
262  int f;
263  char *stack_beg = NULL;
264  char *stack_end = NULL;
265  int gtid;
266 
267  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
268  if (__kmp_storage_map) {
269  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
270  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
271 
272  gtid = __kmp_gtid_from_thread(th);
273 
274  if (gtid == KMP_GTID_MONITOR) {
275  __kmp_print_storage_map_gtid(
276  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
277  "th_%s stack (%s)", "mon",
278  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
279  } else {
280  __kmp_print_storage_map_gtid(
281  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
282  "th_%d stack (%s)", gtid,
283  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
284  }
285  }
286 
287  /* No point in checking ubermaster threads since they use refinement and
288  * cannot overlap */
289  gtid = __kmp_gtid_from_thread(th);
290  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
291  KA_TRACE(10,
292  ("__kmp_check_stack_overlap: performing extensive checking\n"));
293  if (stack_beg == NULL) {
294  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
295  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
296  }
297 
298  for (f = 0; f < __kmp_threads_capacity; f++) {
299  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
300 
301  if (f_th && f_th != th) {
302  char *other_stack_end =
303  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
304  char *other_stack_beg =
305  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
306  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
307  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
308 
309  /* Print the other stack values before the abort */
310  if (__kmp_storage_map)
311  __kmp_print_storage_map_gtid(
312  -1, other_stack_beg, other_stack_end,
313  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
314  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
315 
316  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
317  __kmp_msg_null);
318  }
319  }
320  }
321  }
322  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
323 }
324 
325 /* ------------------------------------------------------------------------ */
326 
327 void __kmp_infinite_loop(void) {
328  static int done = FALSE;
329 
330  while (!done) {
331  KMP_YIELD(TRUE);
332  }
333 }
334 
335 #define MAX_MESSAGE 512
336 
337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
338  char const *format, ...) {
339  char buffer[MAX_MESSAGE];
340  va_list ap;
341 
342  va_start(ap, format);
343  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
344  p2, (unsigned long)size, format);
345  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
346  __kmp_vprintf(kmp_err, buffer, ap);
347 #if KMP_PRINT_DATA_PLACEMENT
348  int node;
349  if (gtid >= 0) {
350  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
351  if (__kmp_storage_map_verbose) {
352  node = __kmp_get_host_node(p1);
353  if (node < 0) /* doesn't work, so don't try this next time */
354  __kmp_storage_map_verbose = FALSE;
355  else {
356  char *last;
357  int lastNode;
358  int localProc = __kmp_get_cpu_from_gtid(gtid);
359 
360  const int page_size = KMP_GET_PAGE_SIZE();
361 
362  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
363  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
364  if (localProc >= 0)
365  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
366  localProc >> 1);
367  else
368  __kmp_printf_no_lock(" GTID %d\n", gtid);
369 #if KMP_USE_PRCTL
370  /* The more elaborate format is disabled for now because of the prctl
371  * hanging bug. */
372  do {
373  last = p1;
374  lastNode = node;
375  /* This loop collates adjacent pages with the same host node. */
376  do {
377  (char *)p1 += page_size;
378  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
379  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
380  lastNode);
381  } while (p1 <= p2);
382 #else
383  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
384  (char *)p1 + (page_size - 1),
385  __kmp_get_host_node(p1));
386  if (p1 < p2) {
387  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
388  (char *)p2 + (page_size - 1),
389  __kmp_get_host_node(p2));
390  }
391 #endif
392  }
393  }
394  } else
395  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
396  }
397 #endif /* KMP_PRINT_DATA_PLACEMENT */
398  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
399 }
400 
401 void __kmp_warn(char const *format, ...) {
402  char buffer[MAX_MESSAGE];
403  va_list ap;
404 
405  if (__kmp_generate_warnings == kmp_warnings_off) {
406  return;
407  }
408 
409  va_start(ap, format);
410 
411  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
412  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
413  __kmp_vprintf(kmp_err, buffer, ap);
414  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
415 
416  va_end(ap);
417 }
418 
419 void __kmp_abort_process() {
420  // Later threads may stall here, but that's ok because abort() will kill them.
421  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
422 
423  if (__kmp_debug_buf) {
424  __kmp_dump_debug_buffer();
425  }
426 
427  if (KMP_OS_WINDOWS) {
428  // Let other threads know of abnormal termination and prevent deadlock
429  // if abort happened during library initialization or shutdown
430  __kmp_global.g.g_abort = SIGABRT;
431 
432  /* On Windows* OS by default abort() causes pop-up error box, which stalls
433  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
434  boxes. _set_abort_behavior() works well, but this function is not
435  available in VS7 (this is not problem for DLL, but it is a problem for
436  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
437  help, at least in some versions of MS C RTL.
438 
439  It seems following sequence is the only way to simulate abort() and
440  avoid pop-up error box. */
441  raise(SIGABRT);
442  _exit(3); // Just in case, if signal ignored, exit anyway.
443  } else {
444  abort();
445  }
446 
447  __kmp_infinite_loop();
448  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449 
450 } // __kmp_abort_process
451 
452 void __kmp_abort_thread(void) {
453  // TODO: Eliminate g_abort global variable and this function.
454  // In case of abort just call abort(), it will kill all the threads.
455  __kmp_infinite_loop();
456 } // __kmp_abort_thread
457 
458 /* Print out the storage map for the major kmp_info_t thread data structures
459  that are allocated together. */
460 
461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463  gtid);
464 
465  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467 
468  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469  sizeof(kmp_local_t), "th_%d.th_local", gtid);
470 
471  __kmp_print_storage_map_gtid(
472  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474 
475  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476  &thr->th.th_bar[bs_plain_barrier + 1],
477  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478  gtid);
479 
480  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481  &thr->th.th_bar[bs_forkjoin_barrier + 1],
482  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483  gtid);
484 
485 #if KMP_FAST_REDUCTION_BARRIER
486  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487  &thr->th.th_bar[bs_reduction_barrier + 1],
488  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489  gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492 
493 /* Print out the storage map for the major kmp_team_t team data structures
494  that are allocated together. */
495 
496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497  int team_id, int num_thr) {
498  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500  header, team_id);
501 
502  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503  &team->t.t_bar[bs_last_barrier],
504  sizeof(kmp_balign_team_t) * bs_last_barrier,
505  "%s_%d.t_bar", header, team_id);
506 
507  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508  &team->t.t_bar[bs_plain_barrier + 1],
509  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510  header, team_id);
511 
512  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513  &team->t.t_bar[bs_forkjoin_barrier + 1],
514  sizeof(kmp_balign_team_t),
515  "%s_%d.t_bar[forkjoin]", header, team_id);
516 
517 #if KMP_FAST_REDUCTION_BARRIER
518  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519  &team->t.t_bar[bs_reduction_barrier + 1],
520  sizeof(kmp_balign_team_t),
521  "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523 
524  __kmp_print_storage_map_gtid(
525  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527 
528  __kmp_print_storage_map_gtid(
529  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531 
532  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533  &team->t.t_disp_buffer[num_disp_buff],
534  sizeof(dispatch_shared_info_t) * num_disp_buff,
535  "%s_%d.t_disp_buffer", header, team_id);
536 }
537 
538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540 
541 /* ------------------------------------------------------------------------ */
542 
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545 
546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547  // TODO: Change to __kmp_break_bootstrap_lock().
548  __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550 
551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552  int i;
553  int thread_count;
554 
555  // PROCESS_DETACH is expected to be called by a thread that executes
556  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559  // threads can be still alive here, although being about to be terminated. The
560  // threads in the array with ds_thread==0 are most suspicious. Actually, it
561  // can be not safe to access the __kmp_threads[].
562 
563  // TODO: does it make sense to check __kmp_roots[] ?
564 
565  // Let's check that there are no other alive threads registered with the OMP
566  // lib.
567  while (1) {
568  thread_count = 0;
569  for (i = 0; i < __kmp_threads_capacity; ++i) {
570  if (!__kmp_threads)
571  continue;
572  kmp_info_t *th = __kmp_threads[i];
573  if (th == NULL)
574  continue;
575  int gtid = th->th.th_info.ds.ds_gtid;
576  if (gtid == gtid_req)
577  continue;
578  if (gtid < 0)
579  continue;
580  DWORD exit_val;
581  int alive = __kmp_is_thread_alive(th, &exit_val);
582  if (alive) {
583  ++thread_count;
584  }
585  }
586  if (thread_count == 0)
587  break; // success
588  }
589 
590  // Assume that I'm alone. Now it might be safe to check and reset locks.
591  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592  __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594  __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597 
598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600 
601  switch (fdwReason) {
602 
603  case DLL_PROCESS_ATTACH:
604  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605 
606  return TRUE;
607 
608  case DLL_PROCESS_DETACH:
609  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610 
611  if (lpReserved != NULL) {
612  // lpReserved is used for telling the difference:
613  // lpReserved == NULL when FreeLibrary() was called,
614  // lpReserved != NULL when the process terminates.
615  // When FreeLibrary() is called, worker threads remain alive. So they will
616  // release the forkjoin lock by themselves. When the process terminates,
617  // worker threads disappear triggering the problem of unreleased forkjoin
618  // lock as described below.
619 
620  // A worker thread can take the forkjoin lock. The problem comes up if
621  // that worker thread becomes dead before it releases the forkjoin lock.
622  // The forkjoin lock remains taken, while the thread executing
623  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624  // to take the forkjoin lock and will always fail, so that the application
625  // will never finish [normally]. This scenario is possible if
626  // __kmpc_end() has not been executed. It looks like it's not a corner
627  // case, but common cases:
628  // - the main function was compiled by an alternative compiler;
629  // - the main function was compiled by icl but without /Qopenmp
630  // (application with plugins);
631  // - application terminates by calling C exit(), Fortran CALL EXIT() or
632  // Fortran STOP.
633  // - alive foreign thread prevented __kmpc_end from doing cleanup.
634  //
635  // This is a hack to work around the problem.
636  // TODO: !!! figure out something better.
637  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638  }
639 
640  __kmp_internal_end_library(__kmp_gtid_get_specific());
641 
642  return TRUE;
643 
644  case DLL_THREAD_ATTACH:
645  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646 
647  /* if we want to register new siblings all the time here call
648  * __kmp_get_gtid(); */
649  return TRUE;
650 
651  case DLL_THREAD_DETACH:
652  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653 
654  __kmp_internal_end_thread(__kmp_gtid_get_specific());
655  return TRUE;
656  }
657 
658  return TRUE;
659 }
660 
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663 
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666  int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668  kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670 
671  if (__kmp_env_consistency_check) {
672  if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678  }
679 #ifdef BUILD_PARALLEL_ORDERED
680  if (!team->t.t_serialized) {
681  KMP_MB();
682  KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683  NULL);
684  KMP_MB();
685  }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688 
689 /* __kmp_parallel_dxo -- Signal the next task. */
690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691  int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693  int tid = __kmp_tid_from_gtid(gtid);
694  kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697  if (__kmp_env_consistency_check) {
698  if (__kmp_threads[gtid]->th.th_root->r.r_active)
699  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700  }
701 #ifdef BUILD_PARALLEL_ORDERED
702  if (!team->t.t_serialized) {
703  KMP_MB(); /* Flush all pending memory write invalidates. */
704 
705  /* use the tid of the next thread in this team */
706  /* TODO replace with general release procedure */
707  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708 
709  KMP_MB(); /* Flush all pending memory write invalidates. */
710  }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit */
716 
717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718  int status;
719  kmp_info_t *th;
720  kmp_team_t *team;
721 
722  if (!TCR_4(__kmp_init_parallel))
723  __kmp_parallel_initialize();
724  __kmp_resume_if_soft_paused();
725 
726  th = __kmp_threads[gtid];
727  team = th->th.th_team;
728  status = 0;
729 
730  th->th.th_ident = id_ref;
731 
732  if (team->t.t_serialized) {
733  status = 1;
734  } else {
735  kmp_int32 old_this = th->th.th_local.this_construct;
736 
737  ++th->th.th_local.this_construct;
738  /* try to set team count to thread count--success means thread got the
739  single block */
740  /* TODO: Should this be acquire or release? */
741  if (team->t.t_construct == old_this) {
742  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743  th->th.th_local.this_construct);
744  }
745 #if USE_ITT_BUILD
746  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747  KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748  team->t.t_active_level ==
749  1) { // Only report metadata by master of active team at level 1
750  __kmp_itt_metadata_single(id_ref);
751  }
752 #endif /* USE_ITT_BUILD */
753  }
754 
755  if (__kmp_env_consistency_check) {
756  if (status && push_ws) {
757  __kmp_push_workshare(gtid, ct_psingle, id_ref);
758  } else {
759  __kmp_check_workshare(gtid, ct_psingle, id_ref);
760  }
761  }
762 #if USE_ITT_BUILD
763  if (status) {
764  __kmp_itt_single_start(gtid);
765  }
766 #endif /* USE_ITT_BUILD */
767  return status;
768 }
769 
770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772  __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774  if (__kmp_env_consistency_check)
775  __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777 
778 /* determine if we can go parallel or must use a serialized parallel region and
779  * how many threads we can use
780  * set_nproc is the number of threads requested for the team
781  * returns 0 if we should serialize or only use one thread,
782  * otherwise the number of threads to use
783  * The forkjoin lock is held by the caller. */
784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785  int master_tid, int set_nthreads,
786  int enter_teams) {
787  int capacity;
788  int new_nthreads;
789  KMP_DEBUG_ASSERT(__kmp_init_serial);
790  KMP_DEBUG_ASSERT(root && parent_team);
791  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792 
793  // If dyn-var is set, dynamically adjust the number of desired threads,
794  // according to the method specified by dynamic_mode.
795  new_nthreads = set_nthreads;
796  if (!get__dynamic_2(parent_team, master_tid)) {
797  ;
798  }
799 #ifdef USE_LOAD_BALANCE
800  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802  if (new_nthreads == 1) {
803  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804  "reservation to 1 thread\n",
805  master_tid));
806  return 1;
807  }
808  if (new_nthreads < set_nthreads) {
809  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810  "reservation to %d threads\n",
811  master_tid, new_nthreads));
812  }
813  }
814 #endif /* USE_LOAD_BALANCE */
815  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816  new_nthreads = __kmp_avail_proc - __kmp_nth +
817  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818  if (new_nthreads <= 1) {
819  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820  "reservation to 1 thread\n",
821  master_tid));
822  return 1;
823  }
824  if (new_nthreads < set_nthreads) {
825  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826  "reservation to %d threads\n",
827  master_tid, new_nthreads));
828  } else {
829  new_nthreads = set_nthreads;
830  }
831  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832  if (set_nthreads > 2) {
833  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834  new_nthreads = (new_nthreads % set_nthreads) + 1;
835  if (new_nthreads == 1) {
836  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837  "reservation to 1 thread\n",
838  master_tid));
839  return 1;
840  }
841  if (new_nthreads < set_nthreads) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843  "reservation to %d threads\n",
844  master_tid, new_nthreads));
845  }
846  }
847  } else {
848  KMP_ASSERT(0);
849  }
850 
851  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852  if (__kmp_nth + new_nthreads -
853  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854  __kmp_max_nth) {
855  int tl_nthreads = __kmp_max_nth - __kmp_nth +
856  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857  if (tl_nthreads <= 0) {
858  tl_nthreads = 1;
859  }
860 
861  // If dyn-var is false, emit a 1-time warning.
862  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863  __kmp_reserve_warn = 1;
864  __kmp_msg(kmp_ms_warning,
865  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867  }
868  if (tl_nthreads == 1) {
869  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870  "reduced reservation to 1 thread\n",
871  master_tid));
872  return 1;
873  }
874  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875  "reservation to %d threads\n",
876  master_tid, tl_nthreads));
877  new_nthreads = tl_nthreads;
878  }
879 
880  // Respect OMP_THREAD_LIMIT
881  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883  if (cg_nthreads + new_nthreads -
884  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885  max_cg_threads) {
886  int tl_nthreads = max_cg_threads - cg_nthreads +
887  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888  if (tl_nthreads <= 0) {
889  tl_nthreads = 1;
890  }
891 
892  // If dyn-var is false, emit a 1-time warning.
893  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894  __kmp_reserve_warn = 1;
895  __kmp_msg(kmp_ms_warning,
896  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898  }
899  if (tl_nthreads == 1) {
900  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901  "reduced reservation to 1 thread\n",
902  master_tid));
903  return 1;
904  }
905  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906  "reservation to %d threads\n",
907  master_tid, tl_nthreads));
908  new_nthreads = tl_nthreads;
909  }
910 
911  // Check if the threads array is large enough, or needs expanding.
912  // See comment in __kmp_register_root() about the adjustment if
913  // __kmp_threads[0] == NULL.
914  capacity = __kmp_threads_capacity;
915  if (TCR_PTR(__kmp_threads[0]) == NULL) {
916  --capacity;
917  }
918  if (__kmp_nth + new_nthreads -
919  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920  capacity) {
921  // Expand the threads array.
922  int slotsRequired = __kmp_nth + new_nthreads -
923  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924  capacity;
925  int slotsAdded = __kmp_expand_threads(slotsRequired);
926  if (slotsAdded < slotsRequired) {
927  // The threads array was not expanded enough.
928  new_nthreads -= (slotsRequired - slotsAdded);
929  KMP_ASSERT(new_nthreads >= 1);
930 
931  // If dyn-var is false, emit a 1-time warning.
932  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933  __kmp_reserve_warn = 1;
934  if (__kmp_tp_cached) {
935  __kmp_msg(kmp_ms_warning,
936  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939  } else {
940  __kmp_msg(kmp_ms_warning,
941  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943  }
944  }
945  }
946  }
947 
948 #ifdef KMP_DEBUG
949  if (new_nthreads == 1) {
950  KC_TRACE(10,
951  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952  "dead roots and rechecking; requested %d threads\n",
953  __kmp_get_gtid(), set_nthreads));
954  } else {
955  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956  " %d threads\n",
957  __kmp_get_gtid(), new_nthreads, set_nthreads));
958  }
959 #endif // KMP_DEBUG
960  return new_nthreads;
961 }
962 
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964  assured that there are enough threads available, because we checked on that
965  earlier within critical section forkjoin */
966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967  kmp_info_t *master_th, int master_gtid) {
968  int i;
969  int use_hot_team;
970 
971  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973  KMP_MB();
974 
975  /* first, let's setup the master thread */
976  master_th->th.th_info.ds.ds_tid = 0;
977  master_th->th.th_team = team;
978  master_th->th.th_team_nproc = team->t.t_nproc;
979  master_th->th.th_team_master = master_th;
980  master_th->th.th_team_serialized = FALSE;
981  master_th->th.th_dispatch = &team->t.t_dispatch[0];
982 
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985  use_hot_team = 0;
986  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987  if (hot_teams) { // hot teams array is not allocated if
988  // KMP_HOT_TEAMS_MAX_LEVEL=0
989  int level = team->t.t_active_level - 1; // index in array of hot teams
990  if (master_th->th.th_teams_microtask) { // are we inside the teams?
991  if (master_th->th.th_teams_size.nteams > 1) {
992  ++level; // level was not increased in teams construct for
993  // team_of_masters
994  }
995  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996  master_th->th.th_teams_level == team->t.t_level) {
997  ++level; // level was not increased in teams construct for
998  // team_of_workers before the parallel
999  } // team->t.t_level will be increased inside parallel
1000  }
1001  if (level < __kmp_hot_teams_max_level) {
1002  if (hot_teams[level].hot_team) {
1003  // hot team has already been allocated for given level
1004  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005  use_hot_team = 1; // the team is ready to use
1006  } else {
1007  use_hot_team = 0; // AC: threads are not allocated yet
1008  hot_teams[level].hot_team = team; // remember new hot team
1009  hot_teams[level].hot_team_nth = team->t.t_nproc;
1010  }
1011  } else {
1012  use_hot_team = 0;
1013  }
1014  }
1015 #else
1016  use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018  if (!use_hot_team) {
1019 
1020  /* install the master thread */
1021  team->t.t_threads[0] = master_th;
1022  __kmp_initialize_info(master_th, team, 0, master_gtid);
1023 
1024  /* now, install the worker threads */
1025  for (i = 1; i < team->t.t_nproc; i++) {
1026 
1027  /* fork or reallocate a new thread and install it in team */
1028  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029  team->t.t_threads[i] = thr;
1030  KMP_DEBUG_ASSERT(thr);
1031  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032  /* align team and thread arrived states */
1033  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038  team->t.t_bar[bs_plain_barrier].b_arrived));
1039  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040  thr->th.th_teams_level = master_th->th.th_teams_level;
1041  thr->th.th_teams_size = master_th->th.th_teams_size;
1042  { // Initialize threads' barrier data.
1043  int b;
1044  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045  for (b = 0; b < bs_last_barrier; ++b) {
1046  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051  }
1052  }
1053  }
1054 
1055 #if KMP_AFFINITY_SUPPORTED
1056  __kmp_partition_places(team);
1057 #endif
1058  }
1059 
1060  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061  for (i = 0; i < team->t.t_nproc; i++) {
1062  kmp_info_t *thr = team->t.t_threads[i];
1063  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064  thr->th.th_prev_level != team->t.t_level) {
1065  team->t.t_display_affinity = 1;
1066  break;
1067  }
1068  }
1069  }
1070 
1071  KMP_MB();
1072 }
1073 
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
1078 inline static void propagateFPControl(kmp_team_t *team) {
1079  if (__kmp_inherit_fp_control) {
1080  kmp_int16 x87_fpu_control_word;
1081  kmp_uint32 mxcsr;
1082 
1083  // Get master values of FPU control flags (both X87 and vector)
1084  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085  __kmp_store_mxcsr(&mxcsr);
1086  mxcsr &= KMP_X86_MXCSR_MASK;
1087 
1088  // There is no point looking at t_fp_control_saved here.
1089  // If it is TRUE, we still have to update the values if they are different
1090  // from those we now have. If it is FALSE we didn't save anything yet, but
1091  // our objective is the same. We have to ensure that the values in the team
1092  // are the same as those we have.
1093  // So, this code achieves what we need whether or not t_fp_control_saved is
1094  // true. By checking whether the value needs updating we avoid unnecessary
1095  // writes that would put the cache-line into a written state, causing all
1096  // threads in the team to have to read it again.
1097  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099  // Although we don't use this value, other code in the runtime wants to know
1100  // whether it should restore them. So we must ensure it is correct.
1101  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102  } else {
1103  // Similarly here. Don't write to this cache-line in the team structure
1104  // unless we have to.
1105  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106  }
1107 }
1108 
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113  // Only reset the fp control regs if they have been changed in the team.
1114  // the parallel region that we are exiting.
1115  kmp_int16 x87_fpu_control_word;
1116  kmp_uint32 mxcsr;
1117  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118  __kmp_store_mxcsr(&mxcsr);
1119  mxcsr &= KMP_X86_MXCSR_MASK;
1120 
1121  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122  __kmp_clear_x87_fpu_status_word();
1123  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124  }
1125 
1126  if (team->t.t_mxcsr != mxcsr) {
1127  __kmp_load_mxcsr(&team->t.t_mxcsr);
1128  }
1129  }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135 
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137  int realloc); // forward declaration
1138 
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140  single master thread. */
1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142  kmp_info_t *this_thr;
1143  kmp_team_t *serial_team;
1144 
1145  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146 
1147  /* Skip all this code for autopar serialized loops since it results in
1148  unacceptable overhead */
1149  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150  return;
1151 
1152  if (!TCR_4(__kmp_init_parallel))
1153  __kmp_parallel_initialize();
1154  __kmp_resume_if_soft_paused();
1155 
1156  this_thr = __kmp_threads[global_tid];
1157  serial_team = this_thr->th.th_serial_team;
1158 
1159  /* utilize the serialized team held by this thread */
1160  KMP_DEBUG_ASSERT(serial_team);
1161  KMP_MB();
1162 
1163  if (__kmp_tasking_mode != tskm_immediate_exec) {
1164  KMP_DEBUG_ASSERT(
1165  this_thr->th.th_task_team ==
1166  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168  NULL);
1169  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170  "team %p, new task_team = NULL\n",
1171  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172  this_thr->th.th_task_team = NULL;
1173  }
1174 
1175  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177  proc_bind = proc_bind_false;
1178  } else if (proc_bind == proc_bind_default) {
1179  // No proc_bind clause was specified, so use the current value
1180  // of proc-bind-var for this parallel region.
1181  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182  }
1183  // Reset for next parallel region
1184  this_thr->th.th_set_proc_bind = proc_bind_default;
1185 
1186 #if OMPT_SUPPORT
1187  ompt_data_t ompt_parallel_data = ompt_data_none;
1188  ompt_data_t *implicit_task_data;
1189  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190  if (ompt_enabled.enabled &&
1191  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192 
1193  ompt_task_info_t *parent_task_info;
1194  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195 
1196  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197  if (ompt_enabled.ompt_callback_parallel_begin) {
1198  int team_size = 1;
1199 
1200  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201  &(parent_task_info->task_data), &(parent_task_info->frame),
1202  &ompt_parallel_data, team_size,
1203  ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204  }
1205  }
1206 #endif // OMPT_SUPPORT
1207 
1208  if (this_thr->th.th_team != serial_team) {
1209  // Nested level will be an index in the nested nthreads array
1210  int level = this_thr->th.th_team->t.t_level;
1211 
1212  if (serial_team->t.t_serialized) {
1213  /* this serial team was already used
1214  TODO increase performance by making this locks more specific */
1215  kmp_team_t *new_team;
1216 
1217  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218 
1219  new_team =
1220  __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222  ompt_parallel_data,
1223 #endif
1224  proc_bind, &this_thr->th.th_current_task->td_icvs,
1225  0 USE_NESTED_HOT_ARG(NULL));
1226  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227  KMP_ASSERT(new_team);
1228 
1229  /* setup new serialized team and install it */
1230  new_team->t.t_threads[0] = this_thr;
1231  new_team->t.t_parent = this_thr->th.th_team;
1232  serial_team = new_team;
1233  this_thr->th.th_serial_team = serial_team;
1234 
1235  KF_TRACE(
1236  10,
1237  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238  global_tid, serial_team));
1239 
1240  /* TODO the above breaks the requirement that if we run out of resources,
1241  then we can still guarantee that serialized teams are ok, since we may
1242  need to allocate a new one */
1243  } else {
1244  KF_TRACE(
1245  10,
1246  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247  global_tid, serial_team));
1248  }
1249 
1250  /* we have to initialize this serial team */
1251  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254  serial_team->t.t_ident = loc;
1255  serial_team->t.t_serialized = 1;
1256  serial_team->t.t_nproc = 1;
1257  serial_team->t.t_parent = this_thr->th.th_team;
1258  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259  this_thr->th.th_team = serial_team;
1260  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261 
1262  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263  this_thr->th.th_current_task));
1264  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265  this_thr->th.th_current_task->td_flags.executing = 0;
1266 
1267  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268 
1269  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270  implicit task for each serialized task represented by
1271  team->t.t_serialized? */
1272  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273  &this_thr->th.th_current_task->td_parent->td_icvs);
1274 
1275  // Thread value exists in the nested nthreads array for the next nested
1276  // level
1277  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278  this_thr->th.th_current_task->td_icvs.nproc =
1279  __kmp_nested_nth.nth[level + 1];
1280  }
1281 
1282  if (__kmp_nested_proc_bind.used &&
1283  (level + 1 < __kmp_nested_proc_bind.used)) {
1284  this_thr->th.th_current_task->td_icvs.proc_bind =
1285  __kmp_nested_proc_bind.bind_types[level + 1];
1286  }
1287 
1288 #if USE_DEBUGGER
1289  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291  this_thr->th.th_info.ds.ds_tid = 0;
1292 
1293  /* set thread cache values */
1294  this_thr->th.th_team_nproc = 1;
1295  this_thr->th.th_team_master = this_thr;
1296  this_thr->th.th_team_serialized = 1;
1297 
1298  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301 
1302  propagateFPControl(serial_team);
1303 
1304  /* check if we need to allocate dispatch buffers stack */
1305  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307  serial_team->t.t_dispatch->th_disp_buffer =
1308  (dispatch_private_info_t *)__kmp_allocate(
1309  sizeof(dispatch_private_info_t));
1310  }
1311  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312 
1313  KMP_MB();
1314 
1315  } else {
1316  /* this serialized team is already being used,
1317  * that's fine, just add another nested level */
1318  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321  ++serial_team->t.t_serialized;
1322  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323 
1324  // Nested level will be an index in the nested nthreads array
1325  int level = this_thr->th.th_team->t.t_level;
1326  // Thread value exists in the nested nthreads array for the next nested
1327  // level
1328  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329  this_thr->th.th_current_task->td_icvs.nproc =
1330  __kmp_nested_nth.nth[level + 1];
1331  }
1332  serial_team->t.t_level++;
1333  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334  "of serial team %p to %d\n",
1335  global_tid, serial_team, serial_team->t.t_level));
1336 
1337  /* allocate/push dispatch buffers stack */
1338  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339  {
1340  dispatch_private_info_t *disp_buffer =
1341  (dispatch_private_info_t *)__kmp_allocate(
1342  sizeof(dispatch_private_info_t));
1343  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345  }
1346  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347 
1348  KMP_MB();
1349  }
1350  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351 
1352  // Perform the display affinity functionality for
1353  // serialized parallel regions
1354  if (__kmp_display_affinity) {
1355  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356  this_thr->th.th_prev_num_threads != 1) {
1357  // NULL means use the affinity-format-var ICV
1358  __kmp_aux_display_affinity(global_tid, NULL);
1359  this_thr->th.th_prev_level = serial_team->t.t_level;
1360  this_thr->th.th_prev_num_threads = 1;
1361  }
1362  }
1363 
1364  if (__kmp_env_consistency_check)
1365  __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367  serial_team->t.ompt_team_info.master_return_address = codeptr;
1368  if (ompt_enabled.enabled &&
1369  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371 
1372  ompt_lw_taskteam_t lw_taskteam;
1373  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374  &ompt_parallel_data, codeptr);
1375 
1376  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377  // don't use lw_taskteam after linking. content was swaped
1378 
1379  /* OMPT implicit task begin */
1380  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381  if (ompt_enabled.ompt_callback_implicit_task) {
1382  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385  OMPT_CUR_TASK_INFO(this_thr)
1386  ->thread_num = __kmp_tid_from_gtid(global_tid);
1387  }
1388 
1389  /* OMPT state */
1390  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392  }
1393 #endif
1394 }
1395 
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399  enum fork_context_e call_context, // Intel, GNU, ...
1400  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401  kmp_va_list ap) {
1402  void **argv;
1403  int i;
1404  int master_tid;
1405  int master_this_cons;
1406  kmp_team_t *team;
1407  kmp_team_t *parent_team;
1408  kmp_info_t *master_th;
1409  kmp_root_t *root;
1410  int nthreads;
1411  int master_active;
1412  int master_set_numthreads;
1413  int level;
1414  int active_level;
1415  int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417  kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419  { // KMP_TIME_BLOCK
1420  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422 
1423  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425  /* Some systems prefer the stack for the root thread(s) to start with */
1426  /* some gap from the parent stack to prevent false sharing. */
1427  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428  /* These 2 lines below are so this does not get optimized out */
1429  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430  __kmp_stkpadding += (short)((kmp_int64)dummy);
1431  }
1432 
1433  /* initialize if needed */
1434  KMP_DEBUG_ASSERT(
1435  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436  if (!TCR_4(__kmp_init_parallel))
1437  __kmp_parallel_initialize();
1438  __kmp_resume_if_soft_paused();
1439 
1440  /* setup current data */
1441  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442  // shutdown
1443  parent_team = master_th->th.th_team;
1444  master_tid = master_th->th.th_info.ds.ds_tid;
1445  master_this_cons = master_th->th.th_local.this_construct;
1446  root = master_th->th.th_root;
1447  master_active = root->r.r_active;
1448  master_set_numthreads = master_th->th.th_set_nproc;
1449 
1450 #if OMPT_SUPPORT
1451  ompt_data_t ompt_parallel_data = ompt_data_none;
1452  ompt_data_t *parent_task_data;
1453  ompt_frame_t *ompt_frame;
1454  ompt_data_t *implicit_task_data;
1455  void *return_address = NULL;
1456 
1457  if (ompt_enabled.enabled) {
1458  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459  NULL, NULL);
1460  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461  }
1462 #endif
1463 
1464  // Nested level will be an index in the nested nthreads array
1465  level = parent_team->t.t_level;
1466  // used to launch non-serial teams even if nested is not allowed
1467  active_level = parent_team->t.t_active_level;
1468  // needed to check nesting inside the teams
1469  teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471  p_hot_teams = &master_th->th.th_hot_teams;
1472  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476  // it is either actual or not needed (when active_level > 0)
1477  (*p_hot_teams)[0].hot_team_nth = 1;
1478  }
1479 #endif
1480 
1481 #if OMPT_SUPPORT
1482  if (ompt_enabled.enabled) {
1483  if (ompt_enabled.ompt_callback_parallel_begin) {
1484  int team_size = master_set_numthreads
1485  ? master_set_numthreads
1486  : get__nproc_2(parent_team, master_tid);
1487  int flags = OMPT_INVOKER(call_context) |
1488  ((microtask == (microtask_t)__kmp_teams_master)
1489  ? ompt_parallel_league
1490  : ompt_parallel_team);
1491  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492  parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493  return_address);
1494  }
1495  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496  }
1497 #endif
1498 
1499  master_th->th.th_ident = loc;
1500 
1501  if (master_th->th.th_teams_microtask && ap &&
1502  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503  // AC: This is start of parallel that is nested inside teams construct.
1504  // The team is actual (hot), all workers are ready at the fork barrier.
1505  // No lock needed to initialize the team a bit, then free workers.
1506  parent_team->t.t_ident = loc;
1507  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508  parent_team->t.t_argc = argc;
1509  argv = (void **)parent_team->t.t_argv;
1510  for (i = argc - 1; i >= 0; --i)
1511  *argv++ = va_arg(kmp_va_deref(ap), void *);
1512  // Increment our nested depth levels, but not increase the serialization
1513  if (parent_team == master_th->th.th_serial_team) {
1514  // AC: we are in serialized parallel
1515  __kmpc_serialized_parallel(loc, gtid);
1516  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517 
1518  if (call_context == fork_context_gnu) {
1519  // AC: need to decrement t_serialized for enquiry functions to work
1520  // correctly, will restore at join time
1521  parent_team->t.t_serialized--;
1522  return TRUE;
1523  }
1524 
1525 #if OMPT_SUPPORT
1526  void *dummy;
1527  void **exit_frame_p;
1528 
1529  ompt_lw_taskteam_t lw_taskteam;
1530 
1531  if (ompt_enabled.enabled) {
1532  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533  &ompt_parallel_data, return_address);
1534  exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535 
1536  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537  // don't use lw_taskteam after linking. content was swaped
1538 
1539  /* OMPT implicit task begin */
1540  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541  if (ompt_enabled.ompt_callback_implicit_task) {
1542  OMPT_CUR_TASK_INFO(master_th)
1543  ->thread_num = __kmp_tid_from_gtid(gtid);
1544  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546  implicit_task_data, 1,
1547  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548  }
1549 
1550  /* OMPT state */
1551  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552  } else {
1553  exit_frame_p = &dummy;
1554  }
1555 #endif
1556  // AC: need to decrement t_serialized for enquiry functions to work
1557  // correctly, will restore at join time
1558  parent_team->t.t_serialized--;
1559 
1560  {
1561  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565  ,
1566  exit_frame_p
1567 #endif
1568  );
1569  }
1570 
1571 #if OMPT_SUPPORT
1572  if (ompt_enabled.enabled) {
1573  *exit_frame_p = NULL;
1574  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575  if (ompt_enabled.ompt_callback_implicit_task) {
1576  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577  ompt_scope_end, NULL, implicit_task_data, 1,
1578  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579  }
1580  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581  __ompt_lw_taskteam_unlink(master_th);
1582  if (ompt_enabled.ompt_callback_parallel_end) {
1583  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584  &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585  OMPT_INVOKER(call_context) | ompt_parallel_team,
1586  return_address);
1587  }
1588  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589  }
1590 #endif
1591  return TRUE;
1592  }
1593 
1594  parent_team->t.t_pkfn = microtask;
1595  parent_team->t.t_invoke = invoker;
1596  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597  parent_team->t.t_active_level++;
1598  parent_team->t.t_level++;
1599  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600 
1601 #if OMPT_SUPPORT
1602  if (ompt_enabled.enabled) {
1603  ompt_lw_taskteam_t lw_taskteam;
1604  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605  &ompt_parallel_data, return_address);
1606  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607  }
1608 #endif
1609 
1610  /* Change number of threads in the team if requested */
1611  if (master_set_numthreads) { // The parallel has num_threads clause
1612  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613  // AC: only can reduce number of threads dynamically, can't increase
1614  kmp_info_t **other_threads = parent_team->t.t_threads;
1615  parent_team->t.t_nproc = master_set_numthreads;
1616  for (i = 0; i < master_set_numthreads; ++i) {
1617  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618  }
1619  // Keep extra threads hot in the team for possible next parallels
1620  }
1621  master_th->th.th_set_nproc = 0;
1622  }
1623 
1624 #if USE_DEBUGGER
1625  if (__kmp_debugging) { // Let debugger override number of threads.
1626  int nth = __kmp_omp_num_threads(loc);
1627  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628  master_set_numthreads = nth;
1629  }
1630  }
1631 #endif
1632 
1633 #if USE_ITT_BUILD
1634  if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635  KMP_ITT_DEBUG) &&
1636  __kmp_forkjoin_frames_mode == 3 &&
1637  parent_team->t.t_active_level == 1 // only report frames at level 1
1638  && master_th->th.th_teams_size.nteams == 1) {
1639  kmp_uint64 tmp_time = __itt_get_timestamp();
1640  master_th->th.th_frame_time = tmp_time;
1641  parent_team->t.t_region_time = tmp_time;
1642  }
1643  if (__itt_stack_caller_create_ptr) {
1644  // create new stack stitching id before entering fork barrier
1645  parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646  }
1647 #endif /* USE_ITT_BUILD */
1648 
1649  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650  "master_th=%p, gtid=%d\n",
1651  root, parent_team, master_th, gtid));
1652  __kmp_internal_fork(loc, gtid, parent_team);
1653  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654  "master_th=%p, gtid=%d\n",
1655  root, parent_team, master_th, gtid));
1656 
1657  if (call_context == fork_context_gnu)
1658  return TRUE;
1659 
1660  /* Invoke microtask for MASTER thread */
1661  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662  parent_team->t.t_id, parent_team->t.t_pkfn));
1663 
1664  if (!parent_team->t.t_invoke(gtid)) {
1665  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666  }
1667  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668  parent_team->t.t_id, parent_team->t.t_pkfn));
1669  KMP_MB(); /* Flush all pending memory write invalidates. */
1670 
1671  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672 
1673  return TRUE;
1674  } // Parallel closely nested in teams construct
1675 
1676 #if KMP_DEBUG
1677  if (__kmp_tasking_mode != tskm_immediate_exec) {
1678  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679  parent_team->t.t_task_team[master_th->th.th_task_state]);
1680  }
1681 #endif
1682 
1683  if (parent_team->t.t_active_level >=
1684  master_th->th.th_current_task->td_icvs.max_active_levels) {
1685  nthreads = 1;
1686  } else {
1687  int enter_teams = ((ap == NULL && active_level == 0) ||
1688  (ap && teams_level > 0 && teams_level == level));
1689  nthreads =
1690  master_set_numthreads
1691  ? master_set_numthreads
1692  : get__nproc_2(
1693  parent_team,
1694  master_tid); // TODO: get nproc directly from current task
1695 
1696  // Check if we need to take forkjoin lock? (no need for serialized
1697  // parallel out of teams construct). This code moved here from
1698  // __kmp_reserve_threads() to speedup nested serialized parallels.
1699  if (nthreads > 1) {
1700  if ((get__max_active_levels(master_th) == 1 &&
1701  (root->r.r_in_parallel && !enter_teams)) ||
1702  (__kmp_library == library_serial)) {
1703  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704  " threads\n",
1705  gtid, nthreads));
1706  nthreads = 1;
1707  }
1708  }
1709  if (nthreads > 1) {
1710  /* determine how many new threads we can use */
1711  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712  /* AC: If we execute teams from parallel region (on host), then teams
1713  should be created but each can only have 1 thread if nesting is
1714  disabled. If teams called from serial region, then teams and their
1715  threads should be created regardless of the nesting setting. */
1716  nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717  nthreads, enter_teams);
1718  if (nthreads == 1) {
1719  // Free lock for single thread execution here; for multi-thread
1720  // execution it will be freed later after team of threads created
1721  // and initialized
1722  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723  }
1724  }
1725  }
1726  KMP_DEBUG_ASSERT(nthreads > 0);
1727 
1728  // If we temporarily changed the set number of threads then restore it now
1729  master_th->th.th_set_nproc = 0;
1730 
1731  /* create a serialized parallel region? */
1732  if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX && \
1735  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736  void *args[argc];
1737 #else
1738  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740  KMP_ARCH_AARCH64) */
1741 
1742  KA_TRACE(20,
1743  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744 
1745  __kmpc_serialized_parallel(loc, gtid);
1746 
1747  if (call_context == fork_context_intel) {
1748  /* TODO this sucks, use the compiler itself to pass args! :) */
1749  master_th->th.th_serial_team->t.t_ident = loc;
1750  if (!ap) {
1751  // revert change made in __kmpc_serialized_parallel()
1752  master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754 
1755 #if OMPT_SUPPORT
1756  void *dummy;
1757  void **exit_frame_p;
1758  ompt_task_info_t *task_info;
1759 
1760  ompt_lw_taskteam_t lw_taskteam;
1761 
1762  if (ompt_enabled.enabled) {
1763  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764  &ompt_parallel_data, return_address);
1765 
1766  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767  // don't use lw_taskteam after linking. content was swaped
1768 
1769  task_info = OMPT_CUR_TASK_INFO(master_th);
1770  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771  if (ompt_enabled.ompt_callback_implicit_task) {
1772  OMPT_CUR_TASK_INFO(master_th)
1773  ->thread_num = __kmp_tid_from_gtid(gtid);
1774  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776  &(task_info->task_data), 1,
1777  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778  ompt_task_implicit);
1779  }
1780 
1781  /* OMPT state */
1782  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783  } else {
1784  exit_frame_p = &dummy;
1785  }
1786 #endif
1787 
1788  {
1789  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792  parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794  ,
1795  exit_frame_p
1796 #endif
1797  );
1798  }
1799 
1800 #if OMPT_SUPPORT
1801  if (ompt_enabled.enabled) {
1802  *exit_frame_p = NULL;
1803  if (ompt_enabled.ompt_callback_implicit_task) {
1804  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805  ompt_scope_end, NULL, &(task_info->task_data), 1,
1806  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807  ompt_task_implicit);
1808  }
1809  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810  __ompt_lw_taskteam_unlink(master_th);
1811  if (ompt_enabled.ompt_callback_parallel_end) {
1812  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813  &ompt_parallel_data, parent_task_data,
1814  OMPT_INVOKER(call_context) | ompt_parallel_team,
1815  return_address);
1816  }
1817  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818  }
1819 #endif
1820  } else if (microtask == (microtask_t)__kmp_teams_master) {
1821  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822  master_th->th.th_serial_team);
1823  team = master_th->th.th_team;
1824  // team->t.t_pkfn = microtask;
1825  team->t.t_invoke = invoker;
1826  __kmp_alloc_argv_entries(argc, team, TRUE);
1827  team->t.t_argc = argc;
1828  argv = (void **)team->t.t_argv;
1829  if (ap) {
1830  for (i = argc - 1; i >= 0; --i)
1831  *argv++ = va_arg(kmp_va_deref(ap), void *);
1832  } else {
1833  for (i = 0; i < argc; ++i)
1834  // Get args from parent team for teams construct
1835  argv[i] = parent_team->t.t_argv[i];
1836  }
1837  // AC: revert change made in __kmpc_serialized_parallel()
1838  // because initial code in teams should have level=0
1839  team->t.t_level--;
1840  // AC: call special invoker for outer "parallel" of teams construct
1841  invoker(gtid);
1842 #if OMPT_SUPPORT
1843  if (ompt_enabled.enabled) {
1844  ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845  if (ompt_enabled.ompt_callback_implicit_task) {
1846  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847  ompt_scope_end, NULL, &(task_info->task_data), 0,
1848  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849  }
1850  if (ompt_enabled.ompt_callback_parallel_end) {
1851  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852  &ompt_parallel_data, parent_task_data,
1853  OMPT_INVOKER(call_context) | ompt_parallel_league,
1854  return_address);
1855  }
1856  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857  }
1858 #endif
1859  } else {
1860  argv = args;
1861  for (i = argc - 1; i >= 0; --i)
1862  *argv++ = va_arg(kmp_va_deref(ap), void *);
1863  KMP_MB();
1864 
1865 #if OMPT_SUPPORT
1866  void *dummy;
1867  void **exit_frame_p;
1868  ompt_task_info_t *task_info;
1869 
1870  ompt_lw_taskteam_t lw_taskteam;
1871 
1872  if (ompt_enabled.enabled) {
1873  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874  &ompt_parallel_data, return_address);
1875  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876  // don't use lw_taskteam after linking. content was swaped
1877  task_info = OMPT_CUR_TASK_INFO(master_th);
1878  exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879 
1880  /* OMPT implicit task begin */
1881  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882  if (ompt_enabled.ompt_callback_implicit_task) {
1883  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885  implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886  ompt_task_implicit);
1887  OMPT_CUR_TASK_INFO(master_th)
1888  ->thread_num = __kmp_tid_from_gtid(gtid);
1889  }
1890 
1891  /* OMPT state */
1892  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893  } else {
1894  exit_frame_p = &dummy;
1895  }
1896 #endif
1897 
1898  {
1899  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903  ,
1904  exit_frame_p
1905 #endif
1906  );
1907  }
1908 
1909 #if OMPT_SUPPORT
1910  if (ompt_enabled.enabled) {
1911  *exit_frame_p = NULL;
1912  if (ompt_enabled.ompt_callback_implicit_task) {
1913  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914  ompt_scope_end, NULL, &(task_info->task_data), 1,
1915  OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916  ompt_task_implicit);
1917  }
1918 
1919  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920  __ompt_lw_taskteam_unlink(master_th);
1921  if (ompt_enabled.ompt_callback_parallel_end) {
1922  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923  &ompt_parallel_data, parent_task_data,
1924  OMPT_INVOKER(call_context) | ompt_parallel_team,
1925  return_address);
1926  }
1927  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928  }
1929 #endif
1930  }
1931  } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933  ompt_lw_taskteam_t lwt;
1934  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935  return_address);
1936 
1937  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941 
1942  // we were called from GNU native code
1943  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944  return FALSE;
1945  } else {
1946  KMP_ASSERT2(call_context < fork_context_last,
1947  "__kmp_fork_call: unknown fork_context parameter");
1948  }
1949 
1950  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951  KMP_MB();
1952  return FALSE;
1953  } // if (nthreads == 1)
1954 
1955  // GEH: only modify the executing flag in the case when not serialized
1956  // serialized case is handled in kmpc_serialized_parallel
1957  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958  "curtask=%p, curtask_max_aclevel=%d\n",
1959  parent_team->t.t_active_level, master_th,
1960  master_th->th.th_current_task,
1961  master_th->th.th_current_task->td_icvs.max_active_levels));
1962  // TODO: GEH - cannot do this assertion because root thread not set up as
1963  // executing
1964  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965  master_th->th.th_current_task->td_flags.executing = 0;
1966 
1967  if (!master_th->th.th_teams_microtask || level > teams_level) {
1968  /* Increment our nested depth level */
1969  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970  }
1971 
1972  // See if we need to make a copy of the ICVs.
1973  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974  if ((level + 1 < __kmp_nested_nth.used) &&
1975  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976  nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977  } else {
1978  nthreads_icv = 0; // don't update
1979  }
1980 
1981  // Figure out the proc_bind_policy for the new team.
1982  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983  kmp_proc_bind_t proc_bind_icv =
1984  proc_bind_default; // proc_bind_default means don't update
1985  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986  proc_bind = proc_bind_false;
1987  } else {
1988  if (proc_bind == proc_bind_default) {
1989  // No proc_bind clause specified; use current proc-bind-var for this
1990  // parallel region
1991  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992  }
1993  /* else: The proc_bind policy was specified explicitly on parallel clause.
1994  This overrides proc-bind-var for this parallel region, but does not
1995  change proc-bind-var. */
1996  // Figure the value of proc-bind-var for the child threads.
1997  if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998  (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999  master_th->th.th_current_task->td_icvs.proc_bind)) {
2000  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001  }
2002  }
2003 
2004  // Reset for next parallel region
2005  master_th->th.th_set_proc_bind = proc_bind_default;
2006 
2007  if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008  kmp_internal_control_t new_icvs;
2009  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010  new_icvs.next = NULL;
2011  if (nthreads_icv > 0) {
2012  new_icvs.nproc = nthreads_icv;
2013  }
2014  if (proc_bind_icv != proc_bind_default) {
2015  new_icvs.proc_bind = proc_bind_icv;
2016  }
2017 
2018  /* allocate a new parallel team */
2019  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020  team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022  ompt_parallel_data,
2023 #endif
2024  proc_bind, &new_icvs,
2025  argc USE_NESTED_HOT_ARG(master_th));
2026  } else {
2027  /* allocate a new parallel team */
2028  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029  team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031  ompt_parallel_data,
2032 #endif
2033  proc_bind,
2034  &master_th->th.th_current_task->td_icvs,
2035  argc USE_NESTED_HOT_ARG(master_th));
2036  }
2037  KF_TRACE(
2038  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039 
2040  /* setup the new team */
2041  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048  return_address);
2049 #endif
2050  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051  // TODO: parent_team->t.t_level == INT_MAX ???
2052  if (!master_th->th.th_teams_microtask || level > teams_level) {
2053  int new_level = parent_team->t.t_level + 1;
2054  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055  new_level = parent_team->t.t_active_level + 1;
2056  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057  } else {
2058  // AC: Do not increase parallel level at start of the teams construct
2059  int new_level = parent_team->t.t_level;
2060  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061  new_level = parent_team->t.t_active_level;
2062  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063  }
2064  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065  // set master's schedule as new run-time schedule
2066  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067 
2068  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070 
2071  // Update the floating point rounding in the team if required.
2072  propagateFPControl(team);
2073 
2074  if (__kmp_tasking_mode != tskm_immediate_exec) {
2075  // Set master's task team to team's task team. Unless this is hot team, it
2076  // should be NULL.
2077  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078  parent_team->t.t_task_team[master_th->th.th_task_state]);
2079  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080  "%p, new task_team %p / team %p\n",
2081  __kmp_gtid_from_thread(master_th),
2082  master_th->th.th_task_team, parent_team,
2083  team->t.t_task_team[master_th->th.th_task_state], team));
2084 
2085  if (active_level || master_th->th.th_task_team) {
2086  // Take a memo of master's task_state
2087  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088  if (master_th->th.th_task_state_top >=
2089  master_th->th.th_task_state_stack_sz) { // increase size
2090  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091  kmp_uint8 *old_stack, *new_stack;
2092  kmp_uint32 i;
2093  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096  }
2097  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098  ++i) { // zero-init rest of stack
2099  new_stack[i] = 0;
2100  }
2101  old_stack = master_th->th.th_task_state_memo_stack;
2102  master_th->th.th_task_state_memo_stack = new_stack;
2103  master_th->th.th_task_state_stack_sz = new_size;
2104  __kmp_free(old_stack);
2105  }
2106  // Store master's task_state on stack
2107  master_th->th
2108  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109  master_th->th.th_task_state;
2110  master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112  if (master_th->th.th_hot_teams &&
2113  active_level < __kmp_hot_teams_max_level &&
2114  team == master_th->th.th_hot_teams[active_level].hot_team) {
2115  // Restore master's nested state if nested hot team
2116  master_th->th.th_task_state =
2117  master_th->th
2118  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119  } else {
2120 #endif
2121  master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123  }
2124 #endif
2125  }
2126 #if !KMP_NESTED_HOT_TEAMS
2127  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128  (team == root->r.r_hot_team));
2129 #endif
2130  }
2131 
2132  KA_TRACE(
2133  20,
2134  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136  team->t.t_nproc));
2137  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138  (team->t.t_master_tid == 0 &&
2139  (team->t.t_parent == root->r.r_root_team ||
2140  team->t.t_parent->t.t_serialized)));
2141  KMP_MB();
2142 
2143  /* now, setup the arguments */
2144  argv = (void **)team->t.t_argv;
2145  if (ap) {
2146  for (i = argc - 1; i >= 0; --i) {
2147  void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148  KMP_CHECK_UPDATE(*argv, new_argv);
2149  argv++;
2150  }
2151  } else {
2152  for (i = 0; i < argc; ++i) {
2153  // Get args from parent team for teams construct
2154  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155  }
2156  }
2157 
2158  /* now actually fork the threads */
2159  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161  root->r.r_active = TRUE;
2162 
2163  __kmp_fork_team_threads(root, team, master_th, gtid);
2164  __kmp_setup_icv_copy(team, nthreads,
2165  &master_th->th.th_current_task->td_icvs, loc);
2166 
2167 #if OMPT_SUPPORT
2168  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170 
2171  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172 
2173 #if USE_ITT_BUILD
2174  if (team->t.t_active_level == 1 // only report frames at level 1
2175  && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178  (__kmp_forkjoin_frames_mode == 3 ||
2179  __kmp_forkjoin_frames_mode == 1)) {
2180  kmp_uint64 tmp_time = 0;
2181  if (__itt_get_timestamp_ptr)
2182  tmp_time = __itt_get_timestamp();
2183  // Internal fork - report frame begin
2184  master_th->th.th_frame_time = tmp_time;
2185  if (__kmp_forkjoin_frames_mode == 3)
2186  team->t.t_region_time = tmp_time;
2187  } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194  }
2195  }
2196 #endif /* USE_ITT_BUILD */
2197 
2198  /* now go on and do the work */
2199  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200  KMP_MB();
2201  KF_TRACE(10,
2202  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203  root, team, master_th, gtid));
2204 
2205 #if USE_ITT_BUILD
2206  if (__itt_stack_caller_create_ptr) {
2207  team->t.t_stack_id =
2208  __kmp_itt_stack_caller_create(); // create new stack stitching id
2209  // before entering fork barrier
2210  }
2211 #endif /* USE_ITT_BUILD */
2212 
2213  // AC: skip __kmp_internal_fork at teams construct, let only master
2214  // threads execute
2215  if (ap) {
2216  __kmp_internal_fork(loc, gtid, team);
2217  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218  "master_th=%p, gtid=%d\n",
2219  root, team, master_th, gtid));
2220  }
2221 
2222  if (call_context == fork_context_gnu) {
2223  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224  return TRUE;
2225  }
2226 
2227  /* Invoke microtask for MASTER thread */
2228  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229  team->t.t_id, team->t.t_pkfn));
2230  } // END of timer KMP_fork_call block
2231 
2232 #if KMP_STATS_ENABLED
2233  // If beginning a teams construct, then change thread state
2234  stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235  if (!ap) {
2236  KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237  }
2238 #endif
2239 
2240  if (!team->t.t_invoke(gtid)) {
2241  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242  }
2243 
2244 #if KMP_STATS_ENABLED
2245  // If was beginning of a teams construct, then reset thread state
2246  if (!ap) {
2247  KMP_SET_THREAD_STATE(previous_state);
2248  }
2249 #endif
2250 
2251  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252  team->t.t_id, team->t.t_pkfn));
2253  KMP_MB(); /* Flush all pending memory write invalidates. */
2254 
2255  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256 
2257 #if OMPT_SUPPORT
2258  if (ompt_enabled.enabled) {
2259  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260  }
2261 #endif
2262 
2263  return TRUE;
2264 }
2265 
2266 #if OMPT_SUPPORT
2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268  kmp_team_t *team) {
2269  // restore state outside the region
2270  thread->th.ompt_thread_info.state =
2271  ((team->t.t_serialized) ? ompt_state_work_serial
2272  : ompt_state_work_parallel);
2273 }
2274 
2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276  kmp_team_t *team, ompt_data_t *parallel_data,
2277  int flags, void *codeptr) {
2278  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279  if (ompt_enabled.ompt_callback_parallel_end) {
2280  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281  parallel_data, &(task_info->task_data), flags, codeptr);
2282  }
2283 
2284  task_info->frame.enter_frame = ompt_data_none;
2285  __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288 
2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291  ,
2292  enum fork_context_e fork_context
2293 #endif
2294  ,
2295  int exit_teams) {
2296  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297  kmp_team_t *team;
2298  kmp_team_t *parent_team;
2299  kmp_info_t *master_th;
2300  kmp_root_t *root;
2301  int master_active;
2302 
2303  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304 
2305  /* setup current data */
2306  master_th = __kmp_threads[gtid];
2307  root = master_th->th.th_root;
2308  team = master_th->th.th_team;
2309  parent_team = team->t.t_parent;
2310 
2311  master_th->th.th_ident = loc;
2312 
2313 #if OMPT_SUPPORT
2314  void *team_microtask = (void *)team->t.t_pkfn;
2315  // For GOMP interface with serialized parallel, need the
2316  // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317  // and end-parallel events.
2318  if (ompt_enabled.enabled &&
2319  !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321  }
2322 #endif
2323 
2324 #if KMP_DEBUG
2325  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327  "th_task_team = %p\n",
2328  __kmp_gtid_from_thread(master_th), team,
2329  team->t.t_task_team[master_th->th.th_task_state],
2330  master_th->th.th_task_team));
2331  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332  team->t.t_task_team[master_th->th.th_task_state]);
2333  }
2334 #endif
2335 
2336  if (team->t.t_serialized) {
2337  if (master_th->th.th_teams_microtask) {
2338  // We are in teams construct
2339  int level = team->t.t_level;
2340  int tlevel = master_th->th.th_teams_level;
2341  if (level == tlevel) {
2342  // AC: we haven't incremented it earlier at start of teams construct,
2343  // so do it here - at the end of teams construct
2344  team->t.t_level++;
2345  } else if (level == tlevel + 1) {
2346  // AC: we are exiting parallel inside teams, need to increment
2347  // serialization in order to restore it in the next call to
2348  // __kmpc_end_serialized_parallel
2349  team->t.t_serialized++;
2350  }
2351  }
2352  __kmpc_end_serialized_parallel(loc, gtid);
2353 
2354 #if OMPT_SUPPORT
2355  if (ompt_enabled.enabled) {
2356  __kmp_join_restore_state(master_th, parent_team);
2357  }
2358 #endif
2359 
2360  return;
2361  }
2362 
2363  master_active = team->t.t_master_active;
2364 
2365  if (!exit_teams) {
2366  // AC: No barrier for internal teams at exit from teams construct.
2367  // But there is barrier for external team (league).
2368  __kmp_internal_join(loc, gtid, team);
2369  } else {
2370  master_th->th.th_task_state =
2371  0; // AC: no tasking in teams (out of any parallel)
2372  }
2373 
2374  KMP_MB();
2375 
2376 #if OMPT_SUPPORT
2377  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378  void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380 
2381 #if USE_ITT_BUILD
2382  if (__itt_stack_caller_create_ptr) {
2383  // destroy the stack stitching id after join barrier
2384  __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385  }
2386  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387  if (team->t.t_active_level == 1 &&
2388  (!master_th->th.th_teams_microtask || /* not in teams construct */
2389  master_th->th.th_teams_size.nteams == 1)) {
2390  master_th->th.th_ident = loc;
2391  // only one notification scheme (either "submit" or "forking/joined", not
2392  // both)
2393  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394  __kmp_forkjoin_frames_mode == 3)
2395  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396  master_th->th.th_frame_time, 0, loc,
2397  master_th->th.th_team_nproc, 1);
2398  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400  __kmp_itt_region_joined(gtid);
2401  } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403 
2404  if (master_th->th.th_teams_microtask && !exit_teams &&
2405  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406  team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411  ompt_data_t ompt_parallel_data = ompt_data_none;
2412  if (ompt_enabled.enabled) {
2413  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414  if (ompt_enabled.ompt_callback_implicit_task) {
2415  int ompt_team_size = team->t.t_nproc;
2416  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419  }
2420  task_info->frame.exit_frame = ompt_data_none;
2421  task_info->task_data = ompt_data_none;
2422  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423  __ompt_lw_taskteam_unlink(master_th);
2424  }
2425 #endif
2426  /* Decrement our nested depth level */
2427  team->t.t_level--;
2428  team->t.t_active_level--;
2429  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430 
2431  // Restore number of threads in the team if needed. This code relies on
2432  // the proper adjustment of th_teams_size.nth after the fork in
2433  // __kmp_teams_master on each teams master in the case that
2434  // __kmp_reserve_threads reduced it.
2435  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436  int old_num = master_th->th.th_team_nproc;
2437  int new_num = master_th->th.th_teams_size.nth;
2438  kmp_info_t **other_threads = team->t.t_threads;
2439  team->t.t_nproc = new_num;
2440  for (int i = 0; i < old_num; ++i) {
2441  other_threads[i]->th.th_team_nproc = new_num;
2442  }
2443  // Adjust states of non-used threads of the team
2444  for (int i = old_num; i < new_num; ++i) {
2445  // Re-initialize thread's barrier data.
2446  KMP_DEBUG_ASSERT(other_threads[i]);
2447  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448  for (int b = 0; b < bs_last_barrier; ++b) {
2449  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454  }
2455  if (__kmp_tasking_mode != tskm_immediate_exec) {
2456  // Synchronize thread's task state
2457  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458  }
2459  }
2460  }
2461 
2462 #if OMPT_SUPPORT
2463  if (ompt_enabled.enabled) {
2464  __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465  OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466  }
2467 #endif
2468 
2469  return;
2470  }
2471 
2472  /* do cleanup and restore the parent team */
2473  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475 
2476  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477 
2478  /* jc: The following lock has instructions with REL and ACQ semantics,
2479  separating the parallel user code called in this parallel region
2480  from the serial user code called after this function returns. */
2481  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482 
2483  if (!master_th->th.th_teams_microtask ||
2484  team->t.t_level > master_th->th.th_teams_level) {
2485  /* Decrement our nested depth level */
2486  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487  }
2488  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489 
2490 #if OMPT_SUPPORT
2491  if (ompt_enabled.enabled) {
2492  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493  if (ompt_enabled.ompt_callback_implicit_task) {
2494  int flags = (team_microtask == (void *)__kmp_teams_master)
2495  ? ompt_task_initial
2496  : ompt_task_implicit;
2497  int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500  OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501  }
2502  task_info->frame.exit_frame = ompt_data_none;
2503  task_info->task_data = ompt_data_none;
2504  }
2505 #endif
2506 
2507  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508  master_th, team));
2509  __kmp_pop_current_task_from_thread(master_th);
2510 
2511 #if KMP_AFFINITY_SUPPORTED
2512  // Restore master thread's partition.
2513  master_th->th.th_first_place = team->t.t_first_place;
2514  master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516  master_th->th.th_def_allocator = team->t.t_def_allocator;
2517 
2518  updateHWFPControl(team);
2519 
2520  if (root->r.r_active != master_active)
2521  root->r.r_active = master_active;
2522 
2523  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524  master_th)); // this will free worker threads
2525 
2526  /* this race was fun to find. make sure the following is in the critical
2527  region otherwise assertions may fail occasionally since the old team may be
2528  reallocated and the hierarchy appears inconsistent. it is actually safe to
2529  run and won't cause any bugs, but will cause those assertion failures. it's
2530  only one deref&assign so might as well put this in the critical region */
2531  master_th->th.th_team = parent_team;
2532  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533  master_th->th.th_team_master = parent_team->t.t_threads[0];
2534  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535 
2536  /* restore serialized team, if need be */
2537  if (parent_team->t.t_serialized &&
2538  parent_team != master_th->th.th_serial_team &&
2539  parent_team != root->r.r_root_team) {
2540  __kmp_free_team(root,
2541  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542  master_th->th.th_serial_team = parent_team;
2543  }
2544 
2545  if (__kmp_tasking_mode != tskm_immediate_exec) {
2546  if (master_th->th.th_task_state_top >
2547  0) { // Restore task state from memo stack
2548  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549  // Remember master's state if we re-use this nested hot team
2550  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551  master_th->th.th_task_state;
2552  --master_th->th.th_task_state_top; // pop
2553  // Now restore state at this level
2554  master_th->th.th_task_state =
2555  master_th->th
2556  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557  }
2558  // Copy the task team from the parent team to the master thread
2559  master_th->th.th_task_team =
2560  parent_team->t.t_task_team[master_th->th.th_task_state];
2561  KA_TRACE(20,
2562  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564  parent_team));
2565  }
2566 
2567  // TODO: GEH - cannot do this assertion because root thread not set up as
2568  // executing
2569  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570  master_th->th.th_current_task->td_flags.executing = 1;
2571 
2572  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573 
2574 #if OMPT_SUPPORT
2575  int flags =
2576  OMPT_INVOKER(fork_context) |
2577  ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578  : ompt_parallel_team);
2579  if (ompt_enabled.enabled) {
2580  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581  codeptr);
2582  }
2583 #endif
2584 
2585  KMP_MB();
2586  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588 
2589 /* Check whether we should push an internal control record onto the
2590  serial team stack. If so, do it. */
2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592 
2593  if (thread->th.th_team != thread->th.th_serial_team) {
2594  return;
2595  }
2596  if (thread->th.th_team->t.t_serialized > 1) {
2597  int push = 0;
2598 
2599  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600  push = 1;
2601  } else {
2602  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603  thread->th.th_team->t.t_serialized) {
2604  push = 1;
2605  }
2606  }
2607  if (push) { /* push a record on the serial team's stack */
2608  kmp_internal_control_t *control =
2609  (kmp_internal_control_t *)__kmp_allocate(
2610  sizeof(kmp_internal_control_t));
2611 
2612  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613 
2614  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615 
2616  control->next = thread->th.th_team->t.t_control_stack_top;
2617  thread->th.th_team->t.t_control_stack_top = control;
2618  }
2619  }
2620 }
2621 
2622 /* Changes set_nproc */
2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624  kmp_info_t *thread;
2625  kmp_root_t *root;
2626 
2627  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628  KMP_DEBUG_ASSERT(__kmp_init_serial);
2629 
2630  if (new_nth < 1)
2631  new_nth = 1;
2632  else if (new_nth > __kmp_max_nth)
2633  new_nth = __kmp_max_nth;
2634 
2635  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636  thread = __kmp_threads[gtid];
2637  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638  return; // nothing to do
2639 
2640  __kmp_save_internal_controls(thread);
2641 
2642  set__nproc(thread, new_nth);
2643 
2644  // If this omp_set_num_threads() call will cause the hot team size to be
2645  // reduced (in the absence of a num_threads clause), then reduce it now,
2646  // rather than waiting for the next parallel region.
2647  root = thread->th.th_root;
2648  if (__kmp_init_parallel && (!root->r.r_active) &&
2649  (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653  ) {
2654  kmp_team_t *hot_team = root->r.r_hot_team;
2655  int f;
2656 
2657  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658 
2659  // Release the extra threads we don't need any more.
2660  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662  if (__kmp_tasking_mode != tskm_immediate_exec) {
2663  // When decreasing team size, threads no longer in the team should unref
2664  // task team.
2665  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666  }
2667  __kmp_free_thread(hot_team->t.t_threads[f]);
2668  hot_team->t.t_threads[f] = NULL;
2669  }
2670  hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672  if (thread->th.th_hot_teams) {
2673  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675  }
2676 #endif
2677 
2678  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679 
2680  // Update the t_nproc field in the threads that are still active.
2681  for (f = 0; f < new_nth; f++) {
2682  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684  }
2685  // Special flag in case omp_set_num_threads() call
2686  hot_team->t.t_size_changed = -1;
2687  }
2688 }
2689 
2690 /* Changes max_active_levels */
2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692  kmp_info_t *thread;
2693 
2694  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695  "%d = (%d)\n",
2696  gtid, max_active_levels));
2697  KMP_DEBUG_ASSERT(__kmp_init_serial);
2698 
2699  // validate max_active_levels
2700  if (max_active_levels < 0) {
2701  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702  // We ignore this call if the user has specified a negative value.
2703  // The current setting won't be changed. The last valid setting will be
2704  // used. A warning will be issued (if warnings are allowed as controlled by
2705  // the KMP_WARNINGS env var).
2706  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707  "max_active_levels for thread %d = (%d)\n",
2708  gtid, max_active_levels));
2709  return;
2710  }
2711  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712  // it's OK, the max_active_levels is within the valid range: [ 0;
2713  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714  // We allow a zero value. (implementation defined behavior)
2715  } else {
2716  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719  // Current upper limit is MAX_INT. (implementation defined behavior)
2720  // If the input exceeds the upper limit, we correct the input to be the
2721  // upper limit. (implementation defined behavior)
2722  // Actually, the flow should never get here until we use MAX_INT limit.
2723  }
2724  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725  "max_active_levels for thread %d = (%d)\n",
2726  gtid, max_active_levels));
2727 
2728  thread = __kmp_threads[gtid];
2729 
2730  __kmp_save_internal_controls(thread);
2731 
2732  set__max_active_levels(thread, max_active_levels);
2733 }
2734 
2735 /* Gets max_active_levels */
2736 int __kmp_get_max_active_levels(int gtid) {
2737  kmp_info_t *thread;
2738 
2739  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740  KMP_DEBUG_ASSERT(__kmp_init_serial);
2741 
2742  thread = __kmp_threads[gtid];
2743  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745  "curtask_maxaclevel=%d\n",
2746  gtid, thread->th.th_current_task,
2747  thread->th.th_current_task->td_icvs.max_active_levels));
2748  return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750 
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753 
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756  kmp_info_t *thread;
2757  kmp_sched_t orig_kind;
2758  // kmp_team_t *team;
2759 
2760  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761  gtid, (int)kind, chunk));
2762  KMP_DEBUG_ASSERT(__kmp_init_serial);
2763 
2764  // Check if the kind parameter is valid, correct if needed.
2765  // Valid parameters should fit in one of two intervals - standard or extended:
2766  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2768  orig_kind = kind;
2769  kind = __kmp_sched_without_mods(kind);
2770 
2771  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773  // TODO: Hint needs attention in case we change the default schedule.
2774  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776  __kmp_msg_null);
2777  kind = kmp_sched_default;
2778  chunk = 0; // ignore chunk value in case of bad kind
2779  }
2780 
2781  thread = __kmp_threads[gtid];
2782 
2783  __kmp_save_internal_controls(thread);
2784 
2785  if (kind < kmp_sched_upper_std) {
2786  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787  // differ static chunked vs. unchunked: chunk should be invalid to
2788  // indicate unchunked schedule (which is the default)
2789  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790  } else {
2791  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792  __kmp_sch_map[kind - kmp_sched_lower - 1];
2793  }
2794  } else {
2795  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796  // kmp_sched_lower - 2 ];
2797  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799  kmp_sched_lower - 2];
2800  }
2801  __kmp_sched_apply_mods_intkind(
2802  orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803  if (kind == kmp_sched_auto || chunk < 1) {
2804  // ignore parameter chunk for schedule auto
2805  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806  } else {
2807  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808  }
2809 }
2810 
2811 /* Gets def_sched_var ICV values */
2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813  kmp_info_t *thread;
2814  enum sched_type th_type;
2815 
2816  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817  KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819  thread = __kmp_threads[gtid];
2820 
2821  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822  switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823  case kmp_sch_static:
2824  case kmp_sch_static_greedy:
2825  case kmp_sch_static_balanced:
2826  *kind = kmp_sched_static;
2827  __kmp_sched_apply_mods_stdkind(kind, th_type);
2828  *chunk = 0; // chunk was not set, try to show this fact via zero value
2829  return;
2830  case kmp_sch_static_chunked:
2831  *kind = kmp_sched_static;
2832  break;
2833  case kmp_sch_dynamic_chunked:
2834  *kind = kmp_sched_dynamic;
2835  break;
2837  case kmp_sch_guided_iterative_chunked:
2838  case kmp_sch_guided_analytical_chunked:
2839  *kind = kmp_sched_guided;
2840  break;
2841  case kmp_sch_auto:
2842  *kind = kmp_sched_auto;
2843  break;
2844  case kmp_sch_trapezoidal:
2845  *kind = kmp_sched_trapezoidal;
2846  break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848  case kmp_sch_static_steal:
2849  *kind = kmp_sched_static_steal;
2850  break;
2851 #endif
2852  default:
2853  KMP_FATAL(UnknownSchedulingType, th_type);
2854  }
2855 
2856  __kmp_sched_apply_mods_stdkind(kind, th_type);
2857  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859 
2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861 
2862  int ii, dd;
2863  kmp_team_t *team;
2864  kmp_info_t *thr;
2865 
2866  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867  KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869  // validate level
2870  if (level == 0)
2871  return 0;
2872  if (level < 0)
2873  return -1;
2874  thr = __kmp_threads[gtid];
2875  team = thr->th.th_team;
2876  ii = team->t.t_level;
2877  if (level > ii)
2878  return -1;
2879 
2880  if (thr->th.th_teams_microtask) {
2881  // AC: we are in teams region where multiple nested teams have same level
2882  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883  if (level <=
2884  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885  KMP_DEBUG_ASSERT(ii >= tlevel);
2886  // AC: As we need to pass by the teams league, we need to artificially
2887  // increase ii
2888  if (ii == tlevel) {
2889  ii += 2; // three teams have same level
2890  } else {
2891  ii++; // two teams have same level
2892  }
2893  }
2894  }
2895 
2896  if (ii == level)
2897  return __kmp_tid_from_gtid(gtid);
2898 
2899  dd = team->t.t_serialized;
2900  level++;
2901  while (ii > level) {
2902  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903  }
2904  if ((team->t.t_serialized) && (!dd)) {
2905  team = team->t.t_parent;
2906  continue;
2907  }
2908  if (ii > level) {
2909  team = team->t.t_parent;
2910  dd = team->t.t_serialized;
2911  ii--;
2912  }
2913  }
2914 
2915  return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917 
2918 int __kmp_get_team_size(int gtid, int level) {
2919 
2920  int ii, dd;
2921  kmp_team_t *team;
2922  kmp_info_t *thr;
2923 
2924  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925  KMP_DEBUG_ASSERT(__kmp_init_serial);
2926 
2927  // validate level
2928  if (level == 0)
2929  return 1;
2930  if (level < 0)
2931  return -1;
2932  thr = __kmp_threads[gtid];
2933  team = thr->th.th_team;
2934  ii = team->t.t_level;
2935  if (level > ii)
2936  return -1;
2937 
2938  if (thr->th.th_teams_microtask) {
2939  // AC: we are in teams region where multiple nested teams have same level
2940  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941  if (level <=
2942  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943  KMP_DEBUG_ASSERT(ii >= tlevel);
2944  // AC: As we need to pass by the teams league, we need to artificially
2945  // increase ii
2946  if (ii == tlevel) {
2947  ii += 2; // three teams have same level
2948  } else {
2949  ii++; // two teams have same level
2950  }
2951  }
2952  }
2953 
2954  while (ii > level) {
2955  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956  }
2957  if (team->t.t_serialized && (!dd)) {
2958  team = team->t.t_parent;
2959  continue;
2960  }
2961  if (ii > level) {
2962  team = team->t.t_parent;
2963  ii--;
2964  }
2965  }
2966 
2967  return team->t.t_nproc;
2968 }
2969 
2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973  // independently. So one can get the updated schedule here.
2974 
2975  kmp_r_sched_t r_sched;
2976 
2977  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978  // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980  // different roots (even in OMP 2.5)
2981  enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982  enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983  if (s == kmp_sch_static) {
2984  // replace STATIC with more detailed schedule (balanced or greedy)
2985  r_sched.r_sched_type = __kmp_static;
2986  } else if (s == kmp_sch_guided_chunked) {
2987  // replace GUIDED with more detailed schedule (iterative or analytical)
2988  r_sched.r_sched_type = __kmp_guided;
2989  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990  r_sched.r_sched_type = __kmp_sched;
2991  }
2992  SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993 
2994  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995  // __kmp_chunk may be wrong here (if it was not ever set)
2996  r_sched.chunk = KMP_DEFAULT_CHUNK;
2997  } else {
2998  r_sched.chunk = __kmp_chunk;
2999  }
3000 
3001  return r_sched;
3002 }
3003 
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005  at least argc number of *t_argv entries for the requested team. */
3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007 
3008  KMP_DEBUG_ASSERT(team);
3009  if (!realloc || argc > team->t.t_max_argc) {
3010 
3011  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012  "current entries=%d\n",
3013  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014  /* if previously allocated heap space for args, free them */
3015  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016  __kmp_free((void *)team->t.t_argv);
3017 
3018  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019  /* use unused space in the cache line for arguments */
3020  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022  "argv entries\n",
3023  team->t.t_id, team->t.t_max_argc));
3024  team->t.t_argv = &team->t.t_inline_argv[0];
3025  if (__kmp_storage_map) {
3026  __kmp_print_storage_map_gtid(
3027  -1, &team->t.t_inline_argv[0],
3028  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030  team->t.t_id);
3031  }
3032  } else {
3033  /* allocate space for arguments in the heap */
3034  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036  : 2 * argc;
3037  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038  "argv entries\n",
3039  team->t.t_id, team->t.t_max_argc));
3040  team->t.t_argv =
3041  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042  if (__kmp_storage_map) {
3043  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044  &team->t.t_argv[team->t.t_max_argc],
3045  sizeof(void *) * team->t.t_max_argc,
3046  "team_%d.t_argv", team->t.t_id);
3047  }
3048  }
3049  }
3050 }
3051 
3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053  int i;
3054  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055  team->t.t_threads =
3056  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058  sizeof(dispatch_shared_info_t) * num_disp_buff);
3059  team->t.t_dispatch =
3060  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061  team->t.t_implicit_task_taskdata =
3062  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063  team->t.t_max_nproc = max_nth;
3064 
3065  /* setup dispatch buffers */
3066  for (i = 0; i < num_disp_buff; ++i) {
3067  team->t.t_disp_buffer[i].buffer_index = i;
3068  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069  }
3070 }
3071 
3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074  int i;
3075  for (i = 0; i < team->t.t_max_nproc; ++i) {
3076  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078  team->t.t_dispatch[i].th_disp_buffer = NULL;
3079  }
3080  }
3081 #if KMP_USE_HIER_SCHED
3082  __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084  __kmp_free(team->t.t_threads);
3085  __kmp_free(team->t.t_disp_buffer);
3086  __kmp_free(team->t.t_dispatch);
3087  __kmp_free(team->t.t_implicit_task_taskdata);
3088  team->t.t_threads = NULL;
3089  team->t.t_disp_buffer = NULL;
3090  team->t.t_dispatch = NULL;
3091  team->t.t_implicit_task_taskdata = 0;
3092 }
3093 
3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095  kmp_info_t **oldThreads = team->t.t_threads;
3096 
3097  __kmp_free(team->t.t_disp_buffer);
3098  __kmp_free(team->t.t_dispatch);
3099  __kmp_free(team->t.t_implicit_task_taskdata);
3100  __kmp_allocate_team_arrays(team, max_nth);
3101 
3102  KMP_MEMCPY(team->t.t_threads, oldThreads,
3103  team->t.t_nproc * sizeof(kmp_info_t *));
3104 
3105  __kmp_free(oldThreads);
3106 }
3107 
3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109 
3110  kmp_r_sched_t r_sched =
3111  __kmp_get_schedule_global(); // get current state of scheduling globals
3112 
3113  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114 
3115  kmp_internal_control_t g_icvs = {
3116  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118  // adjustment of threads (per thread)
3119  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120  // whether blocktime is explicitly set
3121  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127  // next parallel region (per thread)
3128  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129  __kmp_cg_max_nth, // int thread_limit;
3130  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131  // for max_active_levels
3132  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133  // {sched,chunk} pair
3134  __kmp_nested_proc_bind.bind_types[0],
3135  __kmp_default_device,
3136  NULL // struct kmp_internal_control *next;
3137  };
3138 
3139  return g_icvs;
3140 }
3141 
3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143 
3144  kmp_internal_control_t gx_icvs;
3145  gx_icvs.serial_nesting_level =
3146  0; // probably =team->t.t_serial like in save_inter_controls
3147  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148  gx_icvs.next = NULL;
3149 
3150  return gx_icvs;
3151 }
3152 
3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154  int f;
3155  kmp_team_t *root_team;
3156  kmp_team_t *hot_team;
3157  int hot_team_max_nth;
3158  kmp_r_sched_t r_sched =
3159  __kmp_get_schedule_global(); // get current state of scheduling globals
3160  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161  KMP_DEBUG_ASSERT(root);
3162  KMP_ASSERT(!root->r.r_begin);
3163 
3164  /* setup the root state structure */
3165  __kmp_init_lock(&root->r.r_begin_lock);
3166  root->r.r_begin = FALSE;
3167  root->r.r_active = FALSE;
3168  root->r.r_in_parallel = 0;
3169  root->r.r_blocktime = __kmp_dflt_blocktime;
3170 
3171  /* setup the root team for this task */
3172  /* allocate the root team structure */
3173  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174 
3175  root_team =
3176  __kmp_allocate_team(root,
3177  1, // new_nproc
3178  1, // max_nproc
3179 #if OMPT_SUPPORT
3180  ompt_data_none, // root parallel id
3181 #endif
3182  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183  0 // argc
3184  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185  );
3186 #if USE_DEBUGGER
3187  // Non-NULL value should be assigned to make the debugger display the root
3188  // team.
3189  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191 
3192  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193 
3194  root->r.r_root_team = root_team;
3195  root_team->t.t_control_stack_top = NULL;
3196 
3197  /* initialize root team */
3198  root_team->t.t_threads[0] = NULL;
3199  root_team->t.t_nproc = 1;
3200  root_team->t.t_serialized = 1;
3201  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202  root_team->t.t_sched.sched = r_sched.sched;
3203  KA_TRACE(
3204  20,
3205  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207 
3208  /* setup the hot team for this task */
3209  /* allocate the hot team structure */
3210  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211 
3212  hot_team =
3213  __kmp_allocate_team(root,
3214  1, // new_nproc
3215  __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217  ompt_data_none, // root parallel id
3218 #endif
3219  __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220  0 // argc
3221  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222  );
3223  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224 
3225  root->r.r_hot_team = hot_team;
3226  root_team->t.t_control_stack_top = NULL;
3227 
3228  /* first-time initialization */
3229  hot_team->t.t_parent = root_team;
3230 
3231  /* initialize hot team */
3232  hot_team_max_nth = hot_team->t.t_max_nproc;
3233  for (f = 0; f < hot_team_max_nth; ++f) {
3234  hot_team->t.t_threads[f] = NULL;
3235  }
3236  hot_team->t.t_nproc = 1;
3237  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238  hot_team->t.t_sched.sched = r_sched.sched;
3239  hot_team->t.t_size_changed = 0;
3240 }
3241 
3242 #ifdef KMP_DEBUG
3243 
3244 typedef struct kmp_team_list_item {
3245  kmp_team_p const *entry;
3246  struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249 
3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251  kmp_team_list_t list, // List of teams.
3252  kmp_team_p const *team // Team to add.
3253  ) {
3254 
3255  // List must terminate with item where both entry and next are NULL.
3256  // Team is added to the list only once.
3257  // List is sorted in ascending order by team id.
3258  // Team id is *not* a key.
3259 
3260  kmp_team_list_t l;
3261 
3262  KMP_DEBUG_ASSERT(list != NULL);
3263  if (team == NULL) {
3264  return;
3265  }
3266 
3267  __kmp_print_structure_team_accum(list, team->t.t_parent);
3268  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269 
3270  // Search list for the team.
3271  l = list;
3272  while (l->next != NULL && l->entry != team) {
3273  l = l->next;
3274  }
3275  if (l->next != NULL) {
3276  return; // Team has been added before, exit.
3277  }
3278 
3279  // Team is not found. Search list again for insertion point.
3280  l = list;
3281  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282  l = l->next;
3283  }
3284 
3285  // Insert team.
3286  {
3287  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288  sizeof(kmp_team_list_item_t));
3289  *item = *l;
3290  l->entry = team;
3291  l->next = item;
3292  }
3293 }
3294 
3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296 
3297  ) {
3298  __kmp_printf("%s", title);
3299  if (team != NULL) {
3300  __kmp_printf("%2x %p\n", team->t.t_id, team);
3301  } else {
3302  __kmp_printf(" - (nil)\n");
3303  }
3304 }
3305 
3306 static void __kmp_print_structure_thread(char const *title,
3307  kmp_info_p const *thread) {
3308  __kmp_printf("%s", title);
3309  if (thread != NULL) {
3310  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311  } else {
3312  __kmp_printf(" - (nil)\n");
3313  }
3314 }
3315 
3316 void __kmp_print_structure(void) {
3317 
3318  kmp_team_list_t list;
3319 
3320  // Initialize list of teams.
3321  list =
3322  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323  list->entry = NULL;
3324  list->next = NULL;
3325 
3326  __kmp_printf("\n------------------------------\nGlobal Thread "
3327  "Table\n------------------------------\n");
3328  {
3329  int gtid;
3330  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331  __kmp_printf("%2d", gtid);
3332  if (__kmp_threads != NULL) {
3333  __kmp_printf(" %p", __kmp_threads[gtid]);
3334  }
3335  if (__kmp_root != NULL) {
3336  __kmp_printf(" %p", __kmp_root[gtid]);
3337  }
3338  __kmp_printf("\n");
3339  }
3340  }
3341 
3342  // Print out __kmp_threads array.
3343  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344  "----------\n");
3345  if (__kmp_threads != NULL) {
3346  int gtid;
3347  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348  kmp_info_t const *thread = __kmp_threads[gtid];
3349  if (thread != NULL) {
3350  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3352  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3353  __kmp_print_structure_team(" Serial Team: ",
3354  thread->th.th_serial_team);
3355  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3356  __kmp_print_structure_thread(" Master: ",
3357  thread->th.th_team_master);
3358  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3359  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3360  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361  __kmp_print_structure_thread(" Next in pool: ",
3362  thread->th.th_next_pool);
3363  __kmp_printf("\n");
3364  __kmp_print_structure_team_accum(list, thread->th.th_team);
3365  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366  }
3367  }
3368  } else {
3369  __kmp_printf("Threads array is not allocated.\n");
3370  }
3371 
3372  // Print out __kmp_root array.
3373  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374  "--------\n");
3375  if (__kmp_root != NULL) {
3376  int gtid;
3377  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378  kmp_root_t const *root = __kmp_root[gtid];
3379  if (root != NULL) {
3380  __kmp_printf("GTID %2d %p:\n", gtid, root);
3381  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3382  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3383  __kmp_print_structure_thread(" Uber Thread: ",
3384  root->r.r_uber_thread);
3385  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3386  __kmp_printf(" In Parallel: %2d\n",
3387  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388  __kmp_printf("\n");
3389  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391  }
3392  }
3393  } else {
3394  __kmp_printf("Ubers array is not allocated.\n");
3395  }
3396 
3397  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398  "--------\n");
3399  while (list->next != NULL) {
3400  kmp_team_p const *team = list->entry;
3401  int i;
3402  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3404  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3405  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3406  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3407  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3408  for (i = 0; i < team->t.t_nproc; ++i) {
3409  __kmp_printf(" Thread %2d: ", i);
3410  __kmp_print_structure_thread("", team->t.t_threads[i]);
3411  }
3412  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3413  __kmp_printf("\n");
3414  list = list->next;
3415  }
3416 
3417  // Print out __kmp_thread_pool and __kmp_team_pool.
3418  __kmp_printf("\n------------------------------\nPools\n----------------------"
3419  "--------\n");
3420  __kmp_print_structure_thread("Thread pool: ",
3421  CCAST(kmp_info_t *, __kmp_thread_pool));
3422  __kmp_print_structure_team("Team pool: ",
3423  CCAST(kmp_team_t *, __kmp_team_pool));
3424  __kmp_printf("\n");
3425 
3426  // Free team list.
3427  while (list != NULL) {
3428  kmp_team_list_item_t *item = list;
3429  list = list->next;
3430  KMP_INTERNAL_FREE(item);
3431  }
3432 }
3433 
3434 #endif
3435 
3436 //---------------------------------------------------------------------------
3437 // Stuff for per-thread fast random number generator
3438 // Table of primes
3439 static const unsigned __kmp_primes[] = {
3440  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451 
3452 //---------------------------------------------------------------------------
3453 // __kmp_get_random: Get a random number using a linear congruential method.
3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455  unsigned x = thread->th.th_x;
3456  unsigned short r = x >> 16;
3457 
3458  thread->th.th_x = x * thread->th.th_a + 1;
3459 
3460  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461  thread->th.th_info.ds.ds_tid, r));
3462 
3463  return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
3467 void __kmp_init_random(kmp_info_t *thread) {
3468  unsigned seed = thread->th.th_info.ds.ds_tid;
3469 
3470  thread->th.th_a =
3471  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473  KA_TRACE(30,
3474  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476 
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479  * reclaimed */
3480 static int __kmp_reclaim_dead_roots(void) {
3481  int i, r = 0;
3482 
3483  for (i = 0; i < __kmp_threads_capacity; ++i) {
3484  if (KMP_UBER_GTID(i) &&
3485  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486  !__kmp_root[i]
3487  ->r.r_active) { // AC: reclaim only roots died in non-active state
3488  r += __kmp_unregister_root_other_thread(i);
3489  }
3490  }
3491  return r;
3492 }
3493 #endif
3494 
3495 /* This function attempts to create free entries in __kmp_threads and
3496  __kmp_root, and returns the number of free entries generated.
3497 
3498  For Windows* OS static library, the first mechanism used is to reclaim array
3499  entries for root threads that are already dead.
3500 
3501  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504  threadprivate cache array has been created. Synchronization with
3505  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506 
3507  After any dead root reclamation, if the clipping value allows array expansion
3508  to result in the generation of a total of nNeed free slots, the function does
3509  that expansion. If not, nothing is done beyond the possible initial root
3510  thread reclamation.
3511 
3512  If any argument is negative, the behavior is undefined. */
3513 static int __kmp_expand_threads(int nNeed) {
3514  int added = 0;
3515  int minimumRequiredCapacity;
3516  int newCapacity;
3517  kmp_info_t **newThreads;
3518  kmp_root_t **newRoot;
3519 
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523 
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525  /* only for Windows static library */
3526  /* reclaim array entries for root threads that are already dead */
3527  added = __kmp_reclaim_dead_roots();
3528 
3529  if (nNeed) {
3530  nNeed -= added;
3531  if (nNeed < 0)
3532  nNeed = 0;
3533  }
3534 #endif
3535  if (nNeed <= 0)
3536  return added;
3537 
3538  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541  // > __kmp_max_nth in one of two ways:
3542  //
3543  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3544  // may not be reused by another thread, so we may need to increase
3545  // __kmp_threads_capacity to __kmp_max_nth + 1.
3546  //
3547  // 2) New foreign root(s) are encountered. We always register new foreign
3548  // roots. This may cause a smaller # of threads to be allocated at
3549  // subsequent parallel regions, but the worker threads hang around (and
3550  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3551  //
3552  // Anyway, that is the reason for moving the check to see if
3553  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554  // instead of having it performed here. -BB
3555 
3556  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557 
3558  /* compute expansion headroom to check if we can expand */
3559  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560  /* possible expansion too small -- give up */
3561  return added;
3562  }
3563  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564 
3565  newCapacity = __kmp_threads_capacity;
3566  do {
3567  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568  : __kmp_sys_max_nth;
3569  } while (newCapacity < minimumRequiredCapacity);
3570  newThreads = (kmp_info_t **)__kmp_allocate(
3571  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572  newRoot =
3573  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574  KMP_MEMCPY(newThreads, __kmp_threads,
3575  __kmp_threads_capacity * sizeof(kmp_info_t *));
3576  KMP_MEMCPY(newRoot, __kmp_root,
3577  __kmp_threads_capacity * sizeof(kmp_root_t *));
3578 
3579  kmp_info_t **temp_threads = __kmp_threads;
3580  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582  __kmp_free(temp_threads);
3583  added += newCapacity - __kmp_threads_capacity;
3584  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585 
3586  if (newCapacity > __kmp_tp_capacity) {
3587  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589  __kmp_threadprivate_resize_cache(newCapacity);
3590  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592  }
3593  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594  }
3595 
3596  return added;
3597 }
3598 
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601  thread that calls from __kmp_do_serial_initialize() */
3602 int __kmp_register_root(int initial_thread) {
3603  kmp_info_t *root_thread;
3604  kmp_root_t *root;
3605  int gtid;
3606  int capacity;
3607  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609  KMP_MB();
3610 
3611  /* 2007-03-02:
3612  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614  work as expected -- it may return false (that means there is at least one
3615  empty slot in __kmp_threads array), but it is possible the only free slot
3616  is #0, which is reserved for initial thread and so cannot be used for this
3617  one. Following code workarounds this bug.
3618 
3619  However, right solution seems to be not reserving slot #0 for initial
3620  thread because:
3621  (1) there is no magic in slot #0,
3622  (2) we cannot detect initial thread reliably (the first thread which does
3623  serial initialization may be not a real initial thread).
3624  */
3625  capacity = __kmp_threads_capacity;
3626  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627  --capacity;
3628  }
3629 
3630  /* see if there are too many threads */
3631  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632  if (__kmp_tp_cached) {
3633  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636  } else {
3637  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638  __kmp_msg_null);
3639  }
3640  }
3641 
3642  /* find an available thread slot */
3643  /* Don't reassign the zero slot since we need that to only be used by initial
3644  thread */
3645  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646  gtid++)
3647  ;
3648  KA_TRACE(1,
3649  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650  KMP_ASSERT(gtid < __kmp_threads_capacity);
3651 
3652  /* update global accounting */
3653  __kmp_all_nth++;
3654  TCW_4(__kmp_nth, __kmp_nth + 1);
3655 
3656  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658  if (__kmp_adjust_gtid_mode) {
3659  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660  if (TCR_4(__kmp_gtid_mode) != 2) {
3661  TCW_4(__kmp_gtid_mode, 2);
3662  }
3663  } else {
3664  if (TCR_4(__kmp_gtid_mode) != 1) {
3665  TCW_4(__kmp_gtid_mode, 1);
3666  }
3667  }
3668  }
3669 
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671  /* Adjust blocktime to zero if necessary */
3672  /* Middle initialization might not have occurred yet */
3673  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674  if (__kmp_nth > __kmp_avail_proc) {
3675  __kmp_zero_bt = TRUE;
3676  }
3677  }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679 
3680  /* setup this new hierarchy */
3681  if (!(root = __kmp_root[gtid])) {
3682  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684  }
3685 
3686 #if KMP_STATS_ENABLED
3687  // Initialize stats as soon as possible (right after gtid assignment).
3688  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689  __kmp_stats_thread_ptr->startLife();
3690  KMP_SET_THREAD_STATE(SERIAL_REGION);
3691  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693  __kmp_initialize_root(root);
3694 
3695  /* setup new root thread structure */
3696  if (root->r.r_uber_thread) {
3697  root_thread = root->r.r_uber_thread;
3698  } else {
3699  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700  if (__kmp_storage_map) {
3701  __kmp_print_thread_storage_map(root_thread, gtid);
3702  }
3703  root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707  root_thread->th.th_root = root;
3708  if (__kmp_env_consistency_check) {
3709  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710  }
3711 #if USE_FAST_MEMORY
3712  __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714 
3715 #if KMP_USE_BGET
3716  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717  __kmp_initialize_bget(root_thread);
3718 #endif
3719  __kmp_init_random(root_thread); // Initialize random number generator
3720  }
3721 
3722  /* setup the serial team held in reserve by the root thread */
3723  if (!root_thread->th.th_serial_team) {
3724  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726  root_thread->th.th_serial_team = __kmp_allocate_team(
3727  root, 1, 1,
3728 #if OMPT_SUPPORT
3729  ompt_data_none, // root parallel id
3730 #endif
3731  proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732  }
3733  KMP_ASSERT(root_thread->th.th_serial_team);
3734  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735  root_thread->th.th_serial_team));
3736 
3737  /* drop root_thread into place */
3738  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740  root->r.r_root_team->t.t_threads[0] = root_thread;
3741  root->r.r_hot_team->t.t_threads[0] = root_thread;
3742  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743  // AC: the team created in reserve, not for execution (it is unused for now).
3744  root_thread->th.th_serial_team->t.t_serialized = 0;
3745  root->r.r_uber_thread = root_thread;
3746 
3747  /* initialize the thread, get it ready to go */
3748  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749  TCW_4(__kmp_init_gtid, TRUE);
3750 
3751  /* prepare the master thread for get_gtid() */
3752  __kmp_gtid_set_specific(gtid);
3753 
3754 #if USE_ITT_BUILD
3755  __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757 
3758 #ifdef KMP_TDATA_GTID
3759  __kmp_gtid = gtid;
3760 #endif
3761  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763 
3764  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765  "plain=%u\n",
3766  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768  KMP_INIT_BARRIER_STATE));
3769  { // Initialize barrier data.
3770  int b;
3771  for (b = 0; b < bs_last_barrier; ++b) {
3772  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776  }
3777  }
3778  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779  KMP_INIT_BARRIER_STATE);
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786  if (TCR_4(__kmp_init_middle)) {
3787  __kmp_affinity_set_init_mask(gtid, TRUE);
3788  }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790  root_thread->th.th_def_allocator = __kmp_def_allocator;
3791  root_thread->th.th_prev_level = 0;
3792  root_thread->th.th_prev_num_threads = 1;
3793 
3794  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795  tmp->cg_root = root_thread;
3796  tmp->cg_thread_limit = __kmp_cg_max_nth;
3797  tmp->cg_nthreads = 1;
3798  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799  " cg_nthreads init to 1\n",
3800  root_thread, tmp));
3801  tmp->up = NULL;
3802  root_thread->th.th_cg_roots = tmp;
3803 
3804  __kmp_root_counter++;
3805 
3806 #if OMPT_SUPPORT
3807  if (!initial_thread && ompt_enabled.enabled) {
3808 
3809  kmp_info_t *root_thread = ompt_get_thread();
3810 
3811  ompt_set_thread_state(root_thread, ompt_state_overhead);
3812 
3813  if (ompt_enabled.ompt_callback_thread_begin) {
3814  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815  ompt_thread_initial, __ompt_get_thread_data_internal());
3816  }
3817  ompt_data_t *task_data;
3818  ompt_data_t *parallel_data;
3819  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3820  if (ompt_enabled.ompt_callback_implicit_task) {
3821  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822  ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823  }
3824 
3825  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826  }
3827 #endif
3828 
3829  KMP_MB();
3830  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831 
3832  return gtid;
3833 }
3834 
3835 #if KMP_NESTED_HOT_TEAMS
3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837  const int max_level) {
3838  int i, n, nth;
3839  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840  if (!hot_teams || !hot_teams[level].hot_team) {
3841  return 0;
3842  }
3843  KMP_DEBUG_ASSERT(level < max_level);
3844  kmp_team_t *team = hot_teams[level].hot_team;
3845  nth = hot_teams[level].hot_team_nth;
3846  n = nth - 1; // master is not freed
3847  if (level < max_level - 1) {
3848  for (i = 0; i < nth; ++i) {
3849  kmp_info_t *th = team->t.t_threads[i];
3850  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851  if (i > 0 && th->th.th_hot_teams) {
3852  __kmp_free(th->th.th_hot_teams);
3853  th->th.th_hot_teams = NULL;
3854  }
3855  }
3856  }
3857  __kmp_free_team(root, team, NULL);
3858  return n;
3859 }
3860 #endif
3861 
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865  kmp_team_t *root_team = root->r.r_root_team;
3866  kmp_team_t *hot_team = root->r.r_hot_team;
3867  int n = hot_team->t.t_nproc;
3868  int i;
3869 
3870  KMP_DEBUG_ASSERT(!root->r.r_active);
3871 
3872  root->r.r_root_team = NULL;
3873  root->r.r_hot_team = NULL;
3874  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875  // before call to __kmp_free_team().
3876  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878  if (__kmp_hot_teams_max_level >
3879  0) { // need to free nested hot teams and their threads if any
3880  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881  kmp_info_t *th = hot_team->t.t_threads[i];
3882  if (__kmp_hot_teams_max_level > 1) {
3883  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884  }
3885  if (th->th.th_hot_teams) {
3886  __kmp_free(th->th.th_hot_teams);
3887  th->th.th_hot_teams = NULL;
3888  }
3889  }
3890  }
3891 #endif
3892  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893 
3894  // Before we can reap the thread, we need to make certain that all other
3895  // threads in the teams that had this root as ancestor have stopped trying to
3896  // steal tasks.
3897  if (__kmp_tasking_mode != tskm_immediate_exec) {
3898  __kmp_wait_to_unref_task_teams();
3899  }
3900 
3901 #if KMP_OS_WINDOWS
3902  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903  KA_TRACE(
3904  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905  "\n",
3906  (LPVOID) & (root->r.r_uber_thread->th),
3907  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910 
3911 #if OMPT_SUPPORT
3912  ompt_data_t *task_data;
3913  ompt_data_t *parallel_data;
3914  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data, NULL);
3915  if (ompt_enabled.ompt_callback_implicit_task) {
3916  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917  ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918  }
3919  if (ompt_enabled.ompt_callback_thread_end) {
3920  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922  }
3923 #endif
3924 
3925  TCW_4(__kmp_nth,
3926  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927  i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929  " to %d\n",
3930  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932  if (i == 1) {
3933  // need to free contention group structure
3934  KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935  root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936  KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937  __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938  root->r.r_uber_thread->th.th_cg_roots = NULL;
3939  }
3940  __kmp_reap_thread(root->r.r_uber_thread, 1);
3941 
3942  // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943  // instead of freeing.
3944  root->r.r_uber_thread = NULL;
3945  /* mark root as no longer in use */
3946  root->r.r_begin = FALSE;
3947 
3948  return n;
3949 }
3950 
3951 void __kmp_unregister_root_current_thread(int gtid) {
3952  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953  /* this lock should be ok, since unregister_root_current_thread is never
3954  called during an abort, only during a normal close. furthermore, if you
3955  have the forkjoin lock, you should never try to get the initz lock */
3956  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959  "exiting T#%d\n",
3960  gtid));
3961  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962  return;
3963  }
3964  kmp_root_t *root = __kmp_root[gtid];
3965 
3966  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967  KMP_ASSERT(KMP_UBER_GTID(gtid));
3968  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969  KMP_ASSERT(root->r.r_active == FALSE);
3970 
3971  KMP_MB();
3972 
3973  kmp_info_t *thread = __kmp_threads[gtid];
3974  kmp_team_t *team = thread->th.th_team;
3975  kmp_task_team_t *task_team = thread->th.th_task_team;
3976 
3977  // we need to wait for the proxy tasks before finishing the thread
3978  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980  // the runtime is shutting down so we won't report any events
3981  thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984  }
3985 
3986  __kmp_reset_root(gtid, root);
3987 
3988  /* free up this thread slot */
3989  __kmp_gtid_set_specific(KMP_GTID_DNE);
3990 #ifdef KMP_TDATA_GTID
3991  __kmp_gtid = KMP_GTID_DNE;
3992 #endif
3993 
3994  KMP_MB();
3995  KC_TRACE(10,
3996  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3997 
3998  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3999 }
4000 
4001 #if KMP_OS_WINDOWS
4002 /* __kmp_forkjoin_lock must be already held
4003  Unregisters a root thread that is not the current thread. Returns the number
4004  of __kmp_threads entries freed as a result. */
4005 static int __kmp_unregister_root_other_thread(int gtid) {
4006  kmp_root_t *root = __kmp_root[gtid];
4007  int r;
4008 
4009  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4010  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4011  KMP_ASSERT(KMP_UBER_GTID(gtid));
4012  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4013  KMP_ASSERT(root->r.r_active == FALSE);
4014 
4015  r = __kmp_reset_root(gtid, root);
4016  KC_TRACE(10,
4017  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4018  return r;
4019 }
4020 #endif
4021 
4022 #if KMP_DEBUG
4023 void __kmp_task_info() {
4024 
4025  kmp_int32 gtid = __kmp_entry_gtid();
4026  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4027  kmp_info_t *this_thr = __kmp_threads[gtid];
4028  kmp_team_t *steam = this_thr->th.th_serial_team;
4029  kmp_team_t *team = this_thr->th.th_team;
4030 
4031  __kmp_printf(
4032  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4033  "ptask=%p\n",
4034  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4035  team->t.t_implicit_task_taskdata[tid].td_parent);
4036 }
4037 #endif // KMP_DEBUG
4038 
4039 /* TODO optimize with one big memclr, take out what isn't needed, split
4040  responsibility to workers as much as possible, and delay initialization of
4041  features as much as possible */
4042 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4043  int tid, int gtid) {
4044  /* this_thr->th.th_info.ds.ds_gtid is setup in
4045  kmp_allocate_thread/create_worker.
4046  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4047  kmp_info_t *master = team->t.t_threads[0];
4048  KMP_DEBUG_ASSERT(this_thr != NULL);
4049  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4050  KMP_DEBUG_ASSERT(team);
4051  KMP_DEBUG_ASSERT(team->t.t_threads);
4052  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4053  KMP_DEBUG_ASSERT(master);
4054  KMP_DEBUG_ASSERT(master->th.th_root);
4055 
4056  KMP_MB();
4057 
4058  TCW_SYNC_PTR(this_thr->th.th_team, team);
4059 
4060  this_thr->th.th_info.ds.ds_tid = tid;
4061  this_thr->th.th_set_nproc = 0;
4062  if (__kmp_tasking_mode != tskm_immediate_exec)
4063  // When tasking is possible, threads are not safe to reap until they are
4064  // done tasking; this will be set when tasking code is exited in wait
4065  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4066  else // no tasking --> always safe to reap
4067  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4068  this_thr->th.th_set_proc_bind = proc_bind_default;
4069 #if KMP_AFFINITY_SUPPORTED
4070  this_thr->th.th_new_place = this_thr->th.th_current_place;
4071 #endif
4072  this_thr->th.th_root = master->th.th_root;
4073 
4074  /* setup the thread's cache of the team structure */
4075  this_thr->th.th_team_nproc = team->t.t_nproc;
4076  this_thr->th.th_team_master = master;
4077  this_thr->th.th_team_serialized = team->t.t_serialized;
4078  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4079 
4080  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4081 
4082  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4083  tid, gtid, this_thr, this_thr->th.th_current_task));
4084 
4085  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4086  team, tid, TRUE);
4087 
4088  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4089  tid, gtid, this_thr, this_thr->th.th_current_task));
4090  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4091  // __kmp_initialize_team()?
4092 
4093  /* TODO no worksharing in speculative threads */
4094  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4095 
4096  this_thr->th.th_local.this_construct = 0;
4097 
4098  if (!this_thr->th.th_pri_common) {
4099  this_thr->th.th_pri_common =
4100  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4101  if (__kmp_storage_map) {
4102  __kmp_print_storage_map_gtid(
4103  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4104  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4105  }
4106  this_thr->th.th_pri_head = NULL;
4107  }
4108 
4109  if (this_thr != master && // Master's CG root is initialized elsewhere
4110  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4111  // Make new thread's CG root same as master's
4112  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4113  kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4114  if (tmp) {
4115  // worker changes CG, need to check if old CG should be freed
4116  int i = tmp->cg_nthreads--;
4117  KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4118  " on node %p of thread %p to %d\n",
4119  this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4120  if (i == 1) {
4121  __kmp_free(tmp); // last thread left CG --> free it
4122  }
4123  }
4124  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4125  // Increment new thread's CG root's counter to add the new thread
4126  this_thr->th.th_cg_roots->cg_nthreads++;
4127  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4128  " node %p of thread %p to %d\n",
4129  this_thr, this_thr->th.th_cg_roots,
4130  this_thr->th.th_cg_roots->cg_root,
4131  this_thr->th.th_cg_roots->cg_nthreads));
4132  this_thr->th.th_current_task->td_icvs.thread_limit =
4133  this_thr->th.th_cg_roots->cg_thread_limit;
4134  }
4135 
4136  /* Initialize dynamic dispatch */
4137  {
4138  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4139  // Use team max_nproc since this will never change for the team.
4140  size_t disp_size =
4141  sizeof(dispatch_private_info_t) *
4142  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4143  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4144  team->t.t_max_nproc));
4145  KMP_ASSERT(dispatch);
4146  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4147  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4148 
4149  dispatch->th_disp_index = 0;
4150  dispatch->th_doacross_buf_idx = 0;
4151  if (!dispatch->th_disp_buffer) {
4152  dispatch->th_disp_buffer =
4153  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4154 
4155  if (__kmp_storage_map) {
4156  __kmp_print_storage_map_gtid(
4157  gtid, &dispatch->th_disp_buffer[0],
4158  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4159  ? 1
4160  : __kmp_dispatch_num_buffers],
4161  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4162  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4163  gtid, team->t.t_id, gtid);
4164  }
4165  } else {
4166  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4167  }
4168 
4169  dispatch->th_dispatch_pr_current = 0;
4170  dispatch->th_dispatch_sh_current = 0;
4171 
4172  dispatch->th_deo_fcn = 0; /* ORDERED */
4173  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4174  }
4175 
4176  this_thr->th.th_next_pool = NULL;
4177 
4178  if (!this_thr->th.th_task_state_memo_stack) {
4179  size_t i;
4180  this_thr->th.th_task_state_memo_stack =
4181  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4182  this_thr->th.th_task_state_top = 0;
4183  this_thr->th.th_task_state_stack_sz = 4;
4184  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4185  ++i) // zero init the stack
4186  this_thr->th.th_task_state_memo_stack[i] = 0;
4187  }
4188 
4189  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4190  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4191 
4192  KMP_MB();
4193 }
4194 
4195 /* allocate a new thread for the requesting team. this is only called from
4196  within a forkjoin critical section. we will first try to get an available
4197  thread from the thread pool. if none is available, we will fork a new one
4198  assuming we are able to create a new one. this should be assured, as the
4199  caller should check on this first. */
4200 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4201  int new_tid) {
4202  kmp_team_t *serial_team;
4203  kmp_info_t *new_thr;
4204  int new_gtid;
4205 
4206  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4207  KMP_DEBUG_ASSERT(root && team);
4208 #if !KMP_NESTED_HOT_TEAMS
4209  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4210 #endif
4211  KMP_MB();
4212 
4213  /* first, try to get one from the thread pool */
4214  if (__kmp_thread_pool) {
4215  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4216  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4217  if (new_thr == __kmp_thread_pool_insert_pt) {
4218  __kmp_thread_pool_insert_pt = NULL;
4219  }
4220  TCW_4(new_thr->th.th_in_pool, FALSE);
4221  __kmp_suspend_initialize_thread(new_thr);
4222  __kmp_lock_suspend_mx(new_thr);
4223  if (new_thr->th.th_active_in_pool == TRUE) {
4224  KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4225  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4226  new_thr->th.th_active_in_pool = FALSE;
4227  }
4228  __kmp_unlock_suspend_mx(new_thr);
4229 
4230  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4231  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4232  KMP_ASSERT(!new_thr->th.th_team);
4233  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4234 
4235  /* setup the thread structure */
4236  __kmp_initialize_info(new_thr, team, new_tid,
4237  new_thr->th.th_info.ds.ds_gtid);
4238  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4239 
4240  TCW_4(__kmp_nth, __kmp_nth + 1);
4241 
4242  new_thr->th.th_task_state = 0;
4243  new_thr->th.th_task_state_top = 0;
4244  new_thr->th.th_task_state_stack_sz = 4;
4245 
4246 #ifdef KMP_ADJUST_BLOCKTIME
4247  /* Adjust blocktime back to zero if necessary */
4248  /* Middle initialization might not have occurred yet */
4249  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4250  if (__kmp_nth > __kmp_avail_proc) {
4251  __kmp_zero_bt = TRUE;
4252  }
4253  }
4254 #endif /* KMP_ADJUST_BLOCKTIME */
4255 
4256 #if KMP_DEBUG
4257  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4258  // KMP_BARRIER_PARENT_FLAG.
4259  int b;
4260  kmp_balign_t *balign = new_thr->th.th_bar;
4261  for (b = 0; b < bs_last_barrier; ++b)
4262  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4263 #endif
4264 
4265  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4266  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4267 
4268  KMP_MB();
4269  return new_thr;
4270  }
4271 
4272  /* no, well fork a new one */
4273  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4274  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4275 
4276 #if KMP_USE_MONITOR
4277  // If this is the first worker thread the RTL is creating, then also
4278  // launch the monitor thread. We try to do this as early as possible.
4279  if (!TCR_4(__kmp_init_monitor)) {
4280  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4281  if (!TCR_4(__kmp_init_monitor)) {
4282  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4283  TCW_4(__kmp_init_monitor, 1);
4284  __kmp_create_monitor(&__kmp_monitor);
4285  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4286 #if KMP_OS_WINDOWS
4287  // AC: wait until monitor has started. This is a fix for CQ232808.
4288  // The reason is that if the library is loaded/unloaded in a loop with
4289  // small (parallel) work in between, then there is high probability that
4290  // monitor thread started after the library shutdown. At shutdown it is
4291  // too late to cope with the problem, because when the master is in
4292  // DllMain (process detach) the monitor has no chances to start (it is
4293  // blocked), and master has no means to inform the monitor that the
4294  // library has gone, because all the memory which the monitor can access
4295  // is going to be released/reset.
4296  while (TCR_4(__kmp_init_monitor) < 2) {
4297  KMP_YIELD(TRUE);
4298  }
4299  KF_TRACE(10, ("after monitor thread has started\n"));
4300 #endif
4301  }
4302  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4303  }
4304 #endif
4305 
4306  KMP_MB();
4307  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4308  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4309  }
4310 
4311  /* allocate space for it. */
4312  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4313 
4314  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4315 
4316 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4317  // suppress race conditions detection on synchronization flags in debug mode
4318  // this helps to analyze library internals eliminating false positives
4319  __itt_suppress_mark_range(
4320  __itt_suppress_range, __itt_suppress_threading_errors,
4321  &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4322  __itt_suppress_mark_range(
4323  __itt_suppress_range, __itt_suppress_threading_errors,
4324  &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4325 #if KMP_OS_WINDOWS
4326  __itt_suppress_mark_range(
4327  __itt_suppress_range, __itt_suppress_threading_errors,
4328  &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4329 #else
4330  __itt_suppress_mark_range(__itt_suppress_range,
4331  __itt_suppress_threading_errors,
4332  &new_thr->th.th_suspend_init_count,
4333  sizeof(new_thr->th.th_suspend_init_count));
4334 #endif
4335  // TODO: check if we need to also suppress b_arrived flags
4336  __itt_suppress_mark_range(__itt_suppress_range,
4337  __itt_suppress_threading_errors,
4338  CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4339  sizeof(new_thr->th.th_bar[0].bb.b_go));
4340  __itt_suppress_mark_range(__itt_suppress_range,
4341  __itt_suppress_threading_errors,
4342  CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4343  sizeof(new_thr->th.th_bar[1].bb.b_go));
4344  __itt_suppress_mark_range(__itt_suppress_range,
4345  __itt_suppress_threading_errors,
4346  CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4347  sizeof(new_thr->th.th_bar[2].bb.b_go));
4348 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4349  if (__kmp_storage_map) {
4350  __kmp_print_thread_storage_map(new_thr, new_gtid);
4351  }
4352 
4353  // add the reserve serialized team, initialized from the team's master thread
4354  {
4355  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4356  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4357  new_thr->th.th_serial_team = serial_team =
4358  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4359 #if OMPT_SUPPORT
4360  ompt_data_none, // root parallel id
4361 #endif
4362  proc_bind_default, &r_icvs,
4363  0 USE_NESTED_HOT_ARG(NULL));
4364  }
4365  KMP_ASSERT(serial_team);
4366  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4367  // execution (it is unused for now).
4368  serial_team->t.t_threads[0] = new_thr;
4369  KF_TRACE(10,
4370  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4371  new_thr));
4372 
4373  /* setup the thread structures */
4374  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4375 
4376 #if USE_FAST_MEMORY
4377  __kmp_initialize_fast_memory(new_thr);
4378 #endif /* USE_FAST_MEMORY */
4379 
4380 #if KMP_USE_BGET
4381  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4382  __kmp_initialize_bget(new_thr);
4383 #endif
4384 
4385  __kmp_init_random(new_thr); // Initialize random number generator
4386 
4387  /* Initialize these only once when thread is grabbed for a team allocation */
4388  KA_TRACE(20,
4389  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4390  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4391 
4392  int b;
4393  kmp_balign_t *balign = new_thr->th.th_bar;
4394  for (b = 0; b < bs_last_barrier; ++b) {
4395  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4396  balign[b].bb.team = NULL;
4397  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4398  balign[b].bb.use_oncore_barrier = 0;
4399  }
4400 
4401  new_thr->th.th_spin_here = FALSE;
4402  new_thr->th.th_next_waiting = 0;
4403 #if KMP_OS_UNIX
4404  new_thr->th.th_blocking = false;
4405 #endif
4406 
4407 #if KMP_AFFINITY_SUPPORTED
4408  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4409  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4410  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4411  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4412 #endif
4413  new_thr->th.th_def_allocator = __kmp_def_allocator;
4414  new_thr->th.th_prev_level = 0;
4415  new_thr->th.th_prev_num_threads = 1;
4416 
4417  TCW_4(new_thr->th.th_in_pool, FALSE);
4418  new_thr->th.th_active_in_pool = FALSE;
4419  TCW_4(new_thr->th.th_active, TRUE);
4420 
4421  /* adjust the global counters */
4422  __kmp_all_nth++;
4423  __kmp_nth++;
4424 
4425  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4426  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4427  if (__kmp_adjust_gtid_mode) {
4428  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4429  if (TCR_4(__kmp_gtid_mode) != 2) {
4430  TCW_4(__kmp_gtid_mode, 2);
4431  }
4432  } else {
4433  if (TCR_4(__kmp_gtid_mode) != 1) {
4434  TCW_4(__kmp_gtid_mode, 1);
4435  }
4436  }
4437  }
4438 
4439 #ifdef KMP_ADJUST_BLOCKTIME
4440  /* Adjust blocktime back to zero if necessary */
4441  /* Middle initialization might not have occurred yet */
4442  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4443  if (__kmp_nth > __kmp_avail_proc) {
4444  __kmp_zero_bt = TRUE;
4445  }
4446  }
4447 #endif /* KMP_ADJUST_BLOCKTIME */
4448 
4449  /* actually fork it and create the new worker thread */
4450  KF_TRACE(
4451  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4452  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4453  KF_TRACE(10,
4454  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4455 
4456  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4457  new_gtid));
4458  KMP_MB();
4459  return new_thr;
4460 }
4461 
4462 /* Reinitialize team for reuse.
4463  The hot team code calls this case at every fork barrier, so EPCC barrier
4464  test are extremely sensitive to changes in it, esp. writes to the team
4465  struct, which cause a cache invalidation in all threads.
4466  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4467 static void __kmp_reinitialize_team(kmp_team_t *team,
4468  kmp_internal_control_t *new_icvs,
4469  ident_t *loc) {
4470  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4471  team->t.t_threads[0], team));
4472  KMP_DEBUG_ASSERT(team && new_icvs);
4473  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4474  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4475 
4476  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4477  // Copy ICVs to the master thread's implicit taskdata
4478  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4479  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4480 
4481  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4482  team->t.t_threads[0], team));
4483 }
4484 
4485 /* Initialize the team data structure.
4486  This assumes the t_threads and t_max_nproc are already set.
4487  Also, we don't touch the arguments */
4488 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4489  kmp_internal_control_t *new_icvs,
4490  ident_t *loc) {
4491  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4492 
4493  /* verify */
4494  KMP_DEBUG_ASSERT(team);
4495  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4496  KMP_DEBUG_ASSERT(team->t.t_threads);
4497  KMP_MB();
4498 
4499  team->t.t_master_tid = 0; /* not needed */
4500  /* team->t.t_master_bar; not needed */
4501  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4502  team->t.t_nproc = new_nproc;
4503 
4504  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4505  team->t.t_next_pool = NULL;
4506  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4507  * up hot team */
4508 
4509  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4510  team->t.t_invoke = NULL; /* not needed */
4511 
4512  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4513  team->t.t_sched.sched = new_icvs->sched.sched;
4514 
4515 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4516  team->t.t_fp_control_saved = FALSE; /* not needed */
4517  team->t.t_x87_fpu_control_word = 0; /* not needed */
4518  team->t.t_mxcsr = 0; /* not needed */
4519 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4520 
4521  team->t.t_construct = 0;
4522 
4523  team->t.t_ordered.dt.t_value = 0;
4524  team->t.t_master_active = FALSE;
4525 
4526 #ifdef KMP_DEBUG
4527  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4528 #endif
4529 #if KMP_OS_WINDOWS
4530  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4531 #endif
4532 
4533  team->t.t_control_stack_top = NULL;
4534 
4535  __kmp_reinitialize_team(team, new_icvs, loc);
4536 
4537  KMP_MB();
4538  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4539 }
4540 
4541 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4542 /* Sets full mask for thread and returns old mask, no changes to structures. */
4543 static void
4544 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4545  if (KMP_AFFINITY_CAPABLE()) {
4546  int status;
4547  if (old_mask != NULL) {
4548  status = __kmp_get_system_affinity(old_mask, TRUE);
4549  int error = errno;
4550  if (status != 0) {
4551  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4552  __kmp_msg_null);
4553  }
4554  }
4555  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4556  }
4557 }
4558 #endif
4559 
4560 #if KMP_AFFINITY_SUPPORTED
4561 
4562 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4563 // It calculates the worker + master thread's partition based upon the parent
4564 // thread's partition, and binds each worker to a thread in their partition.
4565 // The master thread's partition should already include its current binding.
4566 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4567  // Copy the master thread's place partition to the team struct
4568  kmp_info_t *master_th = team->t.t_threads[0];
4569  KMP_DEBUG_ASSERT(master_th != NULL);
4570  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4571  int first_place = master_th->th.th_first_place;
4572  int last_place = master_th->th.th_last_place;
4573  int masters_place = master_th->th.th_current_place;
4574  team->t.t_first_place = first_place;
4575  team->t.t_last_place = last_place;
4576 
4577  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4578  "bound to place %d partition = [%d,%d]\n",
4579  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4580  team->t.t_id, masters_place, first_place, last_place));
4581 
4582  switch (proc_bind) {
4583 
4584  case proc_bind_default:
4585  // serial teams might have the proc_bind policy set to proc_bind_default. It
4586  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4587  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4588  break;
4589 
4590  case proc_bind_master: {
4591  int f;
4592  int n_th = team->t.t_nproc;
4593  for (f = 1; f < n_th; f++) {
4594  kmp_info_t *th = team->t.t_threads[f];
4595  KMP_DEBUG_ASSERT(th != NULL);
4596  th->th.th_first_place = first_place;
4597  th->th.th_last_place = last_place;
4598  th->th.th_new_place = masters_place;
4599  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4600  team->t.t_display_affinity != 1) {
4601  team->t.t_display_affinity = 1;
4602  }
4603 
4604  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4605  "partition = [%d,%d]\n",
4606  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4607  f, masters_place, first_place, last_place));
4608  }
4609  } break;
4610 
4611  case proc_bind_close: {
4612  int f;
4613  int n_th = team->t.t_nproc;
4614  int n_places;
4615  if (first_place <= last_place) {
4616  n_places = last_place - first_place + 1;
4617  } else {
4618  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4619  }
4620  if (n_th <= n_places) {
4621  int place = masters_place;
4622  for (f = 1; f < n_th; f++) {
4623  kmp_info_t *th = team->t.t_threads[f];
4624  KMP_DEBUG_ASSERT(th != NULL);
4625 
4626  if (place == last_place) {
4627  place = first_place;
4628  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4629  place = 0;
4630  } else {
4631  place++;
4632  }
4633  th->th.th_first_place = first_place;
4634  th->th.th_last_place = last_place;
4635  th->th.th_new_place = place;
4636  if (__kmp_display_affinity && place != th->th.th_current_place &&
4637  team->t.t_display_affinity != 1) {
4638  team->t.t_display_affinity = 1;
4639  }
4640 
4641  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4642  "partition = [%d,%d]\n",
4643  __kmp_gtid_from_thread(team->t.t_threads[f]),
4644  team->t.t_id, f, place, first_place, last_place));
4645  }
4646  } else {
4647  int S, rem, gap, s_count;
4648  S = n_th / n_places;
4649  s_count = 0;
4650  rem = n_th - (S * n_places);
4651  gap = rem > 0 ? n_places / rem : n_places;
4652  int place = masters_place;
4653  int gap_ct = gap;
4654  for (f = 0; f < n_th; f++) {
4655  kmp_info_t *th = team->t.t_threads[f];
4656  KMP_DEBUG_ASSERT(th != NULL);
4657 
4658  th->th.th_first_place = first_place;
4659  th->th.th_last_place = last_place;
4660  th->th.th_new_place = place;
4661  if (__kmp_display_affinity && place != th->th.th_current_place &&
4662  team->t.t_display_affinity != 1) {
4663  team->t.t_display_affinity = 1;
4664  }
4665  s_count++;
4666 
4667  if ((s_count == S) && rem && (gap_ct == gap)) {
4668  // do nothing, add an extra thread to place on next iteration
4669  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4670  // we added an extra thread to this place; move to next place
4671  if (place == last_place) {
4672  place = first_place;
4673  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4674  place = 0;
4675  } else {
4676  place++;
4677  }
4678  s_count = 0;
4679  gap_ct = 1;
4680  rem--;
4681  } else if (s_count == S) { // place full; don't add extra
4682  if (place == last_place) {
4683  place = first_place;
4684  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4685  place = 0;
4686  } else {
4687  place++;
4688  }
4689  gap_ct++;
4690  s_count = 0;
4691  }
4692 
4693  KA_TRACE(100,
4694  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4695  "partition = [%d,%d]\n",
4696  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4697  th->th.th_new_place, first_place, last_place));
4698  }
4699  KMP_DEBUG_ASSERT(place == masters_place);
4700  }
4701  } break;
4702 
4703  case proc_bind_spread: {
4704  int f;
4705  int n_th = team->t.t_nproc;
4706  int n_places;
4707  int thidx;
4708  if (first_place <= last_place) {
4709  n_places = last_place - first_place + 1;
4710  } else {
4711  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4712  }
4713  if (n_th <= n_places) {
4714  int place = -1;
4715 
4716  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4717  int S = n_places / n_th;
4718  int s_count, rem, gap, gap_ct;
4719 
4720  place = masters_place;
4721  rem = n_places - n_th * S;
4722  gap = rem ? n_th / rem : 1;
4723  gap_ct = gap;
4724  thidx = n_th;
4725  if (update_master_only == 1)
4726  thidx = 1;
4727  for (f = 0; f < thidx; f++) {
4728  kmp_info_t *th = team->t.t_threads[f];
4729  KMP_DEBUG_ASSERT(th != NULL);
4730 
4731  th->th.th_first_place = place;
4732  th->th.th_new_place = place;
4733  if (__kmp_display_affinity && place != th->th.th_current_place &&
4734  team->t.t_display_affinity != 1) {
4735  team->t.t_display_affinity = 1;
4736  }
4737  s_count = 1;
4738  while (s_count < S) {
4739  if (place == last_place) {
4740  place = first_place;
4741  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4742  place = 0;
4743  } else {
4744  place++;
4745  }
4746  s_count++;
4747  }
4748  if (rem && (gap_ct == gap)) {
4749  if (place == last_place) {
4750  place = first_place;
4751  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4752  place = 0;
4753  } else {
4754  place++;
4755  }
4756  rem--;
4757  gap_ct = 0;
4758  }
4759  th->th.th_last_place = place;
4760  gap_ct++;
4761 
4762  if (place == last_place) {
4763  place = first_place;
4764  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4765  place = 0;
4766  } else {
4767  place++;
4768  }
4769 
4770  KA_TRACE(100,
4771  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4772  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4773  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4774  f, th->th.th_new_place, th->th.th_first_place,
4775  th->th.th_last_place, __kmp_affinity_num_masks));
4776  }
4777  } else {
4778  /* Having uniform space of available computation places I can create
4779  T partitions of round(P/T) size and put threads into the first
4780  place of each partition. */
4781  double current = static_cast<double>(masters_place);
4782  double spacing =
4783  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4784  int first, last;
4785  kmp_info_t *th;
4786 
4787  thidx = n_th + 1;
4788  if (update_master_only == 1)
4789  thidx = 1;
4790  for (f = 0; f < thidx; f++) {
4791  first = static_cast<int>(current);
4792  last = static_cast<int>(current + spacing) - 1;
4793  KMP_DEBUG_ASSERT(last >= first);
4794  if (first >= n_places) {
4795  if (masters_place) {
4796  first -= n_places;
4797  last -= n_places;
4798  if (first == (masters_place + 1)) {
4799  KMP_DEBUG_ASSERT(f == n_th);
4800  first--;
4801  }
4802  if (last == masters_place) {
4803  KMP_DEBUG_ASSERT(f == (n_th - 1));
4804  last--;
4805  }
4806  } else {
4807  KMP_DEBUG_ASSERT(f == n_th);
4808  first = 0;
4809  last = 0;
4810  }
4811  }
4812  if (last >= n_places) {
4813  last = (n_places - 1);
4814  }
4815  place = first;
4816  current += spacing;
4817  if (f < n_th) {
4818  KMP_DEBUG_ASSERT(0 <= first);
4819  KMP_DEBUG_ASSERT(n_places > first);
4820  KMP_DEBUG_ASSERT(0 <= last);
4821  KMP_DEBUG_ASSERT(n_places > last);
4822  KMP_DEBUG_ASSERT(last_place >= first_place);
4823  th = team->t.t_threads[f];
4824  KMP_DEBUG_ASSERT(th);
4825  th->th.th_first_place = first;
4826  th->th.th_new_place = place;
4827  th->th.th_last_place = last;
4828  if (__kmp_display_affinity && place != th->th.th_current_place &&
4829  team->t.t_display_affinity != 1) {
4830  team->t.t_display_affinity = 1;
4831  }
4832  KA_TRACE(100,
4833  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4834  "partition = [%d,%d], spacing = %.4f\n",
4835  __kmp_gtid_from_thread(team->t.t_threads[f]),
4836  team->t.t_id, f, th->th.th_new_place,
4837  th->th.th_first_place, th->th.th_last_place, spacing));
4838  }
4839  }
4840  }
4841  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4842  } else {
4843  int S, rem, gap, s_count;
4844  S = n_th / n_places;
4845  s_count = 0;
4846  rem = n_th - (S * n_places);
4847  gap = rem > 0 ? n_places / rem : n_places;
4848  int place = masters_place;
4849  int gap_ct = gap;
4850  thidx = n_th;
4851  if (update_master_only == 1)
4852  thidx = 1;
4853  for (f = 0; f < thidx; f++) {
4854  kmp_info_t *th = team->t.t_threads[f];
4855  KMP_DEBUG_ASSERT(th != NULL);
4856 
4857  th->th.th_first_place = place;
4858  th->th.th_last_place = place;
4859  th->th.th_new_place = place;
4860  if (__kmp_display_affinity && place != th->th.th_current_place &&
4861  team->t.t_display_affinity != 1) {
4862  team->t.t_display_affinity = 1;
4863  }
4864  s_count++;
4865 
4866  if ((s_count == S) && rem && (gap_ct == gap)) {
4867  // do nothing, add an extra thread to place on next iteration
4868  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4869  // we added an extra thread to this place; move on to next place
4870  if (place == last_place) {
4871  place = first_place;
4872  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4873  place = 0;
4874  } else {
4875  place++;
4876  }
4877  s_count = 0;
4878  gap_ct = 1;
4879  rem--;
4880  } else if (s_count == S) { // place is full; don't add extra thread
4881  if (place == last_place) {
4882  place = first_place;
4883  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4884  place = 0;
4885  } else {
4886  place++;
4887  }
4888  gap_ct++;
4889  s_count = 0;
4890  }
4891 
4892  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4893  "partition = [%d,%d]\n",
4894  __kmp_gtid_from_thread(team->t.t_threads[f]),
4895  team->t.t_id, f, th->th.th_new_place,
4896  th->th.th_first_place, th->th.th_last_place));
4897  }
4898  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4899  }
4900  } break;
4901 
4902  default:
4903  break;
4904  }
4905 
4906  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4907 }
4908 
4909 #endif // KMP_AFFINITY_SUPPORTED
4910 
4911 /* allocate a new team data structure to use. take one off of the free pool if
4912  available */
4913 kmp_team_t *
4914 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4915 #if OMPT_SUPPORT
4916  ompt_data_t ompt_parallel_data,
4917 #endif
4918  kmp_proc_bind_t new_proc_bind,
4919  kmp_internal_control_t *new_icvs,
4920  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4921  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4922  int f;
4923  kmp_team_t *team;
4924  int use_hot_team = !root->r.r_active;
4925  int level = 0;
4926 
4927  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4928  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4929  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4930  KMP_MB();
4931 
4932 #if KMP_NESTED_HOT_TEAMS
4933  kmp_hot_team_ptr_t *hot_teams;
4934  if (master) {
4935  team = master->th.th_team;
4936  level = team->t.t_active_level;
4937  if (master->th.th_teams_microtask) { // in teams construct?
4938  if (master->th.th_teams_size.nteams > 1 &&
4939  ( // #teams > 1
4940  team->t.t_pkfn ==
4941  (microtask_t)__kmp_teams_master || // inner fork of the teams
4942  master->th.th_teams_level <
4943  team->t.t_level)) { // or nested parallel inside the teams
4944  ++level; // not increment if #teams==1, or for outer fork of the teams;
4945  // increment otherwise
4946  }
4947  }
4948  hot_teams = master->th.th_hot_teams;
4949  if (level < __kmp_hot_teams_max_level && hot_teams &&
4950  hot_teams[level].hot_team) {
4951  // hot team has already been allocated for given level
4952  use_hot_team = 1;
4953  } else {
4954  use_hot_team = 0;
4955  }
4956  } else {
4957  // check we won't access uninitialized hot_teams, just in case
4958  KMP_DEBUG_ASSERT(new_nproc == 1);
4959  }
4960 #endif
4961  // Optimization to use a "hot" team
4962  if (use_hot_team && new_nproc > 1) {
4963  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4964 #if KMP_NESTED_HOT_TEAMS
4965  team = hot_teams[level].hot_team;
4966 #else
4967  team = root->r.r_hot_team;
4968 #endif
4969 #if KMP_DEBUG
4970  if (__kmp_tasking_mode != tskm_immediate_exec) {
4971  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4972  "task_team[1] = %p before reinit\n",
4973  team->t.t_task_team[0], team->t.t_task_team[1]));
4974  }
4975 #endif
4976 
4977  // Has the number of threads changed?
4978  /* Let's assume the most common case is that the number of threads is
4979  unchanged, and put that case first. */
4980  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4981  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4982  // This case can mean that omp_set_num_threads() was called and the hot
4983  // team size was already reduced, so we check the special flag
4984  if (team->t.t_size_changed == -1) {
4985  team->t.t_size_changed = 1;
4986  } else {
4987  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4988  }
4989 
4990  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4991  kmp_r_sched_t new_sched = new_icvs->sched;
4992  // set master's schedule as new run-time schedule
4993  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4994 
4995  __kmp_reinitialize_team(team, new_icvs,
4996  root->r.r_uber_thread->th.th_ident);
4997 
4998  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4999  team->t.t_threads[0], team));
5000  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5001 
5002 #if KMP_AFFINITY_SUPPORTED
5003  if ((team->t.t_size_changed == 0) &&
5004  (team->t.t_proc_bind == new_proc_bind)) {
5005  if (new_proc_bind == proc_bind_spread) {
5006  __kmp_partition_places(
5007  team, 1); // add flag to update only master for spread
5008  }
5009  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5010  "proc_bind = %d, partition = [%d,%d]\n",
5011  team->t.t_id, new_proc_bind, team->t.t_first_place,
5012  team->t.t_last_place));
5013  } else {
5014  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5015  __kmp_partition_places(team);
5016  }
5017 #else
5018  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5019 #endif /* KMP_AFFINITY_SUPPORTED */
5020  } else if (team->t.t_nproc > new_nproc) {
5021  KA_TRACE(20,
5022  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5023  new_nproc));
5024 
5025  team->t.t_size_changed = 1;
5026 #if KMP_NESTED_HOT_TEAMS
5027  if (__kmp_hot_teams_mode == 0) {
5028  // AC: saved number of threads should correspond to team's value in this
5029  // mode, can be bigger in mode 1, when hot team has threads in reserve
5030  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5031  hot_teams[level].hot_team_nth = new_nproc;
5032 #endif // KMP_NESTED_HOT_TEAMS
5033  /* release the extra threads we don't need any more */
5034  for (f = new_nproc; f < team->t.t_nproc; f++) {
5035  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5036  if (__kmp_tasking_mode != tskm_immediate_exec) {
5037  // When decreasing team size, threads no longer in the team should
5038  // unref task team.
5039  team->t.t_threads[f]->th.th_task_team = NULL;
5040  }
5041  __kmp_free_thread(team->t.t_threads[f]);
5042  team->t.t_threads[f] = NULL;
5043  }
5044 #if KMP_NESTED_HOT_TEAMS
5045  } // (__kmp_hot_teams_mode == 0)
5046  else {
5047  // When keeping extra threads in team, switch threads to wait on own
5048  // b_go flag
5049  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5050  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5051  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5052  for (int b = 0; b < bs_last_barrier; ++b) {
5053  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5054  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5055  }
5056  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5057  }
5058  }
5059  }
5060 #endif // KMP_NESTED_HOT_TEAMS
5061  team->t.t_nproc = new_nproc;
5062  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5063  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5064  __kmp_reinitialize_team(team, new_icvs,
5065  root->r.r_uber_thread->th.th_ident);
5066 
5067  // Update remaining threads
5068  for (f = 0; f < new_nproc; ++f) {
5069  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5070  }
5071 
5072  // restore the current task state of the master thread: should be the
5073  // implicit task
5074  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5075  team->t.t_threads[0], team));
5076 
5077  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5078 
5079 #ifdef KMP_DEBUG
5080  for (f = 0; f < team->t.t_nproc; f++) {
5081  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5082  team->t.t_threads[f]->th.th_team_nproc ==
5083  team->t.t_nproc);
5084  }
5085 #endif
5086 
5087  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5088 #if KMP_AFFINITY_SUPPORTED
5089  __kmp_partition_places(team);
5090 #endif
5091  } else { // team->t.t_nproc < new_nproc
5092 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5093  kmp_affin_mask_t *old_mask;
5094  if (KMP_AFFINITY_CAPABLE()) {
5095  KMP_CPU_ALLOC(old_mask);
5096  }
5097 #endif
5098 
5099  KA_TRACE(20,
5100  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5101  new_nproc));
5102 
5103  team->t.t_size_changed = 1;
5104 
5105 #if KMP_NESTED_HOT_TEAMS
5106  int avail_threads = hot_teams[level].hot_team_nth;
5107  if (new_nproc < avail_threads)
5108  avail_threads = new_nproc;
5109  kmp_info_t **other_threads = team->t.t_threads;
5110  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5111  // Adjust barrier data of reserved threads (if any) of the team
5112  // Other data will be set in __kmp_initialize_info() below.
5113  int b;
5114  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5115  for (b = 0; b < bs_last_barrier; ++b) {
5116  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5117  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5118 #if USE_DEBUGGER
5119  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5120 #endif
5121  }
5122  }
5123  if (hot_teams[level].hot_team_nth >= new_nproc) {
5124  // we have all needed threads in reserve, no need to allocate any
5125  // this only possible in mode 1, cannot have reserved threads in mode 0
5126  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5127  team->t.t_nproc = new_nproc; // just get reserved threads involved
5128  } else {
5129  // we may have some threads in reserve, but not enough
5130  team->t.t_nproc =
5131  hot_teams[level]
5132  .hot_team_nth; // get reserved threads involved if any
5133  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5134 #endif // KMP_NESTED_HOT_TEAMS
5135  if (team->t.t_max_nproc < new_nproc) {
5136  /* reallocate larger arrays */
5137  __kmp_reallocate_team_arrays(team, new_nproc);
5138  __kmp_reinitialize_team(team, new_icvs, NULL);
5139  }
5140 
5141 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5142  /* Temporarily set full mask for master thread before creation of
5143  workers. The reason is that workers inherit the affinity from master,
5144  so if a lot of workers are created on the single core quickly, they
5145  don't get a chance to set their own affinity for a long time. */
5146  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5147 #endif
5148 
5149  /* allocate new threads for the hot team */
5150  for (f = team->t.t_nproc; f < new_nproc; f++) {
5151  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5152  KMP_DEBUG_ASSERT(new_worker);
5153  team->t.t_threads[f] = new_worker;
5154 
5155  KA_TRACE(20,
5156  ("__kmp_allocate_team: team %d init T#%d arrived: "
5157  "join=%llu, plain=%llu\n",
5158  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5159  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5160  team->t.t_bar[bs_plain_barrier].b_arrived));
5161 
5162  { // Initialize barrier data for new threads.
5163  int b;
5164  kmp_balign_t *balign = new_worker->th.th_bar;
5165  for (b = 0; b < bs_last_barrier; ++b) {
5166  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5167  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5168  KMP_BARRIER_PARENT_FLAG);
5169 #if USE_DEBUGGER
5170  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5171 #endif
5172  }
5173  }
5174  }
5175 
5176 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5177  if (KMP_AFFINITY_CAPABLE()) {
5178  /* Restore initial master thread's affinity mask */
5179  __kmp_set_system_affinity(old_mask, TRUE);
5180  KMP_CPU_FREE(old_mask);
5181  }
5182 #endif
5183 #if KMP_NESTED_HOT_TEAMS
5184  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5185 #endif // KMP_NESTED_HOT_TEAMS
5186  /* make sure everyone is syncronized */
5187  int old_nproc = team->t.t_nproc; // save old value and use to update only
5188  // new threads below
5189  __kmp_initialize_team(team, new_nproc, new_icvs,
5190  root->r.r_uber_thread->th.th_ident);
5191 
5192  /* reinitialize the threads */
5193  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5194  for (f = 0; f < team->t.t_nproc; ++f)
5195  __kmp_initialize_info(team->t.t_threads[f], team, f,
5196  __kmp_gtid_from_tid(f, team));
5197 
5198  if (level) { // set th_task_state for new threads in nested hot team
5199  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5200  // only need to set the th_task_state for the new threads. th_task_state
5201  // for master thread will not be accurate until after this in
5202  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5203  // correct value.
5204  for (f = old_nproc; f < team->t.t_nproc; ++f)
5205  team->t.t_threads[f]->th.th_task_state =
5206  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5207  } else { // set th_task_state for new threads in non-nested hot team
5208  int old_state =
5209  team->t.t_threads[0]->th.th_task_state; // copy master's state
5210  for (f = old_nproc; f < team->t.t_nproc; ++f)
5211  team->t.t_threads[f]->th.th_task_state = old_state;
5212  }
5213 
5214 #ifdef KMP_DEBUG
5215  for (f = 0; f < team->t.t_nproc; ++f) {
5216  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5217  team->t.t_threads[f]->th.th_team_nproc ==
5218  team->t.t_nproc);
5219  }
5220 #endif
5221 
5222  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5223 #if KMP_AFFINITY_SUPPORTED
5224  __kmp_partition_places(team);
5225 #endif
5226  } // Check changes in number of threads
5227 
5228  kmp_info_t *master = team->t.t_threads[0];
5229  if (master->th.th_teams_microtask) {
5230  for (f = 1; f < new_nproc; ++f) {
5231  // propagate teams construct specific info to workers
5232  kmp_info_t *thr = team->t.t_threads[f];
5233  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5234  thr->th.th_teams_level = master->th.th_teams_level;
5235  thr->th.th_teams_size = master->th.th_teams_size;
5236  }
5237  }
5238 #if KMP_NESTED_HOT_TEAMS
5239  if (level) {
5240  // Sync barrier state for nested hot teams, not needed for outermost hot
5241  // team.
5242  for (f = 1; f < new_nproc; ++f) {
5243  kmp_info_t *thr = team->t.t_threads[f];
5244  int b;
5245  kmp_balign_t *balign = thr->th.th_bar;
5246  for (b = 0; b < bs_last_barrier; ++b) {
5247  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5248  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5249 #if USE_DEBUGGER
5250  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5251 #endif
5252  }
5253  }
5254  }
5255 #endif // KMP_NESTED_HOT_TEAMS
5256 
5257  /* reallocate space for arguments if necessary */
5258  __kmp_alloc_argv_entries(argc, team, TRUE);
5259  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5260  // The hot team re-uses the previous task team,
5261  // if untouched during the previous release->gather phase.
5262 
5263  KF_TRACE(10, (" hot_team = %p\n", team));
5264 
5265 #if KMP_DEBUG
5266  if (__kmp_tasking_mode != tskm_immediate_exec) {
5267  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5268  "task_team[1] = %p after reinit\n",
5269  team->t.t_task_team[0], team->t.t_task_team[1]));
5270  }
5271 #endif
5272 
5273 #if OMPT_SUPPORT
5274  __ompt_team_assign_id(team, ompt_parallel_data);
5275 #endif
5276 
5277  KMP_MB();
5278 
5279  return team;
5280  }
5281 
5282  /* next, let's try to take one from the team pool */
5283  KMP_MB();
5284  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5285  /* TODO: consider resizing undersized teams instead of reaping them, now
5286  that we have a resizing mechanism */
5287  if (team->t.t_max_nproc >= max_nproc) {
5288  /* take this team from the team pool */
5289  __kmp_team_pool = team->t.t_next_pool;
5290 
5291  /* setup the team for fresh use */
5292  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5293 
5294  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5295  "task_team[1] %p to NULL\n",
5296  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5297  team->t.t_task_team[0] = NULL;
5298  team->t.t_task_team[1] = NULL;
5299 
5300  /* reallocate space for arguments if necessary */
5301  __kmp_alloc_argv_entries(argc, team, TRUE);
5302  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5303 
5304  KA_TRACE(
5305  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5306  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5307  { // Initialize barrier data.
5308  int b;
5309  for (b = 0; b < bs_last_barrier; ++b) {
5310  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5311 #if USE_DEBUGGER
5312  team->t.t_bar[b].b_master_arrived = 0;
5313  team->t.t_bar[b].b_team_arrived = 0;
5314 #endif
5315  }
5316  }
5317 
5318  team->t.t_proc_bind = new_proc_bind;
5319 
5320  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5321  team->t.t_id));
5322 
5323 #if OMPT_SUPPORT
5324  __ompt_team_assign_id(team, ompt_parallel_data);
5325 #endif
5326 
5327  KMP_MB();
5328 
5329  return team;
5330  }
5331 
5332  /* reap team if it is too small, then loop back and check the next one */
5333  // not sure if this is wise, but, will be redone during the hot-teams
5334  // rewrite.
5335  /* TODO: Use technique to find the right size hot-team, don't reap them */
5336  team = __kmp_reap_team(team);
5337  __kmp_team_pool = team;
5338  }
5339 
5340  /* nothing available in the pool, no matter, make a new team! */
5341  KMP_MB();
5342  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5343 
5344  /* and set it up */
5345  team->t.t_max_nproc = max_nproc;
5346  /* NOTE well, for some reason allocating one big buffer and dividing it up
5347  seems to really hurt performance a lot on the P4, so, let's not use this */
5348  __kmp_allocate_team_arrays(team, max_nproc);
5349 
5350  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5351  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5352 
5353  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5354  "%p to NULL\n",
5355  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5356  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5357  // memory, no need to duplicate
5358  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5359  // memory, no need to duplicate
5360 
5361  if (__kmp_storage_map) {
5362  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5363  }
5364 
5365  /* allocate space for arguments */
5366  __kmp_alloc_argv_entries(argc, team, FALSE);
5367  team->t.t_argc = argc;
5368 
5369  KA_TRACE(20,
5370  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5371  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5372  { // Initialize barrier data.
5373  int b;
5374  for (b = 0; b < bs_last_barrier; ++b) {
5375  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5376 #if USE_DEBUGGER
5377  team->t.t_bar[b].b_master_arrived = 0;
5378  team->t.t_bar[b].b_team_arrived = 0;
5379 #endif
5380  }
5381  }
5382 
5383  team->t.t_proc_bind = new_proc_bind;
5384 
5385 #if OMPT_SUPPORT
5386  __ompt_team_assign_id(team, ompt_parallel_data);
5387  team->t.ompt_serialized_team_info = NULL;
5388 #endif
5389 
5390  KMP_MB();
5391 
5392  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5393  team->t.t_id));
5394 
5395  return team;
5396 }
5397 
5398 /* TODO implement hot-teams at all levels */
5399 /* TODO implement lazy thread release on demand (disband request) */
5400 
5401 /* free the team. return it to the team pool. release all the threads
5402  * associated with it */
5403 void __kmp_free_team(kmp_root_t *root,
5404  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5405  int f;
5406  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5407  team->t.t_id));
5408 
5409  /* verify state */
5410  KMP_DEBUG_ASSERT(root);
5411  KMP_DEBUG_ASSERT(team);
5412  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5413  KMP_DEBUG_ASSERT(team->t.t_threads);
5414 
5415  int use_hot_team = team == root->r.r_hot_team;
5416 #if KMP_NESTED_HOT_TEAMS
5417  int level;
5418  kmp_hot_team_ptr_t *hot_teams;
5419  if (master) {
5420  level = team->t.t_active_level - 1;
5421  if (master->th.th_teams_microtask) { // in teams construct?
5422  if (master->th.th_teams_size.nteams > 1) {
5423  ++level; // level was not increased in teams construct for
5424  // team_of_masters
5425  }
5426  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5427  master->th.th_teams_level == team->t.t_level) {
5428  ++level; // level was not increased in teams construct for
5429  // team_of_workers before the parallel
5430  } // team->t.t_level will be increased inside parallel
5431  }
5432  hot_teams = master->th.th_hot_teams;
5433  if (level < __kmp_hot_teams_max_level) {
5434  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5435  use_hot_team = 1;
5436  }
5437  }
5438 #endif // KMP_NESTED_HOT_TEAMS
5439 
5440  /* team is done working */
5441  TCW_SYNC_PTR(team->t.t_pkfn,
5442  NULL); // Important for Debugging Support Library.
5443 #if KMP_OS_WINDOWS
5444  team->t.t_copyin_counter = 0; // init counter for possible reuse
5445 #endif
5446  // Do not reset pointer to parent team to NULL for hot teams.
5447 
5448  /* if we are non-hot team, release our threads */
5449  if (!use_hot_team) {
5450  if (__kmp_tasking_mode != tskm_immediate_exec) {
5451  // Wait for threads to reach reapable state
5452  for (f = 1; f < team->t.t_nproc; ++f) {
5453  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5454  kmp_info_t *th = team->t.t_threads[f];
5455  volatile kmp_uint32 *state = &th->th.th_reap_state;
5456  while (*state != KMP_SAFE_TO_REAP) {
5457 #if KMP_OS_WINDOWS
5458  // On Windows a thread can be killed at any time, check this
5459  DWORD ecode;
5460  if (!__kmp_is_thread_alive(th, &ecode)) {
5461  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5462  break;
5463  }
5464 #endif
5465  // first check if thread is sleeping
5466  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5467  if (fl.is_sleeping())
5468  fl.resume(__kmp_gtid_from_thread(th));
5469  KMP_CPU_PAUSE();
5470  }
5471  }
5472 
5473  // Delete task teams
5474  int tt_idx;
5475  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5476  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5477  if (task_team != NULL) {
5478  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5479  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5480  team->t.t_threads[f]->th.th_task_team = NULL;
5481  }
5482  KA_TRACE(
5483  20,
5484  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5485  __kmp_get_gtid(), task_team, team->t.t_id));
5486 #if KMP_NESTED_HOT_TEAMS
5487  __kmp_free_task_team(master, task_team);
5488 #endif
5489  team->t.t_task_team[tt_idx] = NULL;
5490  }
5491  }
5492  }
5493 
5494  // Reset pointer to parent team only for non-hot teams.
5495  team->t.t_parent = NULL;
5496  team->t.t_level = 0;
5497  team->t.t_active_level = 0;
5498 
5499  /* free the worker threads */
5500  for (f = 1; f < team->t.t_nproc; ++f) {
5501  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5502  __kmp_free_thread(team->t.t_threads[f]);
5503  team->t.t_threads[f] = NULL;
5504  }
5505 
5506  /* put the team back in the team pool */
5507  /* TODO limit size of team pool, call reap_team if pool too large */
5508  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5509  __kmp_team_pool = (volatile kmp_team_t *)team;
5510  } else { // Check if team was created for the masters in a teams construct
5511  // See if first worker is a CG root
5512  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5513  team->t.t_threads[1]->th.th_cg_roots);
5514  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5515  // Clean up the CG root nodes on workers so that this team can be re-used
5516  for (f = 1; f < team->t.t_nproc; ++f) {
5517  kmp_info_t *thr = team->t.t_threads[f];
5518  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5519  thr->th.th_cg_roots->cg_root == thr);
5520  // Pop current CG root off list
5521  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5522  thr->th.th_cg_roots = tmp->up;
5523  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5524  " up to node %p. cg_nthreads was %d\n",
5525  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5526  int i = tmp->cg_nthreads--;
5527  if (i == 1) {
5528  __kmp_free(tmp); // free CG if we are the last thread in it
5529  }
5530  // Restore current task's thread_limit from CG root
5531  if (thr->th.th_cg_roots)
5532  thr->th.th_current_task->td_icvs.thread_limit =
5533  thr->th.th_cg_roots->cg_thread_limit;
5534  }
5535  }
5536  }
5537 
5538  KMP_MB();
5539 }
5540 
5541 /* reap the team. destroy it, reclaim all its resources and free its memory */
5542 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5543  kmp_team_t *next_pool = team->t.t_next_pool;
5544 
5545  KMP_DEBUG_ASSERT(team);
5546  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5547  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5548  KMP_DEBUG_ASSERT(team->t.t_threads);
5549  KMP_DEBUG_ASSERT(team->t.t_argv);
5550 
5551  /* TODO clean the threads that are a part of this? */
5552 
5553  /* free stuff */
5554  __kmp_free_team_arrays(team);
5555  if (team->t.t_argv != &team->t.t_inline_argv[0])
5556  __kmp_free((void *)team->t.t_argv);
5557  __kmp_free(team);
5558 
5559  KMP_MB();
5560  return next_pool;
5561 }
5562 
5563 // Free the thread. Don't reap it, just place it on the pool of available
5564 // threads.
5565 //
5566 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5567 // binding for the affinity mechanism to be useful.
5568 //
5569 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5570 // However, we want to avoid a potential performance problem by always
5571 // scanning through the list to find the correct point at which to insert
5572 // the thread (potential N**2 behavior). To do this we keep track of the
5573 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5574 // With single-level parallelism, threads will always be added to the tail
5575 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5576 // parallelism, all bets are off and we may need to scan through the entire
5577 // free list.
5578 //
5579 // This change also has a potentially large performance benefit, for some
5580 // applications. Previously, as threads were freed from the hot team, they
5581 // would be placed back on the free list in inverse order. If the hot team
5582 // grew back to it's original size, then the freed thread would be placed
5583 // back on the hot team in reverse order. This could cause bad cache
5584 // locality problems on programs where the size of the hot team regularly
5585 // grew and shrunk.
5586 //
5587 // Now, for single-level parallelism, the OMP tid is always == gtid.
5588 void __kmp_free_thread(kmp_info_t *this_th) {
5589  int gtid;
5590  kmp_info_t **scan;
5591 
5592  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5593  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5594 
5595  KMP_DEBUG_ASSERT(this_th);
5596 
5597  // When moving thread to pool, switch thread to wait on own b_go flag, and
5598  // uninitialized (NULL team).
5599  int b;
5600  kmp_balign_t *balign = this_th->th.th_bar;
5601  for (b = 0; b < bs_last_barrier; ++b) {
5602  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5603  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5604  balign[b].bb.team = NULL;
5605  balign[b].bb.leaf_kids = 0;
5606  }
5607  this_th->th.th_task_state = 0;
5608  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5609 
5610  /* put thread back on the free pool */
5611  TCW_PTR(this_th->th.th_team, NULL);
5612  TCW_PTR(this_th->th.th_root, NULL);
5613  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5614 
5615  while (this_th->th.th_cg_roots) {
5616  this_th->th.th_cg_roots->cg_nthreads--;
5617  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5618  " %p of thread %p to %d\n",
5619  this_th, this_th->th.th_cg_roots,
5620  this_th->th.th_cg_roots->cg_root,
5621  this_th->th.th_cg_roots->cg_nthreads));
5622  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5623  if (tmp->cg_root == this_th) { // Thread is a cg_root
5624  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5625  KA_TRACE(
5626  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5627  this_th->th.th_cg_roots = tmp->up;
5628  __kmp_free(tmp);
5629  } else { // Worker thread
5630  if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5631  __kmp_free(tmp);
5632  }
5633  this_th->th.th_cg_roots = NULL;
5634  break;
5635  }
5636  }
5637 
5638  /* If the implicit task assigned to this thread can be used by other threads
5639  * -> multiple threads can share the data and try to free the task at
5640  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5641  * with higher probability when hot team is disabled but can occurs even when
5642  * the hot team is enabled */
5643  __kmp_free_implicit_task(this_th);
5644  this_th->th.th_current_task = NULL;
5645 
5646  // If the __kmp_thread_pool_insert_pt is already past the new insert
5647  // point, then we need to re-scan the entire list.
5648  gtid = this_th->th.th_info.ds.ds_gtid;
5649  if (__kmp_thread_pool_insert_pt != NULL) {
5650  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5651  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5652  __kmp_thread_pool_insert_pt = NULL;
5653  }
5654  }
5655 
5656  // Scan down the list to find the place to insert the thread.
5657  // scan is the address of a link in the list, possibly the address of
5658  // __kmp_thread_pool itself.
5659  //
5660  // In the absence of nested parallelism, the for loop will have 0 iterations.
5661  if (__kmp_thread_pool_insert_pt != NULL) {
5662  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5663  } else {
5664  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5665  }
5666  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5667  scan = &((*scan)->th.th_next_pool))
5668  ;
5669 
5670  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5671  // to its address.
5672  TCW_PTR(this_th->th.th_next_pool, *scan);
5673  __kmp_thread_pool_insert_pt = *scan = this_th;
5674  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5675  (this_th->th.th_info.ds.ds_gtid <
5676  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5677  TCW_4(this_th->th.th_in_pool, TRUE);
5678  __kmp_suspend_initialize_thread(this_th);
5679  __kmp_lock_suspend_mx(this_th);
5680  if (this_th->th.th_active == TRUE) {
5681  KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5682  this_th->th.th_active_in_pool = TRUE;
5683  }
5684 #if KMP_DEBUG
5685  else {
5686  KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5687  }
5688 #endif
5689  __kmp_unlock_suspend_mx(this_th);
5690 
5691  TCW_4(__kmp_nth, __kmp_nth - 1);
5692 
5693 #ifdef KMP_ADJUST_BLOCKTIME
5694  /* Adjust blocktime back to user setting or default if necessary */
5695  /* Middle initialization might never have occurred */
5696  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5697  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5698  if (__kmp_nth <= __kmp_avail_proc) {
5699  __kmp_zero_bt = FALSE;
5700  }
5701  }
5702 #endif /* KMP_ADJUST_BLOCKTIME */
5703 
5704  KMP_MB();
5705 }
5706 
5707 /* ------------------------------------------------------------------------ */
5708 
5709 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5710  int gtid = this_thr->th.th_info.ds.ds_gtid;
5711  /* void *stack_data;*/
5712  kmp_team_t **volatile pteam;
5713 
5714  KMP_MB();
5715  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5716 
5717  if (__kmp_env_consistency_check) {
5718  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5719  }
5720 
5721 #if OMPT_SUPPORT
5722  ompt_data_t *thread_data;
5723  if (ompt_enabled.enabled) {
5724  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5725  *thread_data = ompt_data_none;
5726 
5727  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5728  this_thr->th.ompt_thread_info.wait_id = 0;
5729  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5730  this_thr->th.ompt_thread_info.parallel_flags = 0;
5731  if (ompt_enabled.ompt_callback_thread_begin) {
5732  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5733  ompt_thread_worker, thread_data);
5734  }
5735  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5736  }
5737 #endif
5738 
5739  /* This is the place where threads wait for work */
5740  while (!TCR_4(__kmp_global.g.g_done)) {
5741  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5742  KMP_MB();
5743 
5744  /* wait for work to do */
5745  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5746 
5747  /* No tid yet since not part of a team */
5748  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5749 
5750 #if OMPT_SUPPORT
5751  if (ompt_enabled.enabled) {
5752  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5753  }
5754 #endif
5755 
5756  pteam = &this_thr->th.th_team;
5757 
5758  /* have we been allocated? */
5759  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5760  /* we were just woken up, so run our new task */
5761  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5762  int rc;
5763  KA_TRACE(20,
5764  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5765  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5766  (*pteam)->t.t_pkfn));
5767 
5768  updateHWFPControl(*pteam);
5769 
5770 #if OMPT_SUPPORT
5771  if (ompt_enabled.enabled) {
5772  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5773  }
5774 #endif
5775 
5776  rc = (*pteam)->t.t_invoke(gtid);
5777  KMP_ASSERT(rc);
5778 
5779  KMP_MB();
5780  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5781  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5782  (*pteam)->t.t_pkfn));
5783  }
5784 #if OMPT_SUPPORT
5785  if (ompt_enabled.enabled) {
5786  /* no frame set while outside task */
5787  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5788 
5789  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5790  }
5791 #endif
5792  /* join barrier after parallel region */
5793  __kmp_join_barrier(gtid);
5794  }
5795  }
5796  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5797 
5798 #if OMPT_SUPPORT
5799  if (ompt_enabled.ompt_callback_thread_end) {
5800  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5801  }
5802 #endif
5803 
5804  this_thr->th.th_task_team = NULL;
5805  /* run the destructors for the threadprivate data for this thread */
5806  __kmp_common_destroy_gtid(gtid);
5807 
5808  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5809  KMP_MB();
5810  return this_thr;
5811 }
5812 
5813 /* ------------------------------------------------------------------------ */
5814 
5815 void __kmp_internal_end_dest(void *specific_gtid) {
5816 #if KMP_COMPILER_ICC
5817 #pragma warning(push)
5818 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5819 // significant bits
5820 #endif
5821  // Make sure no significant bits are lost
5822  int gtid = (kmp_intptr_t)specific_gtid - 1;
5823 #if KMP_COMPILER_ICC
5824 #pragma warning(pop)
5825 #endif
5826 
5827  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5828  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5829  * this is because 0 is reserved for the nothing-stored case */
5830 
5831  /* josh: One reason for setting the gtid specific data even when it is being
5832  destroyed by pthread is to allow gtid lookup through thread specific data
5833  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5834  that gets executed in the call to __kmp_internal_end_thread, actually
5835  gets the gtid through the thread specific data. Setting it here seems
5836  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5837  to run smoothly.
5838  todo: get rid of this after we remove the dependence on
5839  __kmp_gtid_get_specific */
5840  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5841  __kmp_gtid_set_specific(gtid);
5842 #ifdef KMP_TDATA_GTID
5843  __kmp_gtid = gtid;
5844 #endif
5845  __kmp_internal_end_thread(gtid);
5846 }
5847 
5848 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5849 
5850 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5851  __kmp_internal_end_atexit();
5852 }
5853 
5854 #endif
5855 
5856 /* [Windows] josh: when the atexit handler is called, there may still be more
5857  than one thread alive */
5858 void __kmp_internal_end_atexit(void) {
5859  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5860  /* [Windows]
5861  josh: ideally, we want to completely shutdown the library in this atexit
5862  handler, but stat code that depends on thread specific data for gtid fails
5863  because that data becomes unavailable at some point during the shutdown, so
5864  we call __kmp_internal_end_thread instead. We should eventually remove the
5865  dependency on __kmp_get_specific_gtid in the stat code and use
5866  __kmp_internal_end_library to cleanly shutdown the library.
5867 
5868  // TODO: Can some of this comment about GVS be removed?
5869  I suspect that the offending stat code is executed when the calling thread
5870  tries to clean up a dead root thread's data structures, resulting in GVS
5871  code trying to close the GVS structures for that thread, but since the stat
5872  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5873  the calling thread is cleaning up itself instead of another thread, it get
5874  confused. This happens because allowing a thread to unregister and cleanup
5875  another thread is a recent modification for addressing an issue.
5876  Based on the current design (20050722), a thread may end up
5877  trying to unregister another thread only if thread death does not trigger
5878  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5879  thread specific data destructor function to detect thread death. For
5880  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5881  is nothing. Thus, the workaround is applicable only for Windows static
5882  stat library. */
5883  __kmp_internal_end_library(-1);
5884 #if KMP_OS_WINDOWS
5885  __kmp_close_console();
5886 #endif
5887 }
5888 
5889 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5890  // It is assumed __kmp_forkjoin_lock is acquired.
5891 
5892  int gtid;
5893 
5894  KMP_DEBUG_ASSERT(thread != NULL);
5895 
5896  gtid = thread->th.th_info.ds.ds_gtid;
5897 
5898  if (!is_root) {
5899  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5900  /* Assume the threads are at the fork barrier here */
5901  KA_TRACE(
5902  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5903  gtid));
5904  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5905  * (GEH) */
5906  ANNOTATE_HAPPENS_BEFORE(thread);
5907  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5908  __kmp_release_64(&flag);
5909  }
5910 
5911  // Terminate OS thread.
5912  __kmp_reap_worker(thread);
5913 
5914  // The thread was killed asynchronously. If it was actively
5915  // spinning in the thread pool, decrement the global count.
5916  //
5917  // There is a small timing hole here - if the worker thread was just waking
5918  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5919  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5920  // the global counter might not get updated.
5921  //
5922  // Currently, this can only happen as the library is unloaded,
5923  // so there are no harmful side effects.
5924  if (thread->th.th_active_in_pool) {
5925  thread->th.th_active_in_pool = FALSE;
5926  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5927  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5928  }
5929  }
5930 
5931  __kmp_free_implicit_task(thread);
5932 
5933 // Free the fast memory for tasking
5934 #if USE_FAST_MEMORY
5935  __kmp_free_fast_memory(thread);
5936 #endif /* USE_FAST_MEMORY */
5937 
5938  __kmp_suspend_uninitialize_thread(thread);
5939 
5940  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5941  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5942 
5943  --__kmp_all_nth;
5944 // __kmp_nth was decremented when thread is added to the pool.
5945 
5946 #ifdef KMP_ADJUST_BLOCKTIME
5947  /* Adjust blocktime back to user setting or default if necessary */
5948  /* Middle initialization might never have occurred */
5949  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5950  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5951  if (__kmp_nth <= __kmp_avail_proc) {
5952  __kmp_zero_bt = FALSE;
5953  }
5954  }
5955 #endif /* KMP_ADJUST_BLOCKTIME */
5956 
5957  /* free the memory being used */
5958  if (__kmp_env_consistency_check) {
5959  if (thread->th.th_cons) {
5960  __kmp_free_cons_stack(thread->th.th_cons);
5961  thread->th.th_cons = NULL;
5962  }
5963  }
5964 
5965  if (thread->th.th_pri_common != NULL) {
5966  __kmp_free(thread->th.th_pri_common);
5967  thread->th.th_pri_common = NULL;
5968  }
5969 
5970  if (thread->th.th_task_state_memo_stack != NULL) {
5971  __kmp_free(thread->th.th_task_state_memo_stack);
5972  thread->th.th_task_state_memo_stack = NULL;
5973  }
5974 
5975 #if KMP_USE_BGET
5976  if (thread->th.th_local.bget_data != NULL) {
5977  __kmp_finalize_bget(thread);
5978  }
5979 #endif
5980 
5981 #if KMP_AFFINITY_SUPPORTED
5982  if (thread->th.th_affin_mask != NULL) {
5983  KMP_CPU_FREE(thread->th.th_affin_mask);
5984  thread->th.th_affin_mask = NULL;
5985  }
5986 #endif /* KMP_AFFINITY_SUPPORTED */
5987 
5988 #if KMP_USE_HIER_SCHED
5989  if (thread->th.th_hier_bar_data != NULL) {
5990  __kmp_free(thread->th.th_hier_bar_data);
5991  thread->th.th_hier_bar_data = NULL;
5992  }
5993 #endif
5994 
5995  __kmp_reap_team(thread->th.th_serial_team);
5996  thread->th.th_serial_team = NULL;
5997  __kmp_free(thread);
5998 
5999  KMP_MB();
6000 
6001 } // __kmp_reap_thread
6002 
6003 static void __kmp_internal_end(void) {
6004  int i;
6005 
6006  /* First, unregister the library */
6007  __kmp_unregister_library();
6008 
6009 #if KMP_OS_WINDOWS
6010  /* In Win static library, we can't tell when a root actually dies, so we
6011  reclaim the data structures for any root threads that have died but not
6012  unregistered themselves, in order to shut down cleanly.
6013  In Win dynamic library we also can't tell when a thread dies. */
6014  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6015 // dead roots
6016 #endif
6017 
6018  for (i = 0; i < __kmp_threads_capacity; i++)
6019  if (__kmp_root[i])
6020  if (__kmp_root[i]->r.r_active)
6021  break;
6022  KMP_MB(); /* Flush all pending memory write invalidates. */
6023  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6024 
6025  if (i < __kmp_threads_capacity) {
6026 #if KMP_USE_MONITOR
6027  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6028  KMP_MB(); /* Flush all pending memory write invalidates. */
6029 
6030  // Need to check that monitor was initialized before reaping it. If we are
6031  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6032  // __kmp_monitor will appear to contain valid data, but it is only valid in
6033  // the parent process, not the child.
6034  // New behavior (201008): instead of keying off of the flag
6035  // __kmp_init_parallel, the monitor thread creation is keyed off
6036  // of the new flag __kmp_init_monitor.
6037  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6038  if (TCR_4(__kmp_init_monitor)) {
6039  __kmp_reap_monitor(&__kmp_monitor);
6040  TCW_4(__kmp_init_monitor, 0);
6041  }
6042  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6043  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6044 #endif // KMP_USE_MONITOR
6045  } else {
6046 /* TODO move this to cleanup code */
6047 #ifdef KMP_DEBUG
6048  /* make sure that everything has properly ended */
6049  for (i = 0; i < __kmp_threads_capacity; i++) {
6050  if (__kmp_root[i]) {
6051  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6052  // there can be uber threads alive here
6053  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6054  }
6055  }
6056 #endif
6057 
6058  KMP_MB();
6059 
6060  // Reap the worker threads.
6061  // This is valid for now, but be careful if threads are reaped sooner.
6062  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6063  // Get the next thread from the pool.
6064  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6065  __kmp_thread_pool = thread->th.th_next_pool;
6066  // Reap it.
6067  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6068  thread->th.th_next_pool = NULL;
6069  thread->th.th_in_pool = FALSE;
6070  __kmp_reap_thread(thread, 0);
6071  }
6072  __kmp_thread_pool_insert_pt = NULL;
6073 
6074  // Reap teams.
6075  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6076  // Get the next team from the pool.
6077  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6078  __kmp_team_pool = team->t.t_next_pool;
6079  // Reap it.
6080  team->t.t_next_pool = NULL;
6081  __kmp_reap_team(team);
6082  }
6083 
6084  __kmp_reap_task_teams();
6085 
6086 #if KMP_OS_UNIX
6087  // Threads that are not reaped should not access any resources since they
6088  // are going to be deallocated soon, so the shutdown sequence should wait
6089  // until all threads either exit the final spin-waiting loop or begin
6090  // sleeping after the given blocktime.
6091  for (i = 0; i < __kmp_threads_capacity; i++) {
6092  kmp_info_t *thr = __kmp_threads[i];
6093  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6094  KMP_CPU_PAUSE();
6095  }
6096 #endif
6097 
6098  for (i = 0; i < __kmp_threads_capacity; ++i) {
6099  // TBD: Add some checking...
6100  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6101  }
6102 
6103  /* Make sure all threadprivate destructors get run by joining with all
6104  worker threads before resetting this flag */
6105  TCW_SYNC_4(__kmp_init_common, FALSE);
6106 
6107  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6108  KMP_MB();
6109 
6110 #if KMP_USE_MONITOR
6111  // See note above: One of the possible fixes for CQ138434 / CQ140126
6112  //
6113  // FIXME: push both code fragments down and CSE them?
6114  // push them into __kmp_cleanup() ?
6115  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6116  if (TCR_4(__kmp_init_monitor)) {
6117  __kmp_reap_monitor(&__kmp_monitor);
6118  TCW_4(__kmp_init_monitor, 0);
6119  }
6120  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6121  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6122 #endif
6123  } /* else !__kmp_global.t_active */
6124  TCW_4(__kmp_init_gtid, FALSE);
6125  KMP_MB(); /* Flush all pending memory write invalidates. */
6126 
6127  __kmp_cleanup();
6128 #if OMPT_SUPPORT
6129  ompt_fini();
6130 #endif
6131 }
6132 
6133 void __kmp_internal_end_library(int gtid_req) {
6134  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6135  /* this shouldn't be a race condition because __kmp_internal_end() is the
6136  only place to clear __kmp_serial_init */
6137  /* we'll check this later too, after we get the lock */
6138  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6139  // redundant, because the next check will work in any case.
6140  if (__kmp_global.g.g_abort) {
6141  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6142  /* TODO abort? */
6143  return;
6144  }
6145  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6146  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6147  return;
6148  }
6149 
6150  KMP_MB(); /* Flush all pending memory write invalidates. */
6151  /* find out who we are and what we should do */
6152  {
6153  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6154  KA_TRACE(
6155  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6156  if (gtid == KMP_GTID_SHUTDOWN) {
6157  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6158  "already shutdown\n"));
6159  return;
6160  } else if (gtid == KMP_GTID_MONITOR) {
6161  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6162  "registered, or system shutdown\n"));
6163  return;
6164  } else if (gtid == KMP_GTID_DNE) {
6165  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6166  "shutdown\n"));
6167  /* we don't know who we are, but we may still shutdown the library */
6168  } else if (KMP_UBER_GTID(gtid)) {
6169  /* unregister ourselves as an uber thread. gtid is no longer valid */
6170  if (__kmp_root[gtid]->r.r_active) {
6171  __kmp_global.g.g_abort = -1;
6172  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6173  KA_TRACE(10,
6174  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6175  gtid));
6176  return;
6177  } else {
6178  KA_TRACE(
6179  10,
6180  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6181  __kmp_unregister_root_current_thread(gtid);
6182  }
6183  } else {
6184 /* worker threads may call this function through the atexit handler, if they
6185  * call exit() */
6186 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6187  TODO: do a thorough shutdown instead */
6188 #ifdef DUMP_DEBUG_ON_EXIT
6189  if (__kmp_debug_buf)
6190  __kmp_dump_debug_buffer();
6191 #endif
6192  // added unregister library call here when we switch to shm linux
6193  // if we don't, it will leave lots of files in /dev/shm
6194  // cleanup shared memory file before exiting.
6195  __kmp_unregister_library();
6196  return;
6197  }
6198  }
6199  /* synchronize the termination process */
6200  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6201 
6202  /* have we already finished */
6203  if (__kmp_global.g.g_abort) {
6204  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6205  /* TODO abort? */
6206  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6207  return;
6208  }
6209  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6210  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6211  return;
6212  }
6213 
6214  /* We need this lock to enforce mutex between this reading of
6215  __kmp_threads_capacity and the writing by __kmp_register_root.
6216  Alternatively, we can use a counter of roots that is atomically updated by
6217  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6218  __kmp_internal_end_*. */
6219  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6220 
6221  /* now we can safely conduct the actual termination */
6222  __kmp_internal_end();
6223 
6224  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6225  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6226 
6227  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6228 
6229 #ifdef DUMP_DEBUG_ON_EXIT
6230  if (__kmp_debug_buf)
6231  __kmp_dump_debug_buffer();
6232 #endif
6233 
6234 #if KMP_OS_WINDOWS
6235  __kmp_close_console();
6236 #endif
6237 
6238  __kmp_fini_allocator();
6239 
6240 } // __kmp_internal_end_library
6241 
6242 void __kmp_internal_end_thread(int gtid_req) {
6243  int i;
6244 
6245  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6246  /* this shouldn't be a race condition because __kmp_internal_end() is the
6247  * only place to clear __kmp_serial_init */
6248  /* we'll check this later too, after we get the lock */
6249  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6250  // redundant, because the next check will work in any case.
6251  if (__kmp_global.g.g_abort) {
6252  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6253  /* TODO abort? */
6254  return;
6255  }
6256  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6257  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6258  return;
6259  }
6260 
6261  KMP_MB(); /* Flush all pending memory write invalidates. */
6262 
6263  /* find out who we are and what we should do */
6264  {
6265  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6266  KA_TRACE(10,
6267  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6268  if (gtid == KMP_GTID_SHUTDOWN) {
6269  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6270  "already shutdown\n"));
6271  return;
6272  } else if (gtid == KMP_GTID_MONITOR) {
6273  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6274  "registered, or system shutdown\n"));
6275  return;
6276  } else if (gtid == KMP_GTID_DNE) {
6277  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6278  "shutdown\n"));
6279  return;
6280  /* we don't know who we are */
6281  } else if (KMP_UBER_GTID(gtid)) {
6282  /* unregister ourselves as an uber thread. gtid is no longer valid */
6283  if (__kmp_root[gtid]->r.r_active) {
6284  __kmp_global.g.g_abort = -1;
6285  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6286  KA_TRACE(10,
6287  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6288  gtid));
6289  return;
6290  } else {
6291  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6292  gtid));
6293  __kmp_unregister_root_current_thread(gtid);
6294  }
6295  } else {
6296  /* just a worker thread, let's leave */
6297  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6298 
6299  if (gtid >= 0) {
6300  __kmp_threads[gtid]->th.th_task_team = NULL;
6301  }
6302 
6303  KA_TRACE(10,
6304  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6305  gtid));
6306  return;
6307  }
6308  }
6309 #if KMP_DYNAMIC_LIB
6310  if (__kmp_pause_status != kmp_hard_paused)
6311  // AC: lets not shutdown the dynamic library at the exit of uber thread,
6312  // because we will better shutdown later in the library destructor.
6313  {
6314  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6315  return;
6316  }
6317 #endif
6318  /* synchronize the termination process */
6319  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6320 
6321  /* have we already finished */
6322  if (__kmp_global.g.g_abort) {
6323  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6324  /* TODO abort? */
6325  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6326  return;
6327  }
6328  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6329  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6330  return;
6331  }
6332 
6333  /* We need this lock to enforce mutex between this reading of
6334  __kmp_threads_capacity and the writing by __kmp_register_root.
6335  Alternatively, we can use a counter of roots that is atomically updated by
6336  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6337  __kmp_internal_end_*. */
6338 
6339  /* should we finish the run-time? are all siblings done? */
6340  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6341 
6342  for (i = 0; i < __kmp_threads_capacity; ++i) {
6343  if (KMP_UBER_GTID(i)) {
6344  KA_TRACE(
6345  10,
6346  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6347  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6348  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6349  return;
6350  }
6351  }
6352 
6353  /* now we can safely conduct the actual termination */
6354 
6355  __kmp_internal_end();
6356 
6357  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6358  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6359 
6360  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6361 
6362 #ifdef DUMP_DEBUG_ON_EXIT
6363  if (__kmp_debug_buf)
6364  __kmp_dump_debug_buffer();
6365 #endif
6366 } // __kmp_internal_end_thread
6367 
6368 // -----------------------------------------------------------------------------
6369 // Library registration stuff.
6370 
6371 static long __kmp_registration_flag = 0;
6372 // Random value used to indicate library initialization.
6373 static char *__kmp_registration_str = NULL;
6374 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6375 
6376 static inline char *__kmp_reg_status_name() {
6377  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6378  each thread. If registration and unregistration go in different threads
6379  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6380  env var can not be found, because the name will contain different pid. */
6381  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6382 } // __kmp_reg_status_get
6383 
6384 void __kmp_register_library_startup(void) {
6385 
6386  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6387  int done = 0;
6388  union {
6389  double dtime;
6390  long ltime;
6391  } time;
6392 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6393  __kmp_initialize_system_tick();
6394 #endif
6395  __kmp_read_system_time(&time.dtime);
6396  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6397  __kmp_registration_str =
6398  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6399  __kmp_registration_flag, KMP_LIBRARY_FILE);
6400 
6401  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6402  __kmp_registration_str));
6403 
6404  while (!done) {
6405 
6406  char *value = NULL; // Actual value of the environment variable.
6407 
6408 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6409  char *shm_name = __kmp_str_format("/%s", name);
6410  int shm_preexist = 0;
6411  char *data1;
6412  int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6413  if ((fd1 == -1) && (errno == EEXIST)) {
6414  // file didn't open because it already exists.
6415  // try opening existing file
6416  fd1 = shm_open(shm_name, O_RDWR, 0666);
6417  if (fd1 == -1) { // file didn't open
6418  // error out here
6419  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6420  __kmp_msg_null);
6421  } else {
6422  // able to open existing file
6423  shm_preexist = 1;
6424  }
6425  } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6426  // already exists.
6427  // error out here.
6428  __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6429  __kmp_msg_null);
6430  }
6431  if (shm_preexist == 0) {
6432  // we created SHM now set size
6433  if (ftruncate(fd1, SHM_SIZE) == -1) {
6434  // error occured setting size;
6435  __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6436  KMP_ERR(errno), __kmp_msg_null);
6437  }
6438  }
6439  data1 =
6440  (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6441  if (data1 == MAP_FAILED) {
6442  // failed to map shared memory
6443  __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6444  __kmp_msg_null);
6445  }
6446  if (shm_preexist == 0) { // set data to SHM, set value
6447  KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6448  }
6449  // Read value from either what we just wrote or existing file.
6450  value = __kmp_str_format("%s", data1); // read value from SHM
6451  munmap(data1, SHM_SIZE);
6452  close(fd1);
6453 #else // Windows and unix with static library
6454  // Set environment variable, but do not overwrite if it is exist.
6455  __kmp_env_set(name, __kmp_registration_str, 0);
6456  // read value to see if it got set
6457  value = __kmp_env_get(name);
6458 #endif
6459 
6460  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6461  done = 1; // Ok, environment variable set successfully, exit the loop.
6462  } else {
6463  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6464  // Check whether it alive or dead.
6465  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6466  char *tail = value;
6467  char *flag_addr_str = NULL;
6468  char *flag_val_str = NULL;
6469  char const *file_name = NULL;
6470  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6471  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6472  file_name = tail;
6473  if (tail != NULL) {
6474  long *flag_addr = 0;
6475  long flag_val = 0;
6476  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6477  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6478  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6479  // First, check whether environment-encoded address is mapped into
6480  // addr space.
6481  // If so, dereference it to see if it still has the right value.
6482  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6483  neighbor = 1;
6484  } else {
6485  // If not, then we know the other copy of the library is no longer
6486  // running.
6487  neighbor = 2;
6488  }
6489  }
6490  }
6491  switch (neighbor) {
6492  case 0: // Cannot parse environment variable -- neighbor status unknown.
6493  // Assume it is the incompatible format of future version of the
6494  // library. Assume the other library is alive.
6495  // WARN( ... ); // TODO: Issue a warning.
6496  file_name = "unknown library";
6497  KMP_FALLTHROUGH();
6498  // Attention! Falling to the next case. That's intentional.
6499  case 1: { // Neighbor is alive.
6500  // Check it is allowed.
6501  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6502  if (!__kmp_str_match_true(duplicate_ok)) {
6503  // That's not allowed. Issue fatal error.
6504  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6505  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6506  }
6507  KMP_INTERNAL_FREE(duplicate_ok);
6508  __kmp_duplicate_library_ok = 1;
6509  done = 1; // Exit the loop.
6510  } break;
6511  case 2: { // Neighbor is dead.
6512 
6513 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6514  // close shared memory.
6515  shm_unlink(shm_name); // this removes file in /dev/shm
6516 #else
6517  // Clear the variable and try to register library again.
6518  __kmp_env_unset(name);
6519 #endif
6520  } break;
6521  default: { KMP_DEBUG_ASSERT(0); } break;
6522  }
6523  }
6524  KMP_INTERNAL_FREE((void *)value);
6525 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6526  KMP_INTERNAL_FREE((void *)shm_name);
6527 #endif
6528  } // while
6529  KMP_INTERNAL_FREE((void *)name);
6530 
6531 } // func __kmp_register_library_startup
6532 
6533 void __kmp_unregister_library(void) {
6534 
6535  char *name = __kmp_reg_status_name();
6536  char *value = NULL;
6537 
6538 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6539  char *shm_name = __kmp_str_format("/%s", name);
6540  int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6541  if (fd1 == -1) {
6542  // file did not open. return.
6543  return;
6544  }
6545  char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6546  if (data1 != MAP_FAILED) {
6547  value = __kmp_str_format("%s", data1); // read value from SHM
6548  munmap(data1, SHM_SIZE);
6549  }
6550  close(fd1);
6551 #else
6552  value = __kmp_env_get(name);
6553 #endif
6554 
6555  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6556  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6557  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6558 // Ok, this is our variable. Delete it.
6559 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6560  shm_unlink(shm_name); // this removes file in /dev/shm
6561 #else
6562  __kmp_env_unset(name);
6563 #endif
6564  }
6565 
6566 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6567  KMP_INTERNAL_FREE(shm_name);
6568 #endif
6569 
6570  KMP_INTERNAL_FREE(__kmp_registration_str);
6571  KMP_INTERNAL_FREE(value);
6572  KMP_INTERNAL_FREE(name);
6573 
6574  __kmp_registration_flag = 0;
6575  __kmp_registration_str = NULL;
6576 
6577 } // __kmp_unregister_library
6578 
6579 // End of Library registration stuff.
6580 // -----------------------------------------------------------------------------
6581 
6582 #if KMP_MIC_SUPPORTED
6583 
6584 static void __kmp_check_mic_type() {
6585  kmp_cpuid_t cpuid_state = {0};
6586  kmp_cpuid_t *cs_p = &cpuid_state;
6587  __kmp_x86_cpuid(1, 0, cs_p);
6588  // We don't support mic1 at the moment
6589  if ((cs_p->eax & 0xff0) == 0xB10) {
6590  __kmp_mic_type = mic2;
6591  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6592  __kmp_mic_type = mic3;
6593  } else {
6594  __kmp_mic_type = non_mic;
6595  }
6596 }
6597 
6598 #endif /* KMP_MIC_SUPPORTED */
6599 
6600 static void __kmp_do_serial_initialize(void) {
6601  int i, gtid;
6602  int size;
6603 
6604  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6605 
6606  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6607  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6608  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6609  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6610  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6611 
6612 #if OMPT_SUPPORT
6613  ompt_pre_init();
6614 #endif
6615 
6616  __kmp_validate_locks();
6617 
6618  /* Initialize internal memory allocator */
6619  __kmp_init_allocator();
6620 
6621  /* Register the library startup via an environment variable and check to see
6622  whether another copy of the library is already registered. */
6623 
6624  __kmp_register_library_startup();
6625 
6626  /* TODO reinitialization of library */
6627  if (TCR_4(__kmp_global.g.g_done)) {
6628  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6629  }
6630 
6631  __kmp_global.g.g_abort = 0;
6632  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6633 
6634 /* initialize the locks */
6635 #if KMP_USE_ADAPTIVE_LOCKS
6636 #if KMP_DEBUG_ADAPTIVE_LOCKS
6637  __kmp_init_speculative_stats();
6638 #endif
6639 #endif
6640 #if KMP_STATS_ENABLED
6641  __kmp_stats_init();
6642 #endif
6643  __kmp_init_lock(&__kmp_global_lock);
6644  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6645  __kmp_init_lock(&__kmp_debug_lock);
6646  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6647  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6648  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6649  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6650  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6651  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6652  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6653  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6654  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6655  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6656  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6657  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6658  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6659  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6660  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6661 #if KMP_USE_MONITOR
6662  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6663 #endif
6664  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6665 
6666  /* conduct initialization and initial setup of configuration */
6667 
6668  __kmp_runtime_initialize();
6669 
6670 #if KMP_MIC_SUPPORTED
6671  __kmp_check_mic_type();
6672 #endif
6673 
6674 // Some global variable initialization moved here from kmp_env_initialize()
6675 #ifdef KMP_DEBUG
6676  kmp_diag = 0;
6677 #endif
6678  __kmp_abort_delay = 0;
6679 
6680  // From __kmp_init_dflt_team_nth()
6681  /* assume the entire machine will be used */
6682  __kmp_dflt_team_nth_ub = __kmp_xproc;
6683  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6684  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6685  }
6686  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6687  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6688  }
6689  __kmp_max_nth = __kmp_sys_max_nth;
6690  __kmp_cg_max_nth = __kmp_sys_max_nth;
6691  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6692  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6693  __kmp_teams_max_nth = __kmp_sys_max_nth;
6694  }
6695 
6696  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6697  // part
6698  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6699 #if KMP_USE_MONITOR
6700  __kmp_monitor_wakeups =
6701  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6702  __kmp_bt_intervals =
6703  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6704 #endif
6705  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6706  __kmp_library = library_throughput;
6707  // From KMP_SCHEDULE initialization
6708  __kmp_static = kmp_sch_static_balanced;
6709 // AC: do not use analytical here, because it is non-monotonous
6710 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6711 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6712 // need to repeat assignment
6713 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6714 // bit control and barrier method control parts
6715 #if KMP_FAST_REDUCTION_BARRIER
6716 #define kmp_reduction_barrier_gather_bb ((int)1)
6717 #define kmp_reduction_barrier_release_bb ((int)1)
6718 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6719 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6720 #endif // KMP_FAST_REDUCTION_BARRIER
6721  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6722  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6723  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6724  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6725  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6726 #if KMP_FAST_REDUCTION_BARRIER
6727  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6728  // lin_64 ): hyper,1
6729  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6730  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6731  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6732  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6733  }
6734 #endif // KMP_FAST_REDUCTION_BARRIER
6735  }
6736 #if KMP_FAST_REDUCTION_BARRIER
6737 #undef kmp_reduction_barrier_release_pat
6738 #undef kmp_reduction_barrier_gather_pat
6739 #undef kmp_reduction_barrier_release_bb
6740 #undef kmp_reduction_barrier_gather_bb
6741 #endif // KMP_FAST_REDUCTION_BARRIER
6742 #if KMP_MIC_SUPPORTED
6743  if (__kmp_mic_type == mic2) { // KNC
6744  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6745  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6746  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6747  1; // forkjoin release
6748  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6749  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6750  }
6751 #if KMP_FAST_REDUCTION_BARRIER
6752  if (__kmp_mic_type == mic2) { // KNC
6753  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6754  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6755  }
6756 #endif // KMP_FAST_REDUCTION_BARRIER
6757 #endif // KMP_MIC_SUPPORTED
6758 
6759 // From KMP_CHECKS initialization
6760 #ifdef KMP_DEBUG
6761  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6762 #else
6763  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6764 #endif
6765 
6766  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6767  __kmp_foreign_tp = TRUE;
6768 
6769  __kmp_global.g.g_dynamic = FALSE;
6770  __kmp_global.g.g_dynamic_mode = dynamic_default;
6771 
6772  __kmp_env_initialize(NULL);
6773 
6774 // Print all messages in message catalog for testing purposes.
6775 #ifdef KMP_DEBUG
6776  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6777  if (__kmp_str_match_true(val)) {
6778  kmp_str_buf_t buffer;
6779  __kmp_str_buf_init(&buffer);
6780  __kmp_i18n_dump_catalog(&buffer);
6781  __kmp_printf("%s", buffer.str);
6782  __kmp_str_buf_free(&buffer);
6783  }
6784  __kmp_env_free(&val);
6785 #endif
6786 
6787  __kmp_threads_capacity =
6788  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6789  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6790  __kmp_tp_capacity = __kmp_default_tp_capacity(
6791  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6792 
6793  // If the library is shut down properly, both pools must be NULL. Just in
6794  // case, set them to NULL -- some memory may leak, but subsequent code will
6795  // work even if pools are not freed.
6796  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6797  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6798  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6799  __kmp_thread_pool = NULL;
6800  __kmp_thread_pool_insert_pt = NULL;
6801  __kmp_team_pool = NULL;
6802 
6803  /* Allocate all of the variable sized records */
6804  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6805  * expandable */
6806  /* Since allocation is cache-aligned, just add extra padding at the end */
6807  size =
6808  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6809  CACHE_LINE;
6810  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6811  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6812  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6813 
6814  /* init thread counts */
6815  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6816  0); // Asserts fail if the library is reinitializing and
6817  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6818  __kmp_all_nth = 0;
6819  __kmp_nth = 0;
6820 
6821  /* setup the uber master thread and hierarchy */
6822  gtid = __kmp_register_root(TRUE);
6823  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6824  KMP_ASSERT(KMP_UBER_GTID(gtid));
6825  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6826 
6827  KMP_MB(); /* Flush all pending memory write invalidates. */
6828 
6829  __kmp_common_initialize();
6830 
6831 #if KMP_OS_UNIX
6832  /* invoke the child fork handler */
6833  __kmp_register_atfork();
6834 #endif
6835 
6836 #if !KMP_DYNAMIC_LIB
6837  {
6838  /* Invoke the exit handler when the program finishes, only for static
6839  library. For dynamic library, we already have _fini and DllMain. */
6840  int rc = atexit(__kmp_internal_end_atexit);
6841  if (rc != 0) {
6842  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6843  __kmp_msg_null);
6844  }
6845  }
6846 #endif
6847 
6848 #if KMP_HANDLE_SIGNALS
6849 #if KMP_OS_UNIX
6850  /* NOTE: make sure that this is called before the user installs their own
6851  signal handlers so that the user handlers are called first. this way they
6852  can return false, not call our handler, avoid terminating the library, and
6853  continue execution where they left off. */
6854  __kmp_install_signals(FALSE);
6855 #endif /* KMP_OS_UNIX */
6856 #if KMP_OS_WINDOWS
6857  __kmp_install_signals(TRUE);
6858 #endif /* KMP_OS_WINDOWS */
6859 #endif
6860 
6861  /* we have finished the serial initialization */
6862  __kmp_init_counter++;
6863 
6864  __kmp_init_serial = TRUE;
6865 
6866  if (__kmp_settings) {
6867  __kmp_env_print();
6868  }
6869 
6870  if (__kmp_display_env || __kmp_display_env_verbose) {
6871  __kmp_env_print_2();
6872  }
6873 
6874 #if OMPT_SUPPORT
6875  ompt_post_init();
6876 #endif
6877 
6878  KMP_MB();
6879 
6880  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6881 }
6882 
6883 void __kmp_serial_initialize(void) {
6884  if (__kmp_init_serial) {
6885  return;
6886  }
6887  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6888  if (__kmp_init_serial) {
6889  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6890  return;
6891  }
6892  __kmp_do_serial_initialize();
6893  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6894 }
6895 
6896 static void __kmp_do_middle_initialize(void) {
6897  int i, j;
6898  int prev_dflt_team_nth;
6899 
6900  if (!__kmp_init_serial) {
6901  __kmp_do_serial_initialize();
6902  }
6903 
6904  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6905 
6906  // Save the previous value for the __kmp_dflt_team_nth so that
6907  // we can avoid some reinitialization if it hasn't changed.
6908  prev_dflt_team_nth = __kmp_dflt_team_nth;
6909 
6910 #if KMP_AFFINITY_SUPPORTED
6911  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6912  // number of cores on the machine.
6913  __kmp_affinity_initialize();
6914 
6915  // Run through the __kmp_threads array and set the affinity mask
6916  // for each root thread that is currently registered with the RTL.
6917  for (i = 0; i < __kmp_threads_capacity; i++) {
6918  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6919  __kmp_affinity_set_init_mask(i, TRUE);
6920  }
6921  }
6922 #endif /* KMP_AFFINITY_SUPPORTED */
6923 
6924  KMP_ASSERT(__kmp_xproc > 0);
6925  if (__kmp_avail_proc == 0) {
6926  __kmp_avail_proc = __kmp_xproc;
6927  }
6928 
6929  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6930  // correct them now
6931  j = 0;
6932  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6933  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6934  __kmp_avail_proc;
6935  j++;
6936  }
6937 
6938  if (__kmp_dflt_team_nth == 0) {
6939 #ifdef KMP_DFLT_NTH_CORES
6940  // Default #threads = #cores
6941  __kmp_dflt_team_nth = __kmp_ncores;
6942  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6943  "__kmp_ncores (%d)\n",
6944  __kmp_dflt_team_nth));
6945 #else
6946  // Default #threads = #available OS procs
6947  __kmp_dflt_team_nth = __kmp_avail_proc;
6948  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6949  "__kmp_avail_proc(%d)\n",
6950  __kmp_dflt_team_nth));
6951 #endif /* KMP_DFLT_NTH_CORES */
6952  }
6953 
6954  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6955  __kmp_dflt_team_nth = KMP_MIN_NTH;
6956  }
6957  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6958  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6959  }
6960 
6961  // There's no harm in continuing if the following check fails,
6962  // but it indicates an error in the previous logic.
6963  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6964 
6965  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6966  // Run through the __kmp_threads array and set the num threads icv for each
6967  // root thread that is currently registered with the RTL (which has not
6968  // already explicitly set its nthreads-var with a call to
6969  // omp_set_num_threads()).
6970  for (i = 0; i < __kmp_threads_capacity; i++) {
6971  kmp_info_t *thread = __kmp_threads[i];
6972  if (thread == NULL)
6973  continue;
6974  if (thread->th.th_current_task->td_icvs.nproc != 0)
6975  continue;
6976 
6977  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6978  }
6979  }
6980  KA_TRACE(
6981  20,
6982  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6983  __kmp_dflt_team_nth));
6984 
6985 #ifdef KMP_ADJUST_BLOCKTIME
6986  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6987  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6988  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6989  if (__kmp_nth > __kmp_avail_proc) {
6990  __kmp_zero_bt = TRUE;
6991  }
6992  }
6993 #endif /* KMP_ADJUST_BLOCKTIME */
6994 
6995  /* we have finished middle initialization */
6996  TCW_SYNC_4(__kmp_init_middle, TRUE);
6997 
6998  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6999 }
7000 
7001 void __kmp_middle_initialize(void) {
7002  if (__kmp_init_middle) {
7003  return;
7004  }
7005  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7006  if (__kmp_init_middle) {
7007  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7008  return;
7009  }
7010  __kmp_do_middle_initialize();
7011  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7012 }
7013 
7014 void __kmp_parallel_initialize(void) {
7015  int gtid = __kmp_entry_gtid(); // this might be a new root
7016 
7017  /* synchronize parallel initialization (for sibling) */
7018  if (TCR_4(__kmp_init_parallel))
7019  return;
7020  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7021  if (TCR_4(__kmp_init_parallel)) {
7022  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7023  return;
7024  }
7025 
7026  /* TODO reinitialization after we have already shut down */
7027  if (TCR_4(__kmp_global.g.g_done)) {
7028  KA_TRACE(
7029  10,
7030  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7031  __kmp_infinite_loop();
7032  }
7033 
7034  /* jc: The lock __kmp_initz_lock is already held, so calling
7035  __kmp_serial_initialize would cause a deadlock. So we call
7036  __kmp_do_serial_initialize directly. */
7037  if (!__kmp_init_middle) {
7038  __kmp_do_middle_initialize();
7039  }
7040  __kmp_resume_if_hard_paused();
7041 
7042  /* begin initialization */
7043  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7044  KMP_ASSERT(KMP_UBER_GTID(gtid));
7045 
7046 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7047  // Save the FP control regs.
7048  // Worker threads will set theirs to these values at thread startup.
7049  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7050  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7051  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7052 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7053 
7054 #if KMP_OS_UNIX
7055 #if KMP_HANDLE_SIGNALS
7056  /* must be after __kmp_serial_initialize */
7057  __kmp_install_signals(TRUE);
7058 #endif
7059 #endif
7060 
7061  __kmp_suspend_initialize();
7062 
7063 #if defined(USE_LOAD_BALANCE)
7064  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7065  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7066  }
7067 #else
7068  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7069  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7070  }
7071 #endif
7072 
7073  if (__kmp_version) {
7074  __kmp_print_version_2();
7075  }
7076 
7077  /* we have finished parallel initialization */
7078  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7079 
7080  KMP_MB();
7081  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7082 
7083  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7084 }
7085 
7086 /* ------------------------------------------------------------------------ */
7087 
7088 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7089  kmp_team_t *team) {
7090  kmp_disp_t *dispatch;
7091 
7092  KMP_MB();
7093 
7094  /* none of the threads have encountered any constructs, yet. */
7095  this_thr->th.th_local.this_construct = 0;
7096 #if KMP_CACHE_MANAGE
7097  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7098 #endif /* KMP_CACHE_MANAGE */
7099  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7100  KMP_DEBUG_ASSERT(dispatch);
7101  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7102  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7103  // this_thr->th.th_info.ds.ds_tid ] );
7104 
7105  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7106  dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7107  if (__kmp_env_consistency_check)
7108  __kmp_push_parallel(gtid, team->t.t_ident);
7109 
7110  KMP_MB(); /* Flush all pending memory write invalidates. */
7111 }
7112 
7113 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7114  kmp_team_t *team) {
7115  if (__kmp_env_consistency_check)
7116  __kmp_pop_parallel(gtid, team->t.t_ident);
7117 
7118  __kmp_finish_implicit_task(this_thr);
7119 }
7120 
7121 int __kmp_invoke_task_func(int gtid) {
7122  int rc;
7123  int tid = __kmp_tid_from_gtid(gtid);
7124  kmp_info_t *this_thr = __kmp_threads[gtid];
7125  kmp_team_t *team = this_thr->th.th_team;
7126 
7127  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7128 #if USE_ITT_BUILD
7129  if (__itt_stack_caller_create_ptr) {
7130  __kmp_itt_stack_callee_enter(
7131  (__itt_caller)
7132  team->t.t_stack_id); // inform ittnotify about entering user's code
7133  }
7134 #endif /* USE_ITT_BUILD */
7135 #if INCLUDE_SSC_MARKS
7136  SSC_MARK_INVOKING();
7137 #endif
7138 
7139 #if OMPT_SUPPORT
7140  void *dummy;
7141  void **exit_frame_p;
7142  ompt_data_t *my_task_data;
7143  ompt_data_t *my_parallel_data;
7144  int ompt_team_size;
7145 
7146  if (ompt_enabled.enabled) {
7147  exit_frame_p = &(
7148  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7149  } else {
7150  exit_frame_p = &dummy;
7151  }
7152 
7153  my_task_data =
7154  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7155  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7156  if (ompt_enabled.ompt_callback_implicit_task) {
7157  ompt_team_size = team->t.t_nproc;
7158  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7159  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7160  __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7161  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7162  }
7163 #endif
7164 
7165 #if KMP_STATS_ENABLED
7166  stats_state_e previous_state = KMP_GET_THREAD_STATE();
7167  if (previous_state == stats_state_e::TEAMS_REGION) {
7168  KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7169  } else {
7170  KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7171  }
7172  KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7173 #endif
7174 
7175  rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7176  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7177 #if OMPT_SUPPORT
7178  ,
7179  exit_frame_p
7180 #endif
7181  );
7182 #if OMPT_SUPPORT
7183  *exit_frame_p = NULL;
7184  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7185 #endif
7186 
7187 #if KMP_STATS_ENABLED
7188  if (previous_state == stats_state_e::TEAMS_REGION) {
7189  KMP_SET_THREAD_STATE(previous_state);
7190  }
7191  KMP_POP_PARTITIONED_TIMER();
7192 #endif
7193 
7194 #if USE_ITT_BUILD
7195  if (__itt_stack_caller_create_ptr) {
7196  __kmp_itt_stack_callee_leave(
7197  (__itt_caller)
7198  team->t.t_stack_id); // inform ittnotify about leaving user's code
7199  }
7200 #endif /* USE_ITT_BUILD */
7201  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7202 
7203  return rc;
7204 }
7205 
7206 void __kmp_teams_master(int gtid) {
7207  // This routine is called by all master threads in teams construct
7208  kmp_info_t *thr = __kmp_threads[gtid];
7209  kmp_team_t *team = thr->th.th_team;
7210  ident_t *loc = team->t.t_ident;
7211  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7212  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7213  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7214  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7215  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7216 
7217  // This thread is a new CG root. Set up the proper variables.
7218  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7219  tmp->cg_root = thr; // Make thr the CG root
7220  // Init to thread limit that was stored when league masters were forked
7221  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7222  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7223  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7224  " cg_nthreads to 1\n",
7225  thr, tmp));
7226  tmp->up = thr->th.th_cg_roots;
7227  thr->th.th_cg_roots = tmp;
7228 
7229 // Launch league of teams now, but not let workers execute
7230 // (they hang on fork barrier until next parallel)
7231 #if INCLUDE_SSC_MARKS
7232  SSC_MARK_FORKING();
7233 #endif
7234  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7235  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7236  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7237 #if INCLUDE_SSC_MARKS
7238  SSC_MARK_JOINING();
7239 #endif
7240  // If the team size was reduced from the limit, set it to the new size
7241  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7242  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7243  // AC: last parameter "1" eliminates join barrier which won't work because
7244  // worker threads are in a fork barrier waiting for more parallel regions
7245  __kmp_join_call(loc, gtid
7246 #if OMPT_SUPPORT
7247  ,
7248  fork_context_intel
7249 #endif
7250  ,
7251  1);
7252 }
7253 
7254 int __kmp_invoke_teams_master(int gtid) {
7255  kmp_info_t *this_thr = __kmp_threads[gtid];
7256  kmp_team_t *team = this_thr->th.th_team;
7257 #if KMP_DEBUG
7258  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7259  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7260  (void *)__kmp_teams_master);
7261 #endif
7262  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7263 #if OMPT_SUPPORT
7264  int tid = __kmp_tid_from_gtid(gtid);
7265  ompt_data_t *task_data =
7266  &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7267  ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7268  if (ompt_enabled.ompt_callback_implicit_task) {
7269  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7270  ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7271  ompt_task_initial);
7272  OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7273  }
7274 #endif
7275  __kmp_teams_master(gtid);
7276 #if OMPT_SUPPORT
7277  this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7278 #endif
7279  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7280  return 1;
7281 }
7282 
7283 /* this sets the requested number of threads for the next parallel region
7284  encountered by this team. since this should be enclosed in the forkjoin
7285  critical section it should avoid race conditions with asymmetrical nested
7286  parallelism */
7287 
7288 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7289  kmp_info_t *thr = __kmp_threads[gtid];
7290 
7291  if (num_threads > 0)
7292  thr->th.th_set_nproc = num_threads;
7293 }
7294 
7295 /* this sets the requested number of teams for the teams region and/or
7296  the number of threads for the next parallel region encountered */
7297 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7298  int num_threads) {
7299  kmp_info_t *thr = __kmp_threads[gtid];
7300  KMP_DEBUG_ASSERT(num_teams >= 0);
7301  KMP_DEBUG_ASSERT(num_threads >= 0);
7302 
7303  if (num_teams == 0)
7304  num_teams = 1; // default number of teams is 1.
7305  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7306  if (!__kmp_reserve_warn) {
7307  __kmp_reserve_warn = 1;
7308  __kmp_msg(kmp_ms_warning,
7309  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7310  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7311  }
7312  num_teams = __kmp_teams_max_nth;
7313  }
7314  // Set number of teams (number of threads in the outer "parallel" of the
7315  // teams)
7316  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7317 
7318  // Remember the number of threads for inner parallel regions
7319  if (!TCR_4(__kmp_init_middle))
7320  __kmp_middle_initialize(); // get internal globals calculated
7321  KMP_DEBUG_ASSERT(__kmp_avail_proc);
7322  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7323  if (num_threads == 0) {
7324  num_threads = __kmp_avail_proc / num_teams;
7325  // adjust num_threads w/o warning as it is not user setting
7326  // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7327  // no thread_limit clause specified - do not change thread-limit-var ICV
7328  if (num_threads > __kmp_dflt_team_nth) {
7329  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7330  }
7331  if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7332  num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7333  } // prevent team size to exceed thread-limit-var
7334  if (num_teams * num_threads > __kmp_teams_max_nth) {
7335  num_threads = __kmp_teams_max_nth / num_teams;
7336  }
7337  } else {
7338  // This thread will be the master of the league masters
7339  // Store new thread limit; old limit is saved in th_cg_roots list
7340  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7341  // num_threads = min(num_threads, nthreads-var)
7342  if (num_threads > __kmp_dflt_team_nth) {
7343  num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7344  }
7345  if (num_teams * num_threads > __kmp_teams_max_nth) {
7346  int new_threads = __kmp_teams_max_nth / num_teams;
7347  if (!__kmp_reserve_warn) { // user asked for too many threads
7348  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7349  __kmp_msg(kmp_ms_warning,
7350  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7351  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7352  }
7353  num_threads = new_threads;
7354  }
7355  }
7356  thr->th.th_teams_size.nth = num_threads;
7357 }
7358 
7359 // Set the proc_bind var to use in the following parallel region.
7360 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7361  kmp_info_t *thr = __kmp_threads[gtid];
7362  thr->th.th_set_proc_bind = proc_bind;
7363 }
7364 
7365 /* Launch the worker threads into the microtask. */
7366 
7367 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7368  kmp_info_t *this_thr = __kmp_threads[gtid];
7369 
7370 #ifdef KMP_DEBUG
7371  int f;
7372 #endif /* KMP_DEBUG */
7373 
7374  KMP_DEBUG_ASSERT(team);
7375  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7376  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7377  KMP_MB(); /* Flush all pending memory write invalidates. */
7378 
7379  team->t.t_construct = 0; /* no single directives seen yet */
7380  team->t.t_ordered.dt.t_value =
7381  0; /* thread 0 enters the ordered section first */
7382 
7383  /* Reset the identifiers on the dispatch buffer */
7384  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7385  if (team->t.t_max_nproc > 1) {
7386  int i;
7387  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7388  team->t.t_disp_buffer[i].buffer_index = i;
7389  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7390  }
7391  } else {
7392  team->t.t_disp_buffer[0].buffer_index = 0;
7393  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7394  }
7395 
7396  KMP_MB(); /* Flush all pending memory write invalidates. */
7397  KMP_ASSERT(this_thr->th.th_team == team);
7398 
7399 #ifdef KMP_DEBUG
7400  for (f = 0; f < team->t.t_nproc; f++) {
7401  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7402  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7403  }
7404 #endif /* KMP_DEBUG */
7405 
7406  /* release the worker threads so they may begin working */
7407  __kmp_fork_barrier(gtid, 0);
7408 }
7409 
7410 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7411  kmp_info_t *this_thr = __kmp_threads[gtid];
7412 
7413  KMP_DEBUG_ASSERT(team);
7414  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7415  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7416  KMP_MB(); /* Flush all pending memory write invalidates. */
7417 
7418 /* Join barrier after fork */
7419 
7420 #ifdef KMP_DEBUG
7421  if (__kmp_threads[gtid] &&
7422  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7423  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7424  __kmp_threads[gtid]);
7425  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7426  "team->t.t_nproc=%d\n",
7427  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7428  team->t.t_nproc);
7429  __kmp_print_structure();
7430  }
7431  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7432  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7433 #endif /* KMP_DEBUG */
7434 
7435  __kmp_join_barrier(gtid); /* wait for everyone */
7436 #if OMPT_SUPPORT
7437  if (ompt_enabled.enabled &&
7438  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7439  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7440  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7441  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7442 #if OMPT_OPTIONAL
7443  void *codeptr = NULL;
7444  if (KMP_MASTER_TID(ds_tid) &&
7445  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7446  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7447  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7448 
7449  if (ompt_enabled.ompt_callback_sync_region_wait) {
7450  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7451  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7452  codeptr);
7453  }
7454  if (ompt_enabled.ompt_callback_sync_region) {
7455  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7456  ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7457  codeptr);
7458  }
7459 #endif
7460  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7461  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7462  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7463  }
7464  }
7465 #endif
7466 
7467  KMP_MB(); /* Flush all pending memory write invalidates. */
7468  KMP_ASSERT(this_thr->th.th_team == team);
7469 }
7470 
7471 /* ------------------------------------------------------------------------ */
7472 
7473 #ifdef USE_LOAD_BALANCE
7474 
7475 // Return the worker threads actively spinning in the hot team, if we
7476 // are at the outermost level of parallelism. Otherwise, return 0.
7477 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7478  int i;
7479  int retval;
7480  kmp_team_t *hot_team;
7481 
7482  if (root->r.r_active) {
7483  return 0;
7484  }
7485  hot_team = root->r.r_hot_team;
7486  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7487  return hot_team->t.t_nproc - 1; // Don't count master thread
7488  }
7489 
7490  // Skip the master thread - it is accounted for elsewhere.
7491  retval = 0;
7492  for (i = 1; i < hot_team->t.t_nproc; i++) {
7493  if (hot_team->t.t_threads[i]->th.th_active) {
7494  retval++;
7495  }
7496  }
7497  return retval;
7498 }
7499 
7500 // Perform an automatic adjustment to the number of
7501 // threads used by the next parallel region.
7502 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7503  int retval;
7504  int pool_active;
7505  int hot_team_active;
7506  int team_curr_active;
7507  int system_active;
7508 
7509  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7510  set_nproc));
7511  KMP_DEBUG_ASSERT(root);
7512  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7513  ->th.th_current_task->td_icvs.dynamic == TRUE);
7514  KMP_DEBUG_ASSERT(set_nproc > 1);
7515 
7516  if (set_nproc == 1) {
7517  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7518  return 1;
7519  }
7520 
7521  // Threads that are active in the thread pool, active in the hot team for this
7522  // particular root (if we are at the outer par level), and the currently
7523  // executing thread (to become the master) are available to add to the new
7524  // team, but are currently contributing to the system load, and must be
7525  // accounted for.
7526  pool_active = __kmp_thread_pool_active_nth;
7527  hot_team_active = __kmp_active_hot_team_nproc(root);
7528  team_curr_active = pool_active + hot_team_active + 1;
7529 
7530  // Check the system load.
7531  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7532  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7533  "hot team active = %d\n",
7534  system_active, pool_active, hot_team_active));
7535 
7536  if (system_active < 0) {
7537  // There was an error reading the necessary info from /proc, so use the
7538  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7539  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7540  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7541  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7542 
7543  // Make this call behave like the thread limit algorithm.
7544  retval = __kmp_avail_proc - __kmp_nth +
7545  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7546  if (retval > set_nproc) {
7547  retval = set_nproc;
7548  }
7549  if (retval < KMP_MIN_NTH) {
7550  retval = KMP_MIN_NTH;
7551  }
7552 
7553  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7554  retval));
7555  return retval;
7556  }
7557 
7558  // There is a slight delay in the load balance algorithm in detecting new
7559  // running procs. The real system load at this instant should be at least as
7560  // large as the #active omp thread that are available to add to the team.
7561  if (system_active < team_curr_active) {
7562  system_active = team_curr_active;
7563  }
7564  retval = __kmp_avail_proc - system_active + team_curr_active;
7565  if (retval > set_nproc) {
7566  retval = set_nproc;
7567  }
7568  if (retval < KMP_MIN_NTH) {
7569  retval = KMP_MIN_NTH;
7570  }
7571 
7572  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7573  return retval;
7574 } // __kmp_load_balance_nproc()
7575 
7576 #endif /* USE_LOAD_BALANCE */
7577 
7578 /* ------------------------------------------------------------------------ */
7579 
7580 /* NOTE: this is called with the __kmp_init_lock held */
7581 void __kmp_cleanup(void) {
7582  int f;
7583 
7584  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7585 
7586  if (TCR_4(__kmp_init_parallel)) {
7587 #if KMP_HANDLE_SIGNALS
7588  __kmp_remove_signals();
7589 #endif
7590  TCW_4(__kmp_init_parallel, FALSE);
7591  }
7592 
7593  if (TCR_4(__kmp_init_middle)) {
7594 #if KMP_AFFINITY_SUPPORTED
7595  __kmp_affinity_uninitialize();
7596 #endif /* KMP_AFFINITY_SUPPORTED */
7597  __kmp_cleanup_hierarchy();
7598  TCW_4(__kmp_init_middle, FALSE);
7599  }
7600 
7601  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7602 
7603  if (__kmp_init_serial) {
7604  __kmp_runtime_destroy();
7605  __kmp_init_serial = FALSE;
7606  }
7607 
7608  __kmp_cleanup_threadprivate_caches();
7609 
7610  for (f = 0; f < __kmp_threads_capacity; f++) {
7611  if (__kmp_root[f] != NULL) {
7612  __kmp_free(__kmp_root[f]);
7613  __kmp_root[f] = NULL;
7614  }
7615  }
7616  __kmp_free(__kmp_threads);
7617  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7618  // there is no need in freeing __kmp_root.
7619  __kmp_threads = NULL;
7620  __kmp_root = NULL;
7621  __kmp_threads_capacity = 0;
7622 
7623 #if KMP_USE_DYNAMIC_LOCK
7624  __kmp_cleanup_indirect_user_locks();
7625 #else
7626  __kmp_cleanup_user_locks();
7627 #endif
7628 
7629 #if KMP_AFFINITY_SUPPORTED
7630  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7631  __kmp_cpuinfo_file = NULL;
7632 #endif /* KMP_AFFINITY_SUPPORTED */
7633 
7634 #if KMP_USE_ADAPTIVE_LOCKS
7635 #if KMP_DEBUG_ADAPTIVE_LOCKS
7636  __kmp_print_speculative_stats();
7637 #endif
7638 #endif
7639  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7640  __kmp_nested_nth.nth = NULL;
7641  __kmp_nested_nth.size = 0;
7642  __kmp_nested_nth.used = 0;
7643  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7644  __kmp_nested_proc_bind.bind_types = NULL;
7645  __kmp_nested_proc_bind.size = 0;
7646  __kmp_nested_proc_bind.used = 0;
7647  if (__kmp_affinity_format) {
7648  KMP_INTERNAL_FREE(__kmp_affinity_format);
7649  __kmp_affinity_format = NULL;
7650  }
7651 
7652  __kmp_i18n_catclose();
7653 
7654 #if KMP_USE_HIER_SCHED
7655  __kmp_hier_scheds.deallocate();
7656 #endif
7657 
7658 #if KMP_STATS_ENABLED
7659  __kmp_stats_fini();
7660 #endif
7661 
7662  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7663 }
7664 
7665 /* ------------------------------------------------------------------------ */
7666 
7667 int __kmp_ignore_mppbeg(void) {
7668  char *env;
7669 
7670  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7671  if (__kmp_str_match_false(env))
7672  return FALSE;
7673  }
7674  // By default __kmpc_begin() is no-op.
7675  return TRUE;
7676 }
7677 
7678 int __kmp_ignore_mppend(void) {
7679  char *env;
7680 
7681  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7682  if (__kmp_str_match_false(env))
7683  return FALSE;
7684  }
7685  // By default __kmpc_end() is no-op.
7686  return TRUE;
7687 }
7688 
7689 void __kmp_internal_begin(void) {
7690  int gtid;
7691  kmp_root_t *root;
7692 
7693  /* this is a very important step as it will register new sibling threads
7694  and assign these new uber threads a new gtid */
7695  gtid = __kmp_entry_gtid();
7696  root = __kmp_threads[gtid]->th.th_root;
7697  KMP_ASSERT(KMP_UBER_GTID(gtid));
7698 
7699  if (root->r.r_begin)
7700  return;
7701  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7702  if (root->r.r_begin) {
7703  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7704  return;
7705  }
7706 
7707  root->r.r_begin = TRUE;
7708 
7709  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7710 }
7711 
7712 /* ------------------------------------------------------------------------ */
7713 
7714 void __kmp_user_set_library(enum library_type arg) {
7715  int gtid;
7716  kmp_root_t *root;
7717  kmp_info_t *thread;
7718 
7719  /* first, make sure we are initialized so we can get our gtid */
7720 
7721  gtid = __kmp_entry_gtid();
7722  thread = __kmp_threads[gtid];
7723 
7724  root = thread->th.th_root;
7725 
7726  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7727  library_serial));
7728  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7729  thread */
7730  KMP_WARNING(SetLibraryIncorrectCall);
7731  return;
7732  }
7733 
7734  switch (arg) {
7735  case library_serial:
7736  thread->th.th_set_nproc = 0;
7737  set__nproc(thread, 1);
7738  break;
7739  case library_turnaround:
7740  thread->th.th_set_nproc = 0;
7741  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7742  : __kmp_dflt_team_nth_ub);
7743  break;
7744  case library_throughput:
7745  thread->th.th_set_nproc = 0;
7746  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7747  : __kmp_dflt_team_nth_ub);
7748  break;
7749  default:
7750  KMP_FATAL(UnknownLibraryType, arg);
7751  }
7752 
7753  __kmp_aux_set_library(arg);
7754 }
7755 
7756 void __kmp_aux_set_stacksize(size_t arg) {
7757  if (!__kmp_init_serial)
7758  __kmp_serial_initialize();
7759 
7760 #if KMP_OS_DARWIN
7761  if (arg & (0x1000 - 1)) {
7762  arg &= ~(0x1000 - 1);
7763  if (arg + 0x1000) /* check for overflow if we round up */
7764  arg += 0x1000;
7765  }
7766 #endif
7767  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7768 
7769  /* only change the default stacksize before the first parallel region */
7770  if (!TCR_4(__kmp_init_parallel)) {
7771  size_t value = arg; /* argument is in bytes */
7772 
7773  if (value < __kmp_sys_min_stksize)
7774  value = __kmp_sys_min_stksize;
7775  else if (value > KMP_MAX_STKSIZE)
7776  value = KMP_MAX_STKSIZE;
7777 
7778  __kmp_stksize = value;
7779 
7780  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7781  }
7782 
7783  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7784 }
7785 
7786 /* set the behaviour of the runtime library */
7787 /* TODO this can cause some odd behaviour with sibling parallelism... */
7788 void __kmp_aux_set_library(enum library_type arg) {
7789  __kmp_library = arg;
7790 
7791  switch (__kmp_library) {
7792  case library_serial: {
7793  KMP_INFORM(LibraryIsSerial);
7794  } break;
7795  case library_turnaround:
7796  if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7797  __kmp_use_yield = 2; // only yield when oversubscribed
7798  break;
7799  case library_throughput:
7800  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7801  __kmp_dflt_blocktime = 200;
7802  break;
7803  default:
7804  KMP_FATAL(UnknownLibraryType, arg);
7805  }
7806 }
7807 
7808 /* Getting team information common for all team API */
7809 // Returns NULL if not in teams construct
7810 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7811  kmp_info_t *thr = __kmp_entry_thread();
7812  teams_serialized = 0;
7813  if (thr->th.th_teams_microtask) {
7814  kmp_team_t *team = thr->th.th_team;
7815  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7816  int ii = team->t.t_level;
7817  teams_serialized = team->t.t_serialized;
7818  int level = tlevel + 1;
7819  KMP_DEBUG_ASSERT(ii >= tlevel);
7820  while (ii > level) {
7821  for (teams_serialized = team->t.t_serialized;
7822  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7823  }
7824  if (team->t.t_serialized && (!teams_serialized)) {
7825  team = team->t.t_parent;
7826  continue;
7827  }
7828  if (ii > level) {
7829  team = team->t.t_parent;
7830  ii--;
7831  }
7832  }
7833  return team;
7834  }
7835  return NULL;
7836 }
7837 
7838 int __kmp_aux_get_team_num() {
7839  int serialized;
7840  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7841  if (team) {
7842  if (serialized > 1) {
7843  return 0; // teams region is serialized ( 1 team of 1 thread ).
7844  } else {
7845  return team->t.t_master_tid;
7846  }
7847  }
7848  return 0;
7849 }
7850 
7851 int __kmp_aux_get_num_teams() {
7852  int serialized;
7853  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7854  if (team) {
7855  if (serialized > 1) {
7856  return 1;
7857  } else {
7858  return team->t.t_parent->t.t_nproc;
7859  }
7860  }
7861  return 1;
7862 }
7863 
7864 /* ------------------------------------------------------------------------ */
7865 
7866 /*
7867  * Affinity Format Parser
7868  *
7869  * Field is in form of: %[[[0].]size]type
7870  * % and type are required (%% means print a literal '%')
7871  * type is either single char or long name surrounded by {},
7872  * e.g., N or {num_threads}
7873  * 0 => leading zeros
7874  * . => right justified when size is specified
7875  * by default output is left justified
7876  * size is the *minimum* field length
7877  * All other characters are printed as is
7878  *
7879  * Available field types:
7880  * L {thread_level} - omp_get_level()
7881  * n {thread_num} - omp_get_thread_num()
7882  * h {host} - name of host machine
7883  * P {process_id} - process id (integer)
7884  * T {thread_identifier} - native thread identifier (integer)
7885  * N {num_threads} - omp_get_num_threads()
7886  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7887  * a {thread_affinity} - comma separated list of integers or integer ranges
7888  * (values of affinity mask)
7889  *
7890  * Implementation-specific field types can be added
7891  * If a type is unknown, print "undefined"
7892 */
7893 
7894 // Structure holding the short name, long name, and corresponding data type
7895 // for snprintf. A table of these will represent the entire valid keyword
7896 // field types.
7897 typedef struct kmp_affinity_format_field_t {
7898  char short_name; // from spec e.g., L -> thread level
7899  const char *long_name; // from spec thread_level -> thread level
7900  char field_format; // data type for snprintf (typically 'd' or 's'
7901  // for integer or string)
7902 } kmp_affinity_format_field_t;
7903 
7904 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7905 #if KMP_AFFINITY_SUPPORTED
7906  {'A', "thread_affinity", 's'},
7907 #endif
7908  {'t', "team_num", 'd'},
7909  {'T', "num_teams", 'd'},
7910  {'L', "nesting_level", 'd'},
7911  {'n', "thread_num", 'd'},
7912  {'N', "num_threads", 'd'},
7913  {'a', "ancestor_tnum", 'd'},
7914  {'H', "host", 's'},
7915  {'P', "process_id", 'd'},
7916  {'i', "native_thread_id", 'd'}};
7917 
7918 // Return the number of characters it takes to hold field
7919 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7920  const char **ptr,
7921  kmp_str_buf_t *field_buffer) {
7922  int rc, format_index, field_value;
7923  const char *width_left, *width_right;
7924  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7925  static const int FORMAT_SIZE = 20;
7926  char format[FORMAT_SIZE] = {0};
7927  char absolute_short_name = 0;
7928 
7929  KMP_DEBUG_ASSERT(gtid >= 0);
7930  KMP_DEBUG_ASSERT(th);
7931  KMP_DEBUG_ASSERT(**ptr == '%');
7932  KMP_DEBUG_ASSERT(field_buffer);
7933 
7934  __kmp_str_buf_clear(field_buffer);
7935 
7936  // Skip the initial %
7937  (*ptr)++;
7938 
7939  // Check for %% first
7940  if (**ptr == '%') {
7941  __kmp_str_buf_cat(field_buffer, "%", 1);
7942  (*ptr)++; // skip over the second %
7943  return 1;
7944  }
7945 
7946  // Parse field modifiers if they are present
7947  pad_zeros = false;
7948  if (**ptr == '0') {
7949  pad_zeros = true;
7950  (*ptr)++; // skip over 0
7951  }
7952  right_justify = false;
7953  if (**ptr == '.') {
7954  right_justify = true;
7955  (*ptr)++; // skip over .
7956  }
7957  // Parse width of field: [width_left, width_right)
7958  width_left = width_right = NULL;
7959  if (**ptr >= '0' && **ptr <= '9') {
7960  width_left = *ptr;
7961  SKIP_DIGITS(*ptr);
7962  width_right = *ptr;
7963  }
7964 
7965  // Create the format for KMP_SNPRINTF based on flags parsed above
7966  format_index = 0;
7967  format[format_index++] = '%';
7968  if (!right_justify)
7969  format[format_index++] = '-';
7970  if (pad_zeros)
7971  format[format_index++] = '0';
7972  if (width_left && width_right) {
7973  int i = 0;
7974  // Only allow 8 digit number widths.
7975  // This also prevents overflowing format variable
7976  while (i < 8 && width_left < width_right) {
7977  format[format_index++] = *width_left;
7978  width_left++;
7979  i++;
7980  }
7981  }
7982 
7983  // Parse a name (long or short)
7984  // Canonicalize the name into absolute_short_name
7985  found_valid_name = false;
7986  parse_long_name = (**ptr == '{');
7987  if (parse_long_name)
7988  (*ptr)++; // skip initial left brace
7989  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7990  sizeof(__kmp_affinity_format_table[0]);
7991  ++i) {
7992  char short_name = __kmp_affinity_format_table[i].short_name;
7993  const char *long_name = __kmp_affinity_format_table[i].long_name;
7994  char field_format = __kmp_affinity_format_table[i].field_format;
7995  if (parse_long_name) {
7996  int length = KMP_STRLEN(long_name);
7997  if (strncmp(*ptr, long_name, length) == 0) {
7998  found_valid_name = true;
7999  (*ptr) += length; // skip the long name
8000  }
8001  } else if (**ptr == short_name) {
8002  found_valid_name = true;
8003  (*ptr)++; // skip the short name
8004  }
8005  if (found_valid_name) {
8006  format[format_index++] = field_format;
8007  format[format_index++] = '\0';
8008  absolute_short_name = short_name;
8009  break;
8010  }
8011  }
8012  if (parse_long_name) {
8013  if (**ptr != '}') {
8014  absolute_short_name = 0;
8015  } else {
8016  (*ptr)++; // skip over the right brace
8017  }
8018  }
8019 
8020  // Attempt to fill the buffer with the requested
8021  // value using snprintf within __kmp_str_buf_print()
8022  switch (absolute_short_name) {
8023  case 't':
8024  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8025  break;
8026  case 'T':
8027  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8028  break;
8029  case 'L':
8030  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8031  break;
8032  case 'n':
8033  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8034  break;
8035  case 'H': {
8036  static const int BUFFER_SIZE = 256;
8037  char buf[BUFFER_SIZE];
8038  __kmp_expand_host_name(buf, BUFFER_SIZE);
8039  rc = __kmp_str_buf_print(field_buffer, format, buf);
8040  } break;
8041  case 'P':
8042  rc = __kmp_str_buf_print(field_buffer, format, getpid());
8043  break;
8044  case 'i':
8045  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8046  break;
8047  case 'N':
8048  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8049  break;
8050  case 'a':
8051  field_value =
8052  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8053  rc = __kmp_str_buf_print(field_buffer, format, field_value);
8054  break;
8055 #if KMP_AFFINITY_SUPPORTED
8056  case 'A': {
8057  kmp_str_buf_t buf;
8058  __kmp_str_buf_init(&buf);
8059  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8060  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8061  __kmp_str_buf_free(&buf);
8062  } break;
8063 #endif
8064  default:
8065  // According to spec, If an implementation does not have info for field
8066  // type, then "undefined" is printed
8067  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8068  // Skip the field
8069  if (parse_long_name) {
8070  SKIP_TOKEN(*ptr);
8071  if (**ptr == '}')
8072  (*ptr)++;
8073  } else {
8074  (*ptr)++;
8075  }
8076  }
8077 
8078  KMP_ASSERT(format_index <= FORMAT_SIZE);
8079  return rc;
8080 }
8081 
8082 /*
8083  * Return number of characters needed to hold the affinity string
8084  * (not including null byte character)
8085  * The resultant string is printed to buffer, which the caller can then
8086  * handle afterwards
8087 */
8088 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8089  kmp_str_buf_t *buffer) {
8090  const char *parse_ptr;
8091  size_t retval;
8092  const kmp_info_t *th;
8093  kmp_str_buf_t field;
8094 
8095  KMP_DEBUG_ASSERT(buffer);
8096  KMP_DEBUG_ASSERT(gtid >= 0);
8097 
8098  __kmp_str_buf_init(&field);
8099  __kmp_str_buf_clear(buffer);
8100 
8101  th = __kmp_threads[gtid];
8102  retval = 0;
8103 
8104  // If format is NULL or zero-length string, then we use
8105  // affinity-format-var ICV
8106  parse_ptr = format;
8107  if (parse_ptr == NULL || *parse_ptr == '\0') {
8108  parse_ptr = __kmp_affinity_format;
8109  }
8110  KMP_DEBUG_ASSERT(parse_ptr);
8111 
8112  while (*parse_ptr != '\0') {
8113  // Parse a field
8114  if (*parse_ptr == '%') {
8115  // Put field in the buffer
8116  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8117  __kmp_str_buf_catbuf(buffer, &field);
8118  retval += rc;
8119  } else {
8120  // Put literal character in buffer
8121  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8122  retval++;
8123  parse_ptr++;
8124  }
8125  }
8126  __kmp_str_buf_free(&field);
8127  return retval;
8128 }
8129 
8130 // Displays the affinity string to stdout
8131 void __kmp_aux_display_affinity(int gtid, const char *format) {
8132  kmp_str_buf_t buf;
8133  __kmp_str_buf_init(&buf);
8134  __kmp_aux_capture_affinity(gtid, format, &buf);
8135  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8136  __kmp_str_buf_free(&buf);
8137 }
8138 
8139 /* ------------------------------------------------------------------------ */
8140 
8141 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8142  int blocktime = arg; /* argument is in milliseconds */
8143 #if KMP_USE_MONITOR
8144  int bt_intervals;
8145 #endif
8146  int bt_set;
8147 
8148  __kmp_save_internal_controls(thread);
8149 
8150  /* Normalize and set blocktime for the teams */
8151  if (blocktime < KMP_MIN_BLOCKTIME)
8152  blocktime = KMP_MIN_BLOCKTIME;
8153  else if (blocktime > KMP_MAX_BLOCKTIME)
8154  blocktime = KMP_MAX_BLOCKTIME;
8155 
8156  set__blocktime_team(thread->th.th_team, tid, blocktime);
8157  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8158 
8159 #if KMP_USE_MONITOR
8160  /* Calculate and set blocktime intervals for the teams */
8161  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8162 
8163  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8164  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8165 #endif
8166 
8167  /* Set whether blocktime has been set to "TRUE" */
8168  bt_set = TRUE;
8169 
8170  set__bt_set_team(thread->th.th_team, tid, bt_set);
8171  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8172 #if KMP_USE_MONITOR
8173  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8174  "bt_intervals=%d, monitor_updates=%d\n",
8175  __kmp_gtid_from_tid(tid, thread->th.th_team),
8176  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8177  __kmp_monitor_wakeups));
8178 #else
8179  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8180  __kmp_gtid_from_tid(tid, thread->th.th_team),
8181  thread->th.th_team->t.t_id, tid, blocktime));
8182 #endif
8183 }
8184 
8185 void __kmp_aux_set_defaults(char const *str, int len) {
8186  if (!__kmp_init_serial) {
8187  __kmp_serial_initialize();
8188  }
8189  __kmp_env_initialize(str);
8190 
8191  if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8192  __kmp_env_print();
8193  }
8194 } // __kmp_aux_set_defaults
8195 
8196 /* ------------------------------------------------------------------------ */
8197 /* internal fast reduction routines */
8198 
8199 PACKED_REDUCTION_METHOD_T
8200 __kmp_determine_reduction_method(
8201  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8202  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8203  kmp_critical_name *lck) {
8204 
8205  // Default reduction method: critical construct ( lck != NULL, like in current
8206  // PAROPT )
8207  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8208  // can be selected by RTL
8209  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8210  // can be selected by RTL
8211  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8212  // among generated by PAROPT.
8213 
8214  PACKED_REDUCTION_METHOD_T retval;
8215 
8216  int team_size;
8217 
8218  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8219  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8220 
8221 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8222  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8223 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8224 
8225  retval = critical_reduce_block;
8226 
8227  // another choice of getting a team size (with 1 dynamic deference) is slower
8228  team_size = __kmp_get_team_num_threads(global_tid);
8229  if (team_size == 1) {
8230 
8231  retval = empty_reduce_block;
8232 
8233  } else {
8234 
8235  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8236 
8237 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8238  KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8239 
8240 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8241  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8242 
8243  int teamsize_cutoff = 4;
8244 
8245 #if KMP_MIC_SUPPORTED
8246  if (__kmp_mic_type != non_mic) {
8247  teamsize_cutoff = 8;
8248  }
8249 #endif
8250  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8251  if (tree_available) {
8252  if (team_size <= teamsize_cutoff) {
8253  if (atomic_available) {
8254  retval = atomic_reduce_block;
8255  }
8256  } else {
8257  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8258  }
8259  } else if (atomic_available) {
8260  retval = atomic_reduce_block;
8261  }
8262 #else
8263 #error "Unknown or unsupported OS"
8264 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8265  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8266 
8267 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8268 
8269 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8270 
8271  // basic tuning
8272 
8273  if (atomic_available) {
8274  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8275  retval = atomic_reduce_block;
8276  }
8277  } // otherwise: use critical section
8278 
8279 #elif KMP_OS_DARWIN
8280 
8281  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8282  if (atomic_available && (num_vars <= 3)) {
8283  retval = atomic_reduce_block;
8284  } else if (tree_available) {
8285  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8286  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8287  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8288  }
8289  } // otherwise: use critical section
8290 
8291 #else
8292 #error "Unknown or unsupported OS"
8293 #endif
8294 
8295 #else
8296 #error "Unknown or unsupported architecture"
8297 #endif
8298  }
8299 
8300  // KMP_FORCE_REDUCTION
8301 
8302  // If the team is serialized (team_size == 1), ignore the forced reduction
8303  // method and stay with the unsynchronized method (empty_reduce_block)
8304  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8305  team_size != 1) {
8306 
8307  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8308 
8309  int atomic_available, tree_available;
8310 
8311  switch ((forced_retval = __kmp_force_reduction_method)) {
8312  case critical_reduce_block:
8313  KMP_ASSERT(lck); // lck should be != 0
8314  break;
8315 
8316  case atomic_reduce_block:
8317  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8318  if (!atomic_available) {
8319  KMP_WARNING(RedMethodNotSupported, "atomic");
8320  forced_retval = critical_reduce_block;
8321  }
8322  break;
8323 
8324  case tree_reduce_block:
8325  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8326  if (!tree_available) {
8327  KMP_WARNING(RedMethodNotSupported, "tree");
8328  forced_retval = critical_reduce_block;
8329  } else {
8330 #if KMP_FAST_REDUCTION_BARRIER
8331  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8332 #endif
8333  }
8334  break;
8335 
8336  default:
8337  KMP_ASSERT(0); // "unsupported method specified"
8338  }
8339 
8340  retval = forced_retval;
8341  }
8342 
8343  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8344 
8345 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8346 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8347 
8348  return (retval);
8349 }
8350 // this function is for testing set/get/determine reduce method
8351 kmp_int32 __kmp_get_reduce_method(void) {
8352  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8353 }
8354 
8355 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8356 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8357 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8358 
8359 // Hard pause shuts down the runtime completely. Resume happens naturally when
8360 // OpenMP is used subsequently.
8361 void __kmp_hard_pause() {
8362  __kmp_pause_status = kmp_hard_paused;
8363  __kmp_internal_end_thread(-1);
8364 }
8365 
8366 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8367 void __kmp_resume_if_soft_paused() {
8368  if (__kmp_pause_status == kmp_soft_paused) {
8369  __kmp_pause_status = kmp_not_paused;
8370 
8371  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8372  kmp_info_t *thread = __kmp_threads[gtid];
8373  if (thread) { // Wake it if sleeping
8374  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8375  if (fl.is_sleeping())
8376  fl.resume(gtid);
8377  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8378  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8379  } else { // thread holds the lock and may sleep soon
8380  do { // until either the thread sleeps, or we can get the lock
8381  if (fl.is_sleeping()) {
8382  fl.resume(gtid);
8383  break;
8384  } else if (__kmp_try_suspend_mx(thread)) {
8385  __kmp_unlock_suspend_mx(thread);
8386  break;
8387  }
8388  } while (1);
8389  }
8390  }
8391  }
8392  }
8393 }
8394 
8395 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8396 // TODO: add warning messages
8397 int __kmp_pause_resource(kmp_pause_status_t level) {
8398  if (level == kmp_not_paused) { // requesting resume
8399  if (__kmp_pause_status == kmp_not_paused) {
8400  // error message about runtime not being paused, so can't resume
8401  return 1;
8402  } else {
8403  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8404  __kmp_pause_status == kmp_hard_paused);
8405  __kmp_pause_status = kmp_not_paused;
8406  return 0;
8407  }
8408  } else if (level == kmp_soft_paused) { // requesting soft pause
8409  if (__kmp_pause_status != kmp_not_paused) {
8410  // error message about already being paused
8411  return 1;
8412  } else {
8413  __kmp_soft_pause();
8414  return 0;
8415  }
8416  } else if (level == kmp_hard_paused) { // requesting hard pause
8417  if (__kmp_pause_status != kmp_not_paused) {
8418  // error message about already being paused
8419  return 1;
8420  } else {
8421  __kmp_hard_pause();
8422  return 0;
8423  }
8424  } else {
8425  // error message about invalid level
8426  return 1;
8427  }
8428 }
8429 
8430 
8431 void __kmp_omp_display_env(int verbose) {
8432  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8433  if (__kmp_init_serial == 0)
8434  __kmp_do_serial_initialize();
8435  __kmp_display_env_impl(!verbose, verbose);
8436  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8437 }
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:887
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:929
sched_type
Definition: kmp.h:340
Definition: kmp.h:226
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
kmp_int32 flags
Definition: kmp.h:228