LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_runtime.cpp
1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30 
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34 
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37 
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41 
42 #include "tsan_annotations.h"
43 
44 #if defined(KMP_GOMP_COMPAT)
45 char const __kmp_version_alt_comp[] =
46  KMP_VERSION_PREFIX "alternative compiler support: yes";
47 #endif /* defined(KMP_GOMP_COMPAT) */
48 
49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
50 #if OMP_50_ENABLED
51  "5.0 (201611)";
52 #elif OMP_45_ENABLED
53  "4.5 (201511)";
54 #elif OMP_40_ENABLED
55  "4.0 (201307)";
56 #else
57  "3.1 (201107)";
58 #endif
59 
60 #ifdef KMP_DEBUG
61 char const __kmp_version_lock[] =
62  KMP_VERSION_PREFIX "lock type: run time selectable";
63 #endif /* KMP_DEBUG */
64 
65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
66 
67 /* ------------------------------------------------------------------------ */
68 
69 #if KMP_USE_MONITOR
70 kmp_info_t __kmp_monitor;
71 #endif
72 
73 /* Forward declarations */
74 
75 void __kmp_cleanup(void);
76 
77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
78  int gtid);
79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
80  kmp_internal_control_t *new_icvs,
81  ident_t *loc);
82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
83 static void __kmp_partition_places(kmp_team_t *team,
84  int update_master_only = 0);
85 #endif
86 static void __kmp_do_serial_initialize(void);
87 void __kmp_fork_barrier(int gtid, int tid);
88 void __kmp_join_barrier(int gtid);
89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
90  kmp_internal_control_t *new_icvs, ident_t *loc);
91 
92 #ifdef USE_LOAD_BALANCE
93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
94 #endif
95 
96 static int __kmp_expand_threads(int nNeed);
97 #if KMP_OS_WINDOWS
98 static int __kmp_unregister_root_other_thread(int gtid);
99 #endif
100 static void __kmp_unregister_library(void); // called by __kmp_internal_end()
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103 
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106  thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
107 int __kmp_get_global_thread_id() {
108  int i;
109  kmp_info_t **other_threads;
110  size_t stack_data;
111  char *stack_addr;
112  size_t stack_size;
113  char *stack_base;
114 
115  KA_TRACE(
116  1000,
117  ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
118  __kmp_nth, __kmp_all_nth));
119 
120  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121  a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122  by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123  __kmp_init_gtid for this to work. */
124 
125  if (!TCR_4(__kmp_init_gtid))
126  return KMP_GTID_DNE;
127 
128 #ifdef KMP_TDATA_GTID
129  if (TCR_4(__kmp_gtid_mode) >= 3) {
130  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131  return __kmp_gtid;
132  }
133 #endif
134  if (TCR_4(__kmp_gtid_mode) >= 2) {
135  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136  return __kmp_gtid_get_specific();
137  }
138  KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139 
140  stack_addr = (char *)&stack_data;
141  other_threads = __kmp_threads;
142 
143  /* ATT: The code below is a source of potential bugs due to unsynchronized
144  access to __kmp_threads array. For example:
145  1. Current thread loads other_threads[i] to thr and checks it, it is
146  non-NULL.
147  2. Current thread is suspended by OS.
148  3. Another thread unregisters and finishes (debug versions of free()
149  may fill memory with something like 0xEF).
150  4. Current thread is resumed.
151  5. Current thread reads junk from *thr.
152  TODO: Fix it. --ln */
153 
154  for (i = 0; i < __kmp_threads_capacity; i++) {
155 
156  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157  if (!thr)
158  continue;
159 
160  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162 
163  /* stack grows down -- search through all of the active threads */
164 
165  if (stack_addr <= stack_base) {
166  size_t stack_diff = stack_base - stack_addr;
167 
168  if (stack_diff <= stack_size) {
169  /* The only way we can be closer than the allocated */
170  /* stack size is if we are running on this thread. */
171  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172  return i;
173  }
174  }
175  }
176 
177  /* get specific to try and determine our gtid */
178  KA_TRACE(1000,
179  ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180  "thread, using TLS\n"));
181  i = __kmp_gtid_get_specific();
182 
183  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
184 
185  /* if we havn't been assigned a gtid, then return code */
186  if (i < 0)
187  return i;
188 
189  /* dynamically updated stack window for uber threads to avoid get_specific
190  call */
191  if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192  KMP_FATAL(StackOverflow, i);
193  }
194 
195  stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196  if (stack_addr > stack_base) {
197  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200  stack_base);
201  } else {
202  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203  stack_base - stack_addr);
204  }
205 
206  /* Reprint stack bounds for ubermaster since they have been refined */
207  if (__kmp_storage_map) {
208  char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210  __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211  other_threads[i]->th.th_info.ds.ds_stacksize,
212  "th_%d stack (refinement)", i);
213  }
214  return i;
215 }
216 
217 int __kmp_get_global_thread_id_reg() {
218  int gtid;
219 
220  if (!__kmp_init_serial) {
221  gtid = KMP_GTID_DNE;
222  } else
223 #ifdef KMP_TDATA_GTID
224  if (TCR_4(__kmp_gtid_mode) >= 3) {
225  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226  gtid = __kmp_gtid;
227  } else
228 #endif
229  if (TCR_4(__kmp_gtid_mode) >= 2) {
230  KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231  gtid = __kmp_gtid_get_specific();
232  } else {
233  KA_TRACE(1000,
234  ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235  gtid = __kmp_get_global_thread_id();
236  }
237 
238  /* we must be a new uber master sibling thread */
239  if (gtid == KMP_GTID_DNE) {
240  KA_TRACE(10,
241  ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242  "Registering a new gtid.\n"));
243  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244  if (!__kmp_init_serial) {
245  __kmp_do_serial_initialize();
246  gtid = __kmp_gtid_get_specific();
247  } else {
248  gtid = __kmp_register_root(FALSE);
249  }
250  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252  }
253 
254  KMP_DEBUG_ASSERT(gtid >= 0);
255 
256  return gtid;
257 }
258 
259 /* caller must hold forkjoin_lock */
260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261  int f;
262  char *stack_beg = NULL;
263  char *stack_end = NULL;
264  int gtid;
265 
266  KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267  if (__kmp_storage_map) {
268  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270 
271  gtid = __kmp_gtid_from_thread(th);
272 
273  if (gtid == KMP_GTID_MONITOR) {
274  __kmp_print_storage_map_gtid(
275  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276  "th_%s stack (%s)", "mon",
277  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278  } else {
279  __kmp_print_storage_map_gtid(
280  gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281  "th_%d stack (%s)", gtid,
282  (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283  }
284  }
285 
286  /* No point in checking ubermaster threads since they use refinement and
287  * cannot overlap */
288  gtid = __kmp_gtid_from_thread(th);
289  if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290  KA_TRACE(10,
291  ("__kmp_check_stack_overlap: performing extensive checking\n"));
292  if (stack_beg == NULL) {
293  stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295  }
296 
297  for (f = 0; f < __kmp_threads_capacity; f++) {
298  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299 
300  if (f_th && f_th != th) {
301  char *other_stack_end =
302  (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303  char *other_stack_beg =
304  other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305  if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307 
308  /* Print the other stack values before the abort */
309  if (__kmp_storage_map)
310  __kmp_print_storage_map_gtid(
311  -1, other_stack_beg, other_stack_end,
312  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313  "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314 
315  __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316  __kmp_msg_null);
317  }
318  }
319  }
320  }
321  KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323 
324 /* ------------------------------------------------------------------------ */
325 
326 void __kmp_infinite_loop(void) {
327  static int done = FALSE;
328 
329  while (!done) {
330  KMP_YIELD(1);
331  }
332 }
333 
334 #define MAX_MESSAGE 512
335 
336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337  char const *format, ...) {
338  char buffer[MAX_MESSAGE];
339  va_list ap;
340 
341  va_start(ap, format);
342  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343  p2, (unsigned long)size, format);
344  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345  __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347  int node;
348  if (gtid >= 0) {
349  if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350  if (__kmp_storage_map_verbose) {
351  node = __kmp_get_host_node(p1);
352  if (node < 0) /* doesn't work, so don't try this next time */
353  __kmp_storage_map_verbose = FALSE;
354  else {
355  char *last;
356  int lastNode;
357  int localProc = __kmp_get_cpu_from_gtid(gtid);
358 
359  const int page_size = KMP_GET_PAGE_SIZE();
360 
361  p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362  p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363  if (localProc >= 0)
364  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
365  localProc >> 1);
366  else
367  __kmp_printf_no_lock(" GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369  /* The more elaborate format is disabled for now because of the prctl
370  * hanging bug. */
371  do {
372  last = p1;
373  lastNode = node;
374  /* This loop collates adjacent pages with the same host node. */
375  do {
376  (char *)p1 += page_size;
377  } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378  __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
379  lastNode);
380  } while (p1 <= p2);
381 #else
382  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
383  (char *)p1 + (page_size - 1),
384  __kmp_get_host_node(p1));
385  if (p1 < p2) {
386  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
387  (char *)p2 + (page_size - 1),
388  __kmp_get_host_node(p2));
389  }
390 #endif
391  }
392  }
393  } else
394  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
395  }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399 
400 void __kmp_warn(char const *format, ...) {
401  char buffer[MAX_MESSAGE];
402  va_list ap;
403 
404  if (__kmp_generate_warnings == kmp_warnings_off) {
405  return;
406  }
407 
408  va_start(ap, format);
409 
410  KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411  __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412  __kmp_vprintf(kmp_err, buffer, ap);
413  __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414 
415  va_end(ap);
416 }
417 
418 void __kmp_abort_process() {
419  // Later threads may stall here, but that's ok because abort() will kill them.
420  __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421 
422  if (__kmp_debug_buf) {
423  __kmp_dump_debug_buffer();
424  }
425 
426  if (KMP_OS_WINDOWS) {
427  // Let other threads know of abnormal termination and prevent deadlock
428  // if abort happened during library initialization or shutdown
429  __kmp_global.g.g_abort = SIGABRT;
430 
431  /* On Windows* OS by default abort() causes pop-up error box, which stalls
432  nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433  boxes. _set_abort_behavior() works well, but this function is not
434  available in VS7 (this is not problem for DLL, but it is a problem for
435  static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436  help, at least in some versions of MS C RTL.
437 
438  It seems following sequence is the only way to simulate abort() and
439  avoid pop-up error box. */
440  raise(SIGABRT);
441  _exit(3); // Just in case, if signal ignored, exit anyway.
442  } else {
443  abort();
444  }
445 
446  __kmp_infinite_loop();
447  __kmp_release_bootstrap_lock(&__kmp_exit_lock);
448 
449 } // __kmp_abort_process
450 
451 void __kmp_abort_thread(void) {
452  // TODO: Eliminate g_abort global variable and this function.
453  // In case of abort just call abort(), it will kill all the threads.
454  __kmp_infinite_loop();
455 } // __kmp_abort_thread
456 
457 /* Print out the storage map for the major kmp_info_t thread data structures
458  that are allocated together. */
459 
460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
461  __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
462  gtid);
463 
464  __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
465  sizeof(kmp_desc_t), "th_%d.th_info", gtid);
466 
467  __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
468  sizeof(kmp_local_t), "th_%d.th_local", gtid);
469 
470  __kmp_print_storage_map_gtid(
471  gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
472  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
473 
474  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
475  &thr->th.th_bar[bs_plain_barrier + 1],
476  sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
477  gtid);
478 
479  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
480  &thr->th.th_bar[bs_forkjoin_barrier + 1],
481  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
482  gtid);
483 
484 #if KMP_FAST_REDUCTION_BARRIER
485  __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
486  &thr->th.th_bar[bs_reduction_barrier + 1],
487  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
488  gtid);
489 #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /* Print out the storage map for the major kmp_team_t team data structures
493  that are allocated together. */
494 
495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
496  int team_id, int num_thr) {
497  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
498  __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
499  header, team_id);
500 
501  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
502  &team->t.t_bar[bs_last_barrier],
503  sizeof(kmp_balign_team_t) * bs_last_barrier,
504  "%s_%d.t_bar", header, team_id);
505 
506  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
507  &team->t.t_bar[bs_plain_barrier + 1],
508  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
509  header, team_id);
510 
511  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
512  &team->t.t_bar[bs_forkjoin_barrier + 1],
513  sizeof(kmp_balign_team_t),
514  "%s_%d.t_bar[forkjoin]", header, team_id);
515 
516 #if KMP_FAST_REDUCTION_BARRIER
517  __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
518  &team->t.t_bar[bs_reduction_barrier + 1],
519  sizeof(kmp_balign_team_t),
520  "%s_%d.t_bar[reduction]", header, team_id);
521 #endif // KMP_FAST_REDUCTION_BARRIER
522 
523  __kmp_print_storage_map_gtid(
524  -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
525  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
526 
527  __kmp_print_storage_map_gtid(
528  -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
529  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
530 
531  __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
532  &team->t.t_disp_buffer[num_disp_buff],
533  sizeof(dispatch_shared_info_t) * num_disp_buff,
534  "%s_%d.t_disp_buffer", header, team_id);
535 
536  __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data,
537  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header,
538  team_id);
539 }
540 
541 static void __kmp_init_allocator() {
542 #if OMP_50_ENABLED
543  __kmp_init_memkind();
544 #endif
545 }
546 static void __kmp_fini_allocator() {
547 #if OMP_50_ENABLED
548  __kmp_fini_memkind();
549 #endif
550 }
551 
552 /* ------------------------------------------------------------------------ */
553 
554 #if KMP_DYNAMIC_LIB
555 #if KMP_OS_WINDOWS
556 
557 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
558  // TODO: Change to __kmp_break_bootstrap_lock().
559  __kmp_init_bootstrap_lock(lck); // make the lock released
560 }
561 
562 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
563  int i;
564  int thread_count;
565 
566  // PROCESS_DETACH is expected to be called by a thread that executes
567  // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
568  // calling ProcessExit or FreeLibrary). So, it might be safe to access the
569  // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
570  // threads can be still alive here, although being about to be terminated. The
571  // threads in the array with ds_thread==0 are most suspicious. Actually, it
572  // can be not safe to access the __kmp_threads[].
573 
574  // TODO: does it make sense to check __kmp_roots[] ?
575 
576  // Let's check that there are no other alive threads registered with the OMP
577  // lib.
578  while (1) {
579  thread_count = 0;
580  for (i = 0; i < __kmp_threads_capacity; ++i) {
581  if (!__kmp_threads)
582  continue;
583  kmp_info_t *th = __kmp_threads[i];
584  if (th == NULL)
585  continue;
586  int gtid = th->th.th_info.ds.ds_gtid;
587  if (gtid == gtid_req)
588  continue;
589  if (gtid < 0)
590  continue;
591  DWORD exit_val;
592  int alive = __kmp_is_thread_alive(th, &exit_val);
593  if (alive) {
594  ++thread_count;
595  }
596  }
597  if (thread_count == 0)
598  break; // success
599  }
600 
601  // Assume that I'm alone. Now it might be safe to check and reset locks.
602  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
603  __kmp_reset_lock(&__kmp_forkjoin_lock);
604 #ifdef KMP_DEBUG
605  __kmp_reset_lock(&__kmp_stdio_lock);
606 #endif // KMP_DEBUG
607 }
608 
609 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
610  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
611 
612  switch (fdwReason) {
613 
614  case DLL_PROCESS_ATTACH:
615  KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
616 
617  return TRUE;
618 
619  case DLL_PROCESS_DETACH:
620  KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
621 
622  if (lpReserved != NULL) {
623  // lpReserved is used for telling the difference:
624  // lpReserved == NULL when FreeLibrary() was called,
625  // lpReserved != NULL when the process terminates.
626  // When FreeLibrary() is called, worker threads remain alive. So they will
627  // release the forkjoin lock by themselves. When the process terminates,
628  // worker threads disappear triggering the problem of unreleased forkjoin
629  // lock as described below.
630 
631  // A worker thread can take the forkjoin lock. The problem comes up if
632  // that worker thread becomes dead before it releases the forkjoin lock.
633  // The forkjoin lock remains taken, while the thread executing
634  // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
635  // to take the forkjoin lock and will always fail, so that the application
636  // will never finish [normally]. This scenario is possible if
637  // __kmpc_end() has not been executed. It looks like it's not a corner
638  // case, but common cases:
639  // - the main function was compiled by an alternative compiler;
640  // - the main function was compiled by icl but without /Qopenmp
641  // (application with plugins);
642  // - application terminates by calling C exit(), Fortran CALL EXIT() or
643  // Fortran STOP.
644  // - alive foreign thread prevented __kmpc_end from doing cleanup.
645  //
646  // This is a hack to work around the problem.
647  // TODO: !!! figure out something better.
648  __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
649  }
650 
651  __kmp_internal_end_library(__kmp_gtid_get_specific());
652 
653  return TRUE;
654 
655  case DLL_THREAD_ATTACH:
656  KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
657 
658  /* if we want to register new siblings all the time here call
659  * __kmp_get_gtid(); */
660  return TRUE;
661 
662  case DLL_THREAD_DETACH:
663  KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
664 
665  __kmp_internal_end_thread(__kmp_gtid_get_specific());
666  return TRUE;
667  }
668 
669  return TRUE;
670 }
671 
672 #endif /* KMP_OS_WINDOWS */
673 #endif /* KMP_DYNAMIC_LIB */
674 
675 /* Change the library type to "status" and return the old type */
676 /* called from within initialization routines where __kmp_initz_lock is held */
677 int __kmp_change_library(int status) {
678  int old_status;
679 
680  old_status = __kmp_yield_init &
681  1; // check whether KMP_LIBRARY=throughput (even init count)
682 
683  if (status) {
684  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
685  } else {
686  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
687  }
688 
689  return old_status; // return previous setting of whether
690  // KMP_LIBRARY=throughput
691 }
692 
693 /* __kmp_parallel_deo -- Wait until it's our turn. */
694 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
695  int gtid = *gtid_ref;
696 #ifdef BUILD_PARALLEL_ORDERED
697  kmp_team_t *team = __kmp_team_from_gtid(gtid);
698 #endif /* BUILD_PARALLEL_ORDERED */
699 
700  if (__kmp_env_consistency_check) {
701  if (__kmp_threads[gtid]->th.th_root->r.r_active)
702 #if KMP_USE_DYNAMIC_LOCK
703  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
704 #else
705  __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
706 #endif
707  }
708 #ifdef BUILD_PARALLEL_ORDERED
709  if (!team->t.t_serialized) {
710  KMP_MB();
711  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid),
712  KMP_EQ, NULL);
713  KMP_MB();
714  }
715 #endif /* BUILD_PARALLEL_ORDERED */
716 }
717 
718 /* __kmp_parallel_dxo -- Signal the next task. */
719 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
720  int gtid = *gtid_ref;
721 #ifdef BUILD_PARALLEL_ORDERED
722  int tid = __kmp_tid_from_gtid(gtid);
723  kmp_team_t *team = __kmp_team_from_gtid(gtid);
724 #endif /* BUILD_PARALLEL_ORDERED */
725 
726  if (__kmp_env_consistency_check) {
727  if (__kmp_threads[gtid]->th.th_root->r.r_active)
728  __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
729  }
730 #ifdef BUILD_PARALLEL_ORDERED
731  if (!team->t.t_serialized) {
732  KMP_MB(); /* Flush all pending memory write invalidates. */
733 
734  /* use the tid of the next thread in this team */
735  /* TODO replace with general release procedure */
736  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
737 
738  KMP_MB(); /* Flush all pending memory write invalidates. */
739  }
740 #endif /* BUILD_PARALLEL_ORDERED */
741 }
742 
743 /* ------------------------------------------------------------------------ */
744 /* The BARRIER for a SINGLE process section is always explicit */
745 
746 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
747  int status;
748  kmp_info_t *th;
749  kmp_team_t *team;
750 
751  if (!TCR_4(__kmp_init_parallel))
752  __kmp_parallel_initialize();
753 
754 #if OMP_50_ENABLED
755  __kmp_resume_if_soft_paused();
756 #endif
757 
758  th = __kmp_threads[gtid];
759  team = th->th.th_team;
760  status = 0;
761 
762  th->th.th_ident = id_ref;
763 
764  if (team->t.t_serialized) {
765  status = 1;
766  } else {
767  kmp_int32 old_this = th->th.th_local.this_construct;
768 
769  ++th->th.th_local.this_construct;
770  /* try to set team count to thread count--success means thread got the
771  single block */
772  /* TODO: Should this be acquire or release? */
773  if (team->t.t_construct == old_this) {
774  status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
775  th->th.th_local.this_construct);
776  }
777 #if USE_ITT_BUILD
778  if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
779  KMP_MASTER_GTID(gtid) &&
780 #if OMP_40_ENABLED
781  th->th.th_teams_microtask == NULL &&
782 #endif
783  team->t.t_active_level ==
784  1) { // Only report metadata by master of active team at level 1
785  __kmp_itt_metadata_single(id_ref);
786  }
787 #endif /* USE_ITT_BUILD */
788  }
789 
790  if (__kmp_env_consistency_check) {
791  if (status && push_ws) {
792  __kmp_push_workshare(gtid, ct_psingle, id_ref);
793  } else {
794  __kmp_check_workshare(gtid, ct_psingle, id_ref);
795  }
796  }
797 #if USE_ITT_BUILD
798  if (status) {
799  __kmp_itt_single_start(gtid);
800  }
801 #endif /* USE_ITT_BUILD */
802  return status;
803 }
804 
805 void __kmp_exit_single(int gtid) {
806 #if USE_ITT_BUILD
807  __kmp_itt_single_end(gtid);
808 #endif /* USE_ITT_BUILD */
809  if (__kmp_env_consistency_check)
810  __kmp_pop_workshare(gtid, ct_psingle, NULL);
811 }
812 
813 /* determine if we can go parallel or must use a serialized parallel region and
814  * how many threads we can use
815  * set_nproc is the number of threads requested for the team
816  * returns 0 if we should serialize or only use one thread,
817  * otherwise the number of threads to use
818  * The forkjoin lock is held by the caller. */
819 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
820  int master_tid, int set_nthreads
821 #if OMP_40_ENABLED
822  ,
823  int enter_teams
824 #endif /* OMP_40_ENABLED */
825  ) {
826  int capacity;
827  int new_nthreads;
828  KMP_DEBUG_ASSERT(__kmp_init_serial);
829  KMP_DEBUG_ASSERT(root && parent_team);
830  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
831 
832  // If dyn-var is set, dynamically adjust the number of desired threads,
833  // according to the method specified by dynamic_mode.
834  new_nthreads = set_nthreads;
835  if (!get__dynamic_2(parent_team, master_tid)) {
836  ;
837  }
838 #ifdef USE_LOAD_BALANCE
839  else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
840  new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
841  if (new_nthreads == 1) {
842  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
843  "reservation to 1 thread\n",
844  master_tid));
845  return 1;
846  }
847  if (new_nthreads < set_nthreads) {
848  KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
849  "reservation to %d threads\n",
850  master_tid, new_nthreads));
851  }
852  }
853 #endif /* USE_LOAD_BALANCE */
854  else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
855  new_nthreads = __kmp_avail_proc - __kmp_nth +
856  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857  if (new_nthreads <= 1) {
858  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
859  "reservation to 1 thread\n",
860  master_tid));
861  return 1;
862  }
863  if (new_nthreads < set_nthreads) {
864  KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
865  "reservation to %d threads\n",
866  master_tid, new_nthreads));
867  } else {
868  new_nthreads = set_nthreads;
869  }
870  } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
871  if (set_nthreads > 2) {
872  new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
873  new_nthreads = (new_nthreads % set_nthreads) + 1;
874  if (new_nthreads == 1) {
875  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
876  "reservation to 1 thread\n",
877  master_tid));
878  return 1;
879  }
880  if (new_nthreads < set_nthreads) {
881  KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
882  "reservation to %d threads\n",
883  master_tid, new_nthreads));
884  }
885  }
886  } else {
887  KMP_ASSERT(0);
888  }
889 
890  // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
891  if (__kmp_nth + new_nthreads -
892  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
893  __kmp_max_nth) {
894  int tl_nthreads = __kmp_max_nth - __kmp_nth +
895  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
896  if (tl_nthreads <= 0) {
897  tl_nthreads = 1;
898  }
899 
900  // If dyn-var is false, emit a 1-time warning.
901  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
902  __kmp_reserve_warn = 1;
903  __kmp_msg(kmp_ms_warning,
904  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
905  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
906  }
907  if (tl_nthreads == 1) {
908  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
909  "reduced reservation to 1 thread\n",
910  master_tid));
911  return 1;
912  }
913  KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
914  "reservation to %d threads\n",
915  master_tid, tl_nthreads));
916  new_nthreads = tl_nthreads;
917  }
918 
919  // Respect OMP_THREAD_LIMIT
920  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
921  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
922  if (cg_nthreads + new_nthreads -
923  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
924  max_cg_threads) {
925  int tl_nthreads = max_cg_threads - cg_nthreads +
926  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
927  if (tl_nthreads <= 0) {
928  tl_nthreads = 1;
929  }
930 
931  // If dyn-var is false, emit a 1-time warning.
932  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933  __kmp_reserve_warn = 1;
934  __kmp_msg(kmp_ms_warning,
935  KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
936  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
937  }
938  if (tl_nthreads == 1) {
939  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
940  "reduced reservation to 1 thread\n",
941  master_tid));
942  return 1;
943  }
944  KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
945  "reservation to %d threads\n",
946  master_tid, tl_nthreads));
947  new_nthreads = tl_nthreads;
948  }
949 
950  // Check if the threads array is large enough, or needs expanding.
951  // See comment in __kmp_register_root() about the adjustment if
952  // __kmp_threads[0] == NULL.
953  capacity = __kmp_threads_capacity;
954  if (TCR_PTR(__kmp_threads[0]) == NULL) {
955  --capacity;
956  }
957  if (__kmp_nth + new_nthreads -
958  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
959  capacity) {
960  // Expand the threads array.
961  int slotsRequired = __kmp_nth + new_nthreads -
962  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
963  capacity;
964  int slotsAdded = __kmp_expand_threads(slotsRequired);
965  if (slotsAdded < slotsRequired) {
966  // The threads array was not expanded enough.
967  new_nthreads -= (slotsRequired - slotsAdded);
968  KMP_ASSERT(new_nthreads >= 1);
969 
970  // If dyn-var is false, emit a 1-time warning.
971  if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
972  __kmp_reserve_warn = 1;
973  if (__kmp_tp_cached) {
974  __kmp_msg(kmp_ms_warning,
975  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
976  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
977  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
978  } else {
979  __kmp_msg(kmp_ms_warning,
980  KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
981  KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
982  }
983  }
984  }
985  }
986 
987 #ifdef KMP_DEBUG
988  if (new_nthreads == 1) {
989  KC_TRACE(10,
990  ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
991  "dead roots and rechecking; requested %d threads\n",
992  __kmp_get_gtid(), set_nthreads));
993  } else {
994  KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
995  " %d threads\n",
996  __kmp_get_gtid(), new_nthreads, set_nthreads));
997  }
998 #endif // KMP_DEBUG
999  return new_nthreads;
1000 }
1001 
1002 /* Allocate threads from the thread pool and assign them to the new team. We are
1003  assured that there are enough threads available, because we checked on that
1004  earlier within critical section forkjoin */
1005 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
1006  kmp_info_t *master_th, int master_gtid) {
1007  int i;
1008  int use_hot_team;
1009 
1010  KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
1011  KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
1012  KMP_MB();
1013 
1014  /* first, let's setup the master thread */
1015  master_th->th.th_info.ds.ds_tid = 0;
1016  master_th->th.th_team = team;
1017  master_th->th.th_team_nproc = team->t.t_nproc;
1018  master_th->th.th_team_master = master_th;
1019  master_th->th.th_team_serialized = FALSE;
1020  master_th->th.th_dispatch = &team->t.t_dispatch[0];
1021 
1022 /* make sure we are not the optimized hot team */
1023 #if KMP_NESTED_HOT_TEAMS
1024  use_hot_team = 0;
1025  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1026  if (hot_teams) { // hot teams array is not allocated if
1027  // KMP_HOT_TEAMS_MAX_LEVEL=0
1028  int level = team->t.t_active_level - 1; // index in array of hot teams
1029  if (master_th->th.th_teams_microtask) { // are we inside the teams?
1030  if (master_th->th.th_teams_size.nteams > 1) {
1031  ++level; // level was not increased in teams construct for
1032  // team_of_masters
1033  }
1034  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1035  master_th->th.th_teams_level == team->t.t_level) {
1036  ++level; // level was not increased in teams construct for
1037  // team_of_workers before the parallel
1038  } // team->t.t_level will be increased inside parallel
1039  }
1040  if (level < __kmp_hot_teams_max_level) {
1041  if (hot_teams[level].hot_team) {
1042  // hot team has already been allocated for given level
1043  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1044  use_hot_team = 1; // the team is ready to use
1045  } else {
1046  use_hot_team = 0; // AC: threads are not allocated yet
1047  hot_teams[level].hot_team = team; // remember new hot team
1048  hot_teams[level].hot_team_nth = team->t.t_nproc;
1049  }
1050  } else {
1051  use_hot_team = 0;
1052  }
1053  }
1054 #else
1055  use_hot_team = team == root->r.r_hot_team;
1056 #endif
1057  if (!use_hot_team) {
1058 
1059  /* install the master thread */
1060  team->t.t_threads[0] = master_th;
1061  __kmp_initialize_info(master_th, team, 0, master_gtid);
1062 
1063  /* now, install the worker threads */
1064  for (i = 1; i < team->t.t_nproc; i++) {
1065 
1066  /* fork or reallocate a new thread and install it in team */
1067  kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1068  team->t.t_threads[i] = thr;
1069  KMP_DEBUG_ASSERT(thr);
1070  KMP_DEBUG_ASSERT(thr->th.th_team == team);
1071  /* align team and thread arrived states */
1072  KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1073  "T#%d(%d:%d) join =%llu, plain=%llu\n",
1074  __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1075  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1076  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1077  team->t.t_bar[bs_plain_barrier].b_arrived));
1078 #if OMP_40_ENABLED
1079  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1080  thr->th.th_teams_level = master_th->th.th_teams_level;
1081  thr->th.th_teams_size = master_th->th.th_teams_size;
1082 #endif
1083  { // Initialize threads' barrier data.
1084  int b;
1085  kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1086  for (b = 0; b < bs_last_barrier; ++b) {
1087  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1088  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1089 #if USE_DEBUGGER
1090  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1091 #endif
1092  }
1093  }
1094  }
1095 
1096 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1097  __kmp_partition_places(team);
1098 #endif
1099  }
1100 
1101 #if OMP_50_ENABLED
1102  if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1103  for (i = 0; i < team->t.t_nproc; i++) {
1104  kmp_info_t *thr = team->t.t_threads[i];
1105  if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1106  thr->th.th_prev_level != team->t.t_level) {
1107  team->t.t_display_affinity = 1;
1108  break;
1109  }
1110  }
1111  }
1112 #endif
1113 
1114  KMP_MB();
1115 }
1116 
1117 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1118 // Propagate any changes to the floating point control registers out to the team
1119 // We try to avoid unnecessary writes to the relevant cache line in the team
1120 // structure, so we don't make changes unless they are needed.
1121 inline static void propagateFPControl(kmp_team_t *team) {
1122  if (__kmp_inherit_fp_control) {
1123  kmp_int16 x87_fpu_control_word;
1124  kmp_uint32 mxcsr;
1125 
1126  // Get master values of FPU control flags (both X87 and vector)
1127  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1128  __kmp_store_mxcsr(&mxcsr);
1129  mxcsr &= KMP_X86_MXCSR_MASK;
1130 
1131  // There is no point looking at t_fp_control_saved here.
1132  // If it is TRUE, we still have to update the values if they are different
1133  // from those we now have. If it is FALSE we didn't save anything yet, but
1134  // our objective is the same. We have to ensure that the values in the team
1135  // are the same as those we have.
1136  // So, this code achieves what we need whether or not t_fp_control_saved is
1137  // true. By checking whether the value needs updating we avoid unnecessary
1138  // writes that would put the cache-line into a written state, causing all
1139  // threads in the team to have to read it again.
1140  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1141  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1142  // Although we don't use this value, other code in the runtime wants to know
1143  // whether it should restore them. So we must ensure it is correct.
1144  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1145  } else {
1146  // Similarly here. Don't write to this cache-line in the team structure
1147  // unless we have to.
1148  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1149  }
1150 }
1151 
1152 // Do the opposite, setting the hardware registers to the updated values from
1153 // the team.
1154 inline static void updateHWFPControl(kmp_team_t *team) {
1155  if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1156  // Only reset the fp control regs if they have been changed in the team.
1157  // the parallel region that we are exiting.
1158  kmp_int16 x87_fpu_control_word;
1159  kmp_uint32 mxcsr;
1160  __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1161  __kmp_store_mxcsr(&mxcsr);
1162  mxcsr &= KMP_X86_MXCSR_MASK;
1163 
1164  if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1165  __kmp_clear_x87_fpu_status_word();
1166  __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1167  }
1168 
1169  if (team->t.t_mxcsr != mxcsr) {
1170  __kmp_load_mxcsr(&team->t.t_mxcsr);
1171  }
1172  }
1173 }
1174 #else
1175 #define propagateFPControl(x) ((void)0)
1176 #define updateHWFPControl(x) ((void)0)
1177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1178 
1179 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1180  int realloc); // forward declaration
1181 
1182 /* Run a parallel region that has been serialized, so runs only in a team of the
1183  single master thread. */
1184 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1185  kmp_info_t *this_thr;
1186  kmp_team_t *serial_team;
1187 
1188  KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1189 
1190  /* Skip all this code for autopar serialized loops since it results in
1191  unacceptable overhead */
1192  if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1193  return;
1194 
1195  if (!TCR_4(__kmp_init_parallel))
1196  __kmp_parallel_initialize();
1197 
1198 #if OMP_50_ENABLED
1199  __kmp_resume_if_soft_paused();
1200 #endif
1201 
1202  this_thr = __kmp_threads[global_tid];
1203  serial_team = this_thr->th.th_serial_team;
1204 
1205  /* utilize the serialized team held by this thread */
1206  KMP_DEBUG_ASSERT(serial_team);
1207  KMP_MB();
1208 
1209  if (__kmp_tasking_mode != tskm_immediate_exec) {
1210  KMP_DEBUG_ASSERT(
1211  this_thr->th.th_task_team ==
1212  this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1213  KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1214  NULL);
1215  KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1216  "team %p, new task_team = NULL\n",
1217  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1218  this_thr->th.th_task_team = NULL;
1219  }
1220 
1221 #if OMP_40_ENABLED
1222  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1223  if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1224  proc_bind = proc_bind_false;
1225  } else if (proc_bind == proc_bind_default) {
1226  // No proc_bind clause was specified, so use the current value
1227  // of proc-bind-var for this parallel region.
1228  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1229  }
1230  // Reset for next parallel region
1231  this_thr->th.th_set_proc_bind = proc_bind_default;
1232 #endif /* OMP_40_ENABLED */
1233 
1234 #if OMPT_SUPPORT
1235  ompt_data_t ompt_parallel_data = ompt_data_none;
1236  ompt_data_t *implicit_task_data;
1237  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1238  if (ompt_enabled.enabled &&
1239  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1240 
1241  ompt_task_info_t *parent_task_info;
1242  parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1243 
1244  parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1245  if (ompt_enabled.ompt_callback_parallel_begin) {
1246  int team_size = 1;
1247 
1248  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1249  &(parent_task_info->task_data), &(parent_task_info->frame),
1250  &ompt_parallel_data, team_size, ompt_parallel_invoker_program,
1251  codeptr);
1252  }
1253  }
1254 #endif // OMPT_SUPPORT
1255 
1256  if (this_thr->th.th_team != serial_team) {
1257  // Nested level will be an index in the nested nthreads array
1258  int level = this_thr->th.th_team->t.t_level;
1259 
1260  if (serial_team->t.t_serialized) {
1261  /* this serial team was already used
1262  TODO increase performance by making this locks more specific */
1263  kmp_team_t *new_team;
1264 
1265  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1266 
1267  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1268 #if OMPT_SUPPORT
1269  ompt_parallel_data,
1270 #endif
1271 #if OMP_40_ENABLED
1272  proc_bind,
1273 #endif
1274  &this_thr->th.th_current_task->td_icvs,
1275  0 USE_NESTED_HOT_ARG(NULL));
1276  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1277  KMP_ASSERT(new_team);
1278 
1279  /* setup new serialized team and install it */
1280  new_team->t.t_threads[0] = this_thr;
1281  new_team->t.t_parent = this_thr->th.th_team;
1282  serial_team = new_team;
1283  this_thr->th.th_serial_team = serial_team;
1284 
1285  KF_TRACE(
1286  10,
1287  ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1288  global_tid, serial_team));
1289 
1290  /* TODO the above breaks the requirement that if we run out of resources,
1291  then we can still guarantee that serialized teams are ok, since we may
1292  need to allocate a new one */
1293  } else {
1294  KF_TRACE(
1295  10,
1296  ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1297  global_tid, serial_team));
1298  }
1299 
1300  /* we have to initialize this serial team */
1301  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1302  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1303  KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1304  serial_team->t.t_ident = loc;
1305  serial_team->t.t_serialized = 1;
1306  serial_team->t.t_nproc = 1;
1307  serial_team->t.t_parent = this_thr->th.th_team;
1308  serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1309  this_thr->th.th_team = serial_team;
1310  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1311 
1312  KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1313  this_thr->th.th_current_task));
1314  KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1315  this_thr->th.th_current_task->td_flags.executing = 0;
1316 
1317  __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1318 
1319  /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1320  implicit task for each serialized task represented by
1321  team->t.t_serialized? */
1322  copy_icvs(&this_thr->th.th_current_task->td_icvs,
1323  &this_thr->th.th_current_task->td_parent->td_icvs);
1324 
1325  // Thread value exists in the nested nthreads array for the next nested
1326  // level
1327  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1328  this_thr->th.th_current_task->td_icvs.nproc =
1329  __kmp_nested_nth.nth[level + 1];
1330  }
1331 
1332 #if OMP_40_ENABLED
1333  if (__kmp_nested_proc_bind.used &&
1334  (level + 1 < __kmp_nested_proc_bind.used)) {
1335  this_thr->th.th_current_task->td_icvs.proc_bind =
1336  __kmp_nested_proc_bind.bind_types[level + 1];
1337  }
1338 #endif /* OMP_40_ENABLED */
1339 
1340 #if USE_DEBUGGER
1341  serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1342 #endif
1343  this_thr->th.th_info.ds.ds_tid = 0;
1344 
1345  /* set thread cache values */
1346  this_thr->th.th_team_nproc = 1;
1347  this_thr->th.th_team_master = this_thr;
1348  this_thr->th.th_team_serialized = 1;
1349 
1350  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1351  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1352 #if OMP_50_ENABLED
1353  serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1354 #endif
1355 
1356  propagateFPControl(serial_team);
1357 
1358  /* check if we need to allocate dispatch buffers stack */
1359  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1360  if (!serial_team->t.t_dispatch->th_disp_buffer) {
1361  serial_team->t.t_dispatch->th_disp_buffer =
1362  (dispatch_private_info_t *)__kmp_allocate(
1363  sizeof(dispatch_private_info_t));
1364  }
1365  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1366 
1367  KMP_MB();
1368 
1369  } else {
1370  /* this serialized team is already being used,
1371  * that's fine, just add another nested level */
1372  KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1373  KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1374  KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1375  ++serial_team->t.t_serialized;
1376  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1377 
1378  // Nested level will be an index in the nested nthreads array
1379  int level = this_thr->th.th_team->t.t_level;
1380  // Thread value exists in the nested nthreads array for the next nested
1381  // level
1382  if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1383  this_thr->th.th_current_task->td_icvs.nproc =
1384  __kmp_nested_nth.nth[level + 1];
1385  }
1386  serial_team->t.t_level++;
1387  KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1388  "of serial team %p to %d\n",
1389  global_tid, serial_team, serial_team->t.t_level));
1390 
1391  /* allocate/push dispatch buffers stack */
1392  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1393  {
1394  dispatch_private_info_t *disp_buffer =
1395  (dispatch_private_info_t *)__kmp_allocate(
1396  sizeof(dispatch_private_info_t));
1397  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1398  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1399  }
1400  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1401 
1402  KMP_MB();
1403  }
1404 #if OMP_40_ENABLED
1405  KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1406 #endif
1407 
1408 #if OMP_50_ENABLED
1409  // Perform the display affinity functionality for
1410  // serialized parallel regions
1411  if (__kmp_display_affinity) {
1412  if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1413  this_thr->th.th_prev_num_threads != 1) {
1414  // NULL means use the affinity-format-var ICV
1415  __kmp_aux_display_affinity(global_tid, NULL);
1416  this_thr->th.th_prev_level = serial_team->t.t_level;
1417  this_thr->th.th_prev_num_threads = 1;
1418  }
1419  }
1420 #endif
1421 
1422  if (__kmp_env_consistency_check)
1423  __kmp_push_parallel(global_tid, NULL);
1424 #if OMPT_SUPPORT
1425  serial_team->t.ompt_team_info.master_return_address = codeptr;
1426  if (ompt_enabled.enabled &&
1427  this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1428  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1429 
1430  ompt_lw_taskteam_t lw_taskteam;
1431  __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1432  &ompt_parallel_data, codeptr);
1433 
1434  __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1435  // don't use lw_taskteam after linking. content was swaped
1436 
1437  /* OMPT implicit task begin */
1438  implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1439  if (ompt_enabled.ompt_callback_implicit_task) {
1440  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1441  ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1442  OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1443  OMPT_CUR_TASK_INFO(this_thr)
1444  ->thread_num = __kmp_tid_from_gtid(global_tid);
1445  }
1446 
1447  /* OMPT state */
1448  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1449  OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1450  }
1451 #endif
1452 }
1453 
1454 /* most of the work for a fork */
1455 /* return true if we really went parallel, false if serialized */
1456 int __kmp_fork_call(ident_t *loc, int gtid,
1457  enum fork_context_e call_context, // Intel, GNU, ...
1458  kmp_int32 argc, microtask_t microtask, launch_t invoker,
1459 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1460 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1461  va_list *ap
1462 #else
1463  va_list ap
1464 #endif
1465  ) {
1466  void **argv;
1467  int i;
1468  int master_tid;
1469  int master_this_cons;
1470  kmp_team_t *team;
1471  kmp_team_t *parent_team;
1472  kmp_info_t *master_th;
1473  kmp_root_t *root;
1474  int nthreads;
1475  int master_active;
1476  int master_set_numthreads;
1477  int level;
1478 #if OMP_40_ENABLED
1479  int active_level;
1480  int teams_level;
1481 #endif
1482 #if KMP_NESTED_HOT_TEAMS
1483  kmp_hot_team_ptr_t **p_hot_teams;
1484 #endif
1485  { // KMP_TIME_BLOCK
1486  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1487  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1488 
1489  KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1490  if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1491  /* Some systems prefer the stack for the root thread(s) to start with */
1492  /* some gap from the parent stack to prevent false sharing. */
1493  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1494  /* These 2 lines below are so this does not get optimized out */
1495  if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1496  __kmp_stkpadding += (short)((kmp_int64)dummy);
1497  }
1498 
1499  /* initialize if needed */
1500  KMP_DEBUG_ASSERT(
1501  __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1502  if (!TCR_4(__kmp_init_parallel))
1503  __kmp_parallel_initialize();
1504 
1505 #if OMP_50_ENABLED
1506  __kmp_resume_if_soft_paused();
1507 #endif
1508 
1509  /* setup current data */
1510  master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1511  // shutdown
1512  parent_team = master_th->th.th_team;
1513  master_tid = master_th->th.th_info.ds.ds_tid;
1514  master_this_cons = master_th->th.th_local.this_construct;
1515  root = master_th->th.th_root;
1516  master_active = root->r.r_active;
1517  master_set_numthreads = master_th->th.th_set_nproc;
1518 
1519 #if OMPT_SUPPORT
1520  ompt_data_t ompt_parallel_data = ompt_data_none;
1521  ompt_data_t *parent_task_data;
1522  ompt_frame_t *ompt_frame;
1523  ompt_data_t *implicit_task_data;
1524  void *return_address = NULL;
1525 
1526  if (ompt_enabled.enabled) {
1527  __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1528  NULL, NULL);
1529  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1530  }
1531 #endif
1532 
1533  // Nested level will be an index in the nested nthreads array
1534  level = parent_team->t.t_level;
1535  // used to launch non-serial teams even if nested is not allowed
1536  active_level = parent_team->t.t_active_level;
1537 #if OMP_40_ENABLED
1538  // needed to check nesting inside the teams
1539  teams_level = master_th->th.th_teams_level;
1540 #endif
1541 #if KMP_NESTED_HOT_TEAMS
1542  p_hot_teams = &master_th->th.th_hot_teams;
1543  if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1544  *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1545  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1546  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1547  // it is either actual or not needed (when active_level > 0)
1548  (*p_hot_teams)[0].hot_team_nth = 1;
1549  }
1550 #endif
1551 
1552 #if OMPT_SUPPORT
1553  if (ompt_enabled.enabled) {
1554  if (ompt_enabled.ompt_callback_parallel_begin) {
1555  int team_size = master_set_numthreads
1556  ? master_set_numthreads
1557  : get__nproc_2(parent_team, master_tid);
1558  ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1559  parent_task_data, ompt_frame, &ompt_parallel_data, team_size,
1560  OMPT_INVOKER(call_context), return_address);
1561  }
1562  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1563  }
1564 #endif
1565 
1566  master_th->th.th_ident = loc;
1567 
1568 #if OMP_40_ENABLED
1569  if (master_th->th.th_teams_microtask && ap &&
1570  microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1571  // AC: This is start of parallel that is nested inside teams construct.
1572  // The team is actual (hot), all workers are ready at the fork barrier.
1573  // No lock needed to initialize the team a bit, then free workers.
1574  parent_team->t.t_ident = loc;
1575  __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1576  parent_team->t.t_argc = argc;
1577  argv = (void **)parent_team->t.t_argv;
1578  for (i = argc - 1; i >= 0; --i)
1579 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1580 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1581  *argv++ = va_arg(*ap, void *);
1582 #else
1583  *argv++ = va_arg(ap, void *);
1584 #endif
1585  // Increment our nested depth levels, but not increase the serialization
1586  if (parent_team == master_th->th.th_serial_team) {
1587  // AC: we are in serialized parallel
1588  __kmpc_serialized_parallel(loc, gtid);
1589  KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1590  // AC: need this in order enquiry functions work
1591  // correctly, will restore at join time
1592  parent_team->t.t_serialized--;
1593 #if OMPT_SUPPORT
1594  void *dummy;
1595  void **exit_runtime_p;
1596 
1597  ompt_lw_taskteam_t lw_taskteam;
1598 
1599  if (ompt_enabled.enabled) {
1600  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1601  &ompt_parallel_data, return_address);
1602  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1603 
1604  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1605  // don't use lw_taskteam after linking. content was swaped
1606 
1607  /* OMPT implicit task begin */
1608  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1609  if (ompt_enabled.ompt_callback_implicit_task) {
1610  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1611  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1612  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1613  OMPT_CUR_TASK_INFO(master_th)
1614  ->thread_num = __kmp_tid_from_gtid(gtid);
1615  }
1616 
1617  /* OMPT state */
1618  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1619  } else {
1620  exit_runtime_p = &dummy;
1621  }
1622 #endif
1623 
1624  {
1625  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1626  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1627  __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1628 #if OMPT_SUPPORT
1629  ,
1630  exit_runtime_p
1631 #endif
1632  );
1633  }
1634 
1635 #if OMPT_SUPPORT
1636  *exit_runtime_p = NULL;
1637  if (ompt_enabled.enabled) {
1638  OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1639  if (ompt_enabled.ompt_callback_implicit_task) {
1640  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1641  ompt_scope_end, NULL, implicit_task_data, 1,
1642  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1643  }
1644  __ompt_lw_taskteam_unlink(master_th);
1645 
1646  if (ompt_enabled.ompt_callback_parallel_end) {
1647  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1648  OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th),
1649  OMPT_INVOKER(call_context), return_address);
1650  }
1651  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1652  }
1653 #endif
1654  return TRUE;
1655  }
1656 
1657  parent_team->t.t_pkfn = microtask;
1658  parent_team->t.t_invoke = invoker;
1659  KMP_ATOMIC_INC(&root->r.r_in_parallel);
1660  parent_team->t.t_active_level++;
1661  parent_team->t.t_level++;
1662 #if OMP_50_ENABLED
1663  parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1664 #endif
1665 
1666  /* Change number of threads in the team if requested */
1667  if (master_set_numthreads) { // The parallel has num_threads clause
1668  if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1669  // AC: only can reduce number of threads dynamically, can't increase
1670  kmp_info_t **other_threads = parent_team->t.t_threads;
1671  parent_team->t.t_nproc = master_set_numthreads;
1672  for (i = 0; i < master_set_numthreads; ++i) {
1673  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1674  }
1675  // Keep extra threads hot in the team for possible next parallels
1676  }
1677  master_th->th.th_set_nproc = 0;
1678  }
1679 
1680 #if USE_DEBUGGER
1681  if (__kmp_debugging) { // Let debugger override number of threads.
1682  int nth = __kmp_omp_num_threads(loc);
1683  if (nth > 0) { // 0 means debugger doesn't want to change num threads
1684  master_set_numthreads = nth;
1685  }
1686  }
1687 #endif
1688 
1689  KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1690  "master_th=%p, gtid=%d\n",
1691  root, parent_team, master_th, gtid));
1692  __kmp_internal_fork(loc, gtid, parent_team);
1693  KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1694  "master_th=%p, gtid=%d\n",
1695  root, parent_team, master_th, gtid));
1696 
1697  /* Invoke microtask for MASTER thread */
1698  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1699  parent_team->t.t_id, parent_team->t.t_pkfn));
1700 
1701  if (!parent_team->t.t_invoke(gtid)) {
1702  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1703  }
1704  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1705  parent_team->t.t_id, parent_team->t.t_pkfn));
1706  KMP_MB(); /* Flush all pending memory write invalidates. */
1707 
1708  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1709 
1710  return TRUE;
1711  } // Parallel closely nested in teams construct
1712 #endif /* OMP_40_ENABLED */
1713 
1714 #if KMP_DEBUG
1715  if (__kmp_tasking_mode != tskm_immediate_exec) {
1716  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1717  parent_team->t.t_task_team[master_th->th.th_task_state]);
1718  }
1719 #endif
1720 
1721  if (parent_team->t.t_active_level >=
1722  master_th->th.th_current_task->td_icvs.max_active_levels) {
1723  nthreads = 1;
1724  } else {
1725 #if OMP_40_ENABLED
1726  int enter_teams = ((ap == NULL && active_level == 0) ||
1727  (ap && teams_level > 0 && teams_level == level));
1728 #endif
1729  nthreads =
1730  master_set_numthreads
1731  ? master_set_numthreads
1732  : get__nproc_2(
1733  parent_team,
1734  master_tid); // TODO: get nproc directly from current task
1735 
1736  // Check if we need to take forkjoin lock? (no need for serialized
1737  // parallel out of teams construct). This code moved here from
1738  // __kmp_reserve_threads() to speedup nested serialized parallels.
1739  if (nthreads > 1) {
1740  if ((!get__nested(master_th) && (root->r.r_in_parallel
1741 #if OMP_40_ENABLED
1742  && !enter_teams
1743 #endif /* OMP_40_ENABLED */
1744  )) ||
1745  (__kmp_library == library_serial)) {
1746  KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1747  " threads\n",
1748  gtid, nthreads));
1749  nthreads = 1;
1750  }
1751  }
1752  if (nthreads > 1) {
1753  /* determine how many new threads we can use */
1754  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1755  nthreads = __kmp_reserve_threads(
1756  root, parent_team, master_tid, nthreads
1757 #if OMP_40_ENABLED
1758  /* AC: If we execute teams from parallel region (on host), then
1759  teams should be created but each can only have 1 thread if
1760  nesting is disabled. If teams called from serial region, then
1761  teams and their threads should be created regardless of the
1762  nesting setting. */
1763  ,
1764  enter_teams
1765 #endif /* OMP_40_ENABLED */
1766  );
1767  if (nthreads == 1) {
1768  // Free lock for single thread execution here; for multi-thread
1769  // execution it will be freed later after team of threads created
1770  // and initialized
1771  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1772  }
1773  }
1774  }
1775  KMP_DEBUG_ASSERT(nthreads > 0);
1776 
1777  // If we temporarily changed the set number of threads then restore it now
1778  master_th->th.th_set_nproc = 0;
1779 
1780  /* create a serialized parallel region? */
1781  if (nthreads == 1) {
1782 /* josh todo: hypothetical question: what do we do for OS X*? */
1783 #if KMP_OS_LINUX && \
1784  (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1785  void *args[argc];
1786 #else
1787  void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1788 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1789  KMP_ARCH_AARCH64) */
1790 
1791  KA_TRACE(20,
1792  ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1793 
1794  __kmpc_serialized_parallel(loc, gtid);
1795 
1796  if (call_context == fork_context_intel) {
1797  /* TODO this sucks, use the compiler itself to pass args! :) */
1798  master_th->th.th_serial_team->t.t_ident = loc;
1799 #if OMP_40_ENABLED
1800  if (!ap) {
1801  // revert change made in __kmpc_serialized_parallel()
1802  master_th->th.th_serial_team->t.t_level--;
1803 // Get args from parent team for teams construct
1804 
1805 #if OMPT_SUPPORT
1806  void *dummy;
1807  void **exit_runtime_p;
1808  ompt_task_info_t *task_info;
1809 
1810  ompt_lw_taskteam_t lw_taskteam;
1811 
1812  if (ompt_enabled.enabled) {
1813  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1814  &ompt_parallel_data, return_address);
1815 
1816  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1817  // don't use lw_taskteam after linking. content was swaped
1818 
1819  task_info = OMPT_CUR_TASK_INFO(master_th);
1820  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1821  if (ompt_enabled.ompt_callback_implicit_task) {
1822  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1823  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1824  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1825  OMPT_CUR_TASK_INFO(master_th)
1826  ->thread_num = __kmp_tid_from_gtid(gtid);
1827  }
1828 
1829  /* OMPT state */
1830  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1831  } else {
1832  exit_runtime_p = &dummy;
1833  }
1834 #endif
1835 
1836  {
1837  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1838  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1839  __kmp_invoke_microtask(microtask, gtid, 0, argc,
1840  parent_team->t.t_argv
1841 #if OMPT_SUPPORT
1842  ,
1843  exit_runtime_p
1844 #endif
1845  );
1846  }
1847 
1848 #if OMPT_SUPPORT
1849  if (ompt_enabled.enabled) {
1850  exit_runtime_p = NULL;
1851  if (ompt_enabled.ompt_callback_implicit_task) {
1852  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1853  ompt_scope_end, NULL, &(task_info->task_data), 1,
1854  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1855  }
1856 
1857  __ompt_lw_taskteam_unlink(master_th);
1858  if (ompt_enabled.ompt_callback_parallel_end) {
1859  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1860  OMPT_CUR_TEAM_DATA(master_th), parent_task_data,
1861  OMPT_INVOKER(call_context), return_address);
1862  }
1863  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1864  }
1865 #endif
1866  } else if (microtask == (microtask_t)__kmp_teams_master) {
1867  KMP_DEBUG_ASSERT(master_th->th.th_team ==
1868  master_th->th.th_serial_team);
1869  team = master_th->th.th_team;
1870  // team->t.t_pkfn = microtask;
1871  team->t.t_invoke = invoker;
1872  __kmp_alloc_argv_entries(argc, team, TRUE);
1873  team->t.t_argc = argc;
1874  argv = (void **)team->t.t_argv;
1875  if (ap) {
1876  for (i = argc - 1; i >= 0; --i)
1877 // TODO: revert workaround for Intel(R) 64 tracker #96
1878 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1879  *argv++ = va_arg(*ap, void *);
1880 #else
1881  *argv++ = va_arg(ap, void *);
1882 #endif
1883  } else {
1884  for (i = 0; i < argc; ++i)
1885  // Get args from parent team for teams construct
1886  argv[i] = parent_team->t.t_argv[i];
1887  }
1888  // AC: revert change made in __kmpc_serialized_parallel()
1889  // because initial code in teams should have level=0
1890  team->t.t_level--;
1891  // AC: call special invoker for outer "parallel" of teams construct
1892  invoker(gtid);
1893  } else {
1894 #endif /* OMP_40_ENABLED */
1895  argv = args;
1896  for (i = argc - 1; i >= 0; --i)
1897 // TODO: revert workaround for Intel(R) 64 tracker #96
1898 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1899  *argv++ = va_arg(*ap, void *);
1900 #else
1901  *argv++ = va_arg(ap, void *);
1902 #endif
1903  KMP_MB();
1904 
1905 #if OMPT_SUPPORT
1906  void *dummy;
1907  void **exit_runtime_p;
1908  ompt_task_info_t *task_info;
1909 
1910  ompt_lw_taskteam_t lw_taskteam;
1911 
1912  if (ompt_enabled.enabled) {
1913  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1914  &ompt_parallel_data, return_address);
1915  __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1916  // don't use lw_taskteam after linking. content was swaped
1917  task_info = OMPT_CUR_TASK_INFO(master_th);
1918  exit_runtime_p = &(task_info->frame.exit_frame.ptr);
1919 
1920  /* OMPT implicit task begin */
1921  implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1922  if (ompt_enabled.ompt_callback_implicit_task) {
1923  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1924  ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1925  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1926  OMPT_CUR_TASK_INFO(master_th)
1927  ->thread_num = __kmp_tid_from_gtid(gtid);
1928  }
1929 
1930  /* OMPT state */
1931  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1932  } else {
1933  exit_runtime_p = &dummy;
1934  }
1935 #endif
1936 
1937  {
1938  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1939  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1940  __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1941 #if OMPT_SUPPORT
1942  ,
1943  exit_runtime_p
1944 #endif
1945  );
1946  }
1947 
1948 #if OMPT_SUPPORT
1949  if (ompt_enabled.enabled) {
1950  *exit_runtime_p = NULL;
1951  if (ompt_enabled.ompt_callback_implicit_task) {
1952  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1953  ompt_scope_end, NULL, &(task_info->task_data), 1,
1954  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1955  }
1956 
1957  ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1958  __ompt_lw_taskteam_unlink(master_th);
1959  if (ompt_enabled.ompt_callback_parallel_end) {
1960  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1961  &ompt_parallel_data, parent_task_data,
1962  OMPT_INVOKER(call_context), return_address);
1963  }
1964  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1965  }
1966 #endif
1967 #if OMP_40_ENABLED
1968  }
1969 #endif /* OMP_40_ENABLED */
1970  } else if (call_context == fork_context_gnu) {
1971 #if OMPT_SUPPORT
1972  ompt_lw_taskteam_t lwt;
1973  __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1974  return_address);
1975 
1976  lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1977  __ompt_lw_taskteam_link(&lwt, master_th, 1);
1978 // don't use lw_taskteam after linking. content was swaped
1979 #endif
1980 
1981  // we were called from GNU native code
1982  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1983  return FALSE;
1984  } else {
1985  KMP_ASSERT2(call_context < fork_context_last,
1986  "__kmp_fork_call: unknown fork_context parameter");
1987  }
1988 
1989  KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1990  KMP_MB();
1991  return FALSE;
1992  } // if (nthreads == 1)
1993 
1994  // GEH: only modify the executing flag in the case when not serialized
1995  // serialized case is handled in kmpc_serialized_parallel
1996  KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1997  "curtask=%p, curtask_max_aclevel=%d\n",
1998  parent_team->t.t_active_level, master_th,
1999  master_th->th.th_current_task,
2000  master_th->th.th_current_task->td_icvs.max_active_levels));
2001  // TODO: GEH - cannot do this assertion because root thread not set up as
2002  // executing
2003  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2004  master_th->th.th_current_task->td_flags.executing = 0;
2005 
2006 #if OMP_40_ENABLED
2007  if (!master_th->th.th_teams_microtask || level > teams_level)
2008 #endif /* OMP_40_ENABLED */
2009  {
2010  /* Increment our nested depth level */
2011  KMP_ATOMIC_INC(&root->r.r_in_parallel);
2012  }
2013 
2014  // See if we need to make a copy of the ICVs.
2015  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2016  if ((level + 1 < __kmp_nested_nth.used) &&
2017  (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2018  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2019  } else {
2020  nthreads_icv = 0; // don't update
2021  }
2022 
2023 #if OMP_40_ENABLED
2024  // Figure out the proc_bind_policy for the new team.
2025  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2026  kmp_proc_bind_t proc_bind_icv =
2027  proc_bind_default; // proc_bind_default means don't update
2028  if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2029  proc_bind = proc_bind_false;
2030  } else {
2031  if (proc_bind == proc_bind_default) {
2032  // No proc_bind clause specified; use current proc-bind-var for this
2033  // parallel region
2034  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2035  }
2036  /* else: The proc_bind policy was specified explicitly on parallel clause.
2037  This overrides proc-bind-var for this parallel region, but does not
2038  change proc-bind-var. */
2039  // Figure the value of proc-bind-var for the child threads.
2040  if ((level + 1 < __kmp_nested_proc_bind.used) &&
2041  (__kmp_nested_proc_bind.bind_types[level + 1] !=
2042  master_th->th.th_current_task->td_icvs.proc_bind)) {
2043  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2044  }
2045  }
2046 
2047  // Reset for next parallel region
2048  master_th->th.th_set_proc_bind = proc_bind_default;
2049 #endif /* OMP_40_ENABLED */
2050 
2051  if ((nthreads_icv > 0)
2052 #if OMP_40_ENABLED
2053  || (proc_bind_icv != proc_bind_default)
2054 #endif /* OMP_40_ENABLED */
2055  ) {
2056  kmp_internal_control_t new_icvs;
2057  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2058  new_icvs.next = NULL;
2059  if (nthreads_icv > 0) {
2060  new_icvs.nproc = nthreads_icv;
2061  }
2062 
2063 #if OMP_40_ENABLED
2064  if (proc_bind_icv != proc_bind_default) {
2065  new_icvs.proc_bind = proc_bind_icv;
2066  }
2067 #endif /* OMP_40_ENABLED */
2068 
2069  /* allocate a new parallel team */
2070  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2071  team = __kmp_allocate_team(root, nthreads, nthreads,
2072 #if OMPT_SUPPORT
2073  ompt_parallel_data,
2074 #endif
2075 #if OMP_40_ENABLED
2076  proc_bind,
2077 #endif
2078  &new_icvs, argc USE_NESTED_HOT_ARG(master_th));
2079  } else {
2080  /* allocate a new parallel team */
2081  KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2082  team = __kmp_allocate_team(root, nthreads, nthreads,
2083 #if OMPT_SUPPORT
2084  ompt_parallel_data,
2085 #endif
2086 #if OMP_40_ENABLED
2087  proc_bind,
2088 #endif
2089  &master_th->th.th_current_task->td_icvs,
2090  argc USE_NESTED_HOT_ARG(master_th));
2091  }
2092  KF_TRACE(
2093  10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2094 
2095  /* setup the new team */
2096  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2097  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2098  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2099  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2100  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2101 #if OMPT_SUPPORT
2102  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2103  return_address);
2104 #endif
2105  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2106 // TODO: parent_team->t.t_level == INT_MAX ???
2107 #if OMP_40_ENABLED
2108  if (!master_th->th.th_teams_microtask || level > teams_level) {
2109 #endif /* OMP_40_ENABLED */
2110  int new_level = parent_team->t.t_level + 1;
2111  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2112  new_level = parent_team->t.t_active_level + 1;
2113  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2114 #if OMP_40_ENABLED
2115  } else {
2116  // AC: Do not increase parallel level at start of the teams construct
2117  int new_level = parent_team->t.t_level;
2118  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2119  new_level = parent_team->t.t_active_level;
2120  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2121  }
2122 #endif /* OMP_40_ENABLED */
2123  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2124  // set master's schedule as new run-time schedule
2125  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2126 
2127 #if OMP_40_ENABLED
2128  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2129 #endif
2130 #if OMP_50_ENABLED
2131  KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2132 #endif
2133 
2134  // Update the floating point rounding in the team if required.
2135  propagateFPControl(team);
2136 
2137  if (__kmp_tasking_mode != tskm_immediate_exec) {
2138  // Set master's task team to team's task team. Unless this is hot team, it
2139  // should be NULL.
2140  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2141  parent_team->t.t_task_team[master_th->th.th_task_state]);
2142  KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2143  "%p, new task_team %p / team %p\n",
2144  __kmp_gtid_from_thread(master_th),
2145  master_th->th.th_task_team, parent_team,
2146  team->t.t_task_team[master_th->th.th_task_state], team));
2147 
2148  if (active_level || master_th->th.th_task_team) {
2149  // Take a memo of master's task_state
2150  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2151  if (master_th->th.th_task_state_top >=
2152  master_th->th.th_task_state_stack_sz) { // increase size
2153  kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2154  kmp_uint8 *old_stack, *new_stack;
2155  kmp_uint32 i;
2156  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2157  for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2158  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2159  }
2160  for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2161  ++i) { // zero-init rest of stack
2162  new_stack[i] = 0;
2163  }
2164  old_stack = master_th->th.th_task_state_memo_stack;
2165  master_th->th.th_task_state_memo_stack = new_stack;
2166  master_th->th.th_task_state_stack_sz = new_size;
2167  __kmp_free(old_stack);
2168  }
2169  // Store master's task_state on stack
2170  master_th->th
2171  .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2172  master_th->th.th_task_state;
2173  master_th->th.th_task_state_top++;
2174 #if KMP_NESTED_HOT_TEAMS
2175  if (master_th->th.th_hot_teams &&
2176  active_level < __kmp_hot_teams_max_level &&
2177  team == master_th->th.th_hot_teams[active_level].hot_team) {
2178  // Restore master's nested state if nested hot team
2179  master_th->th.th_task_state =
2180  master_th->th
2181  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2182  } else {
2183 #endif
2184  master_th->th.th_task_state = 0;
2185 #if KMP_NESTED_HOT_TEAMS
2186  }
2187 #endif
2188  }
2189 #if !KMP_NESTED_HOT_TEAMS
2190  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2191  (team == root->r.r_hot_team));
2192 #endif
2193  }
2194 
2195  KA_TRACE(
2196  20,
2197  ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2198  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2199  team->t.t_nproc));
2200  KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2201  (team->t.t_master_tid == 0 &&
2202  (team->t.t_parent == root->r.r_root_team ||
2203  team->t.t_parent->t.t_serialized)));
2204  KMP_MB();
2205 
2206  /* now, setup the arguments */
2207  argv = (void **)team->t.t_argv;
2208 #if OMP_40_ENABLED
2209  if (ap) {
2210 #endif /* OMP_40_ENABLED */
2211  for (i = argc - 1; i >= 0; --i) {
2212 // TODO: revert workaround for Intel(R) 64 tracker #96
2213 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2214  void *new_argv = va_arg(*ap, void *);
2215 #else
2216  void *new_argv = va_arg(ap, void *);
2217 #endif
2218  KMP_CHECK_UPDATE(*argv, new_argv);
2219  argv++;
2220  }
2221 #if OMP_40_ENABLED
2222  } else {
2223  for (i = 0; i < argc; ++i) {
2224  // Get args from parent team for teams construct
2225  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2226  }
2227  }
2228 #endif /* OMP_40_ENABLED */
2229 
2230  /* now actually fork the threads */
2231  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2232  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2233  root->r.r_active = TRUE;
2234 
2235  __kmp_fork_team_threads(root, team, master_th, gtid);
2236  __kmp_setup_icv_copy(team, nthreads,
2237  &master_th->th.th_current_task->td_icvs, loc);
2238 
2239 #if OMPT_SUPPORT
2240  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2241 #endif
2242 
2243  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2244 
2245 #if USE_ITT_BUILD
2246  if (team->t.t_active_level == 1 // only report frames at level 1
2247 #if OMP_40_ENABLED
2248  && !master_th->th.th_teams_microtask // not in teams construct
2249 #endif /* OMP_40_ENABLED */
2250  ) {
2251 #if USE_ITT_NOTIFY
2252  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2253  (__kmp_forkjoin_frames_mode == 3 ||
2254  __kmp_forkjoin_frames_mode == 1)) {
2255  kmp_uint64 tmp_time = 0;
2256  if (__itt_get_timestamp_ptr)
2257  tmp_time = __itt_get_timestamp();
2258  // Internal fork - report frame begin
2259  master_th->th.th_frame_time = tmp_time;
2260  if (__kmp_forkjoin_frames_mode == 3)
2261  team->t.t_region_time = tmp_time;
2262  } else
2263 // only one notification scheme (either "submit" or "forking/joined", not both)
2264 #endif /* USE_ITT_NOTIFY */
2265  if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2266  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2267  // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2268  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2269  }
2270  }
2271 #endif /* USE_ITT_BUILD */
2272 
2273  /* now go on and do the work */
2274  KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2275  KMP_MB();
2276  KF_TRACE(10,
2277  ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2278  root, team, master_th, gtid));
2279 
2280 #if USE_ITT_BUILD
2281  if (__itt_stack_caller_create_ptr) {
2282  team->t.t_stack_id =
2283  __kmp_itt_stack_caller_create(); // create new stack stitching id
2284  // before entering fork barrier
2285  }
2286 #endif /* USE_ITT_BUILD */
2287 
2288 #if OMP_40_ENABLED
2289  // AC: skip __kmp_internal_fork at teams construct, let only master
2290  // threads execute
2291  if (ap)
2292 #endif /* OMP_40_ENABLED */
2293  {
2294  __kmp_internal_fork(loc, gtid, team);
2295  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2296  "master_th=%p, gtid=%d\n",
2297  root, team, master_th, gtid));
2298  }
2299 
2300  if (call_context == fork_context_gnu) {
2301  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2302  return TRUE;
2303  }
2304 
2305  /* Invoke microtask for MASTER thread */
2306  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2307  team->t.t_id, team->t.t_pkfn));
2308  } // END of timer KMP_fork_call block
2309 
2310  if (!team->t.t_invoke(gtid)) {
2311  KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2312  }
2313  KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2314  team->t.t_id, team->t.t_pkfn));
2315  KMP_MB(); /* Flush all pending memory write invalidates. */
2316 
2317  KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2318 
2319 #if OMPT_SUPPORT
2320  if (ompt_enabled.enabled) {
2321  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2322  }
2323 #endif
2324 
2325  return TRUE;
2326 }
2327 
2328 #if OMPT_SUPPORT
2329 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2330  kmp_team_t *team) {
2331  // restore state outside the region
2332  thread->th.ompt_thread_info.state =
2333  ((team->t.t_serialized) ? ompt_state_work_serial
2334  : ompt_state_work_parallel);
2335 }
2336 
2337 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2338  kmp_team_t *team, ompt_data_t *parallel_data,
2339  fork_context_e fork_context, void *codeptr) {
2340  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2341  if (ompt_enabled.ompt_callback_parallel_end) {
2342  ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2343  parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context),
2344  codeptr);
2345  }
2346 
2347  task_info->frame.enter_frame = ompt_data_none;
2348  __kmp_join_restore_state(thread, team);
2349 }
2350 #endif
2351 
2352 void __kmp_join_call(ident_t *loc, int gtid
2353 #if OMPT_SUPPORT
2354  ,
2355  enum fork_context_e fork_context
2356 #endif
2357 #if OMP_40_ENABLED
2358  ,
2359  int exit_teams
2360 #endif /* OMP_40_ENABLED */
2361  ) {
2362  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2363  kmp_team_t *team;
2364  kmp_team_t *parent_team;
2365  kmp_info_t *master_th;
2366  kmp_root_t *root;
2367  int master_active;
2368 
2369  KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2370 
2371  /* setup current data */
2372  master_th = __kmp_threads[gtid];
2373  root = master_th->th.th_root;
2374  team = master_th->th.th_team;
2375  parent_team = team->t.t_parent;
2376 
2377  master_th->th.th_ident = loc;
2378 
2379 #if OMPT_SUPPORT
2380  if (ompt_enabled.enabled) {
2381  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2382  }
2383 #endif
2384 
2385 #if KMP_DEBUG
2386  if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2387  KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2388  "th_task_team = %p\n",
2389  __kmp_gtid_from_thread(master_th), team,
2390  team->t.t_task_team[master_th->th.th_task_state],
2391  master_th->th.th_task_team));
2392  KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2393  team->t.t_task_team[master_th->th.th_task_state]);
2394  }
2395 #endif
2396 
2397  if (team->t.t_serialized) {
2398 #if OMP_40_ENABLED
2399  if (master_th->th.th_teams_microtask) {
2400  // We are in teams construct
2401  int level = team->t.t_level;
2402  int tlevel = master_th->th.th_teams_level;
2403  if (level == tlevel) {
2404  // AC: we haven't incremented it earlier at start of teams construct,
2405  // so do it here - at the end of teams construct
2406  team->t.t_level++;
2407  } else if (level == tlevel + 1) {
2408  // AC: we are exiting parallel inside teams, need to increment
2409  // serialization in order to restore it in the next call to
2410  // __kmpc_end_serialized_parallel
2411  team->t.t_serialized++;
2412  }
2413  }
2414 #endif /* OMP_40_ENABLED */
2415  __kmpc_end_serialized_parallel(loc, gtid);
2416 
2417 #if OMPT_SUPPORT
2418  if (ompt_enabled.enabled) {
2419  __kmp_join_restore_state(master_th, parent_team);
2420  }
2421 #endif
2422 
2423  return;
2424  }
2425 
2426  master_active = team->t.t_master_active;
2427 
2428 #if OMP_40_ENABLED
2429  if (!exit_teams)
2430 #endif /* OMP_40_ENABLED */
2431  {
2432  // AC: No barrier for internal teams at exit from teams construct.
2433  // But there is barrier for external team (league).
2434  __kmp_internal_join(loc, gtid, team);
2435  }
2436 #if OMP_40_ENABLED
2437  else {
2438  master_th->th.th_task_state =
2439  0; // AC: no tasking in teams (out of any parallel)
2440  }
2441 #endif /* OMP_40_ENABLED */
2442 
2443  KMP_MB();
2444 
2445 #if OMPT_SUPPORT
2446  ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2447  void *codeptr = team->t.ompt_team_info.master_return_address;
2448 #endif
2449 
2450 #if USE_ITT_BUILD
2451  if (__itt_stack_caller_create_ptr) {
2452  __kmp_itt_stack_caller_destroy(
2453  (__itt_caller)team->t
2454  .t_stack_id); // destroy the stack stitching id after join barrier
2455  }
2456 
2457  // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2458  if (team->t.t_active_level == 1
2459 #if OMP_40_ENABLED
2460  && !master_th->th.th_teams_microtask /* not in teams construct */
2461 #endif /* OMP_40_ENABLED */
2462  ) {
2463  master_th->th.th_ident = loc;
2464  // only one notification scheme (either "submit" or "forking/joined", not
2465  // both)
2466  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2467  __kmp_forkjoin_frames_mode == 3)
2468  __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2469  master_th->th.th_frame_time, 0, loc,
2470  master_th->th.th_team_nproc, 1);
2471  else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2472  !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2473  __kmp_itt_region_joined(gtid);
2474  } // active_level == 1
2475 #endif /* USE_ITT_BUILD */
2476 
2477 #if OMP_40_ENABLED
2478  if (master_th->th.th_teams_microtask && !exit_teams &&
2479  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2480  team->t.t_level == master_th->th.th_teams_level + 1) {
2481  // AC: We need to leave the team structure intact at the end of parallel
2482  // inside the teams construct, so that at the next parallel same (hot) team
2483  // works, only adjust nesting levels
2484 
2485  /* Decrement our nested depth level */
2486  team->t.t_level--;
2487  team->t.t_active_level--;
2488  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2489 
2490  // Restore number of threads in the team if needed. This code relies on
2491  // the proper adjustment of th_teams_size.nth after the fork in
2492  // __kmp_teams_master on each teams master in the case that
2493  // __kmp_reserve_threads reduced it.
2494  if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2495  int old_num = master_th->th.th_team_nproc;
2496  int new_num = master_th->th.th_teams_size.nth;
2497  kmp_info_t **other_threads = team->t.t_threads;
2498  team->t.t_nproc = new_num;
2499  for (int i = 0; i < old_num; ++i) {
2500  other_threads[i]->th.th_team_nproc = new_num;
2501  }
2502  // Adjust states of non-used threads of the team
2503  for (int i = old_num; i < new_num; ++i) {
2504  // Re-initialize thread's barrier data.
2505  KMP_DEBUG_ASSERT(other_threads[i]);
2506  kmp_balign_t *balign = other_threads[i]->th.th_bar;
2507  for (int b = 0; b < bs_last_barrier; ++b) {
2508  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2509  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2510 #if USE_DEBUGGER
2511  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2512 #endif
2513  }
2514  if (__kmp_tasking_mode != tskm_immediate_exec) {
2515  // Synchronize thread's task state
2516  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2517  }
2518  }
2519  }
2520 
2521 #if OMPT_SUPPORT
2522  if (ompt_enabled.enabled) {
2523  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2524  codeptr);
2525  }
2526 #endif
2527 
2528  return;
2529  }
2530 #endif /* OMP_40_ENABLED */
2531 
2532  /* do cleanup and restore the parent team */
2533  master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2534  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2535 
2536  master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2537 
2538  /* jc: The following lock has instructions with REL and ACQ semantics,
2539  separating the parallel user code called in this parallel region
2540  from the serial user code called after this function returns. */
2541  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2542 
2543 #if OMP_40_ENABLED
2544  if (!master_th->th.th_teams_microtask ||
2545  team->t.t_level > master_th->th.th_teams_level)
2546 #endif /* OMP_40_ENABLED */
2547  {
2548  /* Decrement our nested depth level */
2549  KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2550  }
2551  KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2552 
2553 #if OMPT_SUPPORT
2554  if (ompt_enabled.enabled) {
2555  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2556  if (ompt_enabled.ompt_callback_implicit_task) {
2557  int ompt_team_size = team->t.t_nproc;
2558  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2559  ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2560  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2561  }
2562 
2563  task_info->frame.exit_frame = ompt_data_none;
2564  task_info->task_data = ompt_data_none;
2565  }
2566 #endif
2567 
2568  KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2569  master_th, team));
2570  __kmp_pop_current_task_from_thread(master_th);
2571 
2572 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2573  // Restore master thread's partition.
2574  master_th->th.th_first_place = team->t.t_first_place;
2575  master_th->th.th_last_place = team->t.t_last_place;
2576 #endif /* OMP_40_ENABLED */
2577 #if OMP_50_ENABLED
2578  master_th->th.th_def_allocator = team->t.t_def_allocator;
2579 #endif
2580 
2581  updateHWFPControl(team);
2582 
2583  if (root->r.r_active != master_active)
2584  root->r.r_active = master_active;
2585 
2586  __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2587  master_th)); // this will free worker threads
2588 
2589  /* this race was fun to find. make sure the following is in the critical
2590  region otherwise assertions may fail occasionally since the old team may be
2591  reallocated and the hierarchy appears inconsistent. it is actually safe to
2592  run and won't cause any bugs, but will cause those assertion failures. it's
2593  only one deref&assign so might as well put this in the critical region */
2594  master_th->th.th_team = parent_team;
2595  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2596  master_th->th.th_team_master = parent_team->t.t_threads[0];
2597  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2598 
2599  /* restore serialized team, if need be */
2600  if (parent_team->t.t_serialized &&
2601  parent_team != master_th->th.th_serial_team &&
2602  parent_team != root->r.r_root_team) {
2603  __kmp_free_team(root,
2604  master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2605  master_th->th.th_serial_team = parent_team;
2606  }
2607 
2608  if (__kmp_tasking_mode != tskm_immediate_exec) {
2609  if (master_th->th.th_task_state_top >
2610  0) { // Restore task state from memo stack
2611  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2612  // Remember master's state if we re-use this nested hot team
2613  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2614  master_th->th.th_task_state;
2615  --master_th->th.th_task_state_top; // pop
2616  // Now restore state at this level
2617  master_th->th.th_task_state =
2618  master_th->th
2619  .th_task_state_memo_stack[master_th->th.th_task_state_top];
2620  }
2621  // Copy the task team from the parent team to the master thread
2622  master_th->th.th_task_team =
2623  parent_team->t.t_task_team[master_th->th.th_task_state];
2624  KA_TRACE(20,
2625  ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2626  __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2627  parent_team));
2628  }
2629 
2630  // TODO: GEH - cannot do this assertion because root thread not set up as
2631  // executing
2632  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2633  master_th->th.th_current_task->td_flags.executing = 1;
2634 
2635  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2636 
2637 #if OMPT_SUPPORT
2638  if (ompt_enabled.enabled) {
2639  __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context,
2640  codeptr);
2641  }
2642 #endif
2643 
2644  KMP_MB();
2645  KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2646 }
2647 
2648 /* Check whether we should push an internal control record onto the
2649  serial team stack. If so, do it. */
2650 void __kmp_save_internal_controls(kmp_info_t *thread) {
2651 
2652  if (thread->th.th_team != thread->th.th_serial_team) {
2653  return;
2654  }
2655  if (thread->th.th_team->t.t_serialized > 1) {
2656  int push = 0;
2657 
2658  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2659  push = 1;
2660  } else {
2661  if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2662  thread->th.th_team->t.t_serialized) {
2663  push = 1;
2664  }
2665  }
2666  if (push) { /* push a record on the serial team's stack */
2667  kmp_internal_control_t *control =
2668  (kmp_internal_control_t *)__kmp_allocate(
2669  sizeof(kmp_internal_control_t));
2670 
2671  copy_icvs(control, &thread->th.th_current_task->td_icvs);
2672 
2673  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2674 
2675  control->next = thread->th.th_team->t.t_control_stack_top;
2676  thread->th.th_team->t.t_control_stack_top = control;
2677  }
2678  }
2679 }
2680 
2681 /* Changes set_nproc */
2682 void __kmp_set_num_threads(int new_nth, int gtid) {
2683  kmp_info_t *thread;
2684  kmp_root_t *root;
2685 
2686  KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2687  KMP_DEBUG_ASSERT(__kmp_init_serial);
2688 
2689  if (new_nth < 1)
2690  new_nth = 1;
2691  else if (new_nth > __kmp_max_nth)
2692  new_nth = __kmp_max_nth;
2693 
2694  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2695  thread = __kmp_threads[gtid];
2696  if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2697  return; // nothing to do
2698 
2699  __kmp_save_internal_controls(thread);
2700 
2701  set__nproc(thread, new_nth);
2702 
2703  // If this omp_set_num_threads() call will cause the hot team size to be
2704  // reduced (in the absence of a num_threads clause), then reduce it now,
2705  // rather than waiting for the next parallel region.
2706  root = thread->th.th_root;
2707  if (__kmp_init_parallel && (!root->r.r_active) &&
2708  (root->r.r_hot_team->t.t_nproc > new_nth)
2709 #if KMP_NESTED_HOT_TEAMS
2710  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2711 #endif
2712  ) {
2713  kmp_team_t *hot_team = root->r.r_hot_team;
2714  int f;
2715 
2716  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2717 
2718  // Release the extra threads we don't need any more.
2719  for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2720  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2721  if (__kmp_tasking_mode != tskm_immediate_exec) {
2722  // When decreasing team size, threads no longer in the team should unref
2723  // task team.
2724  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2725  }
2726  __kmp_free_thread(hot_team->t.t_threads[f]);
2727  hot_team->t.t_threads[f] = NULL;
2728  }
2729  hot_team->t.t_nproc = new_nth;
2730 #if KMP_NESTED_HOT_TEAMS
2731  if (thread->th.th_hot_teams) {
2732  KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2733  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2734  }
2735 #endif
2736 
2737  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2738 
2739  // Update the t_nproc field in the threads that are still active.
2740  for (f = 0; f < new_nth; f++) {
2741  KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2742  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2743  }
2744  // Special flag in case omp_set_num_threads() call
2745  hot_team->t.t_size_changed = -1;
2746  }
2747 }
2748 
2749 /* Changes max_active_levels */
2750 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2751  kmp_info_t *thread;
2752 
2753  KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2754  "%d = (%d)\n",
2755  gtid, max_active_levels));
2756  KMP_DEBUG_ASSERT(__kmp_init_serial);
2757 
2758  // validate max_active_levels
2759  if (max_active_levels < 0) {
2760  KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2761  // We ignore this call if the user has specified a negative value.
2762  // The current setting won't be changed. The last valid setting will be
2763  // used. A warning will be issued (if warnings are allowed as controlled by
2764  // the KMP_WARNINGS env var).
2765  KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2766  "max_active_levels for thread %d = (%d)\n",
2767  gtid, max_active_levels));
2768  return;
2769  }
2770  if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2771  // it's OK, the max_active_levels is within the valid range: [ 0;
2772  // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2773  // We allow a zero value. (implementation defined behavior)
2774  } else {
2775  KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2776  KMP_MAX_ACTIVE_LEVELS_LIMIT);
2777  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2778  // Current upper limit is MAX_INT. (implementation defined behavior)
2779  // If the input exceeds the upper limit, we correct the input to be the
2780  // upper limit. (implementation defined behavior)
2781  // Actually, the flow should never get here until we use MAX_INT limit.
2782  }
2783  KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2784  "max_active_levels for thread %d = (%d)\n",
2785  gtid, max_active_levels));
2786 
2787  thread = __kmp_threads[gtid];
2788 
2789  __kmp_save_internal_controls(thread);
2790 
2791  set__max_active_levels(thread, max_active_levels);
2792 }
2793 
2794 /* Gets max_active_levels */
2795 int __kmp_get_max_active_levels(int gtid) {
2796  kmp_info_t *thread;
2797 
2798  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2799  KMP_DEBUG_ASSERT(__kmp_init_serial);
2800 
2801  thread = __kmp_threads[gtid];
2802  KMP_DEBUG_ASSERT(thread->th.th_current_task);
2803  KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2804  "curtask_maxaclevel=%d\n",
2805  gtid, thread->th.th_current_task,
2806  thread->th.th_current_task->td_icvs.max_active_levels));
2807  return thread->th.th_current_task->td_icvs.max_active_levels;
2808 }
2809 
2810 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2811 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2812  kmp_info_t *thread;
2813  // kmp_team_t *team;
2814 
2815  KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2816  gtid, (int)kind, chunk));
2817  KMP_DEBUG_ASSERT(__kmp_init_serial);
2818 
2819  // Check if the kind parameter is valid, correct if needed.
2820  // Valid parameters should fit in one of two intervals - standard or extended:
2821  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2822  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2823  if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2824  (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2825  // TODO: Hint needs attention in case we change the default schedule.
2826  __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2827  KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2828  __kmp_msg_null);
2829  kind = kmp_sched_default;
2830  chunk = 0; // ignore chunk value in case of bad kind
2831  }
2832 
2833  thread = __kmp_threads[gtid];
2834 
2835  __kmp_save_internal_controls(thread);
2836 
2837  if (kind < kmp_sched_upper_std) {
2838  if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2839  // differ static chunked vs. unchunked: chunk should be invalid to
2840  // indicate unchunked schedule (which is the default)
2841  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2842  } else {
2843  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2844  __kmp_sch_map[kind - kmp_sched_lower - 1];
2845  }
2846  } else {
2847  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2848  // kmp_sched_lower - 2 ];
2849  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2850  __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2851  kmp_sched_lower - 2];
2852  }
2853  if (kind == kmp_sched_auto || chunk < 1) {
2854  // ignore parameter chunk for schedule auto
2855  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2856  } else {
2857  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2858  }
2859 }
2860 
2861 /* Gets def_sched_var ICV values */
2862 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2863  kmp_info_t *thread;
2864  enum sched_type th_type;
2865 
2866  KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2867  KMP_DEBUG_ASSERT(__kmp_init_serial);
2868 
2869  thread = __kmp_threads[gtid];
2870 
2871  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2872 
2873  switch (th_type) {
2874  case kmp_sch_static:
2875  case kmp_sch_static_greedy:
2876  case kmp_sch_static_balanced:
2877  *kind = kmp_sched_static;
2878  *chunk = 0; // chunk was not set, try to show this fact via zero value
2879  return;
2880  case kmp_sch_static_chunked:
2881  *kind = kmp_sched_static;
2882  break;
2883  case kmp_sch_dynamic_chunked:
2884  *kind = kmp_sched_dynamic;
2885  break;
2887  case kmp_sch_guided_iterative_chunked:
2888  case kmp_sch_guided_analytical_chunked:
2889  *kind = kmp_sched_guided;
2890  break;
2891  case kmp_sch_auto:
2892  *kind = kmp_sched_auto;
2893  break;
2894  case kmp_sch_trapezoidal:
2895  *kind = kmp_sched_trapezoidal;
2896  break;
2897 #if KMP_STATIC_STEAL_ENABLED
2898  case kmp_sch_static_steal:
2899  *kind = kmp_sched_static_steal;
2900  break;
2901 #endif
2902  default:
2903  KMP_FATAL(UnknownSchedulingType, th_type);
2904  }
2905 
2906  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2907 }
2908 
2909 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2910 
2911  int ii, dd;
2912  kmp_team_t *team;
2913  kmp_info_t *thr;
2914 
2915  KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2916  KMP_DEBUG_ASSERT(__kmp_init_serial);
2917 
2918  // validate level
2919  if (level == 0)
2920  return 0;
2921  if (level < 0)
2922  return -1;
2923  thr = __kmp_threads[gtid];
2924  team = thr->th.th_team;
2925  ii = team->t.t_level;
2926  if (level > ii)
2927  return -1;
2928 
2929 #if OMP_40_ENABLED
2930  if (thr->th.th_teams_microtask) {
2931  // AC: we are in teams region where multiple nested teams have same level
2932  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2933  if (level <=
2934  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2935  KMP_DEBUG_ASSERT(ii >= tlevel);
2936  // AC: As we need to pass by the teams league, we need to artificially
2937  // increase ii
2938  if (ii == tlevel) {
2939  ii += 2; // three teams have same level
2940  } else {
2941  ii++; // two teams have same level
2942  }
2943  }
2944  }
2945 #endif
2946 
2947  if (ii == level)
2948  return __kmp_tid_from_gtid(gtid);
2949 
2950  dd = team->t.t_serialized;
2951  level++;
2952  while (ii > level) {
2953  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2954  }
2955  if ((team->t.t_serialized) && (!dd)) {
2956  team = team->t.t_parent;
2957  continue;
2958  }
2959  if (ii > level) {
2960  team = team->t.t_parent;
2961  dd = team->t.t_serialized;
2962  ii--;
2963  }
2964  }
2965 
2966  return (dd > 1) ? (0) : (team->t.t_master_tid);
2967 }
2968 
2969 int __kmp_get_team_size(int gtid, int level) {
2970 
2971  int ii, dd;
2972  kmp_team_t *team;
2973  kmp_info_t *thr;
2974 
2975  KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2976  KMP_DEBUG_ASSERT(__kmp_init_serial);
2977 
2978  // validate level
2979  if (level == 0)
2980  return 1;
2981  if (level < 0)
2982  return -1;
2983  thr = __kmp_threads[gtid];
2984  team = thr->th.th_team;
2985  ii = team->t.t_level;
2986  if (level > ii)
2987  return -1;
2988 
2989 #if OMP_40_ENABLED
2990  if (thr->th.th_teams_microtask) {
2991  // AC: we are in teams region where multiple nested teams have same level
2992  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2993  if (level <=
2994  tlevel) { // otherwise usual algorithm works (will not touch the teams)
2995  KMP_DEBUG_ASSERT(ii >= tlevel);
2996  // AC: As we need to pass by the teams league, we need to artificially
2997  // increase ii
2998  if (ii == tlevel) {
2999  ii += 2; // three teams have same level
3000  } else {
3001  ii++; // two teams have same level
3002  }
3003  }
3004  }
3005 #endif
3006 
3007  while (ii > level) {
3008  for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3009  }
3010  if (team->t.t_serialized && (!dd)) {
3011  team = team->t.t_parent;
3012  continue;
3013  }
3014  if (ii > level) {
3015  team = team->t.t_parent;
3016  ii--;
3017  }
3018  }
3019 
3020  return team->t.t_nproc;
3021 }
3022 
3023 kmp_r_sched_t __kmp_get_schedule_global() {
3024  // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3025  // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3026  // independently. So one can get the updated schedule here.
3027 
3028  kmp_r_sched_t r_sched;
3029 
3030  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3031  // __kmp_guided. __kmp_sched should keep original value, so that user can set
3032  // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3033  // different roots (even in OMP 2.5)
3034  if (__kmp_sched == kmp_sch_static) {
3035  // replace STATIC with more detailed schedule (balanced or greedy)
3036  r_sched.r_sched_type = __kmp_static;
3037  } else if (__kmp_sched == kmp_sch_guided_chunked) {
3038  // replace GUIDED with more detailed schedule (iterative or analytical)
3039  r_sched.r_sched_type = __kmp_guided;
3040  } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3041  r_sched.r_sched_type = __kmp_sched;
3042  }
3043 
3044  if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3045  // __kmp_chunk may be wrong here (if it was not ever set)
3046  r_sched.chunk = KMP_DEFAULT_CHUNK;
3047  } else {
3048  r_sched.chunk = __kmp_chunk;
3049  }
3050 
3051  return r_sched;
3052 }
3053 
3054 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3055  at least argc number of *t_argv entries for the requested team. */
3056 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3057 
3058  KMP_DEBUG_ASSERT(team);
3059  if (!realloc || argc > team->t.t_max_argc) {
3060 
3061  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3062  "current entries=%d\n",
3063  team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3064  /* if previously allocated heap space for args, free them */
3065  if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3066  __kmp_free((void *)team->t.t_argv);
3067 
3068  if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3069  /* use unused space in the cache line for arguments */
3070  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3071  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3072  "argv entries\n",
3073  team->t.t_id, team->t.t_max_argc));
3074  team->t.t_argv = &team->t.t_inline_argv[0];
3075  if (__kmp_storage_map) {
3076  __kmp_print_storage_map_gtid(
3077  -1, &team->t.t_inline_argv[0],
3078  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3079  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3080  team->t.t_id);
3081  }
3082  } else {
3083  /* allocate space for arguments in the heap */
3084  team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3085  ? KMP_MIN_MALLOC_ARGV_ENTRIES
3086  : 2 * argc;
3087  KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3088  "argv entries\n",
3089  team->t.t_id, team->t.t_max_argc));
3090  team->t.t_argv =
3091  (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3092  if (__kmp_storage_map) {
3093  __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3094  &team->t.t_argv[team->t.t_max_argc],
3095  sizeof(void *) * team->t.t_max_argc,
3096  "team_%d.t_argv", team->t.t_id);
3097  }
3098  }
3099  }
3100 }
3101 
3102 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3103  int i;
3104  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3105  team->t.t_threads =
3106  (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3107  team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3108  sizeof(dispatch_shared_info_t) * num_disp_buff);
3109  team->t.t_dispatch =
3110  (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3111  team->t.t_implicit_task_taskdata =
3112  (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3113  team->t.t_max_nproc = max_nth;
3114 
3115  /* setup dispatch buffers */
3116  for (i = 0; i < num_disp_buff; ++i) {
3117  team->t.t_disp_buffer[i].buffer_index = i;
3118 #if OMP_45_ENABLED
3119  team->t.t_disp_buffer[i].doacross_buf_idx = i;
3120 #endif
3121  }
3122 }
3123 
3124 static void __kmp_free_team_arrays(kmp_team_t *team) {
3125  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3126  int i;
3127  for (i = 0; i < team->t.t_max_nproc; ++i) {
3128  if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3129  __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3130  team->t.t_dispatch[i].th_disp_buffer = NULL;
3131  }
3132  }
3133 #if KMP_USE_HIER_SCHED
3134  __kmp_dispatch_free_hierarchies(team);
3135 #endif
3136  __kmp_free(team->t.t_threads);
3137  __kmp_free(team->t.t_disp_buffer);
3138  __kmp_free(team->t.t_dispatch);
3139  __kmp_free(team->t.t_implicit_task_taskdata);
3140  team->t.t_threads = NULL;
3141  team->t.t_disp_buffer = NULL;
3142  team->t.t_dispatch = NULL;
3143  team->t.t_implicit_task_taskdata = 0;
3144 }
3145 
3146 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3147  kmp_info_t **oldThreads = team->t.t_threads;
3148 
3149  __kmp_free(team->t.t_disp_buffer);
3150  __kmp_free(team->t.t_dispatch);
3151  __kmp_free(team->t.t_implicit_task_taskdata);
3152  __kmp_allocate_team_arrays(team, max_nth);
3153 
3154  KMP_MEMCPY(team->t.t_threads, oldThreads,
3155  team->t.t_nproc * sizeof(kmp_info_t *));
3156 
3157  __kmp_free(oldThreads);
3158 }
3159 
3160 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3161 
3162  kmp_r_sched_t r_sched =
3163  __kmp_get_schedule_global(); // get current state of scheduling globals
3164 
3165 #if OMP_40_ENABLED
3166  KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3167 #endif /* OMP_40_ENABLED */
3168 
3169  kmp_internal_control_t g_icvs = {
3170  0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3171  (kmp_int8)__kmp_dflt_nested, // int nested; //internal control
3172  // for nested parallelism (per thread)
3173  (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3174  // adjustment of threads (per thread)
3175  (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3176  // whether blocktime is explicitly set
3177  __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3178 #if KMP_USE_MONITOR
3179  __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3180 // intervals
3181 #endif
3182  __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3183  // next parallel region (per thread)
3184  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3185  __kmp_cg_max_nth, // int thread_limit;
3186  __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3187  // for max_active_levels
3188  r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3189 // {sched,chunk} pair
3190 #if OMP_40_ENABLED
3191  __kmp_nested_proc_bind.bind_types[0],
3192  __kmp_default_device,
3193 #endif /* OMP_40_ENABLED */
3194  NULL // struct kmp_internal_control *next;
3195  };
3196 
3197  return g_icvs;
3198 }
3199 
3200 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3201 
3202  kmp_internal_control_t gx_icvs;
3203  gx_icvs.serial_nesting_level =
3204  0; // probably =team->t.t_serial like in save_inter_controls
3205  copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3206  gx_icvs.next = NULL;
3207 
3208  return gx_icvs;
3209 }
3210 
3211 static void __kmp_initialize_root(kmp_root_t *root) {
3212  int f;
3213  kmp_team_t *root_team;
3214  kmp_team_t *hot_team;
3215  int hot_team_max_nth;
3216  kmp_r_sched_t r_sched =
3217  __kmp_get_schedule_global(); // get current state of scheduling globals
3218  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3219  KMP_DEBUG_ASSERT(root);
3220  KMP_ASSERT(!root->r.r_begin);
3221 
3222  /* setup the root state structure */
3223  __kmp_init_lock(&root->r.r_begin_lock);
3224  root->r.r_begin = FALSE;
3225  root->r.r_active = FALSE;
3226  root->r.r_in_parallel = 0;
3227  root->r.r_blocktime = __kmp_dflt_blocktime;
3228  root->r.r_nested = __kmp_dflt_nested;
3229 
3230  /* setup the root team for this task */
3231  /* allocate the root team structure */
3232  KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3233 
3234  root_team =
3235  __kmp_allocate_team(root,
3236  1, // new_nproc
3237  1, // max_nproc
3238 #if OMPT_SUPPORT
3239  ompt_data_none, // root parallel id
3240 #endif
3241 #if OMP_40_ENABLED
3242  __kmp_nested_proc_bind.bind_types[0],
3243 #endif
3244  &r_icvs,
3245  0 // argc
3246  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3247  );
3248 #if USE_DEBUGGER
3249  // Non-NULL value should be assigned to make the debugger display the root
3250  // team.
3251  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3252 #endif
3253 
3254  KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3255 
3256  root->r.r_root_team = root_team;
3257  root_team->t.t_control_stack_top = NULL;
3258 
3259  /* initialize root team */
3260  root_team->t.t_threads[0] = NULL;
3261  root_team->t.t_nproc = 1;
3262  root_team->t.t_serialized = 1;
3263  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3264  root_team->t.t_sched.sched = r_sched.sched;
3265  KA_TRACE(
3266  20,
3267  ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3268  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3269 
3270  /* setup the hot team for this task */
3271  /* allocate the hot team structure */
3272  KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3273 
3274  hot_team =
3275  __kmp_allocate_team(root,
3276  1, // new_nproc
3277  __kmp_dflt_team_nth_ub * 2, // max_nproc
3278 #if OMPT_SUPPORT
3279  ompt_data_none, // root parallel id
3280 #endif
3281 #if OMP_40_ENABLED
3282  __kmp_nested_proc_bind.bind_types[0],
3283 #endif
3284  &r_icvs,
3285  0 // argc
3286  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3287  );
3288  KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3289 
3290  root->r.r_hot_team = hot_team;
3291  root_team->t.t_control_stack_top = NULL;
3292 
3293  /* first-time initialization */
3294  hot_team->t.t_parent = root_team;
3295 
3296  /* initialize hot team */
3297  hot_team_max_nth = hot_team->t.t_max_nproc;
3298  for (f = 0; f < hot_team_max_nth; ++f) {
3299  hot_team->t.t_threads[f] = NULL;
3300  }
3301  hot_team->t.t_nproc = 1;
3302  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3303  hot_team->t.t_sched.sched = r_sched.sched;
3304  hot_team->t.t_size_changed = 0;
3305 }
3306 
3307 #ifdef KMP_DEBUG
3308 
3309 typedef struct kmp_team_list_item {
3310  kmp_team_p const *entry;
3311  struct kmp_team_list_item *next;
3312 } kmp_team_list_item_t;
3313 typedef kmp_team_list_item_t *kmp_team_list_t;
3314 
3315 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3316  kmp_team_list_t list, // List of teams.
3317  kmp_team_p const *team // Team to add.
3318  ) {
3319 
3320  // List must terminate with item where both entry and next are NULL.
3321  // Team is added to the list only once.
3322  // List is sorted in ascending order by team id.
3323  // Team id is *not* a key.
3324 
3325  kmp_team_list_t l;
3326 
3327  KMP_DEBUG_ASSERT(list != NULL);
3328  if (team == NULL) {
3329  return;
3330  }
3331 
3332  __kmp_print_structure_team_accum(list, team->t.t_parent);
3333  __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3334 
3335  // Search list for the team.
3336  l = list;
3337  while (l->next != NULL && l->entry != team) {
3338  l = l->next;
3339  }
3340  if (l->next != NULL) {
3341  return; // Team has been added before, exit.
3342  }
3343 
3344  // Team is not found. Search list again for insertion point.
3345  l = list;
3346  while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3347  l = l->next;
3348  }
3349 
3350  // Insert team.
3351  {
3352  kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3353  sizeof(kmp_team_list_item_t));
3354  *item = *l;
3355  l->entry = team;
3356  l->next = item;
3357  }
3358 }
3359 
3360 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3361 
3362  ) {
3363  __kmp_printf("%s", title);
3364  if (team != NULL) {
3365  __kmp_printf("%2x %p\n", team->t.t_id, team);
3366  } else {
3367  __kmp_printf(" - (nil)\n");
3368  }
3369 }
3370 
3371 static void __kmp_print_structure_thread(char const *title,
3372  kmp_info_p const *thread) {
3373  __kmp_printf("%s", title);
3374  if (thread != NULL) {
3375  __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3376  } else {
3377  __kmp_printf(" - (nil)\n");
3378  }
3379 }
3380 
3381 void __kmp_print_structure(void) {
3382 
3383  kmp_team_list_t list;
3384 
3385  // Initialize list of teams.
3386  list =
3387  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3388  list->entry = NULL;
3389  list->next = NULL;
3390 
3391  __kmp_printf("\n------------------------------\nGlobal Thread "
3392  "Table\n------------------------------\n");
3393  {
3394  int gtid;
3395  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3396  __kmp_printf("%2d", gtid);
3397  if (__kmp_threads != NULL) {
3398  __kmp_printf(" %p", __kmp_threads[gtid]);
3399  }
3400  if (__kmp_root != NULL) {
3401  __kmp_printf(" %p", __kmp_root[gtid]);
3402  }
3403  __kmp_printf("\n");
3404  }
3405  }
3406 
3407  // Print out __kmp_threads array.
3408  __kmp_printf("\n------------------------------\nThreads\n--------------------"
3409  "----------\n");
3410  if (__kmp_threads != NULL) {
3411  int gtid;
3412  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3413  kmp_info_t const *thread = __kmp_threads[gtid];
3414  if (thread != NULL) {
3415  __kmp_printf("GTID %2d %p:\n", gtid, thread);
3416  __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3417  __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3418  __kmp_print_structure_team(" Serial Team: ",
3419  thread->th.th_serial_team);
3420  __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3421  __kmp_print_structure_thread(" Master: ",
3422  thread->th.th_team_master);
3423  __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3424  __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3425 #if OMP_40_ENABLED
3426  __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3427 #endif
3428  __kmp_print_structure_thread(" Next in pool: ",
3429  thread->th.th_next_pool);
3430  __kmp_printf("\n");
3431  __kmp_print_structure_team_accum(list, thread->th.th_team);
3432  __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3433  }
3434  }
3435  } else {
3436  __kmp_printf("Threads array is not allocated.\n");
3437  }
3438 
3439  // Print out __kmp_root array.
3440  __kmp_printf("\n------------------------------\nUbers\n----------------------"
3441  "--------\n");
3442  if (__kmp_root != NULL) {
3443  int gtid;
3444  for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3445  kmp_root_t const *root = __kmp_root[gtid];
3446  if (root != NULL) {
3447  __kmp_printf("GTID %2d %p:\n", gtid, root);
3448  __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3449  __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3450  __kmp_print_structure_thread(" Uber Thread: ",
3451  root->r.r_uber_thread);
3452  __kmp_printf(" Active?: %2d\n", root->r.r_active);
3453  __kmp_printf(" Nested?: %2d\n", root->r.r_nested);
3454  __kmp_printf(" In Parallel: %2d\n",
3455  KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3456  __kmp_printf("\n");
3457  __kmp_print_structure_team_accum(list, root->r.r_root_team);
3458  __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3459  }
3460  }
3461  } else {
3462  __kmp_printf("Ubers array is not allocated.\n");
3463  }
3464 
3465  __kmp_printf("\n------------------------------\nTeams\n----------------------"
3466  "--------\n");
3467  while (list->next != NULL) {
3468  kmp_team_p const *team = list->entry;
3469  int i;
3470  __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3471  __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3472  __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3473  __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3474  __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3475  __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3476  for (i = 0; i < team->t.t_nproc; ++i) {
3477  __kmp_printf(" Thread %2d: ", i);
3478  __kmp_print_structure_thread("", team->t.t_threads[i]);
3479  }
3480  __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3481  __kmp_printf("\n");
3482  list = list->next;
3483  }
3484 
3485  // Print out __kmp_thread_pool and __kmp_team_pool.
3486  __kmp_printf("\n------------------------------\nPools\n----------------------"
3487  "--------\n");
3488  __kmp_print_structure_thread("Thread pool: ",
3489  CCAST(kmp_info_t *, __kmp_thread_pool));
3490  __kmp_print_structure_team("Team pool: ",
3491  CCAST(kmp_team_t *, __kmp_team_pool));
3492  __kmp_printf("\n");
3493 
3494  // Free team list.
3495  while (list != NULL) {
3496  kmp_team_list_item_t *item = list;
3497  list = list->next;
3498  KMP_INTERNAL_FREE(item);
3499  }
3500 }
3501 
3502 #endif
3503 
3504 //---------------------------------------------------------------------------
3505 // Stuff for per-thread fast random number generator
3506 // Table of primes
3507 static const unsigned __kmp_primes[] = {
3508  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3509  0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3510  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3511  0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3512  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3513  0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3514  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3515  0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3516  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3517  0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3518  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3519 
3520 //---------------------------------------------------------------------------
3521 // __kmp_get_random: Get a random number using a linear congruential method.
3522 unsigned short __kmp_get_random(kmp_info_t *thread) {
3523  unsigned x = thread->th.th_x;
3524  unsigned short r = x >> 16;
3525 
3526  thread->th.th_x = x * thread->th.th_a + 1;
3527 
3528  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3529  thread->th.th_info.ds.ds_tid, r));
3530 
3531  return r;
3532 }
3533 //--------------------------------------------------------
3534 // __kmp_init_random: Initialize a random number generator
3535 void __kmp_init_random(kmp_info_t *thread) {
3536  unsigned seed = thread->th.th_info.ds.ds_tid;
3537 
3538  thread->th.th_a =
3539  __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3540  thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3541  KA_TRACE(30,
3542  ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3543 }
3544 
3545 #if KMP_OS_WINDOWS
3546 /* reclaim array entries for root threads that are already dead, returns number
3547  * reclaimed */
3548 static int __kmp_reclaim_dead_roots(void) {
3549  int i, r = 0;
3550 
3551  for (i = 0; i < __kmp_threads_capacity; ++i) {
3552  if (KMP_UBER_GTID(i) &&
3553  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3554  !__kmp_root[i]
3555  ->r.r_active) { // AC: reclaim only roots died in non-active state
3556  r += __kmp_unregister_root_other_thread(i);
3557  }
3558  }
3559  return r;
3560 }
3561 #endif
3562 
3563 /* This function attempts to create free entries in __kmp_threads and
3564  __kmp_root, and returns the number of free entries generated.
3565 
3566  For Windows* OS static library, the first mechanism used is to reclaim array
3567  entries for root threads that are already dead.
3568 
3569  On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3570  __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3571  capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3572  threadprivate cache array has been created. Synchronization with
3573  __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3574 
3575  After any dead root reclamation, if the clipping value allows array expansion
3576  to result in the generation of a total of nNeed free slots, the function does
3577  that expansion. If not, nothing is done beyond the possible initial root
3578  thread reclamation.
3579 
3580  If any argument is negative, the behavior is undefined. */
3581 static int __kmp_expand_threads(int nNeed) {
3582  int added = 0;
3583  int minimumRequiredCapacity;
3584  int newCapacity;
3585  kmp_info_t **newThreads;
3586  kmp_root_t **newRoot;
3587 
3588 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3589 // resizing __kmp_threads does not need additional protection if foreign
3590 // threads are present
3591 
3592 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3593  /* only for Windows static library */
3594  /* reclaim array entries for root threads that are already dead */
3595  added = __kmp_reclaim_dead_roots();
3596 
3597  if (nNeed) {
3598  nNeed -= added;
3599  if (nNeed < 0)
3600  nNeed = 0;
3601  }
3602 #endif
3603  if (nNeed <= 0)
3604  return added;
3605 
3606  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3607  // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3608  // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3609  // > __kmp_max_nth in one of two ways:
3610  //
3611  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3612  // may not be resused by another thread, so we may need to increase
3613  // __kmp_threads_capacity to __kmp_max_nth + 1.
3614  //
3615  // 2) New foreign root(s) are encountered. We always register new foreign
3616  // roots. This may cause a smaller # of threads to be allocated at
3617  // subsequent parallel regions, but the worker threads hang around (and
3618  // eventually go to sleep) and need slots in the __kmp_threads[] array.
3619  //
3620  // Anyway, that is the reason for moving the check to see if
3621  // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3622  // instead of having it performed here. -BB
3623 
3624  KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3625 
3626  /* compute expansion headroom to check if we can expand */
3627  if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3628  /* possible expansion too small -- give up */
3629  return added;
3630  }
3631  minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3632 
3633  newCapacity = __kmp_threads_capacity;
3634  do {
3635  newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3636  : __kmp_sys_max_nth;
3637  } while (newCapacity < minimumRequiredCapacity);
3638  newThreads = (kmp_info_t **)__kmp_allocate(
3639  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3640  newRoot =
3641  (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3642  KMP_MEMCPY(newThreads, __kmp_threads,
3643  __kmp_threads_capacity * sizeof(kmp_info_t *));
3644  KMP_MEMCPY(newRoot, __kmp_root,
3645  __kmp_threads_capacity * sizeof(kmp_root_t *));
3646 
3647  kmp_info_t **temp_threads = __kmp_threads;
3648  *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3649  *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3650  __kmp_free(temp_threads);
3651  added += newCapacity - __kmp_threads_capacity;
3652  *(volatile int *)&__kmp_threads_capacity = newCapacity;
3653 
3654  if (newCapacity > __kmp_tp_capacity) {
3655  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3656  if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3657  __kmp_threadprivate_resize_cache(newCapacity);
3658  } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3659  *(volatile int *)&__kmp_tp_capacity = newCapacity;
3660  }
3661  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3662  }
3663 
3664  return added;
3665 }
3666 
3667 /* Register the current thread as a root thread and obtain our gtid. We must
3668  have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3669  thread that calls from __kmp_do_serial_initialize() */
3670 int __kmp_register_root(int initial_thread) {
3671  kmp_info_t *root_thread;
3672  kmp_root_t *root;
3673  int gtid;
3674  int capacity;
3675  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3676  KA_TRACE(20, ("__kmp_register_root: entered\n"));
3677  KMP_MB();
3678 
3679  /* 2007-03-02:
3680  If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3681  initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3682  work as expected -- it may return false (that means there is at least one
3683  empty slot in __kmp_threads array), but it is possible the only free slot
3684  is #0, which is reserved for initial thread and so cannot be used for this
3685  one. Following code workarounds this bug.
3686 
3687  However, right solution seems to be not reserving slot #0 for initial
3688  thread because:
3689  (1) there is no magic in slot #0,
3690  (2) we cannot detect initial thread reliably (the first thread which does
3691  serial initialization may be not a real initial thread).
3692  */
3693  capacity = __kmp_threads_capacity;
3694  if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3695  --capacity;
3696  }
3697 
3698  /* see if there are too many threads */
3699  if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3700  if (__kmp_tp_cached) {
3701  __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3702  KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3703  KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3704  } else {
3705  __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3706  __kmp_msg_null);
3707  }
3708  }
3709 
3710  /* find an available thread slot */
3711  /* Don't reassign the zero slot since we need that to only be used by initial
3712  thread */
3713  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3714  gtid++)
3715  ;
3716  KA_TRACE(1,
3717  ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3718  KMP_ASSERT(gtid < __kmp_threads_capacity);
3719 
3720  /* update global accounting */
3721  __kmp_all_nth++;
3722  TCW_4(__kmp_nth, __kmp_nth + 1);
3723 
3724  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3725  // numbers of procs, and method #2 (keyed API call) for higher numbers.
3726  if (__kmp_adjust_gtid_mode) {
3727  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3728  if (TCR_4(__kmp_gtid_mode) != 2) {
3729  TCW_4(__kmp_gtid_mode, 2);
3730  }
3731  } else {
3732  if (TCR_4(__kmp_gtid_mode) != 1) {
3733  TCW_4(__kmp_gtid_mode, 1);
3734  }
3735  }
3736  }
3737 
3738 #ifdef KMP_ADJUST_BLOCKTIME
3739  /* Adjust blocktime to zero if necessary */
3740  /* Middle initialization might not have occurred yet */
3741  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3742  if (__kmp_nth > __kmp_avail_proc) {
3743  __kmp_zero_bt = TRUE;
3744  }
3745  }
3746 #endif /* KMP_ADJUST_BLOCKTIME */
3747 
3748  /* setup this new hierarchy */
3749  if (!(root = __kmp_root[gtid])) {
3750  root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3751  KMP_DEBUG_ASSERT(!root->r.r_root_team);
3752  }
3753 
3754 #if KMP_STATS_ENABLED
3755  // Initialize stats as soon as possible (right after gtid assignment).
3756  __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3757  __kmp_stats_thread_ptr->startLife();
3758  KMP_SET_THREAD_STATE(SERIAL_REGION);
3759  KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3760 #endif
3761  __kmp_initialize_root(root);
3762 
3763  /* setup new root thread structure */
3764  if (root->r.r_uber_thread) {
3765  root_thread = root->r.r_uber_thread;
3766  } else {
3767  root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3768  if (__kmp_storage_map) {
3769  __kmp_print_thread_storage_map(root_thread, gtid);
3770  }
3771  root_thread->th.th_info.ds.ds_gtid = gtid;
3772 #if OMPT_SUPPORT
3773  root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3774 #endif
3775  root_thread->th.th_root = root;
3776  if (__kmp_env_consistency_check) {
3777  root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3778  }
3779 #if USE_FAST_MEMORY
3780  __kmp_initialize_fast_memory(root_thread);
3781 #endif /* USE_FAST_MEMORY */
3782 
3783 #if KMP_USE_BGET
3784  KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3785  __kmp_initialize_bget(root_thread);
3786 #endif
3787  __kmp_init_random(root_thread); // Initialize random number generator
3788  }
3789 
3790  /* setup the serial team held in reserve by the root thread */
3791  if (!root_thread->th.th_serial_team) {
3792  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3793  KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3794  root_thread->th.th_serial_team =
3795  __kmp_allocate_team(root, 1, 1,
3796 #if OMPT_SUPPORT
3797  ompt_data_none, // root parallel id
3798 #endif
3799 #if OMP_40_ENABLED
3800  proc_bind_default,
3801 #endif
3802  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3803  }
3804  KMP_ASSERT(root_thread->th.th_serial_team);
3805  KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3806  root_thread->th.th_serial_team));
3807 
3808  /* drop root_thread into place */
3809  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3810 
3811  root->r.r_root_team->t.t_threads[0] = root_thread;
3812  root->r.r_hot_team->t.t_threads[0] = root_thread;
3813  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3814  // AC: the team created in reserve, not for execution (it is unused for now).
3815  root_thread->th.th_serial_team->t.t_serialized = 0;
3816  root->r.r_uber_thread = root_thread;
3817 
3818  /* initialize the thread, get it ready to go */
3819  __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3820  TCW_4(__kmp_init_gtid, TRUE);
3821 
3822  /* prepare the master thread for get_gtid() */
3823  __kmp_gtid_set_specific(gtid);
3824 
3825 #if USE_ITT_BUILD
3826  __kmp_itt_thread_name(gtid);
3827 #endif /* USE_ITT_BUILD */
3828 
3829 #ifdef KMP_TDATA_GTID
3830  __kmp_gtid = gtid;
3831 #endif
3832  __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3833  KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3834 
3835  KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3836  "plain=%u\n",
3837  gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3838  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3839  KMP_INIT_BARRIER_STATE));
3840  { // Initialize barrier data.
3841  int b;
3842  for (b = 0; b < bs_last_barrier; ++b) {
3843  root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3844 #if USE_DEBUGGER
3845  root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3846 #endif
3847  }
3848  }
3849  KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3850  KMP_INIT_BARRIER_STATE);
3851 
3852 #if KMP_AFFINITY_SUPPORTED
3853 #if OMP_40_ENABLED
3854  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3855  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3856  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3857  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3858 #endif
3859  if (TCR_4(__kmp_init_middle)) {
3860  __kmp_affinity_set_init_mask(gtid, TRUE);
3861  }
3862 #endif /* KMP_AFFINITY_SUPPORTED */
3863 #if OMP_50_ENABLED
3864  root_thread->th.th_def_allocator = __kmp_def_allocator;
3865  root_thread->th.th_prev_level = 0;
3866  root_thread->th.th_prev_num_threads = 1;
3867 #endif
3868 
3869  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3870  tmp->cg_root = root_thread;
3871  tmp->cg_thread_limit = __kmp_cg_max_nth;
3872  tmp->cg_nthreads = 1;
3873  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3874  " cg_nthreads init to 1\n",
3875  root_thread, tmp));
3876  tmp->up = NULL;
3877  root_thread->th.th_cg_roots = tmp;
3878 
3879  __kmp_root_counter++;
3880 
3881 #if OMPT_SUPPORT
3882  if (!initial_thread && ompt_enabled.enabled) {
3883 
3884  kmp_info_t *root_thread = ompt_get_thread();
3885 
3886  ompt_set_thread_state(root_thread, ompt_state_overhead);
3887 
3888  if (ompt_enabled.ompt_callback_thread_begin) {
3889  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3890  ompt_thread_initial, __ompt_get_thread_data_internal());
3891  }
3892  ompt_data_t *task_data;
3893  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
3894  if (ompt_enabled.ompt_callback_task_create) {
3895  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
3896  NULL, NULL, task_data, ompt_task_initial, 0, NULL);
3897  // initial task has nothing to return to
3898  }
3899 
3900  ompt_set_thread_state(root_thread, ompt_state_work_serial);
3901  }
3902 #endif
3903 
3904  KMP_MB();
3905  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3906 
3907  return gtid;
3908 }
3909 
3910 #if KMP_NESTED_HOT_TEAMS
3911 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3912  const int max_level) {
3913  int i, n, nth;
3914  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3915  if (!hot_teams || !hot_teams[level].hot_team) {
3916  return 0;
3917  }
3918  KMP_DEBUG_ASSERT(level < max_level);
3919  kmp_team_t *team = hot_teams[level].hot_team;
3920  nth = hot_teams[level].hot_team_nth;
3921  n = nth - 1; // master is not freed
3922  if (level < max_level - 1) {
3923  for (i = 0; i < nth; ++i) {
3924  kmp_info_t *th = team->t.t_threads[i];
3925  n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3926  if (i > 0 && th->th.th_hot_teams) {
3927  __kmp_free(th->th.th_hot_teams);
3928  th->th.th_hot_teams = NULL;
3929  }
3930  }
3931  }
3932  __kmp_free_team(root, team, NULL);
3933  return n;
3934 }
3935 #endif
3936 
3937 // Resets a root thread and clear its root and hot teams.
3938 // Returns the number of __kmp_threads entries directly and indirectly freed.
3939 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3940  kmp_team_t *root_team = root->r.r_root_team;
3941  kmp_team_t *hot_team = root->r.r_hot_team;
3942  int n = hot_team->t.t_nproc;
3943  int i;
3944 
3945  KMP_DEBUG_ASSERT(!root->r.r_active);
3946 
3947  root->r.r_root_team = NULL;
3948  root->r.r_hot_team = NULL;
3949  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3950  // before call to __kmp_free_team().
3951  __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3952 #if KMP_NESTED_HOT_TEAMS
3953  if (__kmp_hot_teams_max_level >
3954  0) { // need to free nested hot teams and their threads if any
3955  for (i = 0; i < hot_team->t.t_nproc; ++i) {
3956  kmp_info_t *th = hot_team->t.t_threads[i];
3957  if (__kmp_hot_teams_max_level > 1) {
3958  n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3959  }
3960  if (th->th.th_hot_teams) {
3961  __kmp_free(th->th.th_hot_teams);
3962  th->th.th_hot_teams = NULL;
3963  }
3964  }
3965  }
3966 #endif
3967  __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3968 
3969  // Before we can reap the thread, we need to make certain that all other
3970  // threads in the teams that had this root as ancestor have stopped trying to
3971  // steal tasks.
3972  if (__kmp_tasking_mode != tskm_immediate_exec) {
3973  __kmp_wait_to_unref_task_teams();
3974  }
3975 
3976 #if KMP_OS_WINDOWS
3977  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3978  KA_TRACE(
3979  10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3980  "\n",
3981  (LPVOID) & (root->r.r_uber_thread->th),
3982  root->r.r_uber_thread->th.th_info.ds.ds_thread));
3983  __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3984 #endif /* KMP_OS_WINDOWS */
3985 
3986 #if OMPT_SUPPORT
3987  if (ompt_enabled.ompt_callback_thread_end) {
3988  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3989  &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3990  }
3991 #endif
3992 
3993  TCW_4(__kmp_nth,
3994  __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3995  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3996  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3997  " to %d\n",
3998  root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3999  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4000 
4001  __kmp_reap_thread(root->r.r_uber_thread, 1);
4002 
4003  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead
4004  // of freeing.
4005  root->r.r_uber_thread = NULL;
4006  /* mark root as no longer in use */
4007  root->r.r_begin = FALSE;
4008 
4009  return n;
4010 }
4011 
4012 void __kmp_unregister_root_current_thread(int gtid) {
4013  KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4014  /* this lock should be ok, since unregister_root_current_thread is never
4015  called during an abort, only during a normal close. furthermore, if you
4016  have the forkjoin lock, you should never try to get the initz lock */
4017  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4018  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4019  KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4020  "exiting T#%d\n",
4021  gtid));
4022  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4023  return;
4024  }
4025  kmp_root_t *root = __kmp_root[gtid];
4026 
4027  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4028  KMP_ASSERT(KMP_UBER_GTID(gtid));
4029  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4030  KMP_ASSERT(root->r.r_active == FALSE);
4031 
4032  KMP_MB();
4033 
4034 #if OMP_45_ENABLED
4035  kmp_info_t *thread = __kmp_threads[gtid];
4036  kmp_team_t *team = thread->th.th_team;
4037  kmp_task_team_t *task_team = thread->th.th_task_team;
4038 
4039  // we need to wait for the proxy tasks before finishing the thread
4040  if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
4041 #if OMPT_SUPPORT
4042  // the runtime is shutting down so we won't report any events
4043  thread->th.ompt_thread_info.state = ompt_state_undefined;
4044 #endif
4045  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4046  }
4047 #endif
4048 
4049  __kmp_reset_root(gtid, root);
4050 
4051  /* free up this thread slot */
4052  __kmp_gtid_set_specific(KMP_GTID_DNE);
4053 #ifdef KMP_TDATA_GTID
4054  __kmp_gtid = KMP_GTID_DNE;
4055 #endif
4056 
4057  KMP_MB();
4058  KC_TRACE(10,
4059  ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4060 
4061  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4062 }
4063 
4064 #if KMP_OS_WINDOWS
4065 /* __kmp_forkjoin_lock must be already held
4066  Unregisters a root thread that is not the current thread. Returns the number
4067  of __kmp_threads entries freed as a result. */
4068 static int __kmp_unregister_root_other_thread(int gtid) {
4069  kmp_root_t *root = __kmp_root[gtid];
4070  int r;
4071 
4072  KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4073  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4074  KMP_ASSERT(KMP_UBER_GTID(gtid));
4075  KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4076  KMP_ASSERT(root->r.r_active == FALSE);
4077 
4078  r = __kmp_reset_root(gtid, root);
4079  KC_TRACE(10,
4080  ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4081  return r;
4082 }
4083 #endif
4084 
4085 #if KMP_DEBUG
4086 void __kmp_task_info() {
4087 
4088  kmp_int32 gtid = __kmp_entry_gtid();
4089  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4090  kmp_info_t *this_thr = __kmp_threads[gtid];
4091  kmp_team_t *steam = this_thr->th.th_serial_team;
4092  kmp_team_t *team = this_thr->th.th_team;
4093 
4094  __kmp_printf(
4095  "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4096  "ptask=%p\n",
4097  gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4098  team->t.t_implicit_task_taskdata[tid].td_parent);
4099 }
4100 #endif // KMP_DEBUG
4101 
4102 /* TODO optimize with one big memclr, take out what isn't needed, split
4103  responsibility to workers as much as possible, and delay initialization of
4104  features as much as possible */
4105 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4106  int tid, int gtid) {
4107  /* this_thr->th.th_info.ds.ds_gtid is setup in
4108  kmp_allocate_thread/create_worker.
4109  this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4110  kmp_info_t *master = team->t.t_threads[0];
4111  KMP_DEBUG_ASSERT(this_thr != NULL);
4112  KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4113  KMP_DEBUG_ASSERT(team);
4114  KMP_DEBUG_ASSERT(team->t.t_threads);
4115  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4116  KMP_DEBUG_ASSERT(master);
4117  KMP_DEBUG_ASSERT(master->th.th_root);
4118 
4119  KMP_MB();
4120 
4121  TCW_SYNC_PTR(this_thr->th.th_team, team);
4122 
4123  this_thr->th.th_info.ds.ds_tid = tid;
4124  this_thr->th.th_set_nproc = 0;
4125  if (__kmp_tasking_mode != tskm_immediate_exec)
4126  // When tasking is possible, threads are not safe to reap until they are
4127  // done tasking; this will be set when tasking code is exited in wait
4128  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4129  else // no tasking --> always safe to reap
4130  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4131 #if OMP_40_ENABLED
4132  this_thr->th.th_set_proc_bind = proc_bind_default;
4133 #if KMP_AFFINITY_SUPPORTED
4134  this_thr->th.th_new_place = this_thr->th.th_current_place;
4135 #endif
4136 #endif
4137  this_thr->th.th_root = master->th.th_root;
4138 
4139  /* setup the thread's cache of the team structure */
4140  this_thr->th.th_team_nproc = team->t.t_nproc;
4141  this_thr->th.th_team_master = master;
4142  this_thr->th.th_team_serialized = team->t.t_serialized;
4143  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4144 
4145  KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4146 
4147  KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4148  tid, gtid, this_thr, this_thr->th.th_current_task));
4149 
4150  __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4151  team, tid, TRUE);
4152 
4153  KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4154  tid, gtid, this_thr, this_thr->th.th_current_task));
4155  // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4156  // __kmp_initialize_team()?
4157 
4158  /* TODO no worksharing in speculative threads */
4159  this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4160 
4161  this_thr->th.th_local.this_construct = 0;
4162 
4163  if (!this_thr->th.th_pri_common) {
4164  this_thr->th.th_pri_common =
4165  (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4166  if (__kmp_storage_map) {
4167  __kmp_print_storage_map_gtid(
4168  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4169  sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4170  }
4171  this_thr->th.th_pri_head = NULL;
4172  }
4173 
4174  if (this_thr != master && // Master's CG root is initialized elsewhere
4175  this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4176  // Make new thread's CG root same as master's
4177  KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4178  this_thr->th.th_cg_roots = master->th.th_cg_roots;
4179  // Increment new thread's CG root's counter to add the new thread
4180  this_thr->th.th_cg_roots->cg_nthreads++;
4181  KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4182  " node %p of thread %p to %d\n",
4183  this_thr, this_thr->th.th_cg_roots,
4184  this_thr->th.th_cg_roots->cg_root,
4185  this_thr->th.th_cg_roots->cg_nthreads));
4186  this_thr->th.th_current_task->td_icvs.thread_limit =
4187  this_thr->th.th_cg_roots->cg_thread_limit;
4188  }
4189 
4190  /* Initialize dynamic dispatch */
4191  {
4192  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4193  // Use team max_nproc since this will never change for the team.
4194  size_t disp_size =
4195  sizeof(dispatch_private_info_t) *
4196  (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4197  KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4198  team->t.t_max_nproc));
4199  KMP_ASSERT(dispatch);
4200  KMP_DEBUG_ASSERT(team->t.t_dispatch);
4201  KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4202 
4203  dispatch->th_disp_index = 0;
4204 #if OMP_45_ENABLED
4205  dispatch->th_doacross_buf_idx = 0;
4206 #endif
4207  if (!dispatch->th_disp_buffer) {
4208  dispatch->th_disp_buffer =
4209  (dispatch_private_info_t *)__kmp_allocate(disp_size);
4210 
4211  if (__kmp_storage_map) {
4212  __kmp_print_storage_map_gtid(
4213  gtid, &dispatch->th_disp_buffer[0],
4214  &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4215  ? 1
4216  : __kmp_dispatch_num_buffers],
4217  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4218  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4219  gtid, team->t.t_id, gtid);
4220  }
4221  } else {
4222  memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4223  }
4224 
4225  dispatch->th_dispatch_pr_current = 0;
4226  dispatch->th_dispatch_sh_current = 0;
4227 
4228  dispatch->th_deo_fcn = 0; /* ORDERED */
4229  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4230  }
4231 
4232  this_thr->th.th_next_pool = NULL;
4233 
4234  if (!this_thr->th.th_task_state_memo_stack) {
4235  size_t i;
4236  this_thr->th.th_task_state_memo_stack =
4237  (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4238  this_thr->th.th_task_state_top = 0;
4239  this_thr->th.th_task_state_stack_sz = 4;
4240  for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4241  ++i) // zero init the stack
4242  this_thr->th.th_task_state_memo_stack[i] = 0;
4243  }
4244 
4245  KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4246  KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4247 
4248  KMP_MB();
4249 }
4250 
4251 /* allocate a new thread for the requesting team. this is only called from
4252  within a forkjoin critical section. we will first try to get an available
4253  thread from the thread pool. if none is available, we will fork a new one
4254  assuming we are able to create a new one. this should be assured, as the
4255  caller should check on this first. */
4256 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4257  int new_tid) {
4258  kmp_team_t *serial_team;
4259  kmp_info_t *new_thr;
4260  int new_gtid;
4261 
4262  KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4263  KMP_DEBUG_ASSERT(root && team);
4264 #if !KMP_NESTED_HOT_TEAMS
4265  KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4266 #endif
4267  KMP_MB();
4268 
4269  /* first, try to get one from the thread pool */
4270  if (__kmp_thread_pool) {
4271  new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4272  __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4273  if (new_thr == __kmp_thread_pool_insert_pt) {
4274  __kmp_thread_pool_insert_pt = NULL;
4275  }
4276  TCW_4(new_thr->th.th_in_pool, FALSE);
4277  // Don't touch th_active_in_pool or th_active.
4278  // The worker thread adjusts those flags as it sleeps/awakens.
4279  __kmp_thread_pool_nth--;
4280 
4281  KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4282  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4283  KMP_ASSERT(!new_thr->th.th_team);
4284  KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4285  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0);
4286 
4287  /* setup the thread structure */
4288  __kmp_initialize_info(new_thr, team, new_tid,
4289  new_thr->th.th_info.ds.ds_gtid);
4290  KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4291 
4292  TCW_4(__kmp_nth, __kmp_nth + 1);
4293 
4294  new_thr->th.th_task_state = 0;
4295  new_thr->th.th_task_state_top = 0;
4296  new_thr->th.th_task_state_stack_sz = 4;
4297 
4298 #ifdef KMP_ADJUST_BLOCKTIME
4299  /* Adjust blocktime back to zero if necessary */
4300  /* Middle initialization might not have occurred yet */
4301  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4302  if (__kmp_nth > __kmp_avail_proc) {
4303  __kmp_zero_bt = TRUE;
4304  }
4305  }
4306 #endif /* KMP_ADJUST_BLOCKTIME */
4307 
4308 #if KMP_DEBUG
4309  // If thread entered pool via __kmp_free_thread, wait_flag should !=
4310  // KMP_BARRIER_PARENT_FLAG.
4311  int b;
4312  kmp_balign_t *balign = new_thr->th.th_bar;
4313  for (b = 0; b < bs_last_barrier; ++b)
4314  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4315 #endif
4316 
4317  KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4318  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4319 
4320  KMP_MB();
4321  return new_thr;
4322  }
4323 
4324  /* no, well fork a new one */
4325  KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4326  KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4327 
4328 #if KMP_USE_MONITOR
4329  // If this is the first worker thread the RTL is creating, then also
4330  // launch the monitor thread. We try to do this as early as possible.
4331  if (!TCR_4(__kmp_init_monitor)) {
4332  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4333  if (!TCR_4(__kmp_init_monitor)) {
4334  KF_TRACE(10, ("before __kmp_create_monitor\n"));
4335  TCW_4(__kmp_init_monitor, 1);
4336  __kmp_create_monitor(&__kmp_monitor);
4337  KF_TRACE(10, ("after __kmp_create_monitor\n"));
4338 #if KMP_OS_WINDOWS
4339  // AC: wait until monitor has started. This is a fix for CQ232808.
4340  // The reason is that if the library is loaded/unloaded in a loop with
4341  // small (parallel) work in between, then there is high probability that
4342  // monitor thread started after the library shutdown. At shutdown it is
4343  // too late to cope with the problem, because when the master is in
4344  // DllMain (process detach) the monitor has no chances to start (it is
4345  // blocked), and master has no means to inform the monitor that the
4346  // library has gone, because all the memory which the monitor can access
4347  // is going to be released/reset.
4348  while (TCR_4(__kmp_init_monitor) < 2) {
4349  KMP_YIELD(TRUE);
4350  }
4351  KF_TRACE(10, ("after monitor thread has started\n"));
4352 #endif
4353  }
4354  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4355  }
4356 #endif
4357 
4358  KMP_MB();
4359  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4360  KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4361  }
4362 
4363  /* allocate space for it. */
4364  new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4365 
4366  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4367 
4368  if (__kmp_storage_map) {
4369  __kmp_print_thread_storage_map(new_thr, new_gtid);
4370  }
4371 
4372  // add the reserve serialized team, initialized from the team's master thread
4373  {
4374  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4375  KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4376  new_thr->th.th_serial_team = serial_team =
4377  (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4378 #if OMPT_SUPPORT
4379  ompt_data_none, // root parallel id
4380 #endif
4381 #if OMP_40_ENABLED
4382  proc_bind_default,
4383 #endif
4384  &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
4385  }
4386  KMP_ASSERT(serial_team);
4387  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4388  // execution (it is unused for now).
4389  serial_team->t.t_threads[0] = new_thr;
4390  KF_TRACE(10,
4391  ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4392  new_thr));
4393 
4394  /* setup the thread structures */
4395  __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4396 
4397 #if USE_FAST_MEMORY
4398  __kmp_initialize_fast_memory(new_thr);
4399 #endif /* USE_FAST_MEMORY */
4400 
4401 #if KMP_USE_BGET
4402  KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4403  __kmp_initialize_bget(new_thr);
4404 #endif
4405 
4406  __kmp_init_random(new_thr); // Initialize random number generator
4407 
4408  /* Initialize these only once when thread is grabbed for a team allocation */
4409  KA_TRACE(20,
4410  ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4411  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4412 
4413  int b;
4414  kmp_balign_t *balign = new_thr->th.th_bar;
4415  for (b = 0; b < bs_last_barrier; ++b) {
4416  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4417  balign[b].bb.team = NULL;
4418  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4419  balign[b].bb.use_oncore_barrier = 0;
4420  }
4421 
4422  new_thr->th.th_spin_here = FALSE;
4423  new_thr->th.th_next_waiting = 0;
4424 #if KMP_OS_UNIX
4425  new_thr->th.th_blocking = false;
4426 #endif
4427 
4428 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4429  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4430  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4431  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4432  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4433 #endif
4434 #if OMP_50_ENABLED
4435  new_thr->th.th_def_allocator = __kmp_def_allocator;
4436  new_thr->th.th_prev_level = 0;
4437  new_thr->th.th_prev_num_threads = 1;
4438 #endif
4439 
4440  TCW_4(new_thr->th.th_in_pool, FALSE);
4441  new_thr->th.th_active_in_pool = FALSE;
4442  TCW_4(new_thr->th.th_active, TRUE);
4443 
4444  /* adjust the global counters */
4445  __kmp_all_nth++;
4446  __kmp_nth++;
4447 
4448  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4449  // numbers of procs, and method #2 (keyed API call) for higher numbers.
4450  if (__kmp_adjust_gtid_mode) {
4451  if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4452  if (TCR_4(__kmp_gtid_mode) != 2) {
4453  TCW_4(__kmp_gtid_mode, 2);
4454  }
4455  } else {
4456  if (TCR_4(__kmp_gtid_mode) != 1) {
4457  TCW_4(__kmp_gtid_mode, 1);
4458  }
4459  }
4460  }
4461 
4462 #ifdef KMP_ADJUST_BLOCKTIME
4463  /* Adjust blocktime back to zero if necessary */
4464  /* Middle initialization might not have occurred yet */
4465  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4466  if (__kmp_nth > __kmp_avail_proc) {
4467  __kmp_zero_bt = TRUE;
4468  }
4469  }
4470 #endif /* KMP_ADJUST_BLOCKTIME */
4471 
4472  /* actually fork it and create the new worker thread */
4473  KF_TRACE(
4474  10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4475  __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4476  KF_TRACE(10,
4477  ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4478 
4479  KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4480  new_gtid));
4481  KMP_MB();
4482  return new_thr;
4483 }
4484 
4485 /* Reinitialize team for reuse.
4486  The hot team code calls this case at every fork barrier, so EPCC barrier
4487  test are extremely sensitive to changes in it, esp. writes to the team
4488  struct, which cause a cache invalidation in all threads.
4489  IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4490 static void __kmp_reinitialize_team(kmp_team_t *team,
4491  kmp_internal_control_t *new_icvs,
4492  ident_t *loc) {
4493  KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4494  team->t.t_threads[0], team));
4495  KMP_DEBUG_ASSERT(team && new_icvs);
4496  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4497  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4498 
4499  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4500  // Copy ICVs to the master thread's implicit taskdata
4501  __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4502  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4503 
4504  KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4505  team->t.t_threads[0], team));
4506 }
4507 
4508 /* Initialize the team data structure.
4509  This assumes the t_threads and t_max_nproc are already set.
4510  Also, we don't touch the arguments */
4511 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4512  kmp_internal_control_t *new_icvs,
4513  ident_t *loc) {
4514  KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4515 
4516  /* verify */
4517  KMP_DEBUG_ASSERT(team);
4518  KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4519  KMP_DEBUG_ASSERT(team->t.t_threads);
4520  KMP_MB();
4521 
4522  team->t.t_master_tid = 0; /* not needed */
4523  /* team->t.t_master_bar; not needed */
4524  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4525  team->t.t_nproc = new_nproc;
4526 
4527  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4528  team->t.t_next_pool = NULL;
4529  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4530  * up hot team */
4531 
4532  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4533  team->t.t_invoke = NULL; /* not needed */
4534 
4535  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4536  team->t.t_sched.sched = new_icvs->sched.sched;
4537 
4538 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4539  team->t.t_fp_control_saved = FALSE; /* not needed */
4540  team->t.t_x87_fpu_control_word = 0; /* not needed */
4541  team->t.t_mxcsr = 0; /* not needed */
4542 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4543 
4544  team->t.t_construct = 0;
4545 
4546  team->t.t_ordered.dt.t_value = 0;
4547  team->t.t_master_active = FALSE;
4548 
4549  memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t));
4550 
4551 #ifdef KMP_DEBUG
4552  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4553 #endif
4554 #if KMP_OS_WINDOWS
4555  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4556 #endif
4557 
4558  team->t.t_control_stack_top = NULL;
4559 
4560  __kmp_reinitialize_team(team, new_icvs, loc);
4561 
4562  KMP_MB();
4563  KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4564 }
4565 
4566 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4567 /* Sets full mask for thread and returns old mask, no changes to structures. */
4568 static void
4569 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4570  if (KMP_AFFINITY_CAPABLE()) {
4571  int status;
4572  if (old_mask != NULL) {
4573  status = __kmp_get_system_affinity(old_mask, TRUE);
4574  int error = errno;
4575  if (status != 0) {
4576  __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4577  __kmp_msg_null);
4578  }
4579  }
4580  __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4581  }
4582 }
4583 #endif
4584 
4585 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4586 
4587 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4588 // It calculats the worker + master thread's partition based upon the parent
4589 // thread's partition, and binds each worker to a thread in their partition.
4590 // The master thread's partition should already include its current binding.
4591 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4592  // Copy the master thread's place partion to the team struct
4593  kmp_info_t *master_th = team->t.t_threads[0];
4594  KMP_DEBUG_ASSERT(master_th != NULL);
4595  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4596  int first_place = master_th->th.th_first_place;
4597  int last_place = master_th->th.th_last_place;
4598  int masters_place = master_th->th.th_current_place;
4599  team->t.t_first_place = first_place;
4600  team->t.t_last_place = last_place;
4601 
4602  KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4603  "bound to place %d partition = [%d,%d]\n",
4604  proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4605  team->t.t_id, masters_place, first_place, last_place));
4606 
4607  switch (proc_bind) {
4608 
4609  case proc_bind_default:
4610  // serial teams might have the proc_bind policy set to proc_bind_default. It
4611  // doesn't matter, as we don't rebind master thread for any proc_bind policy
4612  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4613  break;
4614 
4615  case proc_bind_master: {
4616  int f;
4617  int n_th = team->t.t_nproc;
4618  for (f = 1; f < n_th; f++) {
4619  kmp_info_t *th = team->t.t_threads[f];
4620  KMP_DEBUG_ASSERT(th != NULL);
4621  th->th.th_first_place = first_place;
4622  th->th.th_last_place = last_place;
4623  th->th.th_new_place = masters_place;
4624 #if OMP_50_ENABLED
4625  if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4626  team->t.t_display_affinity != 1) {
4627  team->t.t_display_affinity = 1;
4628  }
4629 #endif
4630 
4631  KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4632  "partition = [%d,%d]\n",
4633  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4634  f, masters_place, first_place, last_place));
4635  }
4636  } break;
4637 
4638  case proc_bind_close: {
4639  int f;
4640  int n_th = team->t.t_nproc;
4641  int n_places;
4642  if (first_place <= last_place) {
4643  n_places = last_place - first_place + 1;
4644  } else {
4645  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4646  }
4647  if (n_th <= n_places) {
4648  int place = masters_place;
4649  for (f = 1; f < n_th; f++) {
4650  kmp_info_t *th = team->t.t_threads[f];
4651  KMP_DEBUG_ASSERT(th != NULL);
4652 
4653  if (place == last_place) {
4654  place = first_place;
4655  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4656  place = 0;
4657  } else {
4658  place++;
4659  }
4660  th->th.th_first_place = first_place;
4661  th->th.th_last_place = last_place;
4662  th->th.th_new_place = place;
4663 #if OMP_50_ENABLED
4664  if (__kmp_display_affinity && place != th->th.th_current_place &&
4665  team->t.t_display_affinity != 1) {
4666  team->t.t_display_affinity = 1;
4667  }
4668 #endif
4669 
4670  KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4671  "partition = [%d,%d]\n",
4672  __kmp_gtid_from_thread(team->t.t_threads[f]),
4673  team->t.t_id, f, place, first_place, last_place));
4674  }
4675  } else {
4676  int S, rem, gap, s_count;
4677  S = n_th / n_places;
4678  s_count = 0;
4679  rem = n_th - (S * n_places);
4680  gap = rem > 0 ? n_places / rem : n_places;
4681  int place = masters_place;
4682  int gap_ct = gap;
4683  for (f = 0; f < n_th; f++) {
4684  kmp_info_t *th = team->t.t_threads[f];
4685  KMP_DEBUG_ASSERT(th != NULL);
4686 
4687  th->th.th_first_place = first_place;
4688  th->th.th_last_place = last_place;
4689  th->th.th_new_place = place;
4690 #if OMP_50_ENABLED
4691  if (__kmp_display_affinity && place != th->th.th_current_place &&
4692  team->t.t_display_affinity != 1) {
4693  team->t.t_display_affinity = 1;
4694  }
4695 #endif
4696  s_count++;
4697 
4698  if ((s_count == S) && rem && (gap_ct == gap)) {
4699  // do nothing, add an extra thread to place on next iteration
4700  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4701  // we added an extra thread to this place; move to next place
4702  if (place == last_place) {
4703  place = first_place;
4704  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4705  place = 0;
4706  } else {
4707  place++;
4708  }
4709  s_count = 0;
4710  gap_ct = 1;
4711  rem--;
4712  } else if (s_count == S) { // place full; don't add extra
4713  if (place == last_place) {
4714  place = first_place;
4715  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4716  place = 0;
4717  } else {
4718  place++;
4719  }
4720  gap_ct++;
4721  s_count = 0;
4722  }
4723 
4724  KA_TRACE(100,
4725  ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4726  "partition = [%d,%d]\n",
4727  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4728  th->th.th_new_place, first_place, last_place));
4729  }
4730  KMP_DEBUG_ASSERT(place == masters_place);
4731  }
4732  } break;
4733 
4734  case proc_bind_spread: {
4735  int f;
4736  int n_th = team->t.t_nproc;
4737  int n_places;
4738  int thidx;
4739  if (first_place <= last_place) {
4740  n_places = last_place - first_place + 1;
4741  } else {
4742  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4743  }
4744  if (n_th <= n_places) {
4745  int place = -1;
4746 
4747  if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4748  int S = n_places / n_th;
4749  int s_count, rem, gap, gap_ct;
4750 
4751  place = masters_place;
4752  rem = n_places - n_th * S;
4753  gap = rem ? n_th / rem : 1;
4754  gap_ct = gap;
4755  thidx = n_th;
4756  if (update_master_only == 1)
4757  thidx = 1;
4758  for (f = 0; f < thidx; f++) {
4759  kmp_info_t *th = team->t.t_threads[f];
4760  KMP_DEBUG_ASSERT(th != NULL);
4761 
4762  th->th.th_first_place = place;
4763  th->th.th_new_place = place;
4764 #if OMP_50_ENABLED
4765  if (__kmp_display_affinity && place != th->th.th_current_place &&
4766  team->t.t_display_affinity != 1) {
4767  team->t.t_display_affinity = 1;
4768  }
4769 #endif
4770  s_count = 1;
4771  while (s_count < S) {
4772  if (place == last_place) {
4773  place = first_place;
4774  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4775  place = 0;
4776  } else {
4777  place++;
4778  }
4779  s_count++;
4780  }
4781  if (rem && (gap_ct == gap)) {
4782  if (place == last_place) {
4783  place = first_place;
4784  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4785  place = 0;
4786  } else {
4787  place++;
4788  }
4789  rem--;
4790  gap_ct = 0;
4791  }
4792  th->th.th_last_place = place;
4793  gap_ct++;
4794 
4795  if (place == last_place) {
4796  place = first_place;
4797  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4798  place = 0;
4799  } else {
4800  place++;
4801  }
4802 
4803  KA_TRACE(100,
4804  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4805  "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4806  __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4807  f, th->th.th_new_place, th->th.th_first_place,
4808  th->th.th_last_place, __kmp_affinity_num_masks));
4809  }
4810  } else {
4811  /* Having uniform space of available computation places I can create
4812  T partitions of round(P/T) size and put threads into the first
4813  place of each partition. */
4814  double current = static_cast<double>(masters_place);
4815  double spacing =
4816  (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4817  int first, last;
4818  kmp_info_t *th;
4819 
4820  thidx = n_th + 1;
4821  if (update_master_only == 1)
4822  thidx = 1;
4823  for (f = 0; f < thidx; f++) {
4824  first = static_cast<int>(current);
4825  last = static_cast<int>(current + spacing) - 1;
4826  KMP_DEBUG_ASSERT(last >= first);
4827  if (first >= n_places) {
4828  if (masters_place) {
4829  first -= n_places;
4830  last -= n_places;
4831  if (first == (masters_place + 1)) {
4832  KMP_DEBUG_ASSERT(f == n_th);
4833  first--;
4834  }
4835  if (last == masters_place) {
4836  KMP_DEBUG_ASSERT(f == (n_th - 1));
4837  last--;
4838  }
4839  } else {
4840  KMP_DEBUG_ASSERT(f == n_th);
4841  first = 0;
4842  last = 0;
4843  }
4844  }
4845  if (last >= n_places) {
4846  last = (n_places - 1);
4847  }
4848  place = first;
4849  current += spacing;
4850  if (f < n_th) {
4851  KMP_DEBUG_ASSERT(0 <= first);
4852  KMP_DEBUG_ASSERT(n_places > first);
4853  KMP_DEBUG_ASSERT(0 <= last);
4854  KMP_DEBUG_ASSERT(n_places > last);
4855  KMP_DEBUG_ASSERT(last_place >= first_place);
4856  th = team->t.t_threads[f];
4857  KMP_DEBUG_ASSERT(th);
4858  th->th.th_first_place = first;
4859  th->th.th_new_place = place;
4860  th->th.th_last_place = last;
4861 #if OMP_50_ENABLED
4862  if (__kmp_display_affinity && place != th->th.th_current_place &&
4863  team->t.t_display_affinity != 1) {
4864  team->t.t_display_affinity = 1;
4865  }
4866 #endif
4867  KA_TRACE(100,
4868  ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4869  "partition = [%d,%d], spacing = %.4f\n",
4870  __kmp_gtid_from_thread(team->t.t_threads[f]),
4871  team->t.t_id, f, th->th.th_new_place,
4872  th->th.th_first_place, th->th.th_last_place, spacing));
4873  }
4874  }
4875  }
4876  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4877  } else {
4878  int S, rem, gap, s_count;
4879  S = n_th / n_places;
4880  s_count = 0;
4881  rem = n_th - (S * n_places);
4882  gap = rem > 0 ? n_places / rem : n_places;
4883  int place = masters_place;
4884  int gap_ct = gap;
4885  thidx = n_th;
4886  if (update_master_only == 1)
4887  thidx = 1;
4888  for (f = 0; f < thidx; f++) {
4889  kmp_info_t *th = team->t.t_threads[f];
4890  KMP_DEBUG_ASSERT(th != NULL);
4891 
4892  th->th.th_first_place = place;
4893  th->th.th_last_place = place;
4894  th->th.th_new_place = place;
4895 #if OMP_50_ENABLED
4896  if (__kmp_display_affinity && place != th->th.th_current_place &&
4897  team->t.t_display_affinity != 1) {
4898  team->t.t_display_affinity = 1;
4899  }
4900 #endif
4901  s_count++;
4902 
4903  if ((s_count == S) && rem && (gap_ct == gap)) {
4904  // do nothing, add an extra thread to place on next iteration
4905  } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4906  // we added an extra thread to this place; move on to next place
4907  if (place == last_place) {
4908  place = first_place;
4909  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4910  place = 0;
4911  } else {
4912  place++;
4913  }
4914  s_count = 0;
4915  gap_ct = 1;
4916  rem--;
4917  } else if (s_count == S) { // place is full; don't add extra thread
4918  if (place == last_place) {
4919  place = first_place;
4920  } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4921  place = 0;
4922  } else {
4923  place++;
4924  }
4925  gap_ct++;
4926  s_count = 0;
4927  }
4928 
4929  KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4930  "partition = [%d,%d]\n",
4931  __kmp_gtid_from_thread(team->t.t_threads[f]),
4932  team->t.t_id, f, th->th.th_new_place,
4933  th->th.th_first_place, th->th.th_last_place));
4934  }
4935  KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4936  }
4937  } break;
4938 
4939  default:
4940  break;
4941  }
4942 
4943  KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4944 }
4945 
4946 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4947 
4948 /* allocate a new team data structure to use. take one off of the free pool if
4949  available */
4950 kmp_team_t *
4951 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4952 #if OMPT_SUPPORT
4953  ompt_data_t ompt_parallel_data,
4954 #endif
4955 #if OMP_40_ENABLED
4956  kmp_proc_bind_t new_proc_bind,
4957 #endif
4958  kmp_internal_control_t *new_icvs,
4959  int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4960  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4961  int f;
4962  kmp_team_t *team;
4963  int use_hot_team = !root->r.r_active;
4964  int level = 0;
4965 
4966  KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4967  KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4968  KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4969  KMP_MB();
4970 
4971 #if KMP_NESTED_HOT_TEAMS
4972  kmp_hot_team_ptr_t *hot_teams;
4973  if (master) {
4974  team = master->th.th_team;
4975  level = team->t.t_active_level;
4976  if (master->th.th_teams_microtask) { // in teams construct?
4977  if (master->th.th_teams_size.nteams > 1 &&
4978  ( // #teams > 1
4979  team->t.t_pkfn ==
4980  (microtask_t)__kmp_teams_master || // inner fork of the teams
4981  master->th.th_teams_level <
4982  team->t.t_level)) { // or nested parallel inside the teams
4983  ++level; // not increment if #teams==1, or for outer fork of the teams;
4984  // increment otherwise
4985  }
4986  }
4987  hot_teams = master->th.th_hot_teams;
4988  if (level < __kmp_hot_teams_max_level && hot_teams &&
4989  hot_teams[level]
4990  .hot_team) { // hot team has already been allocated for given level
4991  use_hot_team = 1;
4992  } else {
4993  use_hot_team = 0;
4994  }
4995  }
4996 #endif
4997  // Optimization to use a "hot" team
4998  if (use_hot_team && new_nproc > 1) {
4999  KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5000 #if KMP_NESTED_HOT_TEAMS
5001  team = hot_teams[level].hot_team;
5002 #else
5003  team = root->r.r_hot_team;
5004 #endif
5005 #if KMP_DEBUG
5006  if (__kmp_tasking_mode != tskm_immediate_exec) {
5007  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5008  "task_team[1] = %p before reinit\n",
5009  team->t.t_task_team[0], team->t.t_task_team[1]));
5010  }
5011 #endif
5012 
5013  // Has the number of threads changed?
5014  /* Let's assume the most common case is that the number of threads is
5015  unchanged, and put that case first. */
5016  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5017  KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5018  // This case can mean that omp_set_num_threads() was called and the hot
5019  // team size was already reduced, so we check the special flag
5020  if (team->t.t_size_changed == -1) {
5021  team->t.t_size_changed = 1;
5022  } else {
5023  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5024  }
5025 
5026  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5027  kmp_r_sched_t new_sched = new_icvs->sched;
5028  // set master's schedule as new run-time schedule
5029  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5030 
5031  __kmp_reinitialize_team(team, new_icvs,
5032  root->r.r_uber_thread->th.th_ident);
5033 
5034  KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5035  team->t.t_threads[0], team));
5036  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5037 
5038 #if OMP_40_ENABLED
5039 #if KMP_AFFINITY_SUPPORTED
5040  if ((team->t.t_size_changed == 0) &&
5041  (team->t.t_proc_bind == new_proc_bind)) {
5042  if (new_proc_bind == proc_bind_spread) {
5043  __kmp_partition_places(
5044  team, 1); // add flag to update only master for spread
5045  }
5046  KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5047  "proc_bind = %d, partition = [%d,%d]\n",
5048  team->t.t_id, new_proc_bind, team->t.t_first_place,
5049  team->t.t_last_place));
5050  } else {
5051  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5052  __kmp_partition_places(team);
5053  }
5054 #else
5055  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5056 #endif /* KMP_AFFINITY_SUPPORTED */
5057 #endif /* OMP_40_ENABLED */
5058  } else if (team->t.t_nproc > new_nproc) {
5059  KA_TRACE(20,
5060  ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5061  new_nproc));
5062 
5063  team->t.t_size_changed = 1;
5064 #if KMP_NESTED_HOT_TEAMS
5065  if (__kmp_hot_teams_mode == 0) {
5066  // AC: saved number of threads should correspond to team's value in this
5067  // mode, can be bigger in mode 1, when hot team has threads in reserve
5068  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5069  hot_teams[level].hot_team_nth = new_nproc;
5070 #endif // KMP_NESTED_HOT_TEAMS
5071  /* release the extra threads we don't need any more */
5072  for (f = new_nproc; f < team->t.t_nproc; f++) {
5073  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5074  if (__kmp_tasking_mode != tskm_immediate_exec) {
5075  // When decreasing team size, threads no longer in the team should
5076  // unref task team.
5077  team->t.t_threads[f]->th.th_task_team = NULL;
5078  }
5079  __kmp_free_thread(team->t.t_threads[f]);
5080  team->t.t_threads[f] = NULL;
5081  }
5082 #if KMP_NESTED_HOT_TEAMS
5083  } // (__kmp_hot_teams_mode == 0)
5084  else {
5085  // When keeping extra threads in team, switch threads to wait on own
5086  // b_go flag
5087  for (f = new_nproc; f < team->t.t_nproc; ++f) {
5088  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5089  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5090  for (int b = 0; b < bs_last_barrier; ++b) {
5091  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5092  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5093  }
5094  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5095  }
5096  }
5097  }
5098 #endif // KMP_NESTED_HOT_TEAMS
5099  team->t.t_nproc = new_nproc;
5100  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5101  KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5102  __kmp_reinitialize_team(team, new_icvs,
5103  root->r.r_uber_thread->th.th_ident);
5104 
5105  // Update remaining threads
5106  for (f = 0; f < new_nproc; ++f) {
5107  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5108  }
5109 
5110  // restore the current task state of the master thread: should be the
5111  // implicit task
5112  KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5113  team->t.t_threads[0], team));
5114 
5115  __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5116 
5117 #ifdef KMP_DEBUG
5118  for (f = 0; f < team->t.t_nproc; f++) {
5119  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5120  team->t.t_threads[f]->th.th_team_nproc ==
5121  team->t.t_nproc);
5122  }
5123 #endif
5124 
5125 #if OMP_40_ENABLED
5126  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5127 #if KMP_AFFINITY_SUPPORTED
5128  __kmp_partition_places(team);
5129 #endif
5130 #endif
5131  } else { // team->t.t_nproc < new_nproc
5132 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5133  kmp_affin_mask_t *old_mask;
5134  if (KMP_AFFINITY_CAPABLE()) {
5135  KMP_CPU_ALLOC(old_mask);
5136  }
5137 #endif
5138 
5139  KA_TRACE(20,
5140  ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5141  new_nproc));
5142 
5143  team->t.t_size_changed = 1;
5144 
5145 #if KMP_NESTED_HOT_TEAMS
5146  int avail_threads = hot_teams[level].hot_team_nth;
5147  if (new_nproc < avail_threads)
5148  avail_threads = new_nproc;
5149  kmp_info_t **other_threads = team->t.t_threads;
5150  for (f = team->t.t_nproc; f < avail_threads; ++f) {
5151  // Adjust barrier data of reserved threads (if any) of the team
5152  // Other data will be set in __kmp_initialize_info() below.
5153  int b;
5154  kmp_balign_t *balign = other_threads[f]->th.th_bar;
5155  for (b = 0; b < bs_last_barrier; ++b) {
5156  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5157  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5158 #if USE_DEBUGGER
5159  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5160 #endif
5161  }
5162  }
5163  if (hot_teams[level].hot_team_nth >= new_nproc) {
5164  // we have all needed threads in reserve, no need to allocate any
5165  // this only possible in mode 1, cannot have reserved threads in mode 0
5166  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5167  team->t.t_nproc = new_nproc; // just get reserved threads involved
5168  } else {
5169  // we may have some threads in reserve, but not enough
5170  team->t.t_nproc =
5171  hot_teams[level]
5172  .hot_team_nth; // get reserved threads involved if any
5173  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5174 #endif // KMP_NESTED_HOT_TEAMS
5175  if (team->t.t_max_nproc < new_nproc) {
5176  /* reallocate larger arrays */
5177  __kmp_reallocate_team_arrays(team, new_nproc);
5178  __kmp_reinitialize_team(team, new_icvs, NULL);
5179  }
5180 
5181 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5182  /* Temporarily set full mask for master thread before creation of
5183  workers. The reason is that workers inherit the affinity from master,
5184  so if a lot of workers are created on the single core quickly, they
5185  don't get a chance to set their own affinity for a long time. */
5186  __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5187 #endif
5188 
5189  /* allocate new threads for the hot team */
5190  for (f = team->t.t_nproc; f < new_nproc; f++) {
5191  kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5192  KMP_DEBUG_ASSERT(new_worker);
5193  team->t.t_threads[f] = new_worker;
5194 
5195  KA_TRACE(20,
5196  ("__kmp_allocate_team: team %d init T#%d arrived: "
5197  "join=%llu, plain=%llu\n",
5198  team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5199  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5200  team->t.t_bar[bs_plain_barrier].b_arrived));
5201 
5202  { // Initialize barrier data for new threads.
5203  int b;
5204  kmp_balign_t *balign = new_worker->th.th_bar;
5205  for (b = 0; b < bs_last_barrier; ++b) {
5206  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5207  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5208  KMP_BARRIER_PARENT_FLAG);
5209 #if USE_DEBUGGER
5210  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5211 #endif
5212  }
5213  }
5214  }
5215 
5216 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5217  if (KMP_AFFINITY_CAPABLE()) {
5218  /* Restore initial master thread's affinity mask */
5219  __kmp_set_system_affinity(old_mask, TRUE);
5220  KMP_CPU_FREE(old_mask);
5221  }
5222 #endif
5223 #if KMP_NESTED_HOT_TEAMS
5224  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5225 #endif // KMP_NESTED_HOT_TEAMS
5226  /* make sure everyone is syncronized */
5227  int old_nproc = team->t.t_nproc; // save old value and use to update only
5228  // new threads below
5229  __kmp_initialize_team(team, new_nproc, new_icvs,
5230  root->r.r_uber_thread->th.th_ident);
5231 
5232  /* reinitialize the threads */
5233  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5234  for (f = 0; f < team->t.t_nproc; ++f)
5235  __kmp_initialize_info(team->t.t_threads[f], team, f,
5236  __kmp_gtid_from_tid(f, team));
5237 
5238  if (level) { // set th_task_state for new threads in nested hot team
5239  // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5240  // only need to set the th_task_state for the new threads. th_task_state
5241  // for master thread will not be accurate until after this in
5242  // __kmp_fork_call(), so we look to the master's memo_stack to get the
5243  // correct value.
5244  for (f = old_nproc; f < team->t.t_nproc; ++f)
5245  team->t.t_threads[f]->th.th_task_state =
5246  team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5247  } else { // set th_task_state for new threads in non-nested hot team
5248  int old_state =
5249  team->t.t_threads[0]->th.th_task_state; // copy master's state
5250  for (f = old_nproc; f < team->t.t_nproc; ++f)
5251  team->t.t_threads[f]->th.th_task_state = old_state;
5252  }
5253 
5254 #ifdef KMP_DEBUG
5255  for (f = 0; f < team->t.t_nproc; ++f) {
5256  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5257  team->t.t_threads[f]->th.th_team_nproc ==
5258  team->t.t_nproc);
5259  }
5260 #endif
5261 
5262 #if OMP_40_ENABLED
5263  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5264 #if KMP_AFFINITY_SUPPORTED
5265  __kmp_partition_places(team);
5266 #endif
5267 #endif
5268  } // Check changes in number of threads
5269 
5270 #if OMP_40_ENABLED
5271  kmp_info_t *master = team->t.t_threads[0];
5272  if (master->th.th_teams_microtask) {
5273  for (f = 1; f < new_nproc; ++f) {
5274  // propagate teams construct specific info to workers
5275  kmp_info_t *thr = team->t.t_threads[f];
5276  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5277  thr->th.th_teams_level = master->th.th_teams_level;
5278  thr->th.th_teams_size = master->th.th_teams_size;
5279  }
5280  }
5281 #endif /* OMP_40_ENABLED */
5282 #if KMP_NESTED_HOT_TEAMS
5283  if (level) {
5284  // Sync barrier state for nested hot teams, not needed for outermost hot
5285  // team.
5286  for (f = 1; f < new_nproc; ++f) {
5287  kmp_info_t *thr = team->t.t_threads[f];
5288  int b;
5289  kmp_balign_t *balign = thr->th.th_bar;
5290  for (b = 0; b < bs_last_barrier; ++b) {
5291  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5292  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5293 #if USE_DEBUGGER
5294  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5295 #endif
5296  }
5297  }
5298  }
5299 #endif // KMP_NESTED_HOT_TEAMS
5300 
5301  /* reallocate space for arguments if necessary */
5302  __kmp_alloc_argv_entries(argc, team, TRUE);
5303  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5304  // The hot team re-uses the previous task team,
5305  // if untouched during the previous release->gather phase.
5306 
5307  KF_TRACE(10, (" hot_team = %p\n", team));
5308 
5309 #if KMP_DEBUG
5310  if (__kmp_tasking_mode != tskm_immediate_exec) {
5311  KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5312  "task_team[1] = %p after reinit\n",
5313  team->t.t_task_team[0], team->t.t_task_team[1]));
5314  }
5315 #endif
5316 
5317 #if OMPT_SUPPORT
5318  __ompt_team_assign_id(team, ompt_parallel_data);
5319 #endif
5320 
5321  KMP_MB();
5322 
5323  return team;
5324  }
5325 
5326  /* next, let's try to take one from the team pool */
5327  KMP_MB();
5328  for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5329  /* TODO: consider resizing undersized teams instead of reaping them, now
5330  that we have a resizing mechanism */
5331  if (team->t.t_max_nproc >= max_nproc) {
5332  /* take this team from the team pool */
5333  __kmp_team_pool = team->t.t_next_pool;
5334 
5335  /* setup the team for fresh use */
5336  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5337 
5338  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5339  "task_team[1] %p to NULL\n",
5340  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5341  team->t.t_task_team[0] = NULL;
5342  team->t.t_task_team[1] = NULL;
5343 
5344  /* reallocate space for arguments if necessary */
5345  __kmp_alloc_argv_entries(argc, team, TRUE);
5346  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5347 
5348  KA_TRACE(
5349  20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5350  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5351  { // Initialize barrier data.
5352  int b;
5353  for (b = 0; b < bs_last_barrier; ++b) {
5354  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5355 #if USE_DEBUGGER
5356  team->t.t_bar[b].b_master_arrived = 0;
5357  team->t.t_bar[b].b_team_arrived = 0;
5358 #endif
5359  }
5360  }
5361 
5362 #if OMP_40_ENABLED
5363  team->t.t_proc_bind = new_proc_bind;
5364 #endif
5365 
5366  KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5367  team->t.t_id));
5368 
5369 #if OMPT_SUPPORT
5370  __ompt_team_assign_id(team, ompt_parallel_data);
5371 #endif
5372 
5373  KMP_MB();
5374 
5375  return team;
5376  }
5377 
5378  /* reap team if it is too small, then loop back and check the next one */
5379  // not sure if this is wise, but, will be redone during the hot-teams
5380  // rewrite.
5381  /* TODO: Use technique to find the right size hot-team, don't reap them */
5382  team = __kmp_reap_team(team);
5383  __kmp_team_pool = team;
5384  }
5385 
5386  /* nothing available in the pool, no matter, make a new team! */
5387  KMP_MB();
5388  team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5389 
5390  /* and set it up */
5391  team->t.t_max_nproc = max_nproc;
5392  /* NOTE well, for some reason allocating one big buffer and dividing it up
5393  seems to really hurt performance a lot on the P4, so, let's not use this */
5394  __kmp_allocate_team_arrays(team, max_nproc);
5395 
5396  KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5397  __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5398 
5399  KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5400  "%p to NULL\n",
5401  &team->t.t_task_team[0], &team->t.t_task_team[1]));
5402  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5403  // memory, no need to duplicate
5404  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5405  // memory, no need to duplicate
5406 
5407  if (__kmp_storage_map) {
5408  __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5409  }
5410 
5411  /* allocate space for arguments */
5412  __kmp_alloc_argv_entries(argc, team, FALSE);
5413  team->t.t_argc = argc;
5414 
5415  KA_TRACE(20,
5416  ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5417  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5418  { // Initialize barrier data.
5419  int b;
5420  for (b = 0; b < bs_last_barrier; ++b) {
5421  team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5422 #if USE_DEBUGGER
5423  team->t.t_bar[b].b_master_arrived = 0;
5424  team->t.t_bar[b].b_team_arrived = 0;
5425 #endif
5426  }
5427  }
5428 
5429 #if OMP_40_ENABLED
5430  team->t.t_proc_bind = new_proc_bind;
5431 #endif
5432 
5433 #if OMPT_SUPPORT
5434  __ompt_team_assign_id(team, ompt_parallel_data);
5435  team->t.ompt_serialized_team_info = NULL;
5436 #endif
5437 
5438  KMP_MB();
5439 
5440  KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5441  team->t.t_id));
5442 
5443  return team;
5444 }
5445 
5446 /* TODO implement hot-teams at all levels */
5447 /* TODO implement lazy thread release on demand (disband request) */
5448 
5449 /* free the team. return it to the team pool. release all the threads
5450  * associated with it */
5451 void __kmp_free_team(kmp_root_t *root,
5452  kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5453  int f;
5454  KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5455  team->t.t_id));
5456 
5457  /* verify state */
5458  KMP_DEBUG_ASSERT(root);
5459  KMP_DEBUG_ASSERT(team);
5460  KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5461  KMP_DEBUG_ASSERT(team->t.t_threads);
5462 
5463  int use_hot_team = team == root->r.r_hot_team;
5464 #if KMP_NESTED_HOT_TEAMS
5465  int level;
5466  kmp_hot_team_ptr_t *hot_teams;
5467  if (master) {
5468  level = team->t.t_active_level - 1;
5469  if (master->th.th_teams_microtask) { // in teams construct?
5470  if (master->th.th_teams_size.nteams > 1) {
5471  ++level; // level was not increased in teams construct for
5472  // team_of_masters
5473  }
5474  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5475  master->th.th_teams_level == team->t.t_level) {
5476  ++level; // level was not increased in teams construct for
5477  // team_of_workers before the parallel
5478  } // team->t.t_level will be increased inside parallel
5479  }
5480  hot_teams = master->th.th_hot_teams;
5481  if (level < __kmp_hot_teams_max_level) {
5482  KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5483  use_hot_team = 1;
5484  }
5485  }
5486 #endif // KMP_NESTED_HOT_TEAMS
5487 
5488  /* team is done working */
5489  TCW_SYNC_PTR(team->t.t_pkfn,
5490  NULL); // Important for Debugging Support Library.
5491 #if KMP_OS_WINDOWS
5492  team->t.t_copyin_counter = 0; // init counter for possible reuse
5493 #endif
5494  // Do not reset pointer to parent team to NULL for hot teams.
5495 
5496  /* if we are non-hot team, release our threads */
5497  if (!use_hot_team) {
5498  if (__kmp_tasking_mode != tskm_immediate_exec) {
5499  // Wait for threads to reach reapable state
5500  for (f = 1; f < team->t.t_nproc; ++f) {
5501  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5502  kmp_info_t *th = team->t.t_threads[f];
5503  volatile kmp_uint32 *state = &th->th.th_reap_state;
5504  while (*state != KMP_SAFE_TO_REAP) {
5505 #if KMP_OS_WINDOWS
5506  // On Windows a thread can be killed at any time, check this
5507  DWORD ecode;
5508  if (!__kmp_is_thread_alive(th, &ecode)) {
5509  *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5510  break;
5511  }
5512 #endif
5513  // first check if thread is sleeping
5514  kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5515  if (fl.is_sleeping())
5516  fl.resume(__kmp_gtid_from_thread(th));
5517  KMP_CPU_PAUSE();
5518  }
5519  }
5520 
5521  // Delete task teams
5522  int tt_idx;
5523  for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5524  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5525  if (task_team != NULL) {
5526  for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5527  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5528  team->t.t_threads[f]->th.th_task_team = NULL;
5529  }
5530  KA_TRACE(
5531  20,
5532  ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5533  __kmp_get_gtid(), task_team, team->t.t_id));
5534 #if KMP_NESTED_HOT_TEAMS
5535  __kmp_free_task_team(master, task_team);
5536 #endif
5537  team->t.t_task_team[tt_idx] = NULL;
5538  }
5539  }
5540  }
5541 
5542  // Reset pointer to parent team only for non-hot teams.
5543  team->t.t_parent = NULL;
5544  team->t.t_level = 0;
5545  team->t.t_active_level = 0;
5546 
5547  /* free the worker threads */
5548  for (f = 1; f < team->t.t_nproc; ++f) {
5549  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5550  __kmp_free_thread(team->t.t_threads[f]);
5551  team->t.t_threads[f] = NULL;
5552  }
5553 
5554  /* put the team back in the team pool */
5555  /* TODO limit size of team pool, call reap_team if pool too large */
5556  team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5557  __kmp_team_pool = (volatile kmp_team_t *)team;
5558  } else { // Check if team was created for the masters in a teams construct
5559  // See if first worker is a CG root
5560  KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5561  team->t.t_threads[1]->th.th_cg_roots);
5562  if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5563  // Clean up the CG root nodes on workers so that this team can be re-used
5564  for (f = 1; f < team->t.t_nproc; ++f) {
5565  kmp_info_t *thr = team->t.t_threads[f];
5566  KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5567  thr->th.th_cg_roots->cg_root == thr);
5568  // Pop current CG root off list
5569  kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5570  thr->th.th_cg_roots = tmp->up;
5571  KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5572  " up to node %p. cg_nthreads was %d\n",
5573  thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5574  __kmp_free(tmp);
5575  // Restore current task's thread_limit from CG root
5576  if (thr->th.th_cg_roots)
5577  thr->th.th_current_task->td_icvs.thread_limit =
5578  thr->th.th_cg_roots->cg_thread_limit;
5579  }
5580  }
5581  }
5582 
5583  KMP_MB();
5584 }
5585 
5586 /* reap the team. destroy it, reclaim all its resources and free its memory */
5587 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5588  kmp_team_t *next_pool = team->t.t_next_pool;
5589 
5590  KMP_DEBUG_ASSERT(team);
5591  KMP_DEBUG_ASSERT(team->t.t_dispatch);
5592  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5593  KMP_DEBUG_ASSERT(team->t.t_threads);
5594  KMP_DEBUG_ASSERT(team->t.t_argv);
5595 
5596  /* TODO clean the threads that are a part of this? */
5597 
5598  /* free stuff */
5599  __kmp_free_team_arrays(team);
5600  if (team->t.t_argv != &team->t.t_inline_argv[0])
5601  __kmp_free((void *)team->t.t_argv);
5602  __kmp_free(team);
5603 
5604  KMP_MB();
5605  return next_pool;
5606 }
5607 
5608 // Free the thread. Don't reap it, just place it on the pool of available
5609 // threads.
5610 //
5611 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5612 // binding for the affinity mechanism to be useful.
5613 //
5614 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5615 // However, we want to avoid a potential performance problem by always
5616 // scanning through the list to find the correct point at which to insert
5617 // the thread (potential N**2 behavior). To do this we keep track of the
5618 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5619 // With single-level parallelism, threads will always be added to the tail
5620 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5621 // parallelism, all bets are off and we may need to scan through the entire
5622 // free list.
5623 //
5624 // This change also has a potentially large performance benefit, for some
5625 // applications. Previously, as threads were freed from the hot team, they
5626 // would be placed back on the free list in inverse order. If the hot team
5627 // grew back to it's original size, then the freed thread would be placed
5628 // back on the hot team in reverse order. This could cause bad cache
5629 // locality problems on programs where the size of the hot team regularly
5630 // grew and shrunk.
5631 //
5632 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5633 void __kmp_free_thread(kmp_info_t *this_th) {
5634  int gtid;
5635  kmp_info_t **scan;
5636 
5637  KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5638  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5639 
5640  KMP_DEBUG_ASSERT(this_th);
5641 
5642  // When moving thread to pool, switch thread to wait on own b_go flag, and
5643  // uninitialized (NULL team).
5644  int b;
5645  kmp_balign_t *balign = this_th->th.th_bar;
5646  for (b = 0; b < bs_last_barrier; ++b) {
5647  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5648  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5649  balign[b].bb.team = NULL;
5650  balign[b].bb.leaf_kids = 0;
5651  }
5652  this_th->th.th_task_state = 0;
5653  this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5654 
5655  /* put thread back on the free pool */
5656  TCW_PTR(this_th->th.th_team, NULL);
5657  TCW_PTR(this_th->th.th_root, NULL);
5658  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5659 
5660  while (this_th->th.th_cg_roots) {
5661  this_th->th.th_cg_roots->cg_nthreads--;
5662  KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5663  " %p of thread %p to %d\n",
5664  this_th, this_th->th.th_cg_roots,
5665  this_th->th.th_cg_roots->cg_root,
5666  this_th->th.th_cg_roots->cg_nthreads));
5667  kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5668  if (tmp->cg_root == this_th) { // Thread is a cg_root
5669  KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5670  KA_TRACE(
5671  5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5672  this_th->th.th_cg_roots = tmp->up;
5673  __kmp_free(tmp);
5674  } else { // Worker thread
5675  this_th->th.th_cg_roots = NULL;
5676  break;
5677  }
5678  }
5679 
5680  /* If the implicit task assigned to this thread can be used by other threads
5681  * -> multiple threads can share the data and try to free the task at
5682  * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5683  * with higher probability when hot team is disabled but can occurs even when
5684  * the hot team is enabled */
5685  __kmp_free_implicit_task(this_th);
5686  this_th->th.th_current_task = NULL;
5687 
5688  // If the __kmp_thread_pool_insert_pt is already past the new insert
5689  // point, then we need to re-scan the entire list.
5690  gtid = this_th->th.th_info.ds.ds_gtid;
5691  if (__kmp_thread_pool_insert_pt != NULL) {
5692  KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5693  if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5694  __kmp_thread_pool_insert_pt = NULL;
5695  }
5696  }
5697 
5698  // Scan down the list to find the place to insert the thread.
5699  // scan is the address of a link in the list, possibly the address of
5700  // __kmp_thread_pool itself.
5701  //
5702  // In the absence of nested parallism, the for loop will have 0 iterations.
5703  if (__kmp_thread_pool_insert_pt != NULL) {
5704  scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5705  } else {
5706  scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5707  }
5708  for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5709  scan = &((*scan)->th.th_next_pool))
5710  ;
5711 
5712  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5713  // to its address.
5714  TCW_PTR(this_th->th.th_next_pool, *scan);
5715  __kmp_thread_pool_insert_pt = *scan = this_th;
5716  KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5717  (this_th->th.th_info.ds.ds_gtid <
5718  this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5719  TCW_4(this_th->th.th_in_pool, TRUE);
5720  __kmp_thread_pool_nth++;
5721 
5722  TCW_4(__kmp_nth, __kmp_nth - 1);
5723 
5724 #ifdef KMP_ADJUST_BLOCKTIME
5725  /* Adjust blocktime back to user setting or default if necessary */
5726  /* Middle initialization might never have occurred */
5727  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5728  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5729  if (__kmp_nth <= __kmp_avail_proc) {
5730  __kmp_zero_bt = FALSE;
5731  }
5732  }
5733 #endif /* KMP_ADJUST_BLOCKTIME */
5734 
5735  KMP_MB();
5736 }
5737 
5738 /* ------------------------------------------------------------------------ */
5739 
5740 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5741  int gtid = this_thr->th.th_info.ds.ds_gtid;
5742  /* void *stack_data;*/
5743  kmp_team_t *(*volatile pteam);
5744 
5745  KMP_MB();
5746  KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5747 
5748  if (__kmp_env_consistency_check) {
5749  this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5750  }
5751 
5752 #if OMPT_SUPPORT
5753  ompt_data_t *thread_data;
5754  if (ompt_enabled.enabled) {
5755  thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5756  *thread_data = ompt_data_none;
5757 
5758  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5759  this_thr->th.ompt_thread_info.wait_id = 0;
5760  this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5761  if (ompt_enabled.ompt_callback_thread_begin) {
5762  ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5763  ompt_thread_worker, thread_data);
5764  }
5765  }
5766 #endif
5767 
5768 #if OMPT_SUPPORT
5769  if (ompt_enabled.enabled) {
5770  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5771  }
5772 #endif
5773  /* This is the place where threads wait for work */
5774  while (!TCR_4(__kmp_global.g.g_done)) {
5775  KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5776  KMP_MB();
5777 
5778  /* wait for work to do */
5779  KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5780 
5781  /* No tid yet since not part of a team */
5782  __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5783 
5784 #if OMPT_SUPPORT
5785  if (ompt_enabled.enabled) {
5786  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5787  }
5788 #endif
5789 
5790  pteam = (kmp_team_t * (*))(&this_thr->th.th_team);
5791 
5792  /* have we been allocated? */
5793  if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5794  /* we were just woken up, so run our new task */
5795  if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5796  int rc;
5797  KA_TRACE(20,
5798  ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5799  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5800  (*pteam)->t.t_pkfn));
5801 
5802  updateHWFPControl(*pteam);
5803 
5804 #if OMPT_SUPPORT
5805  if (ompt_enabled.enabled) {
5806  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5807  }
5808 #endif
5809 
5810  rc = (*pteam)->t.t_invoke(gtid);
5811  KMP_ASSERT(rc);
5812 
5813  KMP_MB();
5814  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5815  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5816  (*pteam)->t.t_pkfn));
5817  }
5818 #if OMPT_SUPPORT
5819  if (ompt_enabled.enabled) {
5820  /* no frame set while outside task */
5821  __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5822 
5823  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5824  }
5825 #endif
5826  /* join barrier after parallel region */
5827  __kmp_join_barrier(gtid);
5828  }
5829  }
5830  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5831 
5832 #if OMPT_SUPPORT
5833  if (ompt_enabled.ompt_callback_thread_end) {
5834  ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5835  }
5836 #endif
5837 
5838  this_thr->th.th_task_team = NULL;
5839  /* run the destructors for the threadprivate data for this thread */
5840  __kmp_common_destroy_gtid(gtid);
5841 
5842  KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5843  KMP_MB();
5844  return this_thr;
5845 }
5846 
5847 /* ------------------------------------------------------------------------ */
5848 
5849 void __kmp_internal_end_dest(void *specific_gtid) {
5850 #if KMP_COMPILER_ICC
5851 #pragma warning(push)
5852 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5853 // significant bits
5854 #endif
5855  // Make sure no significant bits are lost
5856  int gtid = (kmp_intptr_t)specific_gtid - 1;
5857 #if KMP_COMPILER_ICC
5858 #pragma warning(pop)
5859 #endif
5860 
5861  KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5862  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5863  * this is because 0 is reserved for the nothing-stored case */
5864 
5865  /* josh: One reason for setting the gtid specific data even when it is being
5866  destroyed by pthread is to allow gtid lookup through thread specific data
5867  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5868  that gets executed in the call to __kmp_internal_end_thread, actually
5869  gets the gtid through the thread specific data. Setting it here seems
5870  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5871  to run smoothly.
5872  todo: get rid of this after we remove the dependence on
5873  __kmp_gtid_get_specific */
5874  if (gtid >= 0 && KMP_UBER_GTID(gtid))
5875  __kmp_gtid_set_specific(gtid);
5876 #ifdef KMP_TDATA_GTID
5877  __kmp_gtid = gtid;
5878 #endif
5879  __kmp_internal_end_thread(gtid);
5880 }
5881 
5882 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5883 
5884 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases
5885 // destructors work perfectly, but in real libomp.so I have no evidence it is
5886 // ever called. However, -fini linker option in makefile.mk works fine.
5887 
5888 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5889  __kmp_internal_end_atexit();
5890 }
5891 
5892 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); }
5893 
5894 #endif
5895 
5896 /* [Windows] josh: when the atexit handler is called, there may still be more
5897  than one thread alive */
5898 void __kmp_internal_end_atexit(void) {
5899  KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5900  /* [Windows]
5901  josh: ideally, we want to completely shutdown the library in this atexit
5902  handler, but stat code that depends on thread specific data for gtid fails
5903  because that data becomes unavailable at some point during the shutdown, so
5904  we call __kmp_internal_end_thread instead. We should eventually remove the
5905  dependency on __kmp_get_specific_gtid in the stat code and use
5906  __kmp_internal_end_library to cleanly shutdown the library.
5907 
5908  // TODO: Can some of this comment about GVS be removed?
5909  I suspect that the offending stat code is executed when the calling thread
5910  tries to clean up a dead root thread's data structures, resulting in GVS
5911  code trying to close the GVS structures for that thread, but since the stat
5912  code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5913  the calling thread is cleaning up itself instead of another thread, it get
5914  confused. This happens because allowing a thread to unregister and cleanup
5915  another thread is a recent modification for addressing an issue.
5916  Based on the current design (20050722), a thread may end up
5917  trying to unregister another thread only if thread death does not trigger
5918  the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5919  thread specific data destructor function to detect thread death. For
5920  Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5921  is nothing. Thus, the workaround is applicable only for Windows static
5922  stat library. */
5923  __kmp_internal_end_library(-1);
5924 #if KMP_OS_WINDOWS
5925  __kmp_close_console();
5926 #endif
5927 }
5928 
5929 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5930  // It is assumed __kmp_forkjoin_lock is acquired.
5931 
5932  int gtid;
5933 
5934  KMP_DEBUG_ASSERT(thread != NULL);
5935 
5936  gtid = thread->th.th_info.ds.ds_gtid;
5937 
5938  if (!is_root) {
5939  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5940  /* Assume the threads are at the fork barrier here */
5941  KA_TRACE(
5942  20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5943  gtid));
5944  /* Need release fence here to prevent seg faults for tree forkjoin barrier
5945  * (GEH) */
5946  ANNOTATE_HAPPENS_BEFORE(thread);
5947  kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
5948  __kmp_release_64(&flag);
5949  }
5950 
5951  // Terminate OS thread.
5952  __kmp_reap_worker(thread);
5953 
5954  // The thread was killed asynchronously. If it was actively
5955  // spinning in the thread pool, decrement the global count.
5956  //
5957  // There is a small timing hole here - if the worker thread was just waking
5958  // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5959  // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5960  // the global counter might not get updated.
5961  //
5962  // Currently, this can only happen as the library is unloaded,
5963  // so there are no harmful side effects.
5964  if (thread->th.th_active_in_pool) {
5965  thread->th.th_active_in_pool = FALSE;
5966  KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5967  KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5968  }
5969 
5970  // Decrement # of [worker] threads in the pool.
5971  KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0);
5972  --__kmp_thread_pool_nth;
5973  }
5974 
5975  __kmp_free_implicit_task(thread);
5976 
5977 // Free the fast memory for tasking
5978 #if USE_FAST_MEMORY
5979  __kmp_free_fast_memory(thread);
5980 #endif /* USE_FAST_MEMORY */
5981 
5982  __kmp_suspend_uninitialize_thread(thread);
5983 
5984  KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5985  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5986 
5987  --__kmp_all_nth;
5988 // __kmp_nth was decremented when thread is added to the pool.
5989 
5990 #ifdef KMP_ADJUST_BLOCKTIME
5991  /* Adjust blocktime back to user setting or default if necessary */
5992  /* Middle initialization might never have occurred */
5993  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5994  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5995  if (__kmp_nth <= __kmp_avail_proc) {
5996  __kmp_zero_bt = FALSE;
5997  }
5998  }
5999 #endif /* KMP_ADJUST_BLOCKTIME */
6000 
6001  /* free the memory being used */
6002  if (__kmp_env_consistency_check) {
6003  if (thread->th.th_cons) {
6004  __kmp_free_cons_stack(thread->th.th_cons);
6005  thread->th.th_cons = NULL;
6006  }
6007  }
6008 
6009  if (thread->th.th_pri_common != NULL) {
6010  __kmp_free(thread->th.th_pri_common);
6011  thread->th.th_pri_common = NULL;
6012  }
6013 
6014  if (thread->th.th_task_state_memo_stack != NULL) {
6015  __kmp_free(thread->th.th_task_state_memo_stack);
6016  thread->th.th_task_state_memo_stack = NULL;
6017  }
6018 
6019 #if KMP_USE_BGET
6020  if (thread->th.th_local.bget_data != NULL) {
6021  __kmp_finalize_bget(thread);
6022  }
6023 #endif
6024 
6025 #if KMP_AFFINITY_SUPPORTED
6026  if (thread->th.th_affin_mask != NULL) {
6027  KMP_CPU_FREE(thread->th.th_affin_mask);
6028  thread->th.th_affin_mask = NULL;
6029  }
6030 #endif /* KMP_AFFINITY_SUPPORTED */
6031 
6032 #if KMP_USE_HIER_SCHED
6033  if (thread->th.th_hier_bar_data != NULL) {
6034  __kmp_free(thread->th.th_hier_bar_data);
6035  thread->th.th_hier_bar_data = NULL;
6036  }
6037 #endif
6038 
6039  __kmp_reap_team(thread->th.th_serial_team);
6040  thread->th.th_serial_team = NULL;
6041  __kmp_free(thread);
6042 
6043  KMP_MB();
6044 
6045 } // __kmp_reap_thread
6046 
6047 static void __kmp_internal_end(void) {
6048  int i;
6049 
6050  /* First, unregister the library */
6051  __kmp_unregister_library();
6052 
6053 #if KMP_OS_WINDOWS
6054  /* In Win static library, we can't tell when a root actually dies, so we
6055  reclaim the data structures for any root threads that have died but not
6056  unregistered themselves, in order to shut down cleanly.
6057  In Win dynamic library we also can't tell when a thread dies. */
6058  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6059 // dead roots
6060 #endif
6061 
6062  for (i = 0; i < __kmp_threads_capacity; i++)
6063  if (__kmp_root[i])
6064  if (__kmp_root[i]->r.r_active)
6065  break;
6066  KMP_MB(); /* Flush all pending memory write invalidates. */
6067  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6068 
6069  if (i < __kmp_threads_capacity) {
6070 #if KMP_USE_MONITOR
6071  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6072  KMP_MB(); /* Flush all pending memory write invalidates. */
6073 
6074  // Need to check that monitor was initialized before reaping it. If we are
6075  // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6076  // __kmp_monitor will appear to contain valid data, but it is only valid in
6077  // the parent process, not the child.
6078  // New behavior (201008): instead of keying off of the flag
6079  // __kmp_init_parallel, the monitor thread creation is keyed off
6080  // of the new flag __kmp_init_monitor.
6081  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6082  if (TCR_4(__kmp_init_monitor)) {
6083  __kmp_reap_monitor(&__kmp_monitor);
6084  TCW_4(__kmp_init_monitor, 0);
6085  }
6086  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6087  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6088 #endif // KMP_USE_MONITOR
6089  } else {
6090 /* TODO move this to cleanup code */
6091 #ifdef KMP_DEBUG
6092  /* make sure that everything has properly ended */
6093  for (i = 0; i < __kmp_threads_capacity; i++) {
6094  if (__kmp_root[i]) {
6095  // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6096  // there can be uber threads alive here
6097  KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6098  }
6099  }
6100 #endif
6101 
6102  KMP_MB();
6103 
6104  // Reap the worker threads.
6105  // This is valid for now, but be careful if threads are reaped sooner.
6106  while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6107  // Get the next thread from the pool.
6108  kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6109  __kmp_thread_pool = thread->th.th_next_pool;
6110  // Reap it.
6111  KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6112  thread->th.th_next_pool = NULL;
6113  thread->th.th_in_pool = FALSE;
6114  __kmp_reap_thread(thread, 0);
6115  }
6116  __kmp_thread_pool_insert_pt = NULL;
6117 
6118  // Reap teams.
6119  while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6120  // Get the next team from the pool.
6121  kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6122  __kmp_team_pool = team->t.t_next_pool;
6123  // Reap it.
6124  team->t.t_next_pool = NULL;
6125  __kmp_reap_team(team);
6126  }
6127 
6128  __kmp_reap_task_teams();
6129 
6130 #if KMP_OS_UNIX
6131  // Threads that are not reaped should not access any resources since they
6132  // are going to be deallocated soon, so the shutdown sequence should wait
6133  // until all threads either exit the final spin-waiting loop or begin
6134  // sleeping after the given blocktime.
6135  for (i = 0; i < __kmp_threads_capacity; i++) {
6136  kmp_info_t *thr = __kmp_threads[i];
6137  while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6138  KMP_CPU_PAUSE();
6139  }
6140 #endif
6141 
6142  for (i = 0; i < __kmp_threads_capacity; ++i) {
6143  // TBD: Add some checking...
6144  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6145  }
6146 
6147  /* Make sure all threadprivate destructors get run by joining with all
6148  worker threads before resetting this flag */
6149  TCW_SYNC_4(__kmp_init_common, FALSE);
6150 
6151  KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6152  KMP_MB();
6153 
6154 #if KMP_USE_MONITOR
6155  // See note above: One of the possible fixes for CQ138434 / CQ140126
6156  //
6157  // FIXME: push both code fragments down and CSE them?
6158  // push them into __kmp_cleanup() ?
6159  __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6160  if (TCR_4(__kmp_init_monitor)) {
6161  __kmp_reap_monitor(&__kmp_monitor);
6162  TCW_4(__kmp_init_monitor, 0);
6163  }
6164  __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6165  KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6166 #endif
6167  } /* else !__kmp_global.t_active */
6168  TCW_4(__kmp_init_gtid, FALSE);
6169  KMP_MB(); /* Flush all pending memory write invalidates. */
6170 
6171  __kmp_cleanup();
6172 #if OMPT_SUPPORT
6173  ompt_fini();
6174 #endif
6175 }
6176 
6177 void __kmp_internal_end_library(int gtid_req) {
6178  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6179  /* this shouldn't be a race condition because __kmp_internal_end() is the
6180  only place to clear __kmp_serial_init */
6181  /* we'll check this later too, after we get the lock */
6182  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6183  // redundaant, because the next check will work in any case.
6184  if (__kmp_global.g.g_abort) {
6185  KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6186  /* TODO abort? */
6187  return;
6188  }
6189  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6190  KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6191  return;
6192  }
6193 
6194  KMP_MB(); /* Flush all pending memory write invalidates. */
6195 
6196  /* find out who we are and what we should do */
6197  {
6198  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6199  KA_TRACE(
6200  10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6201  if (gtid == KMP_GTID_SHUTDOWN) {
6202  KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6203  "already shutdown\n"));
6204  return;
6205  } else if (gtid == KMP_GTID_MONITOR) {
6206  KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6207  "registered, or system shutdown\n"));
6208  return;
6209  } else if (gtid == KMP_GTID_DNE) {
6210  KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6211  "shutdown\n"));
6212  /* we don't know who we are, but we may still shutdown the library */
6213  } else if (KMP_UBER_GTID(gtid)) {
6214  /* unregister ourselves as an uber thread. gtid is no longer valid */
6215  if (__kmp_root[gtid]->r.r_active) {
6216  __kmp_global.g.g_abort = -1;
6217  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6218  KA_TRACE(10,
6219  ("__kmp_internal_end_library: root still active, abort T#%d\n",
6220  gtid));
6221  return;
6222  } else {
6223  KA_TRACE(
6224  10,
6225  ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6226  __kmp_unregister_root_current_thread(gtid);
6227  }
6228  } else {
6229 /* worker threads may call this function through the atexit handler, if they
6230  * call exit() */
6231 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6232  TODO: do a thorough shutdown instead */
6233 #ifdef DUMP_DEBUG_ON_EXIT
6234  if (__kmp_debug_buf)
6235  __kmp_dump_debug_buffer();
6236 #endif
6237  return;
6238  }
6239  }
6240  /* synchronize the termination process */
6241  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6242 
6243  /* have we already finished */
6244  if (__kmp_global.g.g_abort) {
6245  KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6246  /* TODO abort? */
6247  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6248  return;
6249  }
6250  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6251  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6252  return;
6253  }
6254 
6255  /* We need this lock to enforce mutex between this reading of
6256  __kmp_threads_capacity and the writing by __kmp_register_root.
6257  Alternatively, we can use a counter of roots that is atomically updated by
6258  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6259  __kmp_internal_end_*. */
6260  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6261 
6262  /* now we can safely conduct the actual termination */
6263  __kmp_internal_end();
6264 
6265  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6266  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6267 
6268  KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6269 
6270 #ifdef DUMP_DEBUG_ON_EXIT
6271  if (__kmp_debug_buf)
6272  __kmp_dump_debug_buffer();
6273 #endif
6274 
6275 #if KMP_OS_WINDOWS
6276  __kmp_close_console();
6277 #endif
6278 
6279  __kmp_fini_allocator();
6280 
6281 } // __kmp_internal_end_library
6282 
6283 void __kmp_internal_end_thread(int gtid_req) {
6284  int i;
6285 
6286  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6287  /* this shouldn't be a race condition because __kmp_internal_end() is the
6288  * only place to clear __kmp_serial_init */
6289  /* we'll check this later too, after we get the lock */
6290  // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6291  // redundant, because the next check will work in any case.
6292  if (__kmp_global.g.g_abort) {
6293  KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6294  /* TODO abort? */
6295  return;
6296  }
6297  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6298  KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6299  return;
6300  }
6301 
6302  KMP_MB(); /* Flush all pending memory write invalidates. */
6303 
6304  /* find out who we are and what we should do */
6305  {
6306  int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6307  KA_TRACE(10,
6308  ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6309  if (gtid == KMP_GTID_SHUTDOWN) {
6310  KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6311  "already shutdown\n"));
6312  return;
6313  } else if (gtid == KMP_GTID_MONITOR) {
6314  KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6315  "registered, or system shutdown\n"));
6316  return;
6317  } else if (gtid == KMP_GTID_DNE) {
6318  KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6319  "shutdown\n"));
6320  return;
6321  /* we don't know who we are */
6322  } else if (KMP_UBER_GTID(gtid)) {
6323  /* unregister ourselves as an uber thread. gtid is no longer valid */
6324  if (__kmp_root[gtid]->r.r_active) {
6325  __kmp_global.g.g_abort = -1;
6326  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6327  KA_TRACE(10,
6328  ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6329  gtid));
6330  return;
6331  } else {
6332  KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6333  gtid));
6334  __kmp_unregister_root_current_thread(gtid);
6335  }
6336  } else {
6337  /* just a worker thread, let's leave */
6338  KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6339 
6340  if (gtid >= 0) {
6341  __kmp_threads[gtid]->th.th_task_team = NULL;
6342  }
6343 
6344  KA_TRACE(10,
6345  ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6346  gtid));
6347  return;
6348  }
6349  }
6350 #if KMP_DYNAMIC_LIB
6351  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber
6352  // thread, because we will better shutdown later in the library destructor.
6353  // The reason of this change is performance problem when non-openmp thread in
6354  // a loop forks and joins many openmp threads. We can save a lot of time
6355  // keeping worker threads alive until the program shutdown.
6356  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
6357  // and Windows(DPD200287443) that occurs when using critical sections from
6358  // foreign threads.
6359  if (__kmp_pause_status != kmp_hard_paused) {
6360  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6361  return;
6362  }
6363 #endif
6364  /* synchronize the termination process */
6365  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6366 
6367  /* have we already finished */
6368  if (__kmp_global.g.g_abort) {
6369  KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6370  /* TODO abort? */
6371  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6372  return;
6373  }
6374  if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6375  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6376  return;
6377  }
6378 
6379  /* We need this lock to enforce mutex between this reading of
6380  __kmp_threads_capacity and the writing by __kmp_register_root.
6381  Alternatively, we can use a counter of roots that is atomically updated by
6382  __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6383  __kmp_internal_end_*. */
6384 
6385  /* should we finish the run-time? are all siblings done? */
6386  __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6387 
6388  for (i = 0; i < __kmp_threads_capacity; ++i) {
6389  if (KMP_UBER_GTID(i)) {
6390  KA_TRACE(
6391  10,
6392  ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6393  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6394  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6395  return;
6396  }
6397  }
6398 
6399  /* now we can safely conduct the actual termination */
6400 
6401  __kmp_internal_end();
6402 
6403  __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6404  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6405 
6406  KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6407 
6408 #ifdef DUMP_DEBUG_ON_EXIT
6409  if (__kmp_debug_buf)
6410  __kmp_dump_debug_buffer();
6411 #endif
6412 } // __kmp_internal_end_thread
6413 
6414 // -----------------------------------------------------------------------------
6415 // Library registration stuff.
6416 
6417 static long __kmp_registration_flag = 0;
6418 // Random value used to indicate library initialization.
6419 static char *__kmp_registration_str = NULL;
6420 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6421 
6422 static inline char *__kmp_reg_status_name() {
6423  /* On RHEL 3u5 if linked statically, getpid() returns different values in
6424  each thread. If registration and unregistration go in different threads
6425  (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6426  env var can not be found, because the name will contain different pid. */
6427  return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6428 } // __kmp_reg_status_get
6429 
6430 void __kmp_register_library_startup(void) {
6431 
6432  char *name = __kmp_reg_status_name(); // Name of the environment variable.
6433  int done = 0;
6434  union {
6435  double dtime;
6436  long ltime;
6437  } time;
6438 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6439  __kmp_initialize_system_tick();
6440 #endif
6441  __kmp_read_system_time(&time.dtime);
6442  __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6443  __kmp_registration_str =
6444  __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6445  __kmp_registration_flag, KMP_LIBRARY_FILE);
6446 
6447  KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6448  __kmp_registration_str));
6449 
6450  while (!done) {
6451 
6452  char *value = NULL; // Actual value of the environment variable.
6453 
6454  // Set environment variable, but do not overwrite if it is exist.
6455  __kmp_env_set(name, __kmp_registration_str, 0);
6456  // Check the variable is written.
6457  value = __kmp_env_get(name);
6458  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6459 
6460  done = 1; // Ok, environment variable set successfully, exit the loop.
6461 
6462  } else {
6463 
6464  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6465  // Check whether it alive or dead.
6466  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6467  char *tail = value;
6468  char *flag_addr_str = NULL;
6469  char *flag_val_str = NULL;
6470  char const *file_name = NULL;
6471  __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6472  __kmp_str_split(tail, '-', &flag_val_str, &tail);
6473  file_name = tail;
6474  if (tail != NULL) {
6475  long *flag_addr = 0;
6476  long flag_val = 0;
6477  KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6478  KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6479  if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6480  // First, check whether environment-encoded address is mapped into
6481  // addr space.
6482  // If so, dereference it to see if it still has the right value.
6483  if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6484  neighbor = 1;
6485  } else {
6486  // If not, then we know the other copy of the library is no longer
6487  // running.
6488  neighbor = 2;
6489  }
6490  }
6491  }
6492  switch (neighbor) {
6493  case 0: // Cannot parse environment variable -- neighbor status unknown.
6494  // Assume it is the incompatible format of future version of the
6495  // library. Assume the other library is alive.
6496  // WARN( ... ); // TODO: Issue a warning.
6497  file_name = "unknown library";
6498  KMP_FALLTHROUGH();
6499  // Attention! Falling to the next case. That's intentional.
6500  case 1: { // Neighbor is alive.
6501  // Check it is allowed.
6502  char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6503  if (!__kmp_str_match_true(duplicate_ok)) {
6504  // That's not allowed. Issue fatal error.
6505  __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6506  KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6507  }
6508  KMP_INTERNAL_FREE(duplicate_ok);
6509  __kmp_duplicate_library_ok = 1;
6510  done = 1; // Exit the loop.
6511  } break;
6512  case 2: { // Neighbor is dead.
6513  // Clear the variable and try to register library again.
6514  __kmp_env_unset(name);
6515  } break;
6516  default: { KMP_DEBUG_ASSERT(0); } break;
6517  }
6518  }
6519  KMP_INTERNAL_FREE((void *)value);
6520  }
6521  KMP_INTERNAL_FREE((void *)name);
6522 
6523 } // func __kmp_register_library_startup
6524 
6525 void __kmp_unregister_library(void) {
6526 
6527  char *name = __kmp_reg_status_name();
6528  char *value = __kmp_env_get(name);
6529 
6530  KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6531  KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6532  if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6533  // Ok, this is our variable. Delete it.
6534  __kmp_env_unset(name);
6535  }
6536 
6537  KMP_INTERNAL_FREE(__kmp_registration_str);
6538  KMP_INTERNAL_FREE(value);
6539  KMP_INTERNAL_FREE(name);
6540 
6541  __kmp_registration_flag = 0;
6542  __kmp_registration_str = NULL;
6543 
6544 } // __kmp_unregister_library
6545 
6546 // End of Library registration stuff.
6547 // -----------------------------------------------------------------------------
6548 
6549 #if KMP_MIC_SUPPORTED
6550 
6551 static void __kmp_check_mic_type() {
6552  kmp_cpuid_t cpuid_state = {0};
6553  kmp_cpuid_t *cs_p = &cpuid_state;
6554  __kmp_x86_cpuid(1, 0, cs_p);
6555  // We don't support mic1 at the moment
6556  if ((cs_p->eax & 0xff0) == 0xB10) {
6557  __kmp_mic_type = mic2;
6558  } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6559  __kmp_mic_type = mic3;
6560  } else {
6561  __kmp_mic_type = non_mic;
6562  }
6563 }
6564 
6565 #endif /* KMP_MIC_SUPPORTED */
6566 
6567 static void __kmp_do_serial_initialize(void) {
6568  int i, gtid;
6569  int size;
6570 
6571  KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6572 
6573  KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6574  KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6575  KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6576  KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6577  KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6578 
6579 #if OMPT_SUPPORT
6580  ompt_pre_init();
6581 #endif
6582 
6583  __kmp_validate_locks();
6584 
6585  /* Initialize internal memory allocator */
6586  __kmp_init_allocator();
6587 
6588  /* Register the library startup via an environment variable and check to see
6589  whether another copy of the library is already registered. */
6590 
6591  __kmp_register_library_startup();
6592 
6593  /* TODO reinitialization of library */
6594  if (TCR_4(__kmp_global.g.g_done)) {
6595  KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6596  }
6597 
6598  __kmp_global.g.g_abort = 0;
6599  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6600 
6601 /* initialize the locks */
6602 #if KMP_USE_ADAPTIVE_LOCKS
6603 #if KMP_DEBUG_ADAPTIVE_LOCKS
6604  __kmp_init_speculative_stats();
6605 #endif
6606 #endif
6607 #if KMP_STATS_ENABLED
6608  __kmp_stats_init();
6609 #endif
6610  __kmp_init_lock(&__kmp_global_lock);
6611  __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6612  __kmp_init_lock(&__kmp_debug_lock);
6613  __kmp_init_atomic_lock(&__kmp_atomic_lock);
6614  __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6615  __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6616  __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6617  __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6618  __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6619  __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6620  __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6621  __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6622  __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6623  __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6624  __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6625  __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6626  __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6627  __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6628 #if KMP_USE_MONITOR
6629  __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6630 #endif
6631  __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6632 
6633  /* conduct initialization and initial setup of configuration */
6634 
6635  __kmp_runtime_initialize();
6636 
6637 #if KMP_MIC_SUPPORTED
6638  __kmp_check_mic_type();
6639 #endif
6640 
6641 // Some global variable initialization moved here from kmp_env_initialize()
6642 #ifdef KMP_DEBUG
6643  kmp_diag = 0;
6644 #endif
6645  __kmp_abort_delay = 0;
6646 
6647  // From __kmp_init_dflt_team_nth()
6648  /* assume the entire machine will be used */
6649  __kmp_dflt_team_nth_ub = __kmp_xproc;
6650  if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6651  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6652  }
6653  if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6654  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6655  }
6656  __kmp_max_nth = __kmp_sys_max_nth;
6657  __kmp_cg_max_nth = __kmp_sys_max_nth;
6658  __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6659  if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6660  __kmp_teams_max_nth = __kmp_sys_max_nth;
6661  }
6662 
6663  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6664  // part
6665  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6666 #if KMP_USE_MONITOR
6667  __kmp_monitor_wakeups =
6668  KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6669  __kmp_bt_intervals =
6670  KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6671 #endif
6672  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6673  __kmp_library = library_throughput;
6674  // From KMP_SCHEDULE initialization
6675  __kmp_static = kmp_sch_static_balanced;
6676 // AC: do not use analytical here, because it is non-monotonous
6677 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6678 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6679 // need to repeat assignment
6680 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6681 // bit control and barrier method control parts
6682 #if KMP_FAST_REDUCTION_BARRIER
6683 #define kmp_reduction_barrier_gather_bb ((int)1)
6684 #define kmp_reduction_barrier_release_bb ((int)1)
6685 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6686 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6687 #endif // KMP_FAST_REDUCTION_BARRIER
6688  for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6689  __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6690  __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6691  __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6692  __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6693 #if KMP_FAST_REDUCTION_BARRIER
6694  if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6695  // lin_64 ): hyper,1
6696  __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6697  __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6698  __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6699  __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6700  }
6701 #endif // KMP_FAST_REDUCTION_BARRIER
6702  }
6703 #if KMP_FAST_REDUCTION_BARRIER
6704 #undef kmp_reduction_barrier_release_pat
6705 #undef kmp_reduction_barrier_gather_pat
6706 #undef kmp_reduction_barrier_release_bb
6707 #undef kmp_reduction_barrier_gather_bb
6708 #endif // KMP_FAST_REDUCTION_BARRIER
6709 #if KMP_MIC_SUPPORTED
6710  if (__kmp_mic_type == mic2) { // KNC
6711  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6712  __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6713  __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6714  1; // forkjoin release
6715  __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6716  __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6717  }
6718 #if KMP_FAST_REDUCTION_BARRIER
6719  if (__kmp_mic_type == mic2) { // KNC
6720  __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6721  __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6722  }
6723 #endif // KMP_FAST_REDUCTION_BARRIER
6724 #endif // KMP_MIC_SUPPORTED
6725 
6726 // From KMP_CHECKS initialization
6727 #ifdef KMP_DEBUG
6728  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6729 #else
6730  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6731 #endif
6732 
6733  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6734  __kmp_foreign_tp = TRUE;
6735 
6736  __kmp_global.g.g_dynamic = FALSE;
6737  __kmp_global.g.g_dynamic_mode = dynamic_default;
6738 
6739  __kmp_env_initialize(NULL);
6740 
6741 // Print all messages in message catalog for testing purposes.
6742 #ifdef KMP_DEBUG
6743  char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6744  if (__kmp_str_match_true(val)) {
6745  kmp_str_buf_t buffer;
6746  __kmp_str_buf_init(&buffer);
6747  __kmp_i18n_dump_catalog(&buffer);
6748  __kmp_printf("%s", buffer.str);
6749  __kmp_str_buf_free(&buffer);
6750  }
6751  __kmp_env_free(&val);
6752 #endif
6753 
6754  __kmp_threads_capacity =
6755  __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6756  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6757  __kmp_tp_capacity = __kmp_default_tp_capacity(
6758  __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6759 
6760  // If the library is shut down properly, both pools must be NULL. Just in
6761  // case, set them to NULL -- some memory may leak, but subsequent code will
6762  // work even if pools are not freed.
6763  KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6764  KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6765  KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6766  __kmp_thread_pool = NULL;
6767  __kmp_thread_pool_insert_pt = NULL;
6768  __kmp_team_pool = NULL;
6769 
6770  /* Allocate all of the variable sized records */
6771  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6772  * expandable */
6773  /* Since allocation is cache-aligned, just add extra padding at the end */
6774  size =
6775  (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6776  CACHE_LINE;
6777  __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6778  __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6779  sizeof(kmp_info_t *) * __kmp_threads_capacity);
6780 
6781  /* init thread counts */
6782  KMP_DEBUG_ASSERT(__kmp_all_nth ==
6783  0); // Asserts fail if the library is reinitializing and
6784  KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6785  __kmp_all_nth = 0;
6786  __kmp_nth = 0;
6787 
6788  /* setup the uber master thread and hierarchy */
6789  gtid = __kmp_register_root(TRUE);
6790  KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6791  KMP_ASSERT(KMP_UBER_GTID(gtid));
6792  KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6793 
6794  KMP_MB(); /* Flush all pending memory write invalidates. */
6795 
6796  __kmp_common_initialize();
6797 
6798 #if KMP_OS_UNIX
6799  /* invoke the child fork handler */
6800  __kmp_register_atfork();
6801 #endif
6802 
6803 #if !KMP_DYNAMIC_LIB
6804  {
6805  /* Invoke the exit handler when the program finishes, only for static
6806  library. For dynamic library, we already have _fini and DllMain. */
6807  int rc = atexit(__kmp_internal_end_atexit);
6808  if (rc != 0) {
6809  __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6810  __kmp_msg_null);
6811  }
6812  }
6813 #endif
6814 
6815 #if KMP_HANDLE_SIGNALS
6816 #if KMP_OS_UNIX
6817  /* NOTE: make sure that this is called before the user installs their own
6818  signal handlers so that the user handlers are called first. this way they
6819  can return false, not call our handler, avoid terminating the library, and
6820  continue execution where they left off. */
6821  __kmp_install_signals(FALSE);
6822 #endif /* KMP_OS_UNIX */
6823 #if KMP_OS_WINDOWS
6824  __kmp_install_signals(TRUE);
6825 #endif /* KMP_OS_WINDOWS */
6826 #endif
6827 
6828  /* we have finished the serial initialization */
6829  __kmp_init_counter++;
6830 
6831  __kmp_init_serial = TRUE;
6832 
6833  if (__kmp_settings) {
6834  __kmp_env_print();
6835  }
6836 
6837 #if OMP_40_ENABLED
6838  if (__kmp_display_env || __kmp_display_env_verbose) {
6839  __kmp_env_print_2();
6840  }
6841 #endif // OMP_40_ENABLED
6842 
6843 #if OMPT_SUPPORT
6844  ompt_post_init();
6845 #endif
6846 
6847  KMP_MB();
6848 
6849  KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6850 }
6851 
6852 void __kmp_serial_initialize(void) {
6853  if (__kmp_init_serial) {
6854  return;
6855  }
6856  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6857  if (__kmp_init_serial) {
6858  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6859  return;
6860  }
6861  __kmp_do_serial_initialize();
6862  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6863 }
6864 
6865 static void __kmp_do_middle_initialize(void) {
6866  int i, j;
6867  int prev_dflt_team_nth;
6868 
6869  if (!__kmp_init_serial) {
6870  __kmp_do_serial_initialize();
6871  }
6872 
6873  KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6874 
6875  // Save the previous value for the __kmp_dflt_team_nth so that
6876  // we can avoid some reinitialization if it hasn't changed.
6877  prev_dflt_team_nth = __kmp_dflt_team_nth;
6878 
6879 #if KMP_AFFINITY_SUPPORTED
6880  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6881  // number of cores on the machine.
6882  __kmp_affinity_initialize();
6883 
6884  // Run through the __kmp_threads array and set the affinity mask
6885  // for each root thread that is currently registered with the RTL.
6886  for (i = 0; i < __kmp_threads_capacity; i++) {
6887  if (TCR_PTR(__kmp_threads[i]) != NULL) {
6888  __kmp_affinity_set_init_mask(i, TRUE);
6889  }
6890  }
6891 #endif /* KMP_AFFINITY_SUPPORTED */
6892 
6893  KMP_ASSERT(__kmp_xproc > 0);
6894  if (__kmp_avail_proc == 0) {
6895  __kmp_avail_proc = __kmp_xproc;
6896  }
6897 
6898  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6899  // correct them now
6900  j = 0;
6901  while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6902  __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6903  __kmp_avail_proc;
6904  j++;
6905  }
6906 
6907  if (__kmp_dflt_team_nth == 0) {
6908 #ifdef KMP_DFLT_NTH_CORES
6909  // Default #threads = #cores
6910  __kmp_dflt_team_nth = __kmp_ncores;
6911  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6912  "__kmp_ncores (%d)\n",
6913  __kmp_dflt_team_nth));
6914 #else
6915  // Default #threads = #available OS procs
6916  __kmp_dflt_team_nth = __kmp_avail_proc;
6917  KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6918  "__kmp_avail_proc(%d)\n",
6919  __kmp_dflt_team_nth));
6920 #endif /* KMP_DFLT_NTH_CORES */
6921  }
6922 
6923  if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6924  __kmp_dflt_team_nth = KMP_MIN_NTH;
6925  }
6926  if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6927  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6928  }
6929 
6930  // There's no harm in continuing if the following check fails,
6931  // but it indicates an error in the previous logic.
6932  KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6933 
6934  if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6935  // Run through the __kmp_threads array and set the num threads icv for each
6936  // root thread that is currently registered with the RTL (which has not
6937  // already explicitly set its nthreads-var with a call to
6938  // omp_set_num_threads()).
6939  for (i = 0; i < __kmp_threads_capacity; i++) {
6940  kmp_info_t *thread = __kmp_threads[i];
6941  if (thread == NULL)
6942  continue;
6943  if (thread->th.th_current_task->td_icvs.nproc != 0)
6944  continue;
6945 
6946  set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
6947  }
6948  }
6949  KA_TRACE(
6950  20,
6951  ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6952  __kmp_dflt_team_nth));
6953 
6954 #ifdef KMP_ADJUST_BLOCKTIME
6955  /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
6956  if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6957  KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6958  if (__kmp_nth > __kmp_avail_proc) {
6959  __kmp_zero_bt = TRUE;
6960  }
6961  }
6962 #endif /* KMP_ADJUST_BLOCKTIME */
6963 
6964  /* we have finished middle initialization */
6965  TCW_SYNC_4(__kmp_init_middle, TRUE);
6966 
6967  KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
6968 }
6969 
6970 void __kmp_middle_initialize(void) {
6971  if (__kmp_init_middle) {
6972  return;
6973  }
6974  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6975  if (__kmp_init_middle) {
6976  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6977  return;
6978  }
6979  __kmp_do_middle_initialize();
6980  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6981 }
6982 
6983 void __kmp_parallel_initialize(void) {
6984  int gtid = __kmp_entry_gtid(); // this might be a new root
6985 
6986  /* synchronize parallel initialization (for sibling) */
6987  if (TCR_4(__kmp_init_parallel))
6988  return;
6989  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6990  if (TCR_4(__kmp_init_parallel)) {
6991  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6992  return;
6993  }
6994 
6995  /* TODO reinitialization after we have already shut down */
6996  if (TCR_4(__kmp_global.g.g_done)) {
6997  KA_TRACE(
6998  10,
6999  ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7000  __kmp_infinite_loop();
7001  }
7002 
7003  /* jc: The lock __kmp_initz_lock is already held, so calling
7004  __kmp_serial_initialize would cause a deadlock. So we call
7005  __kmp_do_serial_initialize directly. */
7006  if (!__kmp_init_middle) {
7007  __kmp_do_middle_initialize();
7008  }
7009 
7010 #if OMP_50_ENABLED
7011  __kmp_resume_if_hard_paused();
7012 #endif
7013 
7014  /* begin initialization */
7015  KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7016  KMP_ASSERT(KMP_UBER_GTID(gtid));
7017 
7018 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7019  // Save the FP control regs.
7020  // Worker threads will set theirs to these values at thread startup.
7021  __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7022  __kmp_store_mxcsr(&__kmp_init_mxcsr);
7023  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7024 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7025 
7026 #if KMP_OS_UNIX
7027 #if KMP_HANDLE_SIGNALS
7028  /* must be after __kmp_serial_initialize */
7029  __kmp_install_signals(TRUE);
7030 #endif
7031 #endif
7032 
7033  __kmp_suspend_initialize();
7034 
7035 #if defined(USE_LOAD_BALANCE)
7036  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7037  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7038  }
7039 #else
7040  if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7041  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7042  }
7043 #endif
7044 
7045  if (__kmp_version) {
7046  __kmp_print_version_2();
7047  }
7048 
7049  /* we have finished parallel initialization */
7050  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7051 
7052  KMP_MB();
7053  KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7054 
7055  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7056 }
7057 
7058 /* ------------------------------------------------------------------------ */
7059 
7060 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7061  kmp_team_t *team) {
7062  kmp_disp_t *dispatch;
7063 
7064  KMP_MB();
7065 
7066  /* none of the threads have encountered any constructs, yet. */
7067  this_thr->th.th_local.this_construct = 0;
7068 #if KMP_CACHE_MANAGE
7069  KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7070 #endif /* KMP_CACHE_MANAGE */
7071  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7072  KMP_DEBUG_ASSERT(dispatch);
7073  KMP_DEBUG_ASSERT(team->t.t_dispatch);
7074  // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7075  // this_thr->th.th_info.ds.ds_tid ] );
7076 
7077  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7078 #if OMP_45_ENABLED
7079  dispatch->th_doacross_buf_idx =
7080  0; /* reset the doacross dispatch buffer counter */
7081 #endif
7082  if (__kmp_env_consistency_check)
7083  __kmp_push_parallel(gtid, team->t.t_ident);
7084 
7085  KMP_MB(); /* Flush all pending memory write invalidates. */
7086 }
7087 
7088 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7089  kmp_team_t *team) {
7090  if (__kmp_env_consistency_check)
7091  __kmp_pop_parallel(gtid, team->t.t_ident);
7092 
7093  __kmp_finish_implicit_task(this_thr);
7094 }
7095 
7096 int __kmp_invoke_task_func(int gtid) {
7097  int rc;
7098  int tid = __kmp_tid_from_gtid(gtid);
7099  kmp_info_t *this_thr = __kmp_threads[gtid];
7100  kmp_team_t *team = this_thr->th.th_team;
7101 
7102  __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7103 #if USE_ITT_BUILD
7104  if (__itt_stack_caller_create_ptr) {
7105  __kmp_itt_stack_callee_enter(
7106  (__itt_caller)
7107  team->t.t_stack_id); // inform ittnotify about entering user's code
7108  }
7109 #endif /* USE_ITT_BUILD */
7110 #if INCLUDE_SSC_MARKS
7111  SSC_MARK_INVOKING();
7112 #endif
7113 
7114 #if OMPT_SUPPORT
7115  void *dummy;
7116  void **exit_runtime_p;
7117  ompt_data_t *my_task_data;
7118  ompt_data_t *my_parallel_data;
7119  int ompt_team_size;
7120 
7121  if (ompt_enabled.enabled) {
7122  exit_runtime_p = &(
7123  team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7124  } else {
7125  exit_runtime_p = &dummy;
7126  }
7127 
7128  my_task_data =
7129  &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7130  my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7131  if (ompt_enabled.ompt_callback_implicit_task) {
7132  ompt_team_size = team->t.t_nproc;
7133  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7134  ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7135  __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7136  OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7137  }
7138 #endif
7139 
7140  {
7141  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
7142  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
7143  rc =
7144  __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7145  tid, (int)team->t.t_argc, (void **)team->t.t_argv
7146 #if OMPT_SUPPORT
7147  ,
7148  exit_runtime_p
7149 #endif
7150  );
7151 #if OMPT_SUPPORT
7152  *exit_runtime_p = NULL;
7153 #endif
7154  }
7155 
7156 #if USE_ITT_BUILD
7157  if (__itt_stack_caller_create_ptr) {
7158  __kmp_itt_stack_callee_leave(
7159  (__itt_caller)
7160  team->t.t_stack_id); // inform ittnotify about leaving user's code
7161  }
7162 #endif /* USE_ITT_BUILD */
7163  __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7164 
7165  return rc;
7166 }
7167 
7168 #if OMP_40_ENABLED
7169 void __kmp_teams_master(int gtid) {
7170  // This routine is called by all master threads in teams construct
7171  kmp_info_t *thr = __kmp_threads[gtid];
7172  kmp_team_t *team = thr->th.th_team;
7173  ident_t *loc = team->t.t_ident;
7174  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7175  KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7176  KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7177  KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7178  __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7179 
7180  // This thread is a new CG root. Set up the proper variables.
7181  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7182  tmp->cg_root = thr; // Make thr the CG root
7183  // Init to thread limit that was stored when league masters were forked
7184  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7185  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7186  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7187  " cg_threads to 1\n",
7188  thr, tmp));
7189  tmp->up = thr->th.th_cg_roots;
7190  thr->th.th_cg_roots = tmp;
7191 
7192 // Launch league of teams now, but not let workers execute
7193 // (they hang on fork barrier until next parallel)
7194 #if INCLUDE_SSC_MARKS
7195  SSC_MARK_FORKING();
7196 #endif
7197  __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7198  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7199  VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7200 #if INCLUDE_SSC_MARKS
7201  SSC_MARK_JOINING();
7202 #endif
7203  // If the team size was reduced from the limit, set it to the new size
7204  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7205  thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7206  // AC: last parameter "1" eliminates join barrier which won't work because
7207  // worker threads are in a fork barrier waiting for more parallel regions
7208  __kmp_join_call(loc, gtid
7209 #if OMPT_SUPPORT
7210  ,
7211  fork_context_intel
7212 #endif
7213  ,
7214  1);
7215 }
7216 
7217 int __kmp_invoke_teams_master(int gtid) {
7218  kmp_info_t *this_thr = __kmp_threads[gtid];
7219  kmp_team_t *team = this_thr->th.th_team;
7220 #if KMP_DEBUG
7221  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7222  KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7223  (void *)__kmp_teams_master);
7224 #endif
7225  __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7226  __kmp_teams_master(gtid);
7227  __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7228  return 1;
7229 }
7230 #endif /* OMP_40_ENABLED */
7231 
7232 /* this sets the requested number of threads for the next parallel region
7233  encountered by this team. since this should be enclosed in the forkjoin
7234  critical section it should avoid race conditions with assymmetrical nested
7235  parallelism */
7236 
7237 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7238  kmp_info_t *thr = __kmp_threads[gtid];
7239 
7240  if (num_threads > 0)
7241  thr->th.th_set_nproc = num_threads;
7242 }
7243 
7244 #if OMP_40_ENABLED
7245 
7246 /* this sets the requested number of teams for the teams region and/or
7247  the number of threads for the next parallel region encountered */
7248 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7249  int num_threads) {
7250  kmp_info_t *thr = __kmp_threads[gtid];
7251  KMP_DEBUG_ASSERT(num_teams >= 0);
7252  KMP_DEBUG_ASSERT(num_threads >= 0);
7253 
7254  if (num_teams == 0)
7255  num_teams = 1; // default number of teams is 1.
7256  if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7257  if (!__kmp_reserve_warn) {
7258  __kmp_reserve_warn = 1;
7259  __kmp_msg(kmp_ms_warning,
7260  KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7261  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7262  }
7263  num_teams = __kmp_teams_max_nth;
7264  }
7265  // Set number of teams (number of threads in the outer "parallel" of the
7266  // teams)
7267  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7268 
7269  // Remember the number of threads for inner parallel regions
7270  if (num_threads == 0) {
7271  if (!TCR_4(__kmp_init_middle))
7272  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
7273  num_threads = __kmp_avail_proc / num_teams;
7274  if (num_teams * num_threads > __kmp_teams_max_nth) {
7275  // adjust num_threads w/o warning as it is not user setting
7276  num_threads = __kmp_teams_max_nth / num_teams;
7277  }
7278  } else {
7279  // This thread will be the master of the league masters
7280  // Store new thread limit; old limit is saved in th_cg_roots list
7281  thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7282 
7283  if (num_teams * num_threads > __kmp_teams_max_nth) {
7284  int new_threads = __kmp_teams_max_nth / num_teams;
7285  if (!__kmp_reserve_warn) { // user asked for too many threads
7286  __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7287  __kmp_msg(kmp_ms_warning,
7288  KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7289  KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7290  }
7291  num_threads = new_threads;
7292  }
7293  }
7294  thr->th.th_teams_size.nth = num_threads;
7295 }
7296 
7297 // Set the proc_bind var to use in the following parallel region.
7298 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7299  kmp_info_t *thr = __kmp_threads[gtid];
7300  thr->th.th_set_proc_bind = proc_bind;
7301 }
7302 
7303 #endif /* OMP_40_ENABLED */
7304 
7305 /* Launch the worker threads into the microtask. */
7306 
7307 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7308  kmp_info_t *this_thr = __kmp_threads[gtid];
7309 
7310 #ifdef KMP_DEBUG
7311  int f;
7312 #endif /* KMP_DEBUG */
7313 
7314  KMP_DEBUG_ASSERT(team);
7315  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7316  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7317  KMP_MB(); /* Flush all pending memory write invalidates. */
7318 
7319  team->t.t_construct = 0; /* no single directives seen yet */
7320  team->t.t_ordered.dt.t_value =
7321  0; /* thread 0 enters the ordered section first */
7322 
7323  /* Reset the identifiers on the dispatch buffer */
7324  KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7325  if (team->t.t_max_nproc > 1) {
7326  int i;
7327  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7328  team->t.t_disp_buffer[i].buffer_index = i;
7329 #if OMP_45_ENABLED
7330  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7331 #endif
7332  }
7333  } else {
7334  team->t.t_disp_buffer[0].buffer_index = 0;
7335 #if OMP_45_ENABLED
7336  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7337 #endif
7338  }
7339 
7340  KMP_MB(); /* Flush all pending memory write invalidates. */
7341  KMP_ASSERT(this_thr->th.th_team == team);
7342 
7343 #ifdef KMP_DEBUG
7344  for (f = 0; f < team->t.t_nproc; f++) {
7345  KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7346  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7347  }
7348 #endif /* KMP_DEBUG */
7349 
7350  /* release the worker threads so they may begin working */
7351  __kmp_fork_barrier(gtid, 0);
7352 }
7353 
7354 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7355  kmp_info_t *this_thr = __kmp_threads[gtid];
7356 
7357  KMP_DEBUG_ASSERT(team);
7358  KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7359  KMP_ASSERT(KMP_MASTER_GTID(gtid));
7360  KMP_MB(); /* Flush all pending memory write invalidates. */
7361 
7362 /* Join barrier after fork */
7363 
7364 #ifdef KMP_DEBUG
7365  if (__kmp_threads[gtid] &&
7366  __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7367  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7368  __kmp_threads[gtid]);
7369  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7370  "team->t.t_nproc=%d\n",
7371  gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7372  team->t.t_nproc);
7373  __kmp_print_structure();
7374  }
7375  KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7376  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7377 #endif /* KMP_DEBUG */
7378 
7379  __kmp_join_barrier(gtid); /* wait for everyone */
7380 #if OMPT_SUPPORT
7381  if (ompt_enabled.enabled &&
7382  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7383  int ds_tid = this_thr->th.th_info.ds.ds_tid;
7384  ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7385  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7386 #if OMPT_OPTIONAL
7387  void *codeptr = NULL;
7388  if (KMP_MASTER_TID(ds_tid) &&
7389  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7390  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7391  codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7392 
7393  if (ompt_enabled.ompt_callback_sync_region_wait) {
7394  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7395  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7396  }
7397  if (ompt_enabled.ompt_callback_sync_region) {
7398  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7399  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
7400  }
7401 #endif
7402  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7403  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7404  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7405  }
7406  }
7407 #endif
7408 
7409  KMP_MB(); /* Flush all pending memory write invalidates. */
7410  KMP_ASSERT(this_thr->th.th_team == team);
7411 }
7412 
7413 /* ------------------------------------------------------------------------ */
7414 
7415 #ifdef USE_LOAD_BALANCE
7416 
7417 // Return the worker threads actively spinning in the hot team, if we
7418 // are at the outermost level of parallelism. Otherwise, return 0.
7419 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7420  int i;
7421  int retval;
7422  kmp_team_t *hot_team;
7423 
7424  if (root->r.r_active) {
7425  return 0;
7426  }
7427  hot_team = root->r.r_hot_team;
7428  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7429  return hot_team->t.t_nproc - 1; // Don't count master thread
7430  }
7431 
7432  // Skip the master thread - it is accounted for elsewhere.
7433  retval = 0;
7434  for (i = 1; i < hot_team->t.t_nproc; i++) {
7435  if (hot_team->t.t_threads[i]->th.th_active) {
7436  retval++;
7437  }
7438  }
7439  return retval;
7440 }
7441 
7442 // Perform an automatic adjustment to the number of
7443 // threads used by the next parallel region.
7444 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7445  int retval;
7446  int pool_active;
7447  int hot_team_active;
7448  int team_curr_active;
7449  int system_active;
7450 
7451  KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7452  set_nproc));
7453  KMP_DEBUG_ASSERT(root);
7454  KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7455  ->th.th_current_task->td_icvs.dynamic == TRUE);
7456  KMP_DEBUG_ASSERT(set_nproc > 1);
7457 
7458  if (set_nproc == 1) {
7459  KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7460  return 1;
7461  }
7462 
7463  // Threads that are active in the thread pool, active in the hot team for this
7464  // particular root (if we are at the outer par level), and the currently
7465  // executing thread (to become the master) are available to add to the new
7466  // team, but are currently contributing to the system load, and must be
7467  // accounted for.
7468  pool_active = __kmp_thread_pool_active_nth;
7469  hot_team_active = __kmp_active_hot_team_nproc(root);
7470  team_curr_active = pool_active + hot_team_active + 1;
7471 
7472  // Check the system load.
7473  system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7474  KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7475  "hot team active = %d\n",
7476  system_active, pool_active, hot_team_active));
7477 
7478  if (system_active < 0) {
7479  // There was an error reading the necessary info from /proc, so use the
7480  // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7481  // = dynamic_thread_limit, we shouldn't wind up getting back here.
7482  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7483  KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7484 
7485  // Make this call behave like the thread limit algorithm.
7486  retval = __kmp_avail_proc - __kmp_nth +
7487  (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7488  if (retval > set_nproc) {
7489  retval = set_nproc;
7490  }
7491  if (retval < KMP_MIN_NTH) {
7492  retval = KMP_MIN_NTH;
7493  }
7494 
7495  KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7496  retval));
7497  return retval;
7498  }
7499 
7500  // There is a slight delay in the load balance algorithm in detecting new
7501  // running procs. The real system load at this instant should be at least as
7502  // large as the #active omp thread that are available to add to the team.
7503  if (system_active < team_curr_active) {
7504  system_active = team_curr_active;
7505  }
7506  retval = __kmp_avail_proc - system_active + team_curr_active;
7507  if (retval > set_nproc) {
7508  retval = set_nproc;
7509  }
7510  if (retval < KMP_MIN_NTH) {
7511  retval = KMP_MIN_NTH;
7512  }
7513 
7514  KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7515  return retval;
7516 } // __kmp_load_balance_nproc()
7517 
7518 #endif /* USE_LOAD_BALANCE */
7519 
7520 /* ------------------------------------------------------------------------ */
7521 
7522 /* NOTE: this is called with the __kmp_init_lock held */
7523 void __kmp_cleanup(void) {
7524  int f;
7525 
7526  KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7527 
7528  if (TCR_4(__kmp_init_parallel)) {
7529 #if KMP_HANDLE_SIGNALS
7530  __kmp_remove_signals();
7531 #endif
7532  TCW_4(__kmp_init_parallel, FALSE);
7533  }
7534 
7535  if (TCR_4(__kmp_init_middle)) {
7536 #if KMP_AFFINITY_SUPPORTED
7537  __kmp_affinity_uninitialize();
7538 #endif /* KMP_AFFINITY_SUPPORTED */
7539  __kmp_cleanup_hierarchy();
7540  TCW_4(__kmp_init_middle, FALSE);
7541  }
7542 
7543  KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7544 
7545  if (__kmp_init_serial) {
7546  __kmp_runtime_destroy();
7547  __kmp_init_serial = FALSE;
7548  }
7549 
7550  __kmp_cleanup_threadprivate_caches();
7551 
7552  for (f = 0; f < __kmp_threads_capacity; f++) {
7553  if (__kmp_root[f] != NULL) {
7554  __kmp_free(__kmp_root[f]);
7555  __kmp_root[f] = NULL;
7556  }
7557  }
7558  __kmp_free(__kmp_threads);
7559  // __kmp_threads and __kmp_root were allocated at once, as single block, so
7560  // there is no need in freeing __kmp_root.
7561  __kmp_threads = NULL;
7562  __kmp_root = NULL;
7563  __kmp_threads_capacity = 0;
7564 
7565 #if KMP_USE_DYNAMIC_LOCK
7566  __kmp_cleanup_indirect_user_locks();
7567 #else
7568  __kmp_cleanup_user_locks();
7569 #endif
7570 
7571 #if KMP_AFFINITY_SUPPORTED
7572  KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7573  __kmp_cpuinfo_file = NULL;
7574 #endif /* KMP_AFFINITY_SUPPORTED */
7575 
7576 #if KMP_USE_ADAPTIVE_LOCKS
7577 #if KMP_DEBUG_ADAPTIVE_LOCKS
7578  __kmp_print_speculative_stats();
7579 #endif
7580 #endif
7581  KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7582  __kmp_nested_nth.nth = NULL;
7583  __kmp_nested_nth.size = 0;
7584  __kmp_nested_nth.used = 0;
7585  KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7586  __kmp_nested_proc_bind.bind_types = NULL;
7587  __kmp_nested_proc_bind.size = 0;
7588  __kmp_nested_proc_bind.used = 0;
7589 #if OMP_50_ENABLED
7590  if (__kmp_affinity_format) {
7591  KMP_INTERNAL_FREE(__kmp_affinity_format);
7592  __kmp_affinity_format = NULL;
7593  }
7594 #endif
7595 
7596  __kmp_i18n_catclose();
7597 
7598 #if KMP_USE_HIER_SCHED
7599  __kmp_hier_scheds.deallocate();
7600 #endif
7601 
7602 #if KMP_STATS_ENABLED
7603  __kmp_stats_fini();
7604 #endif
7605 
7606  KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7607 }
7608 
7609 /* ------------------------------------------------------------------------ */
7610 
7611 int __kmp_ignore_mppbeg(void) {
7612  char *env;
7613 
7614  if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7615  if (__kmp_str_match_false(env))
7616  return FALSE;
7617  }
7618  // By default __kmpc_begin() is no-op.
7619  return TRUE;
7620 }
7621 
7622 int __kmp_ignore_mppend(void) {
7623  char *env;
7624 
7625  if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7626  if (__kmp_str_match_false(env))
7627  return FALSE;
7628  }
7629  // By default __kmpc_end() is no-op.
7630  return TRUE;
7631 }
7632 
7633 void __kmp_internal_begin(void) {
7634  int gtid;
7635  kmp_root_t *root;
7636 
7637  /* this is a very important step as it will register new sibling threads
7638  and assign these new uber threads a new gtid */
7639  gtid = __kmp_entry_gtid();
7640  root = __kmp_threads[gtid]->th.th_root;
7641  KMP_ASSERT(KMP_UBER_GTID(gtid));
7642 
7643  if (root->r.r_begin)
7644  return;
7645  __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7646  if (root->r.r_begin) {
7647  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7648  return;
7649  }
7650 
7651  root->r.r_begin = TRUE;
7652 
7653  __kmp_release_lock(&root->r.r_begin_lock, gtid);
7654 }
7655 
7656 /* ------------------------------------------------------------------------ */
7657 
7658 void __kmp_user_set_library(enum library_type arg) {
7659  int gtid;
7660  kmp_root_t *root;
7661  kmp_info_t *thread;
7662 
7663  /* first, make sure we are initialized so we can get our gtid */
7664 
7665  gtid = __kmp_entry_gtid();
7666  thread = __kmp_threads[gtid];
7667 
7668  root = thread->th.th_root;
7669 
7670  KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7671  library_serial));
7672  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7673  thread */
7674  KMP_WARNING(SetLibraryIncorrectCall);
7675  return;
7676  }
7677 
7678  switch (arg) {
7679  case library_serial:
7680  thread->th.th_set_nproc = 0;
7681  set__nproc(thread, 1);
7682  break;
7683  case library_turnaround:
7684  thread->th.th_set_nproc = 0;
7685  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7686  : __kmp_dflt_team_nth_ub);
7687  break;
7688  case library_throughput:
7689  thread->th.th_set_nproc = 0;
7690  set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7691  : __kmp_dflt_team_nth_ub);
7692  break;
7693  default:
7694  KMP_FATAL(UnknownLibraryType, arg);
7695  }
7696 
7697  __kmp_aux_set_library(arg);
7698 }
7699 
7700 void __kmp_aux_set_stacksize(size_t arg) {
7701  if (!__kmp_init_serial)
7702  __kmp_serial_initialize();
7703 
7704 #if KMP_OS_DARWIN
7705  if (arg & (0x1000 - 1)) {
7706  arg &= ~(0x1000 - 1);
7707  if (arg + 0x1000) /* check for overflow if we round up */
7708  arg += 0x1000;
7709  }
7710 #endif
7711  __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7712 
7713  /* only change the default stacksize before the first parallel region */
7714  if (!TCR_4(__kmp_init_parallel)) {
7715  size_t value = arg; /* argument is in bytes */
7716 
7717  if (value < __kmp_sys_min_stksize)
7718  value = __kmp_sys_min_stksize;
7719  else if (value > KMP_MAX_STKSIZE)
7720  value = KMP_MAX_STKSIZE;
7721 
7722  __kmp_stksize = value;
7723 
7724  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7725  }
7726 
7727  __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7728 }
7729 
7730 /* set the behaviour of the runtime library */
7731 /* TODO this can cause some odd behaviour with sibling parallelism... */
7732 void __kmp_aux_set_library(enum library_type arg) {
7733  __kmp_library = arg;
7734 
7735  switch (__kmp_library) {
7736  case library_serial: {
7737  KMP_INFORM(LibraryIsSerial);
7738  (void)__kmp_change_library(TRUE);
7739  } break;
7740  case library_turnaround:
7741  (void)__kmp_change_library(TRUE);
7742  break;
7743  case library_throughput:
7744  (void)__kmp_change_library(FALSE);
7745  break;
7746  default:
7747  KMP_FATAL(UnknownLibraryType, arg);
7748  }
7749 }
7750 
7751 /* Getting team information common for all team API */
7752 // Returns NULL if not in teams construct
7753 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7754  kmp_info_t *thr = __kmp_entry_thread();
7755  teams_serialized = 0;
7756  if (thr->th.th_teams_microtask) {
7757  kmp_team_t *team = thr->th.th_team;
7758  int tlevel = thr->th.th_teams_level; // the level of the teams construct
7759  int ii = team->t.t_level;
7760  teams_serialized = team->t.t_serialized;
7761  int level = tlevel + 1;
7762  KMP_DEBUG_ASSERT(ii >= tlevel);
7763  while (ii > level) {
7764  for (teams_serialized = team->t.t_serialized;
7765  (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7766  }
7767  if (team->t.t_serialized && (!teams_serialized)) {
7768  team = team->t.t_parent;
7769  continue;
7770  }
7771  if (ii > level) {
7772  team = team->t.t_parent;
7773  ii--;
7774  }
7775  }
7776  return team;
7777  }
7778  return NULL;
7779 }
7780 
7781 int __kmp_aux_get_team_num() {
7782  int serialized;
7783  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7784  if (team) {
7785  if (serialized > 1) {
7786  return 0; // teams region is serialized ( 1 team of 1 thread ).
7787  } else {
7788  return team->t.t_master_tid;
7789  }
7790  }
7791  return 0;
7792 }
7793 
7794 int __kmp_aux_get_num_teams() {
7795  int serialized;
7796  kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7797  if (team) {
7798  if (serialized > 1) {
7799  return 1;
7800  } else {
7801  return team->t.t_parent->t.t_nproc;
7802  }
7803  }
7804  return 1;
7805 }
7806 
7807 /* ------------------------------------------------------------------------ */
7808 
7809 #if OMP_50_ENABLED
7810 /*
7811  * Affinity Format Parser
7812  *
7813  * Field is in form of: %[[[0].]size]type
7814  * % and type are required (%% means print a literal '%')
7815  * type is either single char or long name surrounded by {},
7816  * e.g., N or {num_threads}
7817  * 0 => leading zeros
7818  * . => right justified when size is specified
7819  * by default output is left justified
7820  * size is the *minimum* field length
7821  * All other characters are printed as is
7822  *
7823  * Available field types:
7824  * L {thread_level} - omp_get_level()
7825  * n {thread_num} - omp_get_thread_num()
7826  * h {host} - name of host machine
7827  * P {process_id} - process id (integer)
7828  * T {thread_identifier} - native thread identifier (integer)
7829  * N {num_threads} - omp_get_num_threads()
7830  * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7831  * a {thread_affinity} - comma separated list of integers or integer ranges
7832  * (values of affinity mask)
7833  *
7834  * Implementation-specific field types can be added
7835  * If a type is unknown, print "undefined"
7836 */
7837 
7838 // Structure holding the short name, long name, and corresponding data type
7839 // for snprintf. A table of these will represent the entire valid keyword
7840 // field types.
7841 typedef struct kmp_affinity_format_field_t {
7842  char short_name; // from spec e.g., L -> thread level
7843  const char *long_name; // from spec thread_level -> thread level
7844  char field_format; // data type for snprintf (typically 'd' or 's'
7845  // for integer or string)
7846 } kmp_affinity_format_field_t;
7847 
7848 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7849 #if KMP_AFFINITY_SUPPORTED
7850  {'A', "thread_affinity", 's'},
7851 #endif
7852  {'t', "team_num", 'd'},
7853  {'T', "num_teams", 'd'},
7854  {'L', "nesting_level", 'd'},
7855  {'n', "thread_num", 'd'},
7856  {'N', "num_threads", 'd'},
7857  {'a', "ancestor_tnum", 'd'},
7858  {'H', "host", 's'},
7859  {'P', "process_id", 'd'},
7860  {'i', "native_thread_id", 'd'}};
7861 
7862 // Return the number of characters it takes to hold field
7863 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7864  const char **ptr,
7865  kmp_str_buf_t *field_buffer) {
7866  int rc, format_index, field_value;
7867  const char *width_left, *width_right;
7868  bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7869  static const int FORMAT_SIZE = 20;
7870  char format[FORMAT_SIZE] = {0};
7871  char absolute_short_name = 0;
7872 
7873  KMP_DEBUG_ASSERT(gtid >= 0);
7874  KMP_DEBUG_ASSERT(th);
7875  KMP_DEBUG_ASSERT(**ptr == '%');
7876  KMP_DEBUG_ASSERT(field_buffer);
7877 
7878  __kmp_str_buf_clear(field_buffer);
7879 
7880  // Skip the initial %
7881  (*ptr)++;
7882 
7883  // Check for %% first
7884  if (**ptr == '%') {
7885  __kmp_str_buf_cat(field_buffer, "%", 1);
7886  (*ptr)++; // skip over the second %
7887  return 1;
7888  }
7889 
7890  // Parse field modifiers if they are present
7891  pad_zeros = false;
7892  if (**ptr == '0') {
7893  pad_zeros = true;
7894  (*ptr)++; // skip over 0
7895  }
7896  right_justify = false;
7897  if (**ptr == '.') {
7898  right_justify = true;
7899  (*ptr)++; // skip over .
7900  }
7901  // Parse width of field: [width_left, width_right)
7902  width_left = width_right = NULL;
7903  if (**ptr >= '0' && **ptr <= '9') {
7904  width_left = *ptr;
7905  SKIP_DIGITS(*ptr);
7906  width_right = *ptr;
7907  }
7908 
7909  // Create the format for KMP_SNPRINTF based on flags parsed above
7910  format_index = 0;
7911  format[format_index++] = '%';
7912  if (!right_justify)
7913  format[format_index++] = '-';
7914  if (pad_zeros)
7915  format[format_index++] = '0';
7916  if (width_left && width_right) {
7917  int i = 0;
7918  // Only allow 8 digit number widths.
7919  // This also prevents overflowing format variable
7920  while (i < 8 && width_left < width_right) {
7921  format[format_index++] = *width_left;
7922  width_left++;
7923  i++;
7924  }
7925  }
7926 
7927  // Parse a name (long or short)
7928  // Canonicalize the name into absolute_short_name
7929  found_valid_name = false;
7930  parse_long_name = (**ptr == '{');
7931  if (parse_long_name)
7932  (*ptr)++; // skip initial left brace
7933  for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
7934  sizeof(__kmp_affinity_format_table[0]);
7935  ++i) {
7936  char short_name = __kmp_affinity_format_table[i].short_name;
7937  const char *long_name = __kmp_affinity_format_table[i].long_name;
7938  char field_format = __kmp_affinity_format_table[i].field_format;
7939  if (parse_long_name) {
7940  int length = KMP_STRLEN(long_name);
7941  if (strncmp(*ptr, long_name, length) == 0) {
7942  found_valid_name = true;
7943  (*ptr) += length; // skip the long name
7944  }
7945  } else if (**ptr == short_name) {
7946  found_valid_name = true;
7947  (*ptr)++; // skip the short name
7948  }
7949  if (found_valid_name) {
7950  format[format_index++] = field_format;
7951  format[format_index++] = '\0';
7952  absolute_short_name = short_name;
7953  break;
7954  }
7955  }
7956  if (parse_long_name) {
7957  if (**ptr != '}') {
7958  absolute_short_name = 0;
7959  } else {
7960  (*ptr)++; // skip over the right brace
7961  }
7962  }
7963 
7964  // Attempt to fill the buffer with the requested
7965  // value using snprintf within __kmp_str_buf_print()
7966  switch (absolute_short_name) {
7967  case 't':
7968  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
7969  break;
7970  case 'T':
7971  rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
7972  break;
7973  case 'L':
7974  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
7975  break;
7976  case 'n':
7977  rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
7978  break;
7979  case 'H': {
7980  static const int BUFFER_SIZE = 256;
7981  char buf[BUFFER_SIZE];
7982  __kmp_expand_host_name(buf, BUFFER_SIZE);
7983  rc = __kmp_str_buf_print(field_buffer, format, buf);
7984  } break;
7985  case 'P':
7986  rc = __kmp_str_buf_print(field_buffer, format, getpid());
7987  break;
7988  case 'i':
7989  rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
7990  break;
7991  case 'N':
7992  rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
7993  break;
7994  case 'a':
7995  field_value =
7996  __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
7997  rc = __kmp_str_buf_print(field_buffer, format, field_value);
7998  break;
7999 #if KMP_AFFINITY_SUPPORTED
8000  case 'A': {
8001  kmp_str_buf_t buf;
8002  __kmp_str_buf_init(&buf);
8003  __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8004  rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8005  __kmp_str_buf_free(&buf);
8006  } break;
8007 #endif
8008  default:
8009  // According to spec, If an implementation does not have info for field
8010  // type, then "undefined" is printed
8011  rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8012  // Skip the field
8013  if (parse_long_name) {
8014  SKIP_TOKEN(*ptr);
8015  if (**ptr == '}')
8016  (*ptr)++;
8017  } else {
8018  (*ptr)++;
8019  }
8020  }
8021 
8022  KMP_ASSERT(format_index <= FORMAT_SIZE);
8023  return rc;
8024 }
8025 
8026 /*
8027  * Return number of characters needed to hold the affinity string
8028  * (not including null byte character)
8029  * The resultant string is printed to buffer, which the caller can then
8030  * handle afterwards
8031 */
8032 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8033  kmp_str_buf_t *buffer) {
8034  const char *parse_ptr;
8035  size_t retval;
8036  const kmp_info_t *th;
8037  kmp_str_buf_t field;
8038 
8039  KMP_DEBUG_ASSERT(buffer);
8040  KMP_DEBUG_ASSERT(gtid >= 0);
8041 
8042  __kmp_str_buf_init(&field);
8043  __kmp_str_buf_clear(buffer);
8044 
8045  th = __kmp_threads[gtid];
8046  retval = 0;
8047 
8048  // If format is NULL or zero-length string, then we use
8049  // affinity-format-var ICV
8050  parse_ptr = format;
8051  if (parse_ptr == NULL || *parse_ptr == '\0') {
8052  parse_ptr = __kmp_affinity_format;
8053  }
8054  KMP_DEBUG_ASSERT(parse_ptr);
8055 
8056  while (*parse_ptr != '\0') {
8057  // Parse a field
8058  if (*parse_ptr == '%') {
8059  // Put field in the buffer
8060  int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8061  __kmp_str_buf_catbuf(buffer, &field);
8062  retval += rc;
8063  } else {
8064  // Put literal character in buffer
8065  __kmp_str_buf_cat(buffer, parse_ptr, 1);
8066  retval++;
8067  parse_ptr++;
8068  }
8069  }
8070  __kmp_str_buf_free(&field);
8071  return retval;
8072 }
8073 
8074 // Displays the affinity string to stdout
8075 void __kmp_aux_display_affinity(int gtid, const char *format) {
8076  kmp_str_buf_t buf;
8077  __kmp_str_buf_init(&buf);
8078  __kmp_aux_capture_affinity(gtid, format, &buf);
8079  __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8080  __kmp_str_buf_free(&buf);
8081 }
8082 #endif // OMP_50_ENABLED
8083 
8084 /* ------------------------------------------------------------------------ */
8085 
8086 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8087  int blocktime = arg; /* argument is in milliseconds */
8088 #if KMP_USE_MONITOR
8089  int bt_intervals;
8090 #endif
8091  int bt_set;
8092 
8093  __kmp_save_internal_controls(thread);
8094 
8095  /* Normalize and set blocktime for the teams */
8096  if (blocktime < KMP_MIN_BLOCKTIME)
8097  blocktime = KMP_MIN_BLOCKTIME;
8098  else if (blocktime > KMP_MAX_BLOCKTIME)
8099  blocktime = KMP_MAX_BLOCKTIME;
8100 
8101  set__blocktime_team(thread->th.th_team, tid, blocktime);
8102  set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8103 
8104 #if KMP_USE_MONITOR
8105  /* Calculate and set blocktime intervals for the teams */
8106  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8107 
8108  set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8109  set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8110 #endif
8111 
8112  /* Set whether blocktime has been set to "TRUE" */
8113  bt_set = TRUE;
8114 
8115  set__bt_set_team(thread->th.th_team, tid, bt_set);
8116  set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8117 #if KMP_USE_MONITOR
8118  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8119  "bt_intervals=%d, monitor_updates=%d\n",
8120  __kmp_gtid_from_tid(tid, thread->th.th_team),
8121  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8122  __kmp_monitor_wakeups));
8123 #else
8124  KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8125  __kmp_gtid_from_tid(tid, thread->th.th_team),
8126  thread->th.th_team->t.t_id, tid, blocktime));
8127 #endif
8128 }
8129 
8130 void __kmp_aux_set_defaults(char const *str, int len) {
8131  if (!__kmp_init_serial) {
8132  __kmp_serial_initialize();
8133  }
8134  __kmp_env_initialize(str);
8135 
8136  if (__kmp_settings
8137 #if OMP_40_ENABLED
8138  || __kmp_display_env || __kmp_display_env_verbose
8139 #endif // OMP_40_ENABLED
8140  ) {
8141  __kmp_env_print();
8142  }
8143 } // __kmp_aux_set_defaults
8144 
8145 /* ------------------------------------------------------------------------ */
8146 /* internal fast reduction routines */
8147 
8148 PACKED_REDUCTION_METHOD_T
8149 __kmp_determine_reduction_method(
8150  ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8151  void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8152  kmp_critical_name *lck) {
8153 
8154  // Default reduction method: critical construct ( lck != NULL, like in current
8155  // PAROPT )
8156  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8157  // can be selected by RTL
8158  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8159  // can be selected by RTL
8160  // Finally, it's up to OpenMP RTL to make a decision on which method to select
8161  // among generated by PAROPT.
8162 
8163  PACKED_REDUCTION_METHOD_T retval;
8164 
8165  int team_size;
8166 
8167  KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8168  KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8169 
8170 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8171  ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8172 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8173 
8174  retval = critical_reduce_block;
8175 
8176  // another choice of getting a team size (with 1 dynamic deference) is slower
8177  team_size = __kmp_get_team_num_threads(global_tid);
8178  if (team_size == 1) {
8179 
8180  retval = empty_reduce_block;
8181 
8182  } else {
8183 
8184  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8185 
8186 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
8187 
8188 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8189  KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD || KMP_OS_KFREEBSD
8190 
8191  int teamsize_cutoff = 4;
8192 
8193 #if KMP_MIC_SUPPORTED
8194  if (__kmp_mic_type != non_mic) {
8195  teamsize_cutoff = 8;
8196  }
8197 #endif
8198  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8199  if (tree_available) {
8200  if (team_size <= teamsize_cutoff) {
8201  if (atomic_available) {
8202  retval = atomic_reduce_block;
8203  }
8204  } else {
8205  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8206  }
8207  } else if (atomic_available) {
8208  retval = atomic_reduce_block;
8209  }
8210 #else
8211 #error "Unknown or unsupported OS"
8212 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8213  // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8214 
8215 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8216 
8217 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD || KMP_OS_KFREEBSD
8218 
8219  // basic tuning
8220 
8221  if (atomic_available) {
8222  if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8223  retval = atomic_reduce_block;
8224  }
8225  } // otherwise: use critical section
8226 
8227 #elif KMP_OS_DARWIN
8228 
8229  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8230  if (atomic_available && (num_vars <= 3)) {
8231  retval = atomic_reduce_block;
8232  } else if (tree_available) {
8233  if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8234  (reduce_size < (2000 * sizeof(kmp_real64)))) {
8235  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8236  }
8237  } // otherwise: use critical section
8238 
8239 #else
8240 #error "Unknown or unsupported OS"
8241 #endif
8242 
8243 #else
8244 #error "Unknown or unsupported architecture"
8245 #endif
8246  }
8247 
8248  // KMP_FORCE_REDUCTION
8249 
8250  // If the team is serialized (team_size == 1), ignore the forced reduction
8251  // method and stay with the unsynchronized method (empty_reduce_block)
8252  if (__kmp_force_reduction_method != reduction_method_not_defined &&
8253  team_size != 1) {
8254 
8255  PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8256 
8257  int atomic_available, tree_available;
8258 
8259  switch ((forced_retval = __kmp_force_reduction_method)) {
8260  case critical_reduce_block:
8261  KMP_ASSERT(lck); // lck should be != 0
8262  break;
8263 
8264  case atomic_reduce_block:
8265  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8266  if (!atomic_available) {
8267  KMP_WARNING(RedMethodNotSupported, "atomic");
8268  forced_retval = critical_reduce_block;
8269  }
8270  break;
8271 
8272  case tree_reduce_block:
8273  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8274  if (!tree_available) {
8275  KMP_WARNING(RedMethodNotSupported, "tree");
8276  forced_retval = critical_reduce_block;
8277  } else {
8278 #if KMP_FAST_REDUCTION_BARRIER
8279  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8280 #endif
8281  }
8282  break;
8283 
8284  default:
8285  KMP_ASSERT(0); // "unsupported method specified"
8286  }
8287 
8288  retval = forced_retval;
8289  }
8290 
8291  KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8292 
8293 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8294 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8295 
8296  return (retval);
8297 }
8298 
8299 // this function is for testing set/get/determine reduce method
8300 kmp_int32 __kmp_get_reduce_method(void) {
8301  return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8302 }
8303 
8304 #if OMP_50_ENABLED
8305 
8306 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8307 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
8308 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8309 
8310 // Hard pause shuts down the runtime completely. Resume happens naturally when
8311 // OpenMP is used subsequently.
8312 void __kmp_hard_pause() {
8313  __kmp_pause_status = kmp_hard_paused;
8314  __kmp_internal_end_thread(-1);
8315 }
8316 
8317 // Soft resume sets __kmp_pause_status, and wakes up all threads.
8318 void __kmp_resume_if_soft_paused() {
8319  if (__kmp_pause_status == kmp_soft_paused) {
8320  __kmp_pause_status = kmp_not_paused;
8321 
8322  for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8323  kmp_info_t *thread = __kmp_threads[gtid];
8324  if (thread) { // Wake it if sleeping
8325  kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
8326  if (fl.is_sleeping())
8327  fl.resume(gtid);
8328  else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8329  __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8330  } else { // thread holds the lock and may sleep soon
8331  do { // until either the thread sleeps, or we can get the lock
8332  if (fl.is_sleeping()) {
8333  fl.resume(gtid);
8334  break;
8335  } else if (__kmp_try_suspend_mx(thread)) {
8336  __kmp_unlock_suspend_mx(thread);
8337  break;
8338  }
8339  } while (1);
8340  }
8341  }
8342  }
8343  }
8344 }
8345 
8346 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8347 // TODO: add warning messages
8348 int __kmp_pause_resource(kmp_pause_status_t level) {
8349  if (level == kmp_not_paused) { // requesting resume
8350  if (__kmp_pause_status == kmp_not_paused) {
8351  // error message about runtime not being paused, so can't resume
8352  return 1;
8353  } else {
8354  KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8355  __kmp_pause_status == kmp_hard_paused);
8356  __kmp_pause_status = kmp_not_paused;
8357  return 0;
8358  }
8359  } else if (level == kmp_soft_paused) { // requesting soft pause
8360  if (__kmp_pause_status != kmp_not_paused) {
8361  // error message about already being paused
8362  return 1;
8363  } else {
8364  __kmp_soft_pause();
8365  return 0;
8366  }
8367  } else if (level == kmp_hard_paused) { // requesting hard pause
8368  if (__kmp_pause_status != kmp_not_paused) {
8369  // error message about already being paused
8370  return 1;
8371  } else {
8372  __kmp_hard_pause();
8373  return 0;
8374  }
8375  } else {
8376  // error message about invalid level
8377  return 1;
8378  }
8379 }
8380 
8381 #endif // OMP_50_ENABLED
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:876
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the paritioned timers to begin with name.
Definition: kmp_stats.h:918
sched_type
Definition: kmp.h:336
Definition: kmp.h:223
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
kmp_int32 flags
Definition: kmp.h:225