LLVM OpenMP* Runtime Library
kmp_runtime.cpp
1/*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#include "kmp.h"
14#include "kmp_affinity.h"
15#include "kmp_atomic.h"
16#include "kmp_environment.h"
17#include "kmp_error.h"
18#include "kmp_i18n.h"
19#include "kmp_io.h"
20#include "kmp_itt.h"
21#include "kmp_settings.h"
22#include "kmp_stats.h"
23#include "kmp_str.h"
24#include "kmp_wait_release.h"
25#include "kmp_wrapper_getpid.h"
26#include "kmp_dispatch.h"
27#if KMP_USE_HIER_SCHED
28#include "kmp_dispatch_hier.h"
29#endif
30
31#if OMPT_SUPPORT
32#include "ompt-specific.h"
33#endif
34#if OMPD_SUPPORT
35#include "ompd-specific.h"
36#endif
37
38#if OMP_PROFILING_SUPPORT
39#include "llvm/Support/TimeProfiler.h"
40static char *ProfileTraceFile = nullptr;
41#endif
42
43/* these are temporary issues to be dealt with */
44#define KMP_USE_PRCTL 0
45
46#if KMP_OS_WINDOWS
47#include <process.h>
48#endif
49
50#if KMP_OS_WINDOWS
51// windows does not need include files as it doesn't use shared memory
52#else
53#include <sys/mman.h>
54#include <sys/stat.h>
55#include <fcntl.h>
56#define SHM_SIZE 1024
57#endif
58
59#if defined(KMP_GOMP_COMPAT)
60char const __kmp_version_alt_comp[] =
61 KMP_VERSION_PREFIX "alternative compiler support: yes";
62#endif /* defined(KMP_GOMP_COMPAT) */
63
64char const __kmp_version_omp_api[] =
65 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
66
67#ifdef KMP_DEBUG
68char const __kmp_version_lock[] =
69 KMP_VERSION_PREFIX "lock type: run time selectable";
70#endif /* KMP_DEBUG */
71
72#define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
73
74/* ------------------------------------------------------------------------ */
75
76#if KMP_USE_MONITOR
77kmp_info_t __kmp_monitor;
78#endif
79
80/* Forward declarations */
81
82void __kmp_cleanup(void);
83
84static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
85 int gtid);
86static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
87 kmp_internal_control_t *new_icvs,
88 ident_t *loc);
89#if KMP_AFFINITY_SUPPORTED
90static void __kmp_partition_places(kmp_team_t *team,
91 int update_master_only = 0);
92#endif
93static void __kmp_do_serial_initialize(void);
94void __kmp_fork_barrier(int gtid, int tid);
95void __kmp_join_barrier(int gtid);
96void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
97 kmp_internal_control_t *new_icvs, ident_t *loc);
98
99#ifdef USE_LOAD_BALANCE
100static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
101#endif
102
103static int __kmp_expand_threads(int nNeed);
104#if KMP_OS_WINDOWS
105static int __kmp_unregister_root_other_thread(int gtid);
106#endif
107static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
108kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
109
110void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
111 int new_nthreads);
112void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
113
114/* Calculate the identifier of the current thread */
115/* fast (and somewhat portable) way to get unique identifier of executing
116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
117int __kmp_get_global_thread_id() {
118 int i;
119 kmp_info_t **other_threads;
120 size_t stack_data;
121 char *stack_addr;
122 size_t stack_size;
123 char *stack_base;
124
125 KA_TRACE(
126 1000,
127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
128 __kmp_nth, __kmp_all_nth));
129
130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
133 __kmp_init_gtid for this to work. */
134
135 if (!TCR_4(__kmp_init_gtid))
136 return KMP_GTID_DNE;
137
138#ifdef KMP_TDATA_GTID
139 if (TCR_4(__kmp_gtid_mode) >= 3) {
140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
141 return __kmp_gtid;
142 }
143#endif
144 if (TCR_4(__kmp_gtid_mode) >= 2) {
145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
146 return __kmp_gtid_get_specific();
147 }
148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
149
150 stack_addr = (char *)&stack_data;
151 other_threads = __kmp_threads;
152
153 /* ATT: The code below is a source of potential bugs due to unsynchronized
154 access to __kmp_threads array. For example:
155 1. Current thread loads other_threads[i] to thr and checks it, it is
156 non-NULL.
157 2. Current thread is suspended by OS.
158 3. Another thread unregisters and finishes (debug versions of free()
159 may fill memory with something like 0xEF).
160 4. Current thread is resumed.
161 5. Current thread reads junk from *thr.
162 TODO: Fix it. --ln */
163
164 for (i = 0; i < __kmp_threads_capacity; i++) {
165
166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
167 if (!thr)
168 continue;
169
170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
172
173 /* stack grows down -- search through all of the active threads */
174
175 if (stack_addr <= stack_base) {
176 size_t stack_diff = stack_base - stack_addr;
177
178 if (stack_diff <= stack_size) {
179 /* The only way we can be closer than the allocated */
180 /* stack size is if we are running on this thread. */
181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
182 return i;
183 }
184 }
185 }
186
187 /* get specific to try and determine our gtid */
188 KA_TRACE(1000,
189 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
190 "thread, using TLS\n"));
191 i = __kmp_gtid_get_specific();
192
193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
194
195 /* if we havn't been assigned a gtid, then return code */
196 if (i < 0)
197 return i;
198
199 /* dynamically updated stack window for uber threads to avoid get_specific
200 call */
201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
202 KMP_FATAL(StackOverflow, i);
203 }
204
205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
206 if (stack_addr > stack_base) {
207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
210 stack_base);
211 } else {
212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
213 stack_base - stack_addr);
214 }
215
216 /* Reprint stack bounds for ubermaster since they have been refined */
217 if (__kmp_storage_map) {
218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
221 other_threads[i]->th.th_info.ds.ds_stacksize,
222 "th_%d stack (refinement)", i);
223 }
224 return i;
225}
226
227int __kmp_get_global_thread_id_reg() {
228 int gtid;
229
230 if (!__kmp_init_serial) {
231 gtid = KMP_GTID_DNE;
232 } else
233#ifdef KMP_TDATA_GTID
234 if (TCR_4(__kmp_gtid_mode) >= 3) {
235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
236 gtid = __kmp_gtid;
237 } else
238#endif
239 if (TCR_4(__kmp_gtid_mode) >= 2) {
240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
241 gtid = __kmp_gtid_get_specific();
242 } else {
243 KA_TRACE(1000,
244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
245 gtid = __kmp_get_global_thread_id();
246 }
247
248 /* we must be a new uber master sibling thread */
249 if (gtid == KMP_GTID_DNE) {
250 KA_TRACE(10,
251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
252 "Registering a new gtid.\n"));
253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
254 if (!__kmp_init_serial) {
255 __kmp_do_serial_initialize();
256 gtid = __kmp_gtid_get_specific();
257 } else {
258 gtid = __kmp_register_root(FALSE);
259 }
260 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
262 }
263
264 KMP_DEBUG_ASSERT(gtid >= 0);
265
266 return gtid;
267}
268
269/* caller must hold forkjoin_lock */
270void __kmp_check_stack_overlap(kmp_info_t *th) {
271 int f;
272 char *stack_beg = NULL;
273 char *stack_end = NULL;
274 int gtid;
275
276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
277 if (__kmp_storage_map) {
278 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
280
281 gtid = __kmp_gtid_from_thread(th);
282
283 if (gtid == KMP_GTID_MONITOR) {
284 __kmp_print_storage_map_gtid(
285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
286 "th_%s stack (%s)", "mon",
287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
288 } else {
289 __kmp_print_storage_map_gtid(
290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
291 "th_%d stack (%s)", gtid,
292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
293 }
294 }
295
296 /* No point in checking ubermaster threads since they use refinement and
297 * cannot overlap */
298 gtid = __kmp_gtid_from_thread(th);
299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
300 KA_TRACE(10,
301 ("__kmp_check_stack_overlap: performing extensive checking\n"));
302 if (stack_beg == NULL) {
303 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
305 }
306
307 for (f = 0; f < __kmp_threads_capacity; f++) {
308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
309
310 if (f_th && f_th != th) {
311 char *other_stack_end =
312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
313 char *other_stack_beg =
314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
316 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
317
318 /* Print the other stack values before the abort */
319 if (__kmp_storage_map)
320 __kmp_print_storage_map_gtid(
321 -1, other_stack_beg, other_stack_end,
322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
324
325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
326 __kmp_msg_null);
327 }
328 }
329 }
330 }
331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
332}
333
334/* ------------------------------------------------------------------------ */
335
336void __kmp_infinite_loop(void) {
337 static int done = FALSE;
338
339 while (!done) {
340 KMP_YIELD(TRUE);
341 }
342}
343
344#define MAX_MESSAGE 512
345
346void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
347 char const *format, ...) {
348 char buffer[MAX_MESSAGE];
349 va_list ap;
350
351 va_start(ap, format);
352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
353 p2, (unsigned long)size, format);
354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
355 __kmp_vprintf(kmp_err, buffer, ap);
356#if KMP_PRINT_DATA_PLACEMENT
357 int node;
358 if (gtid >= 0) {
359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
360 if (__kmp_storage_map_verbose) {
361 node = __kmp_get_host_node(p1);
362 if (node < 0) /* doesn't work, so don't try this next time */
363 __kmp_storage_map_verbose = FALSE;
364 else {
365 char *last;
366 int lastNode;
367 int localProc = __kmp_get_cpu_from_gtid(gtid);
368
369 const int page_size = KMP_GET_PAGE_SIZE();
370
371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
373 if (localProc >= 0)
374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
375 localProc >> 1);
376 else
377 __kmp_printf_no_lock(" GTID %d\n", gtid);
378#if KMP_USE_PRCTL
379 /* The more elaborate format is disabled for now because of the prctl
380 * hanging bug. */
381 do {
382 last = p1;
383 lastNode = node;
384 /* This loop collates adjacent pages with the same host node. */
385 do {
386 (char *)p1 += page_size;
387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
389 lastNode);
390 } while (p1 <= p2);
391#else
392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
393 (char *)p1 + (page_size - 1),
394 __kmp_get_host_node(p1));
395 if (p1 < p2) {
396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
397 (char *)p2 + (page_size - 1),
398 __kmp_get_host_node(p2));
399 }
400#endif
401 }
402 }
403 } else
404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
405 }
406#endif /* KMP_PRINT_DATA_PLACEMENT */
407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
408}
409
410void __kmp_warn(char const *format, ...) {
411 char buffer[MAX_MESSAGE];
412 va_list ap;
413
414 if (__kmp_generate_warnings == kmp_warnings_off) {
415 return;
416 }
417
418 va_start(ap, format);
419
420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
422 __kmp_vprintf(kmp_err, buffer, ap);
423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
424
425 va_end(ap);
426}
427
428void __kmp_abort_process() {
429 // Later threads may stall here, but that's ok because abort() will kill them.
430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
431
432 if (__kmp_debug_buf) {
433 __kmp_dump_debug_buffer();
434 }
435
436 if (KMP_OS_WINDOWS) {
437 // Let other threads know of abnormal termination and prevent deadlock
438 // if abort happened during library initialization or shutdown
439 __kmp_global.g.g_abort = SIGABRT;
440
441 /* On Windows* OS by default abort() causes pop-up error box, which stalls
442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
443 boxes. _set_abort_behavior() works well, but this function is not
444 available in VS7 (this is not problem for DLL, but it is a problem for
445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
446 help, at least in some versions of MS C RTL.
447
448 It seems following sequence is the only way to simulate abort() and
449 avoid pop-up error box. */
450 raise(SIGABRT);
451 _exit(3); // Just in case, if signal ignored, exit anyway.
452 } else {
453 __kmp_unregister_library();
454 abort();
455 }
456
457 __kmp_infinite_loop();
458 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
459
460} // __kmp_abort_process
461
462void __kmp_abort_thread(void) {
463 // TODO: Eliminate g_abort global variable and this function.
464 // In case of abort just call abort(), it will kill all the threads.
465 __kmp_infinite_loop();
466} // __kmp_abort_thread
467
468/* Print out the storage map for the major kmp_info_t thread data structures
469 that are allocated together. */
470
471static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
473 gtid);
474
475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
476 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
477
478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
479 sizeof(kmp_local_t), "th_%d.th_local", gtid);
480
481 __kmp_print_storage_map_gtid(
482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
484
485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
486 &thr->th.th_bar[bs_plain_barrier + 1],
487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
488 gtid);
489
490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
491 &thr->th.th_bar[bs_forkjoin_barrier + 1],
492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
493 gtid);
494
495#if KMP_FAST_REDUCTION_BARRIER
496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
497 &thr->th.th_bar[bs_reduction_barrier + 1],
498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
499 gtid);
500#endif // KMP_FAST_REDUCTION_BARRIER
501}
502
503/* Print out the storage map for the major kmp_team_t team data structures
504 that are allocated together. */
505
506static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
507 int team_id, int num_thr) {
508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
510 header, team_id);
511
512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
513 &team->t.t_bar[bs_last_barrier],
514 sizeof(kmp_balign_team_t) * bs_last_barrier,
515 "%s_%d.t_bar", header, team_id);
516
517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
518 &team->t.t_bar[bs_plain_barrier + 1],
519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
520 header, team_id);
521
522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
523 &team->t.t_bar[bs_forkjoin_barrier + 1],
524 sizeof(kmp_balign_team_t),
525 "%s_%d.t_bar[forkjoin]", header, team_id);
526
527#if KMP_FAST_REDUCTION_BARRIER
528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
529 &team->t.t_bar[bs_reduction_barrier + 1],
530 sizeof(kmp_balign_team_t),
531 "%s_%d.t_bar[reduction]", header, team_id);
532#endif // KMP_FAST_REDUCTION_BARRIER
533
534 __kmp_print_storage_map_gtid(
535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
537
538 __kmp_print_storage_map_gtid(
539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
541
542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
543 &team->t.t_disp_buffer[num_disp_buff],
544 sizeof(dispatch_shared_info_t) * num_disp_buff,
545 "%s_%d.t_disp_buffer", header, team_id);
546}
547
548static void __kmp_init_allocator() {
549 __kmp_init_memkind();
550 __kmp_init_target_mem();
551}
552static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
553
554/* ------------------------------------------------------------------------ */
555
556#if KMP_DYNAMIC_LIB
557#if KMP_OS_WINDOWS
558
559BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
561
562 switch (fdwReason) {
563
564 case DLL_PROCESS_ATTACH:
565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
566
567 return TRUE;
568
569 case DLL_PROCESS_DETACH:
570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
571
572 // According to Windows* documentation for DllMain entry point:
573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference:
574 // lpReserved == NULL when FreeLibrary() is called,
575 // lpReserved != NULL when the process is terminated.
576 // When FreeLibrary() is called, worker threads remain alive. So the
577 // runtime's state is consistent and executing proper shutdown is OK.
578 // When the process is terminated, worker threads have exited or been
579 // forcefully terminated by the OS and only the shutdown thread remains.
580 // This can leave the runtime in an inconsistent state.
581 // Hence, only attempt proper cleanup when FreeLibrary() is called.
582 // Otherwise, rely on OS to reclaim resources.
583 if (lpReserved == NULL)
584 __kmp_internal_end_library(__kmp_gtid_get_specific());
585
586 return TRUE;
587
588 case DLL_THREAD_ATTACH:
589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
590
591 /* if we want to register new siblings all the time here call
592 * __kmp_get_gtid(); */
593 return TRUE;
594
595 case DLL_THREAD_DETACH:
596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
597
598 __kmp_internal_end_thread(__kmp_gtid_get_specific());
599 return TRUE;
600 }
601
602 return TRUE;
603}
604
605#endif /* KMP_OS_WINDOWS */
606#endif /* KMP_DYNAMIC_LIB */
607
608/* __kmp_parallel_deo -- Wait until it's our turn. */
609void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
610 int gtid = *gtid_ref;
611#ifdef BUILD_PARALLEL_ORDERED
612 kmp_team_t *team = __kmp_team_from_gtid(gtid);
613#endif /* BUILD_PARALLEL_ORDERED */
614
615 if (__kmp_env_consistency_check) {
616 if (__kmp_threads[gtid]->th.th_root->r.r_active)
617#if KMP_USE_DYNAMIC_LOCK
618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
619#else
620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
621#endif
622 }
623#ifdef BUILD_PARALLEL_ORDERED
624 if (!team->t.t_serialized) {
625 KMP_MB();
626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
627 NULL);
628 KMP_MB();
629 }
630#endif /* BUILD_PARALLEL_ORDERED */
631}
632
633/* __kmp_parallel_dxo -- Signal the next task. */
634void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
635 int gtid = *gtid_ref;
636#ifdef BUILD_PARALLEL_ORDERED
637 int tid = __kmp_tid_from_gtid(gtid);
638 kmp_team_t *team = __kmp_team_from_gtid(gtid);
639#endif /* BUILD_PARALLEL_ORDERED */
640
641 if (__kmp_env_consistency_check) {
642 if (__kmp_threads[gtid]->th.th_root->r.r_active)
643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
644 }
645#ifdef BUILD_PARALLEL_ORDERED
646 if (!team->t.t_serialized) {
647 KMP_MB(); /* Flush all pending memory write invalidates. */
648
649 /* use the tid of the next thread in this team */
650 /* TODO replace with general release procedure */
651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
652
653 KMP_MB(); /* Flush all pending memory write invalidates. */
654 }
655#endif /* BUILD_PARALLEL_ORDERED */
656}
657
658/* ------------------------------------------------------------------------ */
659/* The BARRIER for a SINGLE process section is always explicit */
660
661int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
662 int status;
663 kmp_info_t *th;
664 kmp_team_t *team;
665
666 if (!TCR_4(__kmp_init_parallel))
667 __kmp_parallel_initialize();
668 __kmp_resume_if_soft_paused();
669
670 th = __kmp_threads[gtid];
671 team = th->th.th_team;
672 status = 0;
673
674 th->th.th_ident = id_ref;
675
676 if (team->t.t_serialized) {
677 status = 1;
678 } else {
679 kmp_int32 old_this = th->th.th_local.this_construct;
680
681 ++th->th.th_local.this_construct;
682 /* try to set team count to thread count--success means thread got the
683 single block */
684 /* TODO: Should this be acquire or release? */
685 if (team->t.t_construct == old_this) {
686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
687 th->th.th_local.this_construct);
688 }
689#if USE_ITT_BUILD
690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
692 team->t.t_active_level == 1) {
693 // Only report metadata by primary thread of active team at level 1
694 __kmp_itt_metadata_single(id_ref);
695 }
696#endif /* USE_ITT_BUILD */
697 }
698
699 if (__kmp_env_consistency_check) {
700 if (status && push_ws) {
701 __kmp_push_workshare(gtid, ct_psingle, id_ref);
702 } else {
703 __kmp_check_workshare(gtid, ct_psingle, id_ref);
704 }
705 }
706#if USE_ITT_BUILD
707 if (status) {
708 __kmp_itt_single_start(gtid);
709 }
710#endif /* USE_ITT_BUILD */
711 return status;
712}
713
714void __kmp_exit_single(int gtid) {
715#if USE_ITT_BUILD
716 __kmp_itt_single_end(gtid);
717#endif /* USE_ITT_BUILD */
718 if (__kmp_env_consistency_check)
719 __kmp_pop_workshare(gtid, ct_psingle, NULL);
720}
721
722/* determine if we can go parallel or must use a serialized parallel region and
723 * how many threads we can use
724 * set_nproc is the number of threads requested for the team
725 * returns 0 if we should serialize or only use one thread,
726 * otherwise the number of threads to use
727 * The forkjoin lock is held by the caller. */
728static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
729 int master_tid, int set_nthreads,
730 int enter_teams) {
731 int capacity;
732 int new_nthreads;
733 KMP_DEBUG_ASSERT(__kmp_init_serial);
734 KMP_DEBUG_ASSERT(root && parent_team);
735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
736
737 // If dyn-var is set, dynamically adjust the number of desired threads,
738 // according to the method specified by dynamic_mode.
739 new_nthreads = set_nthreads;
740 if (!get__dynamic_2(parent_team, master_tid)) {
741 ;
742 }
743#ifdef USE_LOAD_BALANCE
744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
746 if (new_nthreads == 1) {
747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
748 "reservation to 1 thread\n",
749 master_tid));
750 return 1;
751 }
752 if (new_nthreads < set_nthreads) {
753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
754 "reservation to %d threads\n",
755 master_tid, new_nthreads));
756 }
757 }
758#endif /* USE_LOAD_BALANCE */
759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
760 new_nthreads = __kmp_avail_proc - __kmp_nth +
761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
762 if (new_nthreads <= 1) {
763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
764 "reservation to 1 thread\n",
765 master_tid));
766 return 1;
767 }
768 if (new_nthreads < set_nthreads) {
769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
770 "reservation to %d threads\n",
771 master_tid, new_nthreads));
772 } else {
773 new_nthreads = set_nthreads;
774 }
775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
776 if (set_nthreads > 2) {
777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
778 new_nthreads = (new_nthreads % set_nthreads) + 1;
779 if (new_nthreads == 1) {
780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
781 "reservation to 1 thread\n",
782 master_tid));
783 return 1;
784 }
785 if (new_nthreads < set_nthreads) {
786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
787 "reservation to %d threads\n",
788 master_tid, new_nthreads));
789 }
790 }
791 } else {
792 KMP_ASSERT(0);
793 }
794
795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
796 if (__kmp_nth + new_nthreads -
797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
798 __kmp_max_nth) {
799 int tl_nthreads = __kmp_max_nth - __kmp_nth +
800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
801 if (tl_nthreads <= 0) {
802 tl_nthreads = 1;
803 }
804
805 // If dyn-var is false, emit a 1-time warning.
806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
807 __kmp_reserve_warn = 1;
808 __kmp_msg(kmp_ms_warning,
809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
811 }
812 if (tl_nthreads == 1) {
813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
814 "reduced reservation to 1 thread\n",
815 master_tid));
816 return 1;
817 }
818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
819 "reservation to %d threads\n",
820 master_tid, tl_nthreads));
821 new_nthreads = tl_nthreads;
822 }
823
824 // Respect OMP_THREAD_LIMIT
825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
827 if (cg_nthreads + new_nthreads -
828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
829 max_cg_threads) {
830 int tl_nthreads = max_cg_threads - cg_nthreads +
831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
832 if (tl_nthreads <= 0) {
833 tl_nthreads = 1;
834 }
835
836 // If dyn-var is false, emit a 1-time warning.
837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
838 __kmp_reserve_warn = 1;
839 __kmp_msg(kmp_ms_warning,
840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
842 }
843 if (tl_nthreads == 1) {
844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
845 "reduced reservation to 1 thread\n",
846 master_tid));
847 return 1;
848 }
849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
850 "reservation to %d threads\n",
851 master_tid, tl_nthreads));
852 new_nthreads = tl_nthreads;
853 }
854
855 // Check if the threads array is large enough, or needs expanding.
856 // See comment in __kmp_register_root() about the adjustment if
857 // __kmp_threads[0] == NULL.
858 capacity = __kmp_threads_capacity;
859 if (TCR_PTR(__kmp_threads[0]) == NULL) {
860 --capacity;
861 }
862 // If it is not for initializing the hidden helper team, we need to take
863 // __kmp_hidden_helper_threads_num out of the capacity because it is included
864 // in __kmp_threads_capacity.
865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
866 capacity -= __kmp_hidden_helper_threads_num;
867 }
868 if (__kmp_nth + new_nthreads -
869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
870 capacity) {
871 // Expand the threads array.
872 int slotsRequired = __kmp_nth + new_nthreads -
873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
874 capacity;
875 int slotsAdded = __kmp_expand_threads(slotsRequired);
876 if (slotsAdded < slotsRequired) {
877 // The threads array was not expanded enough.
878 new_nthreads -= (slotsRequired - slotsAdded);
879 KMP_ASSERT(new_nthreads >= 1);
880
881 // If dyn-var is false, emit a 1-time warning.
882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
883 __kmp_reserve_warn = 1;
884 if (__kmp_tp_cached) {
885 __kmp_msg(kmp_ms_warning,
886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
889 } else {
890 __kmp_msg(kmp_ms_warning,
891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
893 }
894 }
895 }
896 }
897
898#ifdef KMP_DEBUG
899 if (new_nthreads == 1) {
900 KC_TRACE(10,
901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
902 "dead roots and rechecking; requested %d threads\n",
903 __kmp_get_gtid(), set_nthreads));
904 } else {
905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
906 " %d threads\n",
907 __kmp_get_gtid(), new_nthreads, set_nthreads));
908 }
909#endif // KMP_DEBUG
910 return new_nthreads;
911}
912
913/* Allocate threads from the thread pool and assign them to the new team. We are
914 assured that there are enough threads available, because we checked on that
915 earlier within critical section forkjoin */
916static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
917 kmp_info_t *master_th, int master_gtid,
918 int fork_teams_workers) {
919 int i;
920 int use_hot_team;
921
922 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
923 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
924 KMP_MB();
925
926 /* first, let's setup the primary thread */
927 master_th->th.th_info.ds.ds_tid = 0;
928 master_th->th.th_team = team;
929 master_th->th.th_team_nproc = team->t.t_nproc;
930 master_th->th.th_team_master = master_th;
931 master_th->th.th_team_serialized = FALSE;
932 master_th->th.th_dispatch = &team->t.t_dispatch[0];
933
934/* make sure we are not the optimized hot team */
935#if KMP_NESTED_HOT_TEAMS
936 use_hot_team = 0;
937 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
938 if (hot_teams) { // hot teams array is not allocated if
939 // KMP_HOT_TEAMS_MAX_LEVEL=0
940 int level = team->t.t_active_level - 1; // index in array of hot teams
941 if (master_th->th.th_teams_microtask) { // are we inside the teams?
942 if (master_th->th.th_teams_size.nteams > 1) {
943 ++level; // level was not increased in teams construct for
944 // team_of_masters
945 }
946 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
947 master_th->th.th_teams_level == team->t.t_level) {
948 ++level; // level was not increased in teams construct for
949 // team_of_workers before the parallel
950 } // team->t.t_level will be increased inside parallel
951 }
952 if (level < __kmp_hot_teams_max_level) {
953 if (hot_teams[level].hot_team) {
954 // hot team has already been allocated for given level
955 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
956 use_hot_team = 1; // the team is ready to use
957 } else {
958 use_hot_team = 0; // AC: threads are not allocated yet
959 hot_teams[level].hot_team = team; // remember new hot team
960 hot_teams[level].hot_team_nth = team->t.t_nproc;
961 }
962 } else {
963 use_hot_team = 0;
964 }
965 }
966#else
967 use_hot_team = team == root->r.r_hot_team;
968#endif
969 if (!use_hot_team) {
970
971 /* install the primary thread */
972 team->t.t_threads[0] = master_th;
973 __kmp_initialize_info(master_th, team, 0, master_gtid);
974
975 /* now, install the worker threads */
976 for (i = 1; i < team->t.t_nproc; i++) {
977
978 /* fork or reallocate a new thread and install it in team */
979 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
980 team->t.t_threads[i] = thr;
981 KMP_DEBUG_ASSERT(thr);
982 KMP_DEBUG_ASSERT(thr->th.th_team == team);
983 /* align team and thread arrived states */
984 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
985 "T#%d(%d:%d) join =%llu, plain=%llu\n",
986 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
987 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
988 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
989 team->t.t_bar[bs_plain_barrier].b_arrived));
990 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
991 thr->th.th_teams_level = master_th->th.th_teams_level;
992 thr->th.th_teams_size = master_th->th.th_teams_size;
993 { // Initialize threads' barrier data.
994 int b;
995 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
996 for (b = 0; b < bs_last_barrier; ++b) {
997 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
998 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
999#if USE_DEBUGGER
1000 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1001#endif
1002 }
1003 }
1004 }
1005
1006#if KMP_AFFINITY_SUPPORTED
1007 // Do not partition the places list for teams construct workers who
1008 // haven't actually been forked to do real work yet. This partitioning
1009 // will take place in the parallel region nested within the teams construct.
1010 if (!fork_teams_workers) {
1011 __kmp_partition_places(team);
1012 }
1013#endif
1014
1015 if (team->t.t_nproc > 1 &&
1016 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1017 team->t.b->update_num_threads(team->t.t_nproc);
1018 __kmp_add_threads_to_team(team, team->t.t_nproc);
1019 }
1020 }
1021
1022 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1023 for (i = 0; i < team->t.t_nproc; i++) {
1024 kmp_info_t *thr = team->t.t_threads[i];
1025 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1026 thr->th.th_prev_level != team->t.t_level) {
1027 team->t.t_display_affinity = 1;
1028 break;
1029 }
1030 }
1031 }
1032
1033 KMP_MB();
1034}
1035
1036#if KMP_ARCH_X86 || KMP_ARCH_X86_64
1037// Propagate any changes to the floating point control registers out to the team
1038// We try to avoid unnecessary writes to the relevant cache line in the team
1039// structure, so we don't make changes unless they are needed.
1040inline static void propagateFPControl(kmp_team_t *team) {
1041 if (__kmp_inherit_fp_control) {
1042 kmp_int16 x87_fpu_control_word;
1043 kmp_uint32 mxcsr;
1044
1045 // Get primary thread's values of FPU control flags (both X87 and vector)
1046 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1047 __kmp_store_mxcsr(&mxcsr);
1048 mxcsr &= KMP_X86_MXCSR_MASK;
1049
1050 // There is no point looking at t_fp_control_saved here.
1051 // If it is TRUE, we still have to update the values if they are different
1052 // from those we now have. If it is FALSE we didn't save anything yet, but
1053 // our objective is the same. We have to ensure that the values in the team
1054 // are the same as those we have.
1055 // So, this code achieves what we need whether or not t_fp_control_saved is
1056 // true. By checking whether the value needs updating we avoid unnecessary
1057 // writes that would put the cache-line into a written state, causing all
1058 // threads in the team to have to read it again.
1059 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1060 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1061 // Although we don't use this value, other code in the runtime wants to know
1062 // whether it should restore them. So we must ensure it is correct.
1063 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1064 } else {
1065 // Similarly here. Don't write to this cache-line in the team structure
1066 // unless we have to.
1067 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1068 }
1069}
1070
1071// Do the opposite, setting the hardware registers to the updated values from
1072// the team.
1073inline static void updateHWFPControl(kmp_team_t *team) {
1074 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1075 // Only reset the fp control regs if they have been changed in the team.
1076 // the parallel region that we are exiting.
1077 kmp_int16 x87_fpu_control_word;
1078 kmp_uint32 mxcsr;
1079 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1080 __kmp_store_mxcsr(&mxcsr);
1081 mxcsr &= KMP_X86_MXCSR_MASK;
1082
1083 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1084 __kmp_clear_x87_fpu_status_word();
1085 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1086 }
1087
1088 if (team->t.t_mxcsr != mxcsr) {
1089 __kmp_load_mxcsr(&team->t.t_mxcsr);
1090 }
1091 }
1092}
1093#else
1094#define propagateFPControl(x) ((void)0)
1095#define updateHWFPControl(x) ((void)0)
1096#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1097
1098static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1099 int realloc); // forward declaration
1100
1101/* Run a parallel region that has been serialized, so runs only in a team of the
1102 single primary thread. */
1103void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1104 kmp_info_t *this_thr;
1105 kmp_team_t *serial_team;
1106
1107 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1108
1109 /* Skip all this code for autopar serialized loops since it results in
1110 unacceptable overhead */
1111 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1112 return;
1113
1114 if (!TCR_4(__kmp_init_parallel))
1115 __kmp_parallel_initialize();
1116 __kmp_resume_if_soft_paused();
1117
1118 this_thr = __kmp_threads[global_tid];
1119 serial_team = this_thr->th.th_serial_team;
1120
1121 /* utilize the serialized team held by this thread */
1122 KMP_DEBUG_ASSERT(serial_team);
1123 KMP_MB();
1124
1125 if (__kmp_tasking_mode != tskm_immediate_exec) {
1126 KMP_DEBUG_ASSERT(
1127 this_thr->th.th_task_team ==
1128 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1129 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1130 NULL);
1131 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1132 "team %p, new task_team = NULL\n",
1133 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1134 this_thr->th.th_task_team = NULL;
1135 }
1136
1137 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1138 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1139 proc_bind = proc_bind_false;
1140 } else if (proc_bind == proc_bind_default) {
1141 // No proc_bind clause was specified, so use the current value
1142 // of proc-bind-var for this parallel region.
1143 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1144 }
1145 // Reset for next parallel region
1146 this_thr->th.th_set_proc_bind = proc_bind_default;
1147
1148#if OMPT_SUPPORT
1149 ompt_data_t ompt_parallel_data = ompt_data_none;
1150 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1151 if (ompt_enabled.enabled &&
1152 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1153
1154 ompt_task_info_t *parent_task_info;
1155 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1156
1157 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1158 if (ompt_enabled.ompt_callback_parallel_begin) {
1159 int team_size = 1;
1160
1161 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1162 &(parent_task_info->task_data), &(parent_task_info->frame),
1163 &ompt_parallel_data, team_size,
1164 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1165 }
1166 }
1167#endif // OMPT_SUPPORT
1168
1169 if (this_thr->th.th_team != serial_team) {
1170 // Nested level will be an index in the nested nthreads array
1171 int level = this_thr->th.th_team->t.t_level;
1172
1173 if (serial_team->t.t_serialized) {
1174 /* this serial team was already used
1175 TODO increase performance by making this locks more specific */
1176 kmp_team_t *new_team;
1177
1178 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1179
1180 new_team =
1181 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1182#if OMPT_SUPPORT
1183 ompt_parallel_data,
1184#endif
1185 proc_bind, &this_thr->th.th_current_task->td_icvs,
1186 0 USE_NESTED_HOT_ARG(NULL));
1187 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1188 KMP_ASSERT(new_team);
1189
1190 /* setup new serialized team and install it */
1191 new_team->t.t_threads[0] = this_thr;
1192 new_team->t.t_parent = this_thr->th.th_team;
1193 serial_team = new_team;
1194 this_thr->th.th_serial_team = serial_team;
1195
1196 KF_TRACE(
1197 10,
1198 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1199 global_tid, serial_team));
1200
1201 /* TODO the above breaks the requirement that if we run out of resources,
1202 then we can still guarantee that serialized teams are ok, since we may
1203 need to allocate a new one */
1204 } else {
1205 KF_TRACE(
1206 10,
1207 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1208 global_tid, serial_team));
1209 }
1210
1211 /* we have to initialize this serial team */
1212 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1213 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1214 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1215 serial_team->t.t_ident = loc;
1216 serial_team->t.t_serialized = 1;
1217 serial_team->t.t_nproc = 1;
1218 serial_team->t.t_parent = this_thr->th.th_team;
1219 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1220 this_thr->th.th_team = serial_team;
1221 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1222
1223 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid,
1224 this_thr->th.th_current_task));
1225 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1226 this_thr->th.th_current_task->td_flags.executing = 0;
1227
1228 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1229
1230 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1231 implicit task for each serialized task represented by
1232 team->t.t_serialized? */
1233 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1234 &this_thr->th.th_current_task->td_parent->td_icvs);
1235
1236 // Thread value exists in the nested nthreads array for the next nested
1237 // level
1238 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1239 this_thr->th.th_current_task->td_icvs.nproc =
1240 __kmp_nested_nth.nth[level + 1];
1241 }
1242
1243 if (__kmp_nested_proc_bind.used &&
1244 (level + 1 < __kmp_nested_proc_bind.used)) {
1245 this_thr->th.th_current_task->td_icvs.proc_bind =
1246 __kmp_nested_proc_bind.bind_types[level + 1];
1247 }
1248
1249#if USE_DEBUGGER
1250 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1251#endif
1252 this_thr->th.th_info.ds.ds_tid = 0;
1253
1254 /* set thread cache values */
1255 this_thr->th.th_team_nproc = 1;
1256 this_thr->th.th_team_master = this_thr;
1257 this_thr->th.th_team_serialized = 1;
1258
1259 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1260 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1261 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1262
1263 propagateFPControl(serial_team);
1264
1265 /* check if we need to allocate dispatch buffers stack */
1266 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1267 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1268 serial_team->t.t_dispatch->th_disp_buffer =
1269 (dispatch_private_info_t *)__kmp_allocate(
1270 sizeof(dispatch_private_info_t));
1271 }
1272 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1273
1274 KMP_MB();
1275
1276 } else {
1277 /* this serialized team is already being used,
1278 * that's fine, just add another nested level */
1279 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1280 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1281 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1282 ++serial_team->t.t_serialized;
1283 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1284
1285 // Nested level will be an index in the nested nthreads array
1286 int level = this_thr->th.th_team->t.t_level;
1287 // Thread value exists in the nested nthreads array for the next nested
1288 // level
1289 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1290 this_thr->th.th_current_task->td_icvs.nproc =
1291 __kmp_nested_nth.nth[level + 1];
1292 }
1293 serial_team->t.t_level++;
1294 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1295 "of serial team %p to %d\n",
1296 global_tid, serial_team, serial_team->t.t_level));
1297
1298 /* allocate/push dispatch buffers stack */
1299 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1300 {
1301 dispatch_private_info_t *disp_buffer =
1302 (dispatch_private_info_t *)__kmp_allocate(
1303 sizeof(dispatch_private_info_t));
1304 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1305 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1306 }
1307 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1308
1309 KMP_MB();
1310 }
1311 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1312
1313 // Perform the display affinity functionality for
1314 // serialized parallel regions
1315 if (__kmp_display_affinity) {
1316 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1317 this_thr->th.th_prev_num_threads != 1) {
1318 // NULL means use the affinity-format-var ICV
1319 __kmp_aux_display_affinity(global_tid, NULL);
1320 this_thr->th.th_prev_level = serial_team->t.t_level;
1321 this_thr->th.th_prev_num_threads = 1;
1322 }
1323 }
1324
1325 if (__kmp_env_consistency_check)
1326 __kmp_push_parallel(global_tid, NULL);
1327#if OMPT_SUPPORT
1328 serial_team->t.ompt_team_info.master_return_address = codeptr;
1329 if (ompt_enabled.enabled &&
1330 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1331 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1332 OMPT_GET_FRAME_ADDRESS(0);
1333
1334 ompt_lw_taskteam_t lw_taskteam;
1335 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1336 &ompt_parallel_data, codeptr);
1337
1338 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1339 // don't use lw_taskteam after linking. content was swaped
1340
1341 /* OMPT implicit task begin */
1342 if (ompt_enabled.ompt_callback_implicit_task) {
1343 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1344 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1345 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid),
1346 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1347 OMPT_CUR_TASK_INFO(this_thr)->thread_num =
1348 __kmp_tid_from_gtid(global_tid);
1349 }
1350
1351 /* OMPT state */
1352 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1353 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr =
1354 OMPT_GET_FRAME_ADDRESS(0);
1355 }
1356#endif
1357}
1358
1359// Test if this fork is for a team closely nested in a teams construct
1360static inline bool __kmp_is_fork_in_teams(kmp_info_t *master_th,
1361 microtask_t microtask, int level,
1362 int teams_level, kmp_va_list ap) {
1363 return (master_th->th.th_teams_microtask && ap &&
1364 microtask != (microtask_t)__kmp_teams_master && level == teams_level);
1365}
1366
1367// Test if this fork is for the teams construct, i.e. to form the outer league
1368// of teams
1369static inline bool __kmp_is_entering_teams(int active_level, int level,
1370 int teams_level, kmp_va_list ap) {
1371 return ((ap == NULL && active_level == 0) ||
1372 (ap && teams_level > 0 && teams_level == level));
1373}
1374
1375// AC: This is start of parallel that is nested inside teams construct.
1376// The team is actual (hot), all workers are ready at the fork barrier.
1377// No lock needed to initialize the team a bit, then free workers.
1378static inline int
1379__kmp_fork_in_teams(ident_t *loc, int gtid, kmp_team_t *parent_team,
1380 kmp_int32 argc, kmp_info_t *master_th, kmp_root_t *root,
1381 enum fork_context_e call_context, microtask_t microtask,
1382 launch_t invoker, int master_set_numthreads, int level,
1383#if OMPT_SUPPORT
1384 ompt_data_t ompt_parallel_data, void *return_address,
1385#endif
1386 kmp_va_list ap) {
1387 void **argv;
1388 int i;
1389
1390 parent_team->t.t_ident = loc;
1391 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1392 parent_team->t.t_argc = argc;
1393 argv = (void **)parent_team->t.t_argv;
1394 for (i = argc - 1; i >= 0; --i) {
1395 *argv++ = va_arg(kmp_va_deref(ap), void *);
1396 }
1397 // Increment our nested depth levels, but not increase the serialization
1398 if (parent_team == master_th->th.th_serial_team) {
1399 // AC: we are in serialized parallel
1400 __kmpc_serialized_parallel(loc, gtid);
1401 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1402
1403 if (call_context == fork_context_gnu) {
1404 // AC: need to decrement t_serialized for enquiry functions to work
1405 // correctly, will restore at join time
1406 parent_team->t.t_serialized--;
1407 return TRUE;
1408 }
1409
1410#if OMPD_SUPPORT
1411 parent_team->t.t_pkfn = microtask;
1412#endif
1413
1414#if OMPT_SUPPORT
1415 void *dummy;
1416 void **exit_frame_p;
1417 ompt_data_t *implicit_task_data;
1418 ompt_lw_taskteam_t lw_taskteam;
1419
1420 if (ompt_enabled.enabled) {
1421 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1422 &ompt_parallel_data, return_address);
1423 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1424
1425 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1426 // Don't use lw_taskteam after linking. Content was swapped.
1427
1428 /* OMPT implicit task begin */
1429 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1430 if (ompt_enabled.ompt_callback_implicit_task) {
1431 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1432 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1433 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), implicit_task_data,
1434 1, OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1435 }
1436
1437 /* OMPT state */
1438 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1439 } else {
1440 exit_frame_p = &dummy;
1441 }
1442#endif
1443
1444 // AC: need to decrement t_serialized for enquiry functions to work
1445 // correctly, will restore at join time
1446 parent_team->t.t_serialized--;
1447
1448 {
1449 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1450 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1451 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1452#if OMPT_SUPPORT
1453 ,
1454 exit_frame_p
1455#endif
1456 );
1457 }
1458
1459#if OMPT_SUPPORT
1460 if (ompt_enabled.enabled) {
1461 *exit_frame_p = NULL;
1462 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1463 if (ompt_enabled.ompt_callback_implicit_task) {
1464 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1465 ompt_scope_end, NULL, implicit_task_data, 1,
1466 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1467 }
1468 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1469 __ompt_lw_taskteam_unlink(master_th);
1470 if (ompt_enabled.ompt_callback_parallel_end) {
1471 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1472 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1473 OMPT_INVOKER(call_context) | ompt_parallel_team, return_address);
1474 }
1475 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1476 }
1477#endif
1478 return TRUE;
1479 }
1480
1481 parent_team->t.t_pkfn = microtask;
1482 parent_team->t.t_invoke = invoker;
1483 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1484 parent_team->t.t_active_level++;
1485 parent_team->t.t_level++;
1486 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1487
1488 // If the threads allocated to the team are less than the thread limit, update
1489 // the thread limit here. th_teams_size.nth is specific to this team nested
1490 // in a teams construct, the team is fully created, and we're about to do
1491 // the actual fork. Best to do this here so that the subsequent uses below
1492 // and in the join have the correct value.
1493 master_th->th.th_teams_size.nth = parent_team->t.t_nproc;
1494
1495#if OMPT_SUPPORT
1496 if (ompt_enabled.enabled) {
1497 ompt_lw_taskteam_t lw_taskteam;
1498 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, &ompt_parallel_data,
1499 return_address);
1500 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1501 }
1502#endif
1503
1504 /* Change number of threads in the team if requested */
1505 if (master_set_numthreads) { // The parallel has num_threads clause
1506 if (master_set_numthreads <= master_th->th.th_teams_size.nth) {
1507 // AC: only can reduce number of threads dynamically, can't increase
1508 kmp_info_t **other_threads = parent_team->t.t_threads;
1509 // NOTE: if using distributed barrier, we need to run this code block
1510 // even when the team size appears not to have changed from the max.
1511 int old_proc = master_th->th.th_teams_size.nth;
1512 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
1513 __kmp_resize_dist_barrier(parent_team, old_proc, master_set_numthreads);
1514 __kmp_add_threads_to_team(parent_team, master_set_numthreads);
1515 }
1516 parent_team->t.t_nproc = master_set_numthreads;
1517 for (i = 0; i < master_set_numthreads; ++i) {
1518 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1519 }
1520 }
1521 // Keep extra threads hot in the team for possible next parallels
1522 master_th->th.th_set_nproc = 0;
1523 }
1524
1525#if USE_DEBUGGER
1526 if (__kmp_debugging) { // Let debugger override number of threads.
1527 int nth = __kmp_omp_num_threads(loc);
1528 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1529 master_set_numthreads = nth;
1530 }
1531 }
1532#endif
1533
1534 // Figure out the proc_bind policy for the nested parallel within teams
1535 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1536 // proc_bind_default means don't update
1537 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
1538 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1539 proc_bind = proc_bind_false;
1540 } else {
1541 // No proc_bind clause specified; use current proc-bind-var
1542 if (proc_bind == proc_bind_default) {
1543 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1544 }
1545 /* else: The proc_bind policy was specified explicitly on parallel clause.
1546 This overrides proc-bind-var for this parallel region, but does not
1547 change proc-bind-var. */
1548 // Figure the value of proc-bind-var for the child threads.
1549 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1550 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1551 master_th->th.th_current_task->td_icvs.proc_bind)) {
1552 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
1553 }
1554 }
1555 KMP_CHECK_UPDATE(parent_team->t.t_proc_bind, proc_bind);
1556 // Need to change the bind-var ICV to correct value for each implicit task
1557 if (proc_bind_icv != proc_bind_default &&
1558 master_th->th.th_current_task->td_icvs.proc_bind != proc_bind_icv) {
1559 kmp_info_t **other_threads = parent_team->t.t_threads;
1560 for (i = 0; i < master_th->th.th_team_nproc; ++i) {
1561 other_threads[i]->th.th_current_task->td_icvs.proc_bind = proc_bind_icv;
1562 }
1563 }
1564 // Reset for next parallel region
1565 master_th->th.th_set_proc_bind = proc_bind_default;
1566
1567#if USE_ITT_BUILD && USE_ITT_NOTIFY
1568 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1569 KMP_ITT_DEBUG) &&
1570 __kmp_forkjoin_frames_mode == 3 &&
1571 parent_team->t.t_active_level == 1 // only report frames at level 1
1572 && master_th->th.th_teams_size.nteams == 1) {
1573 kmp_uint64 tmp_time = __itt_get_timestamp();
1574 master_th->th.th_frame_time = tmp_time;
1575 parent_team->t.t_region_time = tmp_time;
1576 }
1577 if (__itt_stack_caller_create_ptr) {
1578 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
1579 // create new stack stitching id before entering fork barrier
1580 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1581 }
1582#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1583#if KMP_AFFINITY_SUPPORTED
1584 __kmp_partition_places(parent_team);
1585#endif
1586
1587 KF_TRACE(10, ("__kmp_fork_in_teams: before internal fork: root=%p, team=%p, "
1588 "master_th=%p, gtid=%d\n",
1589 root, parent_team, master_th, gtid));
1590 __kmp_internal_fork(loc, gtid, parent_team);
1591 KF_TRACE(10, ("__kmp_fork_in_teams: after internal fork: root=%p, team=%p, "
1592 "master_th=%p, gtid=%d\n",
1593 root, parent_team, master_th, gtid));
1594
1595 if (call_context == fork_context_gnu)
1596 return TRUE;
1597
1598 /* Invoke microtask for PRIMARY thread */
1599 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) invoke microtask = %p\n", gtid,
1600 parent_team->t.t_id, parent_team->t.t_pkfn));
1601
1602 if (!parent_team->t.t_invoke(gtid)) {
1603 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
1604 }
1605 KA_TRACE(20, ("__kmp_fork_in_teams: T#%d(%d:0) done microtask = %p\n", gtid,
1606 parent_team->t.t_id, parent_team->t.t_pkfn));
1607 KMP_MB(); /* Flush all pending memory write invalidates. */
1608
1609 KA_TRACE(20, ("__kmp_fork_in_teams: parallel exit T#%d\n", gtid));
1610
1611 return TRUE;
1612}
1613
1614// Create a serialized parallel region
1615static inline int
1616__kmp_serial_fork_call(ident_t *loc, int gtid, enum fork_context_e call_context,
1617 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1618 kmp_info_t *master_th, kmp_team_t *parent_team,
1619#if OMPT_SUPPORT
1620 ompt_data_t *ompt_parallel_data, void **return_address,
1621 ompt_data_t **parent_task_data,
1622#endif
1623 kmp_va_list ap) {
1624 kmp_team_t *team;
1625 int i;
1626 void **argv;
1627
1628/* josh todo: hypothetical question: what do we do for OS X*? */
1629#if KMP_OS_LINUX && \
1630 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1631 void *args[argc];
1632#else
1633 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1634#endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1635 KMP_ARCH_AARCH64) */
1636
1637 KA_TRACE(
1638 20, ("__kmp_serial_fork_call: T#%d serializing parallel region\n", gtid));
1639
1640 __kmpc_serialized_parallel(loc, gtid);
1641
1642#if OMPD_SUPPORT
1643 master_th->th.th_serial_team->t.t_pkfn = microtask;
1644#endif
1645
1646 if (call_context == fork_context_intel) {
1647 /* TODO this sucks, use the compiler itself to pass args! :) */
1648 master_th->th.th_serial_team->t.t_ident = loc;
1649 if (!ap) {
1650 // revert change made in __kmpc_serialized_parallel()
1651 master_th->th.th_serial_team->t.t_level--;
1652// Get args from parent team for teams construct
1653
1654#if OMPT_SUPPORT
1655 void *dummy;
1656 void **exit_frame_p;
1657 ompt_task_info_t *task_info;
1658 ompt_lw_taskteam_t lw_taskteam;
1659
1660 if (ompt_enabled.enabled) {
1661 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1662 ompt_parallel_data, *return_address);
1663
1664 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1665 // don't use lw_taskteam after linking. content was swaped
1666 task_info = OMPT_CUR_TASK_INFO(master_th);
1667 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1668 if (ompt_enabled.ompt_callback_implicit_task) {
1669 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1670 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1671 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1672 &(task_info->task_data), 1,
1673 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1674 }
1675
1676 /* OMPT state */
1677 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1678 } else {
1679 exit_frame_p = &dummy;
1680 }
1681#endif
1682
1683 {
1684 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1685 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1686 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1687#if OMPT_SUPPORT
1688 ,
1689 exit_frame_p
1690#endif
1691 );
1692 }
1693
1694#if OMPT_SUPPORT
1695 if (ompt_enabled.enabled) {
1696 *exit_frame_p = NULL;
1697 if (ompt_enabled.ompt_callback_implicit_task) {
1698 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1699 ompt_scope_end, NULL, &(task_info->task_data), 1,
1700 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1701 }
1702 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1703 __ompt_lw_taskteam_unlink(master_th);
1704 if (ompt_enabled.ompt_callback_parallel_end) {
1705 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1706 ompt_parallel_data, *parent_task_data,
1707 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1708 }
1709 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1710 }
1711#endif
1712 } else if (microtask == (microtask_t)__kmp_teams_master) {
1713 KMP_DEBUG_ASSERT(master_th->th.th_team == master_th->th.th_serial_team);
1714 team = master_th->th.th_team;
1715 // team->t.t_pkfn = microtask;
1716 team->t.t_invoke = invoker;
1717 __kmp_alloc_argv_entries(argc, team, TRUE);
1718 team->t.t_argc = argc;
1719 argv = (void **)team->t.t_argv;
1720 if (ap) {
1721 for (i = argc - 1; i >= 0; --i)
1722 *argv++ = va_arg(kmp_va_deref(ap), void *);
1723 } else {
1724 for (i = 0; i < argc; ++i)
1725 // Get args from parent team for teams construct
1726 argv[i] = parent_team->t.t_argv[i];
1727 }
1728 // AC: revert change made in __kmpc_serialized_parallel()
1729 // because initial code in teams should have level=0
1730 team->t.t_level--;
1731 // AC: call special invoker for outer "parallel" of teams construct
1732 invoker(gtid);
1733#if OMPT_SUPPORT
1734 if (ompt_enabled.enabled) {
1735 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1736 if (ompt_enabled.ompt_callback_implicit_task) {
1737 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1738 ompt_scope_end, NULL, &(task_info->task_data), 0,
1739 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1740 }
1741 if (ompt_enabled.ompt_callback_parallel_end) {
1742 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1743 ompt_parallel_data, *parent_task_data,
1744 OMPT_INVOKER(call_context) | ompt_parallel_league,
1745 *return_address);
1746 }
1747 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1748 }
1749#endif
1750 } else {
1751 argv = args;
1752 for (i = argc - 1; i >= 0; --i)
1753 *argv++ = va_arg(kmp_va_deref(ap), void *);
1754 KMP_MB();
1755
1756#if OMPT_SUPPORT
1757 void *dummy;
1758 void **exit_frame_p;
1759 ompt_task_info_t *task_info;
1760 ompt_lw_taskteam_t lw_taskteam;
1761 ompt_data_t *implicit_task_data;
1762
1763 if (ompt_enabled.enabled) {
1764 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1765 ompt_parallel_data, *return_address);
1766 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767 // don't use lw_taskteam after linking. content was swaped
1768 task_info = OMPT_CUR_TASK_INFO(master_th);
1769 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1770
1771 /* OMPT implicit task begin */
1772 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1773 if (ompt_enabled.ompt_callback_implicit_task) {
1774 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1777 ompt_task_implicit);
1778 OMPT_CUR_TASK_INFO(master_th)->thread_num = __kmp_tid_from_gtid(gtid);
1779 }
1780
1781 /* OMPT state */
1782 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783 } else {
1784 exit_frame_p = &dummy;
1785 }
1786#endif
1787
1788 {
1789 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1792#if OMPT_SUPPORT
1793 ,
1794 exit_frame_p
1795#endif
1796 );
1797 }
1798
1799#if OMPT_SUPPORT
1800 if (ompt_enabled.enabled) {
1801 *exit_frame_p = NULL;
1802 if (ompt_enabled.ompt_callback_implicit_task) {
1803 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1804 ompt_scope_end, NULL, &(task_info->task_data), 1,
1805 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1806 }
1807
1808 *ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1809 __ompt_lw_taskteam_unlink(master_th);
1810 if (ompt_enabled.ompt_callback_parallel_end) {
1811 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1812 ompt_parallel_data, *parent_task_data,
1813 OMPT_INVOKER(call_context) | ompt_parallel_team, *return_address);
1814 }
1815 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1816 }
1817#endif
1818 }
1819 } else if (call_context == fork_context_gnu) {
1820#if OMPT_SUPPORT
1821 if (ompt_enabled.enabled) {
1822 ompt_lw_taskteam_t lwt;
1823 __ompt_lw_taskteam_init(&lwt, master_th, gtid, ompt_parallel_data,
1824 *return_address);
1825
1826 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1827 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1828 }
1829// don't use lw_taskteam after linking. content was swaped
1830#endif
1831
1832 // we were called from GNU native code
1833 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1834 return FALSE;
1835 } else {
1836 KMP_ASSERT2(call_context < fork_context_last,
1837 "__kmp_serial_fork_call: unknown fork_context parameter");
1838 }
1839
1840 KA_TRACE(20, ("__kmp_serial_fork_call: T#%d serial exit\n", gtid));
1841 KMP_MB();
1842 return FALSE;
1843}
1844
1845/* most of the work for a fork */
1846/* return true if we really went parallel, false if serialized */
1847int __kmp_fork_call(ident_t *loc, int gtid,
1848 enum fork_context_e call_context, // Intel, GNU, ...
1849 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1850 kmp_va_list ap) {
1851 void **argv;
1852 int i;
1853 int master_tid;
1854 int master_this_cons;
1855 kmp_team_t *team;
1856 kmp_team_t *parent_team;
1857 kmp_info_t *master_th;
1858 kmp_root_t *root;
1859 int nthreads;
1860 int master_active;
1861 int master_set_numthreads;
1862 int level;
1863 int active_level;
1864 int teams_level;
1865#if KMP_NESTED_HOT_TEAMS
1866 kmp_hot_team_ptr_t **p_hot_teams;
1867#endif
1868 { // KMP_TIME_BLOCK
1869 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1870 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1871
1872 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1873 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1874 /* Some systems prefer the stack for the root thread(s) to start with */
1875 /* some gap from the parent stack to prevent false sharing. */
1876 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1877 /* These 2 lines below are so this does not get optimized out */
1878 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1879 __kmp_stkpadding += (short)((kmp_int64)dummy);
1880 }
1881
1882 /* initialize if needed */
1883 KMP_DEBUG_ASSERT(
1884 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1885 if (!TCR_4(__kmp_init_parallel))
1886 __kmp_parallel_initialize();
1887 __kmp_resume_if_soft_paused();
1888
1889 /* setup current data */
1890 // AC: potentially unsafe, not in sync with library shutdown,
1891 // __kmp_threads can be freed
1892 master_th = __kmp_threads[gtid];
1893
1894 parent_team = master_th->th.th_team;
1895 master_tid = master_th->th.th_info.ds.ds_tid;
1896 master_this_cons = master_th->th.th_local.this_construct;
1897 root = master_th->th.th_root;
1898 master_active = root->r.r_active;
1899 master_set_numthreads = master_th->th.th_set_nproc;
1900
1901#if OMPT_SUPPORT
1902 ompt_data_t ompt_parallel_data = ompt_data_none;
1903 ompt_data_t *parent_task_data;
1904 ompt_frame_t *ompt_frame;
1905 void *return_address = NULL;
1906
1907 if (ompt_enabled.enabled) {
1908 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1909 NULL, NULL);
1910 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1911 }
1912#endif
1913
1914 // Assign affinity to root thread if it hasn't happened yet
1915 __kmp_assign_root_init_mask();
1916
1917 // Nested level will be an index in the nested nthreads array
1918 level = parent_team->t.t_level;
1919 // used to launch non-serial teams even if nested is not allowed
1920 active_level = parent_team->t.t_active_level;
1921 // needed to check nesting inside the teams
1922 teams_level = master_th->th.th_teams_level;
1923#if KMP_NESTED_HOT_TEAMS
1924 p_hot_teams = &master_th->th.th_hot_teams;
1925 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1926 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1927 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1928 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1929 // it is either actual or not needed (when active_level > 0)
1930 (*p_hot_teams)[0].hot_team_nth = 1;
1931 }
1932#endif
1933
1934#if OMPT_SUPPORT
1935 if (ompt_enabled.enabled) {
1936 if (ompt_enabled.ompt_callback_parallel_begin) {
1937 int team_size = master_set_numthreads
1938 ? master_set_numthreads
1939 : get__nproc_2(parent_team, master_tid);
1940 int flags = OMPT_INVOKER(call_context) |
1941 ((microtask == (microtask_t)__kmp_teams_master)
1942 ? ompt_parallel_league
1943 : ompt_parallel_team);
1944 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1945 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1946 return_address);
1947 }
1948 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1949 }
1950#endif
1951
1952 master_th->th.th_ident = loc;
1953
1954 // Parallel closely nested in teams construct:
1955 if (__kmp_is_fork_in_teams(master_th, microtask, level, teams_level, ap)) {
1956 return __kmp_fork_in_teams(loc, gtid, parent_team, argc, master_th, root,
1957 call_context, microtask, invoker,
1958 master_set_numthreads, level,
1959#if OMPT_SUPPORT
1960 ompt_parallel_data, return_address,
1961#endif
1962 ap);
1963 } // End parallel closely nested in teams construct
1964
1965#if KMP_DEBUG
1966 if (__kmp_tasking_mode != tskm_immediate_exec) {
1967 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1968 parent_team->t.t_task_team[master_th->th.th_task_state]);
1969 }
1970#endif
1971
1972 // Need this to happen before we determine the number of threads, not while
1973 // we are allocating the team
1974 //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
1975
1976 // Determine the number of threads
1977 int enter_teams =
1978 __kmp_is_entering_teams(active_level, level, teams_level, ap);
1979 if ((!enter_teams &&
1980 (parent_team->t.t_active_level >=
1981 master_th->th.th_current_task->td_icvs.max_active_levels)) ||
1982 (__kmp_library == library_serial)) {
1983 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team\n", gtid));
1984 nthreads = 1;
1985 } else {
1986 nthreads = master_set_numthreads
1987 ? master_set_numthreads
1988 // TODO: get nproc directly from current task
1989 : get__nproc_2(parent_team, master_tid);
1990 // Check if we need to take forkjoin lock? (no need for serialized
1991 // parallel out of teams construct).
1992 if (nthreads > 1) {
1993 /* determine how many new threads we can use */
1994 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1995 /* AC: If we execute teams from parallel region (on host), then teams
1996 should be created but each can only have 1 thread if nesting is
1997 disabled. If teams called from serial region, then teams and their
1998 threads should be created regardless of the nesting setting. */
1999 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
2000 nthreads, enter_teams);
2001 if (nthreads == 1) {
2002 // Free lock for single thread execution here; for multi-thread
2003 // execution it will be freed later after team of threads created
2004 // and initialized
2005 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2006 }
2007 }
2008 }
2009 KMP_DEBUG_ASSERT(nthreads > 0);
2010
2011 // If we temporarily changed the set number of threads then restore it now
2012 master_th->th.th_set_nproc = 0;
2013
2014 if (nthreads == 1) {
2015 return __kmp_serial_fork_call(loc, gtid, call_context, argc, microtask,
2016 invoker, master_th, parent_team,
2017#if OMPT_SUPPORT
2018 &ompt_parallel_data, &return_address,
2019 &parent_task_data,
2020#endif
2021 ap);
2022 } // if (nthreads == 1)
2023
2024 // GEH: only modify the executing flag in the case when not serialized
2025 // serialized case is handled in kmpc_serialized_parallel
2026 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
2027 "curtask=%p, curtask_max_aclevel=%d\n",
2028 parent_team->t.t_active_level, master_th,
2029 master_th->th.th_current_task,
2030 master_th->th.th_current_task->td_icvs.max_active_levels));
2031 // TODO: GEH - cannot do this assertion because root thread not set up as
2032 // executing
2033 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2034 master_th->th.th_current_task->td_flags.executing = 0;
2035
2036 if (!master_th->th.th_teams_microtask || level > teams_level) {
2037 /* Increment our nested depth level */
2038 KMP_ATOMIC_INC(&root->r.r_in_parallel);
2039 }
2040
2041 // See if we need to make a copy of the ICVs.
2042 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2043 if ((level + 1 < __kmp_nested_nth.used) &&
2044 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
2045 nthreads_icv = __kmp_nested_nth.nth[level + 1];
2046 } else {
2047 nthreads_icv = 0; // don't update
2048 }
2049
2050 // Figure out the proc_bind_policy for the new team.
2051 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2052 // proc_bind_default means don't update
2053 kmp_proc_bind_t proc_bind_icv = proc_bind_default;
2054 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
2055 proc_bind = proc_bind_false;
2056 } else {
2057 // No proc_bind clause specified; use current proc-bind-var for this
2058 // parallel region
2059 if (proc_bind == proc_bind_default) {
2060 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
2061 }
2062 // Have teams construct take proc_bind value from KMP_TEAMS_PROC_BIND
2063 if (master_th->th.th_teams_microtask &&
2064 microtask == (microtask_t)__kmp_teams_master) {
2065 proc_bind = __kmp_teams_proc_bind;
2066 }
2067 /* else: The proc_bind policy was specified explicitly on parallel clause.
2068 This overrides proc-bind-var for this parallel region, but does not
2069 change proc-bind-var. */
2070 // Figure the value of proc-bind-var for the child threads.
2071 if ((level + 1 < __kmp_nested_proc_bind.used) &&
2072 (__kmp_nested_proc_bind.bind_types[level + 1] !=
2073 master_th->th.th_current_task->td_icvs.proc_bind)) {
2074 // Do not modify the proc bind icv for the two teams construct forks
2075 // They just let the proc bind icv pass through
2076 if (!master_th->th.th_teams_microtask ||
2077 !(microtask == (microtask_t)__kmp_teams_master || ap == NULL))
2078 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2079 }
2080 }
2081
2082 // Reset for next parallel region
2083 master_th->th.th_set_proc_bind = proc_bind_default;
2084
2085 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2086 kmp_internal_control_t new_icvs;
2087 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2088 new_icvs.next = NULL;
2089 if (nthreads_icv > 0) {
2090 new_icvs.nproc = nthreads_icv;
2091 }
2092 if (proc_bind_icv != proc_bind_default) {
2093 new_icvs.proc_bind = proc_bind_icv;
2094 }
2095
2096 /* allocate a new parallel team */
2097 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2098 team = __kmp_allocate_team(root, nthreads, nthreads,
2099#if OMPT_SUPPORT
2100 ompt_parallel_data,
2101#endif
2102 proc_bind, &new_icvs,
2103 argc USE_NESTED_HOT_ARG(master_th));
2104 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2105 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs);
2106 } else {
2107 /* allocate a new parallel team */
2108 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2109 team = __kmp_allocate_team(root, nthreads, nthreads,
2110#if OMPT_SUPPORT
2111 ompt_parallel_data,
2112#endif
2113 proc_bind,
2114 &master_th->th.th_current_task->td_icvs,
2115 argc USE_NESTED_HOT_ARG(master_th));
2116 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar)
2117 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs,
2118 &master_th->th.th_current_task->td_icvs);
2119 }
2120 KF_TRACE(
2121 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2122
2123 /* setup the new team */
2124 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2125 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2126 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2127 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2128 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2129#if OMPT_SUPPORT
2130 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2131 return_address);
2132#endif
2133 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2134 // TODO: parent_team->t.t_level == INT_MAX ???
2135 if (!master_th->th.th_teams_microtask || level > teams_level) {
2136 int new_level = parent_team->t.t_level + 1;
2137 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2138 new_level = parent_team->t.t_active_level + 1;
2139 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2140 } else {
2141 // AC: Do not increase parallel level at start of the teams construct
2142 int new_level = parent_team->t.t_level;
2143 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2144 new_level = parent_team->t.t_active_level;
2145 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2146 }
2147 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2148 // set primary thread's schedule as new run-time schedule
2149 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2150
2151 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2152 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2153
2154 // Update the floating point rounding in the team if required.
2155 propagateFPControl(team);
2156#if OMPD_SUPPORT
2157 if (ompd_state & OMPD_ENABLE_BP)
2158 ompd_bp_parallel_begin();
2159#endif
2160
2161 if (__kmp_tasking_mode != tskm_immediate_exec) {
2162 // Set primary thread's task team to team's task team. Unless this is hot
2163 // team, it should be NULL.
2164 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2165 parent_team->t.t_task_team[master_th->th.th_task_state]);
2166 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
2167 "%p, new task_team %p / team %p\n",
2168 __kmp_gtid_from_thread(master_th),
2169 master_th->th.th_task_team, parent_team,
2170 team->t.t_task_team[master_th->th.th_task_state], team));
2171
2172 if (active_level || master_th->th.th_task_team) {
2173 // Take a memo of primary thread's task_state
2174 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2175 if (master_th->th.th_task_state_top >=
2176 master_th->th.th_task_state_stack_sz) { // increase size
2177 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2178 kmp_uint8 *old_stack, *new_stack;
2179 kmp_uint32 i;
2180 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2181 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2182 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2183 }
2184 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2185 ++i) { // zero-init rest of stack
2186 new_stack[i] = 0;
2187 }
2188 old_stack = master_th->th.th_task_state_memo_stack;
2189 master_th->th.th_task_state_memo_stack = new_stack;
2190 master_th->th.th_task_state_stack_sz = new_size;
2191 __kmp_free(old_stack);
2192 }
2193 // Store primary thread's task_state on stack
2194 master_th->th
2195 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2196 master_th->th.th_task_state;
2197 master_th->th.th_task_state_top++;
2198#if KMP_NESTED_HOT_TEAMS
2199 if (master_th->th.th_hot_teams &&
2200 active_level < __kmp_hot_teams_max_level &&
2201 team == master_th->th.th_hot_teams[active_level].hot_team) {
2202 // Restore primary thread's nested state if nested hot team
2203 master_th->th.th_task_state =
2204 master_th->th
2205 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2206 } else {
2207#endif
2208 master_th->th.th_task_state = 0;
2209#if KMP_NESTED_HOT_TEAMS
2210 }
2211#endif
2212 }
2213#if !KMP_NESTED_HOT_TEAMS
2214 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2215 (team == root->r.r_hot_team));
2216#endif
2217 }
2218
2219 KA_TRACE(
2220 20,
2221 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2222 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2223 team->t.t_nproc));
2224 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2225 (team->t.t_master_tid == 0 &&
2226 (team->t.t_parent == root->r.r_root_team ||
2227 team->t.t_parent->t.t_serialized)));
2228 KMP_MB();
2229
2230 /* now, setup the arguments */
2231 argv = (void **)team->t.t_argv;
2232 if (ap) {
2233 for (i = argc - 1; i >= 0; --i) {
2234 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2235 KMP_CHECK_UPDATE(*argv, new_argv);
2236 argv++;
2237 }
2238 } else {
2239 for (i = 0; i < argc; ++i) {
2240 // Get args from parent team for teams construct
2241 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2242 }
2243 }
2244
2245 /* now actually fork the threads */
2246 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2247 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2248 root->r.r_active = TRUE;
2249
2250 __kmp_fork_team_threads(root, team, master_th, gtid, !ap);
2251 __kmp_setup_icv_copy(team, nthreads,
2252 &master_th->th.th_current_task->td_icvs, loc);
2253
2254#if OMPT_SUPPORT
2255 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2256#endif
2257
2258 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2259
2260#if USE_ITT_BUILD
2261 if (team->t.t_active_level == 1 // only report frames at level 1
2262 && !master_th->th.th_teams_microtask) { // not in teams construct
2263#if USE_ITT_NOTIFY
2264 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2265 (__kmp_forkjoin_frames_mode == 3 ||
2266 __kmp_forkjoin_frames_mode == 1)) {
2267 kmp_uint64 tmp_time = 0;
2268 if (__itt_get_timestamp_ptr)
2269 tmp_time = __itt_get_timestamp();
2270 // Internal fork - report frame begin
2271 master_th->th.th_frame_time = tmp_time;
2272 if (__kmp_forkjoin_frames_mode == 3)
2273 team->t.t_region_time = tmp_time;
2274 } else
2275// only one notification scheme (either "submit" or "forking/joined", not both)
2276#endif /* USE_ITT_NOTIFY */
2277 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2278 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2279 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2280 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2281 }
2282 }
2283#endif /* USE_ITT_BUILD */
2284
2285 /* now go on and do the work */
2286 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2287 KMP_MB();
2288 KF_TRACE(10,
2289 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2290 root, team, master_th, gtid));
2291
2292#if USE_ITT_BUILD
2293 if (__itt_stack_caller_create_ptr) {
2294 // create new stack stitching id before entering fork barrier
2295 if (!enter_teams) {
2296 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL);
2297 team->t.t_stack_id = __kmp_itt_stack_caller_create();
2298 } else if (parent_team->t.t_serialized) {
2299 // keep stack stitching id in the serialized parent_team;
2300 // current team will be used for parallel inside the teams;
2301 // if parent_team is active, then it already keeps stack stitching id
2302 // for the league of teams
2303 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL);
2304 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
2305 }
2306 }
2307#endif /* USE_ITT_BUILD */
2308
2309 // AC: skip __kmp_internal_fork at teams construct, let only primary
2310 // threads execute
2311 if (ap) {
2312 __kmp_internal_fork(loc, gtid, team);
2313 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2314 "master_th=%p, gtid=%d\n",
2315 root, team, master_th, gtid));
2316 }
2317
2318 if (call_context == fork_context_gnu) {
2319 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2320 return TRUE;
2321 }
2322
2323 /* Invoke microtask for PRIMARY thread */
2324 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2325 team->t.t_id, team->t.t_pkfn));
2326 } // END of timer KMP_fork_call block
2327
2328#if KMP_STATS_ENABLED
2329 // If beginning a teams construct, then change thread state
2330 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2331 if (!ap) {
2332 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2333 }
2334#endif
2335
2336 if (!team->t.t_invoke(gtid)) {
2337 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread");
2338 }
2339
2340#if KMP_STATS_ENABLED
2341 // If was beginning of a teams construct, then reset thread state
2342 if (!ap) {
2343 KMP_SET_THREAD_STATE(previous_state);
2344 }
2345#endif
2346
2347 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2348 team->t.t_id, team->t.t_pkfn));
2349 KMP_MB(); /* Flush all pending memory write invalidates. */
2350
2351 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2352#if OMPT_SUPPORT
2353 if (ompt_enabled.enabled) {
2354 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2355 }
2356#endif
2357
2358 return TRUE;
2359}
2360
2361#if OMPT_SUPPORT
2362static inline void __kmp_join_restore_state(kmp_info_t *thread,
2363 kmp_team_t *team) {
2364 // restore state outside the region
2365 thread->th.ompt_thread_info.state =
2366 ((team->t.t_serialized) ? ompt_state_work_serial
2367 : ompt_state_work_parallel);
2368}
2369
2370static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2371 kmp_team_t *team, ompt_data_t *parallel_data,
2372 int flags, void *codeptr) {
2373 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2374 if (ompt_enabled.ompt_callback_parallel_end) {
2375 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2376 parallel_data, &(task_info->task_data), flags, codeptr);
2377 }
2378
2379 task_info->frame.enter_frame = ompt_data_none;
2380 __kmp_join_restore_state(thread, team);
2381}
2382#endif
2383
2384void __kmp_join_call(ident_t *loc, int gtid
2385#if OMPT_SUPPORT
2386 ,
2387 enum fork_context_e fork_context
2388#endif
2389 ,
2390 int exit_teams) {
2391 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2392 kmp_team_t *team;
2393 kmp_team_t *parent_team;
2394 kmp_info_t *master_th;
2395 kmp_root_t *root;
2396 int master_active;
2397
2398 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2399
2400 /* setup current data */
2401 master_th = __kmp_threads[gtid];
2402 root = master_th->th.th_root;
2403 team = master_th->th.th_team;
2404 parent_team = team->t.t_parent;
2405
2406 master_th->th.th_ident = loc;
2407
2408#if OMPT_SUPPORT
2409 void *team_microtask = (void *)team->t.t_pkfn;
2410 // For GOMP interface with serialized parallel, need the
2411 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2412 // and end-parallel events.
2413 if (ompt_enabled.enabled &&
2414 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2415 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2416 }
2417#endif
2418
2419#if KMP_DEBUG
2420 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2421 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2422 "th_task_team = %p\n",
2423 __kmp_gtid_from_thread(master_th), team,
2424 team->t.t_task_team[master_th->th.th_task_state],
2425 master_th->th.th_task_team));
2426 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2427 team->t.t_task_team[master_th->th.th_task_state]);
2428 }
2429#endif
2430
2431 if (team->t.t_serialized) {
2432 if (master_th->th.th_teams_microtask) {
2433 // We are in teams construct
2434 int level = team->t.t_level;
2435 int tlevel = master_th->th.th_teams_level;
2436 if (level == tlevel) {
2437 // AC: we haven't incremented it earlier at start of teams construct,
2438 // so do it here - at the end of teams construct
2439 team->t.t_level++;
2440 } else if (level == tlevel + 1) {
2441 // AC: we are exiting parallel inside teams, need to increment
2442 // serialization in order to restore it in the next call to
2443 // __kmpc_end_serialized_parallel
2444 team->t.t_serialized++;
2445 }
2446 }
2448
2449#if OMPT_SUPPORT
2450 if (ompt_enabled.enabled) {
2451 if (fork_context == fork_context_gnu) {
2452 __ompt_lw_taskteam_unlink(master_th);
2453 }
2454 __kmp_join_restore_state(master_th, parent_team);
2455 }
2456#endif
2457
2458 return;
2459 }
2460
2461 master_active = team->t.t_master_active;
2462
2463 if (!exit_teams) {
2464 // AC: No barrier for internal teams at exit from teams construct.
2465 // But there is barrier for external team (league).
2466 __kmp_internal_join(loc, gtid, team);
2467#if USE_ITT_BUILD
2468 if (__itt_stack_caller_create_ptr) {
2469 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL);
2470 // destroy the stack stitching id after join barrier
2471 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2472 team->t.t_stack_id = NULL;
2473 }
2474#endif
2475 } else {
2476 master_th->th.th_task_state =
2477 0; // AC: no tasking in teams (out of any parallel)
2478#if USE_ITT_BUILD
2479 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) {
2480 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL);
2481 // destroy the stack stitching id on exit from the teams construct
2482 // if parent_team is active, then the id will be destroyed later on
2483 // by master of the league of teams
2484 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id);
2485 parent_team->t.t_stack_id = NULL;
2486 }
2487#endif
2488 }
2489
2490 KMP_MB();
2491
2492#if OMPT_SUPPORT
2493 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2494 void *codeptr = team->t.ompt_team_info.master_return_address;
2495#endif
2496
2497#if USE_ITT_BUILD
2498 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2499 if (team->t.t_active_level == 1 &&
2500 (!master_th->th.th_teams_microtask || /* not in teams construct */
2501 master_th->th.th_teams_size.nteams == 1)) {
2502 master_th->th.th_ident = loc;
2503 // only one notification scheme (either "submit" or "forking/joined", not
2504 // both)
2505 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2506 __kmp_forkjoin_frames_mode == 3)
2507 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2508 master_th->th.th_frame_time, 0, loc,
2509 master_th->th.th_team_nproc, 1);
2510 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2511 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2512 __kmp_itt_region_joined(gtid);
2513 } // active_level == 1
2514#endif /* USE_ITT_BUILD */
2515
2516#if KMP_AFFINITY_SUPPORTED
2517 if (!exit_teams) {
2518 // Restore master thread's partition.
2519 master_th->th.th_first_place = team->t.t_first_place;
2520 master_th->th.th_last_place = team->t.t_last_place;
2521 }
2522#endif // KMP_AFFINITY_SUPPORTED
2523
2524 if (master_th->th.th_teams_microtask && !exit_teams &&
2525 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2526 team->t.t_level == master_th->th.th_teams_level + 1) {
2527// AC: We need to leave the team structure intact at the end of parallel
2528// inside the teams construct, so that at the next parallel same (hot) team
2529// works, only adjust nesting levels
2530#if OMPT_SUPPORT
2531 ompt_data_t ompt_parallel_data = ompt_data_none;
2532 if (ompt_enabled.enabled) {
2533 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2534 if (ompt_enabled.ompt_callback_implicit_task) {
2535 int ompt_team_size = team->t.t_nproc;
2536 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2537 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2538 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2539 }
2540 task_info->frame.exit_frame = ompt_data_none;
2541 task_info->task_data = ompt_data_none;
2542 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2543 __ompt_lw_taskteam_unlink(master_th);
2544 }
2545#endif
2546 /* Decrement our nested depth level */
2547 team->t.t_level--;
2548 team->t.t_active_level--;
2549 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2550
2551 // Restore number of threads in the team if needed. This code relies on
2552 // the proper adjustment of th_teams_size.nth after the fork in
2553 // __kmp_teams_master on each teams primary thread in the case that
2554 // __kmp_reserve_threads reduced it.
2555 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2556 int old_num = master_th->th.th_team_nproc;
2557 int new_num = master_th->th.th_teams_size.nth;
2558 kmp_info_t **other_threads = team->t.t_threads;
2559 team->t.t_nproc = new_num;
2560 for (int i = 0; i < old_num; ++i) {
2561 other_threads[i]->th.th_team_nproc = new_num;
2562 }
2563 // Adjust states of non-used threads of the team
2564 for (int i = old_num; i < new_num; ++i) {
2565 // Re-initialize thread's barrier data.
2566 KMP_DEBUG_ASSERT(other_threads[i]);
2567 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2568 for (int b = 0; b < bs_last_barrier; ++b) {
2569 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2570 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2571#if USE_DEBUGGER
2572 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2573#endif
2574 }
2575 if (__kmp_tasking_mode != tskm_immediate_exec) {
2576 // Synchronize thread's task state
2577 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2578 }
2579 }
2580 }
2581
2582#if OMPT_SUPPORT
2583 if (ompt_enabled.enabled) {
2584 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2585 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2586 }
2587#endif
2588
2589 return;
2590 }
2591
2592 /* do cleanup and restore the parent team */
2593 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2594 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2595
2596 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2597
2598 /* jc: The following lock has instructions with REL and ACQ semantics,
2599 separating the parallel user code called in this parallel region
2600 from the serial user code called after this function returns. */
2601 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2602
2603 if (!master_th->th.th_teams_microtask ||
2604 team->t.t_level > master_th->th.th_teams_level) {
2605 /* Decrement our nested depth level */
2606 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2607 }
2608 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2609
2610#if OMPT_SUPPORT
2611 if (ompt_enabled.enabled) {
2612 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2613 if (ompt_enabled.ompt_callback_implicit_task) {
2614 int flags = (team_microtask == (void *)__kmp_teams_master)
2615 ? ompt_task_initial
2616 : ompt_task_implicit;
2617 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2618 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2619 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2620 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2621 }
2622 task_info->frame.exit_frame = ompt_data_none;
2623 task_info->task_data = ompt_data_none;
2624 }
2625#endif
2626
2627 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2628 master_th, team));
2629 __kmp_pop_current_task_from_thread(master_th);
2630
2631 master_th->th.th_def_allocator = team->t.t_def_allocator;
2632
2633#if OMPD_SUPPORT
2634 if (ompd_state & OMPD_ENABLE_BP)
2635 ompd_bp_parallel_end();
2636#endif
2637 updateHWFPControl(team);
2638
2639 if (root->r.r_active != master_active)
2640 root->r.r_active = master_active;
2641
2642 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2643 master_th)); // this will free worker threads
2644
2645 /* this race was fun to find. make sure the following is in the critical
2646 region otherwise assertions may fail occasionally since the old team may be
2647 reallocated and the hierarchy appears inconsistent. it is actually safe to
2648 run and won't cause any bugs, but will cause those assertion failures. it's
2649 only one deref&assign so might as well put this in the critical region */
2650 master_th->th.th_team = parent_team;
2651 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2652 master_th->th.th_team_master = parent_team->t.t_threads[0];
2653 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2654
2655 /* restore serialized team, if need be */
2656 if (parent_team->t.t_serialized &&
2657 parent_team != master_th->th.th_serial_team &&
2658 parent_team != root->r.r_root_team) {
2659 __kmp_free_team(root,
2660 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2661 master_th->th.th_serial_team = parent_team;
2662 }
2663
2664 if (__kmp_tasking_mode != tskm_immediate_exec) {
2665 if (master_th->th.th_task_state_top >
2666 0) { // Restore task state from memo stack
2667 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2668 // Remember primary thread's state if we re-use this nested hot team
2669 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2670 master_th->th.th_task_state;
2671 --master_th->th.th_task_state_top; // pop
2672 // Now restore state at this level
2673 master_th->th.th_task_state =
2674 master_th->th
2675 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2676 } else if (team != root->r.r_hot_team) {
2677 // Reset the task state of primary thread if we are not hot team because
2678 // in this case all the worker threads will be free, and their task state
2679 // will be reset. If not reset the primary's, the task state will be
2680 // inconsistent.
2681 master_th->th.th_task_state = 0;
2682 }
2683 // Copy the task team from the parent team to the primary thread
2684 master_th->th.th_task_team =
2685 parent_team->t.t_task_team[master_th->th.th_task_state];
2686 KA_TRACE(20,
2687 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n",
2688 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2689 parent_team));
2690 }
2691
2692 // TODO: GEH - cannot do this assertion because root thread not set up as
2693 // executing
2694 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2695 master_th->th.th_current_task->td_flags.executing = 1;
2696
2697 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2698
2699#if KMP_AFFINITY_SUPPORTED
2700 if (master_th->th.th_team->t.t_level == 0 && __kmp_affinity.flags.reset) {
2701 __kmp_reset_root_init_mask(gtid);
2702 }
2703#endif
2704#if OMPT_SUPPORT
2705 int flags =
2706 OMPT_INVOKER(fork_context) |
2707 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2708 : ompt_parallel_team);
2709 if (ompt_enabled.enabled) {
2710 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2711 codeptr);
2712 }
2713#endif
2714
2715 KMP_MB();
2716 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2717}
2718
2719/* Check whether we should push an internal control record onto the
2720 serial team stack. If so, do it. */
2721void __kmp_save_internal_controls(kmp_info_t *thread) {
2722
2723 if (thread->th.th_team != thread->th.th_serial_team) {
2724 return;
2725 }
2726 if (thread->th.th_team->t.t_serialized > 1) {
2727 int push = 0;
2728
2729 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2730 push = 1;
2731 } else {
2732 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2733 thread->th.th_team->t.t_serialized) {
2734 push = 1;
2735 }
2736 }
2737 if (push) { /* push a record on the serial team's stack */
2738 kmp_internal_control_t *control =
2739 (kmp_internal_control_t *)__kmp_allocate(
2740 sizeof(kmp_internal_control_t));
2741
2742 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2743
2744 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2745
2746 control->next = thread->th.th_team->t.t_control_stack_top;
2747 thread->th.th_team->t.t_control_stack_top = control;
2748 }
2749 }
2750}
2751
2752/* Changes set_nproc */
2753void __kmp_set_num_threads(int new_nth, int gtid) {
2754 kmp_info_t *thread;
2755 kmp_root_t *root;
2756
2757 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2758 KMP_DEBUG_ASSERT(__kmp_init_serial);
2759
2760 if (new_nth < 1)
2761 new_nth = 1;
2762 else if (new_nth > __kmp_max_nth)
2763 new_nth = __kmp_max_nth;
2764
2765 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2766 thread = __kmp_threads[gtid];
2767 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2768 return; // nothing to do
2769
2770 __kmp_save_internal_controls(thread);
2771
2772 set__nproc(thread, new_nth);
2773
2774 // If this omp_set_num_threads() call will cause the hot team size to be
2775 // reduced (in the absence of a num_threads clause), then reduce it now,
2776 // rather than waiting for the next parallel region.
2777 root = thread->th.th_root;
2778 if (__kmp_init_parallel && (!root->r.r_active) &&
2779 (root->r.r_hot_team->t.t_nproc > new_nth)
2780#if KMP_NESTED_HOT_TEAMS
2781 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2782#endif
2783 ) {
2784 kmp_team_t *hot_team = root->r.r_hot_team;
2785 int f;
2786
2787 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2788
2789 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2790 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth);
2791 }
2792 // Release the extra threads we don't need any more.
2793 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2794 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2795 if (__kmp_tasking_mode != tskm_immediate_exec) {
2796 // When decreasing team size, threads no longer in the team should unref
2797 // task team.
2798 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2799 }
2800 __kmp_free_thread(hot_team->t.t_threads[f]);
2801 hot_team->t.t_threads[f] = NULL;
2802 }
2803 hot_team->t.t_nproc = new_nth;
2804#if KMP_NESTED_HOT_TEAMS
2805 if (thread->th.th_hot_teams) {
2806 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2807 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2808 }
2809#endif
2810
2811 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
2812 hot_team->t.b->update_num_threads(new_nth);
2813 __kmp_add_threads_to_team(hot_team, new_nth);
2814 }
2815
2816 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2817
2818 // Update the t_nproc field in the threads that are still active.
2819 for (f = 0; f < new_nth; f++) {
2820 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2821 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2822 }
2823 // Special flag in case omp_set_num_threads() call
2824 hot_team->t.t_size_changed = -1;
2825 }
2826}
2827
2828/* Changes max_active_levels */
2829void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2830 kmp_info_t *thread;
2831
2832 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2833 "%d = (%d)\n",
2834 gtid, max_active_levels));
2835 KMP_DEBUG_ASSERT(__kmp_init_serial);
2836
2837 // validate max_active_levels
2838 if (max_active_levels < 0) {
2839 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2840 // We ignore this call if the user has specified a negative value.
2841 // The current setting won't be changed. The last valid setting will be
2842 // used. A warning will be issued (if warnings are allowed as controlled by
2843 // the KMP_WARNINGS env var).
2844 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2845 "max_active_levels for thread %d = (%d)\n",
2846 gtid, max_active_levels));
2847 return;
2848 }
2849 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2850 // it's OK, the max_active_levels is within the valid range: [ 0;
2851 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2852 // We allow a zero value. (implementation defined behavior)
2853 } else {
2854 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2855 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2856 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2857 // Current upper limit is MAX_INT. (implementation defined behavior)
2858 // If the input exceeds the upper limit, we correct the input to be the
2859 // upper limit. (implementation defined behavior)
2860 // Actually, the flow should never get here until we use MAX_INT limit.
2861 }
2862 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2863 "max_active_levels for thread %d = (%d)\n",
2864 gtid, max_active_levels));
2865
2866 thread = __kmp_threads[gtid];
2867
2868 __kmp_save_internal_controls(thread);
2869
2870 set__max_active_levels(thread, max_active_levels);
2871}
2872
2873/* Gets max_active_levels */
2874int __kmp_get_max_active_levels(int gtid) {
2875 kmp_info_t *thread;
2876
2877 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2878 KMP_DEBUG_ASSERT(__kmp_init_serial);
2879
2880 thread = __kmp_threads[gtid];
2881 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2882 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2883 "curtask_maxaclevel=%d\n",
2884 gtid, thread->th.th_current_task,
2885 thread->th.th_current_task->td_icvs.max_active_levels));
2886 return thread->th.th_current_task->td_icvs.max_active_levels;
2887}
2888
2889// nteams-var per-device ICV
2890void __kmp_set_num_teams(int num_teams) {
2891 if (num_teams > 0)
2892 __kmp_nteams = num_teams;
2893}
2894int __kmp_get_max_teams(void) { return __kmp_nteams; }
2895// teams-thread-limit-var per-device ICV
2896void __kmp_set_teams_thread_limit(int limit) {
2897 if (limit > 0)
2898 __kmp_teams_thread_limit = limit;
2899}
2900int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; }
2901
2902KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2903KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2904
2905/* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2906void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2907 kmp_info_t *thread;
2908 kmp_sched_t orig_kind;
2909 // kmp_team_t *team;
2910
2911 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2912 gtid, (int)kind, chunk));
2913 KMP_DEBUG_ASSERT(__kmp_init_serial);
2914
2915 // Check if the kind parameter is valid, correct if needed.
2916 // Valid parameters should fit in one of two intervals - standard or extended:
2917 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2918 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2919 orig_kind = kind;
2920 kind = __kmp_sched_without_mods(kind);
2921
2922 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2923 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2924 // TODO: Hint needs attention in case we change the default schedule.
2925 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2926 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2927 __kmp_msg_null);
2928 kind = kmp_sched_default;
2929 chunk = 0; // ignore chunk value in case of bad kind
2930 }
2931
2932 thread = __kmp_threads[gtid];
2933
2934 __kmp_save_internal_controls(thread);
2935
2936 if (kind < kmp_sched_upper_std) {
2937 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2938 // differ static chunked vs. unchunked: chunk should be invalid to
2939 // indicate unchunked schedule (which is the default)
2940 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2941 } else {
2942 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2943 __kmp_sch_map[kind - kmp_sched_lower - 1];
2944 }
2945 } else {
2946 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2947 // kmp_sched_lower - 2 ];
2948 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2949 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2950 kmp_sched_lower - 2];
2951 }
2952 __kmp_sched_apply_mods_intkind(
2953 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2954 if (kind == kmp_sched_auto || chunk < 1) {
2955 // ignore parameter chunk for schedule auto
2956 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2957 } else {
2958 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2959 }
2960}
2961
2962/* Gets def_sched_var ICV values */
2963void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2964 kmp_info_t *thread;
2965 enum sched_type th_type;
2966
2967 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2968 KMP_DEBUG_ASSERT(__kmp_init_serial);
2969
2970 thread = __kmp_threads[gtid];
2971
2972 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2973 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2974 case kmp_sch_static:
2975 case kmp_sch_static_greedy:
2976 case kmp_sch_static_balanced:
2977 *kind = kmp_sched_static;
2978 __kmp_sched_apply_mods_stdkind(kind, th_type);
2979 *chunk = 0; // chunk was not set, try to show this fact via zero value
2980 return;
2981 case kmp_sch_static_chunked:
2982 *kind = kmp_sched_static;
2983 break;
2984 case kmp_sch_dynamic_chunked:
2985 *kind = kmp_sched_dynamic;
2986 break;
2988 case kmp_sch_guided_iterative_chunked:
2989 case kmp_sch_guided_analytical_chunked:
2990 *kind = kmp_sched_guided;
2991 break;
2992 case kmp_sch_auto:
2993 *kind = kmp_sched_auto;
2994 break;
2995 case kmp_sch_trapezoidal:
2996 *kind = kmp_sched_trapezoidal;
2997 break;
2998#if KMP_STATIC_STEAL_ENABLED
2999 case kmp_sch_static_steal:
3000 *kind = kmp_sched_static_steal;
3001 break;
3002#endif
3003 default:
3004 KMP_FATAL(UnknownSchedulingType, th_type);
3005 }
3006
3007 __kmp_sched_apply_mods_stdkind(kind, th_type);
3008 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
3009}
3010
3011int __kmp_get_ancestor_thread_num(int gtid, int level) {
3012
3013 int ii, dd;
3014 kmp_team_t *team;
3015 kmp_info_t *thr;
3016
3017 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
3018 KMP_DEBUG_ASSERT(__kmp_init_serial);
3019
3020 // validate level
3021 if (level == 0)
3022 return 0;
3023 if (level < 0)
3024 return -1;
3025 thr = __kmp_threads[gtid];
3026 team = thr->th.th_team;
3027 ii = team->t.t_level;
3028 if (level > ii)
3029 return -1;
3030
3031 if (thr->th.th_teams_microtask) {
3032 // AC: we are in teams region where multiple nested teams have same level
3033 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3034 if (level <=
3035 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3036 KMP_DEBUG_ASSERT(ii >= tlevel);
3037 // AC: As we need to pass by the teams league, we need to artificially
3038 // increase ii
3039 if (ii == tlevel) {
3040 ii += 2; // three teams have same level
3041 } else {
3042 ii++; // two teams have same level
3043 }
3044 }
3045 }
3046
3047 if (ii == level)
3048 return __kmp_tid_from_gtid(gtid);
3049
3050 dd = team->t.t_serialized;
3051 level++;
3052 while (ii > level) {
3053 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3054 }
3055 if ((team->t.t_serialized) && (!dd)) {
3056 team = team->t.t_parent;
3057 continue;
3058 }
3059 if (ii > level) {
3060 team = team->t.t_parent;
3061 dd = team->t.t_serialized;
3062 ii--;
3063 }
3064 }
3065
3066 return (dd > 1) ? (0) : (team->t.t_master_tid);
3067}
3068
3069int __kmp_get_team_size(int gtid, int level) {
3070
3071 int ii, dd;
3072 kmp_team_t *team;
3073 kmp_info_t *thr;
3074
3075 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
3076 KMP_DEBUG_ASSERT(__kmp_init_serial);
3077
3078 // validate level
3079 if (level == 0)
3080 return 1;
3081 if (level < 0)
3082 return -1;
3083 thr = __kmp_threads[gtid];
3084 team = thr->th.th_team;
3085 ii = team->t.t_level;
3086 if (level > ii)
3087 return -1;
3088
3089 if (thr->th.th_teams_microtask) {
3090 // AC: we are in teams region where multiple nested teams have same level
3091 int tlevel = thr->th.th_teams_level; // the level of the teams construct
3092 if (level <=
3093 tlevel) { // otherwise usual algorithm works (will not touch the teams)
3094 KMP_DEBUG_ASSERT(ii >= tlevel);
3095 // AC: As we need to pass by the teams league, we need to artificially
3096 // increase ii
3097 if (ii == tlevel) {
3098 ii += 2; // three teams have same level
3099 } else {
3100 ii++; // two teams have same level
3101 }
3102 }
3103 }
3104
3105 while (ii > level) {
3106 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
3107 }
3108 if (team->t.t_serialized && (!dd)) {
3109 team = team->t.t_parent;
3110 continue;
3111 }
3112 if (ii > level) {
3113 team = team->t.t_parent;
3114 ii--;
3115 }
3116 }
3117
3118 return team->t.t_nproc;
3119}
3120
3121kmp_r_sched_t __kmp_get_schedule_global() {
3122 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
3123 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
3124 // independently. So one can get the updated schedule here.
3125
3126 kmp_r_sched_t r_sched;
3127
3128 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
3129 // __kmp_guided. __kmp_sched should keep original value, so that user can set
3130 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
3131 // different roots (even in OMP 2.5)
3132 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
3133 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
3134 if (s == kmp_sch_static) {
3135 // replace STATIC with more detailed schedule (balanced or greedy)
3136 r_sched.r_sched_type = __kmp_static;
3137 } else if (s == kmp_sch_guided_chunked) {
3138 // replace GUIDED with more detailed schedule (iterative or analytical)
3139 r_sched.r_sched_type = __kmp_guided;
3140 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3141 r_sched.r_sched_type = __kmp_sched;
3142 }
3143 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
3144
3145 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
3146 // __kmp_chunk may be wrong here (if it was not ever set)
3147 r_sched.chunk = KMP_DEFAULT_CHUNK;
3148 } else {
3149 r_sched.chunk = __kmp_chunk;
3150 }
3151
3152 return r_sched;
3153}
3154
3155/* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3156 at least argc number of *t_argv entries for the requested team. */
3157static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3158
3159 KMP_DEBUG_ASSERT(team);
3160 if (!realloc || argc > team->t.t_max_argc) {
3161
3162 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3163 "current entries=%d\n",
3164 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3165 /* if previously allocated heap space for args, free them */
3166 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3167 __kmp_free((void *)team->t.t_argv);
3168
3169 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3170 /* use unused space in the cache line for arguments */
3171 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3172 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3173 "argv entries\n",
3174 team->t.t_id, team->t.t_max_argc));
3175 team->t.t_argv = &team->t.t_inline_argv[0];
3176 if (__kmp_storage_map) {
3177 __kmp_print_storage_map_gtid(
3178 -1, &team->t.t_inline_argv[0],
3179 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3180 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3181 team->t.t_id);
3182 }
3183 } else {
3184 /* allocate space for arguments in the heap */
3185 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3186 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3187 : 2 * argc;
3188 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3189 "argv entries\n",
3190 team->t.t_id, team->t.t_max_argc));
3191 team->t.t_argv =
3192 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3193 if (__kmp_storage_map) {
3194 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3195 &team->t.t_argv[team->t.t_max_argc],
3196 sizeof(void *) * team->t.t_max_argc,
3197 "team_%d.t_argv", team->t.t_id);
3198 }
3199 }
3200 }
3201}
3202
3203static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3204 int i;
3205 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3206 team->t.t_threads =
3207 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3208 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3209 sizeof(dispatch_shared_info_t) * num_disp_buff);
3210 team->t.t_dispatch =
3211 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3212 team->t.t_implicit_task_taskdata =
3213 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3214 team->t.t_max_nproc = max_nth;
3215
3216 /* setup dispatch buffers */
3217 for (i = 0; i < num_disp_buff; ++i) {
3218 team->t.t_disp_buffer[i].buffer_index = i;
3219 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3220 }
3221}
3222
3223static void __kmp_free_team_arrays(kmp_team_t *team) {
3224 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3225 int i;
3226 for (i = 0; i < team->t.t_max_nproc; ++i) {
3227 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3228 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3229 team->t.t_dispatch[i].th_disp_buffer = NULL;
3230 }
3231 }
3232#if KMP_USE_HIER_SCHED
3233 __kmp_dispatch_free_hierarchies(team);
3234#endif
3235 __kmp_free(team->t.t_threads);
3236 __kmp_free(team->t.t_disp_buffer);
3237 __kmp_free(team->t.t_dispatch);
3238 __kmp_free(team->t.t_implicit_task_taskdata);
3239 team->t.t_threads = NULL;
3240 team->t.t_disp_buffer = NULL;
3241 team->t.t_dispatch = NULL;
3242 team->t.t_implicit_task_taskdata = 0;
3243}
3244
3245static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3246 kmp_info_t **oldThreads = team->t.t_threads;
3247
3248 __kmp_free(team->t.t_disp_buffer);
3249 __kmp_free(team->t.t_dispatch);
3250 __kmp_free(team->t.t_implicit_task_taskdata);
3251 __kmp_allocate_team_arrays(team, max_nth);
3252
3253 KMP_MEMCPY(team->t.t_threads, oldThreads,
3254 team->t.t_nproc * sizeof(kmp_info_t *));
3255
3256 __kmp_free(oldThreads);
3257}
3258
3259static kmp_internal_control_t __kmp_get_global_icvs(void) {
3260
3261 kmp_r_sched_t r_sched =
3262 __kmp_get_schedule_global(); // get current state of scheduling globals
3263
3264 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3265
3266 kmp_internal_control_t g_icvs = {
3267 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3268 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3269 // adjustment of threads (per thread)
3270 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3271 // whether blocktime is explicitly set
3272 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3273#if KMP_USE_MONITOR
3274 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3275// intervals
3276#endif
3277 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3278 // next parallel region (per thread)
3279 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3280 __kmp_cg_max_nth, // int thread_limit;
3281 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3282 // for max_active_levels
3283 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3284 // {sched,chunk} pair
3285 __kmp_nested_proc_bind.bind_types[0],
3286 __kmp_default_device,
3287 NULL // struct kmp_internal_control *next;
3288 };
3289
3290 return g_icvs;
3291}
3292
3293static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3294
3295 kmp_internal_control_t gx_icvs;
3296 gx_icvs.serial_nesting_level =
3297 0; // probably =team->t.t_serial like in save_inter_controls
3298 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3299 gx_icvs.next = NULL;
3300
3301 return gx_icvs;
3302}
3303
3304static void __kmp_initialize_root(kmp_root_t *root) {
3305 int f;
3306 kmp_team_t *root_team;
3307 kmp_team_t *hot_team;
3308 int hot_team_max_nth;
3309 kmp_r_sched_t r_sched =
3310 __kmp_get_schedule_global(); // get current state of scheduling globals
3311 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3312 KMP_DEBUG_ASSERT(root);
3313 KMP_ASSERT(!root->r.r_begin);
3314
3315 /* setup the root state structure */
3316 __kmp_init_lock(&root->r.r_begin_lock);
3317 root->r.r_begin = FALSE;
3318 root->r.r_active = FALSE;
3319 root->r.r_in_parallel = 0;
3320 root->r.r_blocktime = __kmp_dflt_blocktime;
3321#if KMP_AFFINITY_SUPPORTED
3322 root->r.r_affinity_assigned = FALSE;
3323#endif
3324
3325 /* setup the root team for this task */
3326 /* allocate the root team structure */
3327 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3328
3329 root_team =
3330 __kmp_allocate_team(root,
3331 1, // new_nproc
3332 1, // max_nproc
3333#if OMPT_SUPPORT
3334 ompt_data_none, // root parallel id
3335#endif
3336 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3337 0 // argc
3338 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3339 );
3340#if USE_DEBUGGER
3341 // Non-NULL value should be assigned to make the debugger display the root
3342 // team.
3343 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3344#endif
3345
3346 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3347
3348 root->r.r_root_team = root_team;
3349 root_team->t.t_control_stack_top = NULL;
3350
3351 /* initialize root team */
3352 root_team->t.t_threads[0] = NULL;
3353 root_team->t.t_nproc = 1;
3354 root_team->t.t_serialized = 1;
3355 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3356 root_team->t.t_sched.sched = r_sched.sched;
3357 KA_TRACE(
3358 20,
3359 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3360 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3361
3362 /* setup the hot team for this task */
3363 /* allocate the hot team structure */
3364 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3365
3366 hot_team =
3367 __kmp_allocate_team(root,
3368 1, // new_nproc
3369 __kmp_dflt_team_nth_ub * 2, // max_nproc
3370#if OMPT_SUPPORT
3371 ompt_data_none, // root parallel id
3372#endif
3373 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3374 0 // argc
3375 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown
3376 );
3377 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3378
3379 root->r.r_hot_team = hot_team;
3380 root_team->t.t_control_stack_top = NULL;
3381
3382 /* first-time initialization */
3383 hot_team->t.t_parent = root_team;
3384
3385 /* initialize hot team */
3386 hot_team_max_nth = hot_team->t.t_max_nproc;
3387 for (f = 0; f < hot_team_max_nth; ++f) {
3388 hot_team->t.t_threads[f] = NULL;
3389 }
3390 hot_team->t.t_nproc = 1;
3391 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3392 hot_team->t.t_sched.sched = r_sched.sched;
3393 hot_team->t.t_size_changed = 0;
3394}
3395
3396#ifdef KMP_DEBUG
3397
3398typedef struct kmp_team_list_item {
3399 kmp_team_p const *entry;
3400 struct kmp_team_list_item *next;
3401} kmp_team_list_item_t;
3402typedef kmp_team_list_item_t *kmp_team_list_t;
3403
3404static void __kmp_print_structure_team_accum( // Add team to list of teams.
3405 kmp_team_list_t list, // List of teams.
3406 kmp_team_p const *team // Team to add.
3407) {
3408
3409 // List must terminate with item where both entry and next are NULL.
3410 // Team is added to the list only once.
3411 // List is sorted in ascending order by team id.
3412 // Team id is *not* a key.
3413
3414 kmp_team_list_t l;
3415
3416 KMP_DEBUG_ASSERT(list != NULL);
3417 if (team == NULL) {
3418 return;
3419 }
3420
3421 __kmp_print_structure_team_accum(list, team->t.t_parent);
3422 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3423
3424 // Search list for the team.
3425 l = list;
3426 while (l->next != NULL && l->entry != team) {
3427 l = l->next;
3428 }
3429 if (l->next != NULL) {
3430 return; // Team has been added before, exit.
3431 }
3432
3433 // Team is not found. Search list again for insertion point.
3434 l = list;
3435 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3436 l = l->next;
3437 }
3438
3439 // Insert team.
3440 {
3441 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3442 sizeof(kmp_team_list_item_t));
3443 *item = *l;
3444 l->entry = team;
3445 l->next = item;
3446 }
3447}
3448
3449static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3450
3451) {
3452 __kmp_printf("%s", title);
3453 if (team != NULL) {
3454 __kmp_printf("%2x %p\n", team->t.t_id, team);
3455 } else {
3456 __kmp_printf(" - (nil)\n");
3457 }
3458}
3459
3460static void __kmp_print_structure_thread(char const *title,
3461 kmp_info_p const *thread) {
3462 __kmp_printf("%s", title);
3463 if (thread != NULL) {
3464 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3465 } else {
3466 __kmp_printf(" - (nil)\n");
3467 }
3468}
3469
3470void __kmp_print_structure(void) {
3471
3472 kmp_team_list_t list;
3473
3474 // Initialize list of teams.
3475 list =
3476 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3477 list->entry = NULL;
3478 list->next = NULL;
3479
3480 __kmp_printf("\n------------------------------\nGlobal Thread "
3481 "Table\n------------------------------\n");
3482 {
3483 int gtid;
3484 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3485 __kmp_printf("%2d", gtid);
3486 if (__kmp_threads != NULL) {
3487 __kmp_printf(" %p", __kmp_threads[gtid]);
3488 }
3489 if (__kmp_root != NULL) {
3490 __kmp_printf(" %p", __kmp_root[gtid]);
3491 }
3492 __kmp_printf("\n");
3493 }
3494 }
3495
3496 // Print out __kmp_threads array.
3497 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3498 "----------\n");
3499 if (__kmp_threads != NULL) {
3500 int gtid;
3501 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3502 kmp_info_t const *thread = __kmp_threads[gtid];
3503 if (thread != NULL) {
3504 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3505 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3506 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3507 __kmp_print_structure_team(" Serial Team: ",
3508 thread->th.th_serial_team);
3509 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3510 __kmp_print_structure_thread(" Primary: ",
3511 thread->th.th_team_master);
3512 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3513 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3514 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3515 __kmp_print_structure_thread(" Next in pool: ",
3516 thread->th.th_next_pool);
3517 __kmp_printf("\n");
3518 __kmp_print_structure_team_accum(list, thread->th.th_team);
3519 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3520 }
3521 }
3522 } else {
3523 __kmp_printf("Threads array is not allocated.\n");
3524 }
3525
3526 // Print out __kmp_root array.
3527 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3528 "--------\n");
3529 if (__kmp_root != NULL) {
3530 int gtid;
3531 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3532 kmp_root_t const *root = __kmp_root[gtid];
3533 if (root != NULL) {
3534 __kmp_printf("GTID %2d %p:\n", gtid, root);
3535 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3536 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3537 __kmp_print_structure_thread(" Uber Thread: ",
3538 root->r.r_uber_thread);
3539 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3540 __kmp_printf(" In Parallel: %2d\n",
3541 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3542 __kmp_printf("\n");
3543 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3544 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3545 }
3546 }
3547 } else {
3548 __kmp_printf("Ubers array is not allocated.\n");
3549 }
3550
3551 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3552 "--------\n");
3553 while (list->next != NULL) {
3554 kmp_team_p const *team = list->entry;
3555 int i;
3556 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3557 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3558 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid);
3559 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3560 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3561 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3562 for (i = 0; i < team->t.t_nproc; ++i) {
3563 __kmp_printf(" Thread %2d: ", i);
3564 __kmp_print_structure_thread("", team->t.t_threads[i]);
3565 }
3566 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3567 __kmp_printf("\n");
3568 list = list->next;
3569 }
3570
3571 // Print out __kmp_thread_pool and __kmp_team_pool.
3572 __kmp_printf("\n------------------------------\nPools\n----------------------"
3573 "--------\n");
3574 __kmp_print_structure_thread("Thread pool: ",
3575 CCAST(kmp_info_t *, __kmp_thread_pool));
3576 __kmp_print_structure_team("Team pool: ",
3577 CCAST(kmp_team_t *, __kmp_team_pool));
3578 __kmp_printf("\n");
3579
3580 // Free team list.
3581 while (list != NULL) {
3582 kmp_team_list_item_t *item = list;
3583 list = list->next;
3584 KMP_INTERNAL_FREE(item);
3585 }
3586}
3587
3588#endif
3589
3590//---------------------------------------------------------------------------
3591// Stuff for per-thread fast random number generator
3592// Table of primes
3593static const unsigned __kmp_primes[] = {
3594 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3595 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3596 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3597 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3598 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3599 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3600 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3601 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3602 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3603 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3604 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3605
3606//---------------------------------------------------------------------------
3607// __kmp_get_random: Get a random number using a linear congruential method.
3608unsigned short __kmp_get_random(kmp_info_t *thread) {
3609 unsigned x = thread->th.th_x;
3610 unsigned short r = (unsigned short)(x >> 16);
3611
3612 thread->th.th_x = x * thread->th.th_a + 1;
3613
3614 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3615 thread->th.th_info.ds.ds_tid, r));
3616
3617 return r;
3618}
3619//--------------------------------------------------------
3620// __kmp_init_random: Initialize a random number generator
3621void __kmp_init_random(kmp_info_t *thread) {
3622 unsigned seed = thread->th.th_info.ds.ds_tid;
3623
3624 thread->th.th_a =
3625 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3626 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3627 KA_TRACE(30,
3628 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3629}
3630
3631#if KMP_OS_WINDOWS
3632/* reclaim array entries for root threads that are already dead, returns number
3633 * reclaimed */
3634static int __kmp_reclaim_dead_roots(void) {
3635 int i, r = 0;
3636
3637 for (i = 0; i < __kmp_threads_capacity; ++i) {
3638 if (KMP_UBER_GTID(i) &&
3639 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3640 !__kmp_root[i]
3641 ->r.r_active) { // AC: reclaim only roots died in non-active state
3642 r += __kmp_unregister_root_other_thread(i);
3643 }
3644 }
3645 return r;
3646}
3647#endif
3648
3649/* This function attempts to create free entries in __kmp_threads and
3650 __kmp_root, and returns the number of free entries generated.
3651
3652 For Windows* OS static library, the first mechanism used is to reclaim array
3653 entries for root threads that are already dead.
3654
3655 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3656 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3657 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3658 threadprivate cache array has been created. Synchronization with
3659 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3660
3661 After any dead root reclamation, if the clipping value allows array expansion
3662 to result in the generation of a total of nNeed free slots, the function does
3663 that expansion. If not, nothing is done beyond the possible initial root
3664 thread reclamation.
3665
3666 If any argument is negative, the behavior is undefined. */
3667static int __kmp_expand_threads(int nNeed) {
3668 int added = 0;
3669 int minimumRequiredCapacity;
3670 int newCapacity;
3671 kmp_info_t **newThreads;
3672 kmp_root_t **newRoot;
3673
3674 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3675 // resizing __kmp_threads does not need additional protection if foreign
3676 // threads are present
3677
3678#if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3679 /* only for Windows static library */
3680 /* reclaim array entries for root threads that are already dead */
3681 added = __kmp_reclaim_dead_roots();
3682
3683 if (nNeed) {
3684 nNeed -= added;
3685 if (nNeed < 0)
3686 nNeed = 0;
3687 }
3688#endif
3689 if (nNeed <= 0)
3690 return added;
3691
3692 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3693 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3694 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3695 // > __kmp_max_nth in one of two ways:
3696 //
3697 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3698 // may not be reused by another thread, so we may need to increase
3699 // __kmp_threads_capacity to __kmp_max_nth + 1.
3700 //
3701 // 2) New foreign root(s) are encountered. We always register new foreign
3702 // roots. This may cause a smaller # of threads to be allocated at
3703 // subsequent parallel regions, but the worker threads hang around (and
3704 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3705 //
3706 // Anyway, that is the reason for moving the check to see if
3707 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3708 // instead of having it performed here. -BB
3709
3710 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3711
3712 /* compute expansion headroom to check if we can expand */
3713 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3714 /* possible expansion too small -- give up */
3715 return added;
3716 }
3717 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3718
3719 newCapacity = __kmp_threads_capacity;
3720 do {
3721 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3722 : __kmp_sys_max_nth;
3723 } while (newCapacity < minimumRequiredCapacity);
3724 newThreads = (kmp_info_t **)__kmp_allocate(
3725 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3726 newRoot =
3727 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3728 KMP_MEMCPY(newThreads, __kmp_threads,
3729 __kmp_threads_capacity * sizeof(kmp_info_t *));
3730 KMP_MEMCPY(newRoot, __kmp_root,
3731 __kmp_threads_capacity * sizeof(kmp_root_t *));
3732 // Put old __kmp_threads array on a list. Any ongoing references to the old
3733 // list will be valid. This list is cleaned up at library shutdown.
3734 kmp_old_threads_list_t *node =
3735 (kmp_old_threads_list_t *)__kmp_allocate(sizeof(kmp_old_threads_list_t));
3736 node->threads = __kmp_threads;
3737 node->next = __kmp_old_threads_list;
3738 __kmp_old_threads_list = node;
3739
3740 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3741 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3742 added += newCapacity - __kmp_threads_capacity;
3743 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3744
3745 if (newCapacity > __kmp_tp_capacity) {
3746 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3747 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3748 __kmp_threadprivate_resize_cache(newCapacity);
3749 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3750 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3751 }
3752 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3753 }
3754
3755 return added;
3756}
3757
3758/* Register the current thread as a root thread and obtain our gtid. We must
3759 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3760 thread that calls from __kmp_do_serial_initialize() */
3761int __kmp_register_root(int initial_thread) {
3762 kmp_info_t *root_thread;
3763 kmp_root_t *root;
3764 int gtid;
3765 int capacity;
3766 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3767 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3768 KMP_MB();
3769
3770 /* 2007-03-02:
3771 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3772 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3773 work as expected -- it may return false (that means there is at least one
3774 empty slot in __kmp_threads array), but it is possible the only free slot
3775 is #0, which is reserved for initial thread and so cannot be used for this
3776 one. Following code workarounds this bug.
3777
3778 However, right solution seems to be not reserving slot #0 for initial
3779 thread because:
3780 (1) there is no magic in slot #0,
3781 (2) we cannot detect initial thread reliably (the first thread which does
3782 serial initialization may be not a real initial thread).
3783 */
3784 capacity = __kmp_threads_capacity;
3785 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3786 --capacity;
3787 }
3788
3789 // If it is not for initializing the hidden helper team, we need to take
3790 // __kmp_hidden_helper_threads_num out of the capacity because it is included
3791 // in __kmp_threads_capacity.
3792 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) {
3793 capacity -= __kmp_hidden_helper_threads_num;
3794 }
3795
3796 /* see if there are too many threads */
3797 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3798 if (__kmp_tp_cached) {
3799 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3800 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3801 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3802 } else {
3803 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3804 __kmp_msg_null);
3805 }
3806 }
3807
3808 // When hidden helper task is enabled, __kmp_threads is organized as follows:
3809 // 0: initial thread, also a regular OpenMP thread.
3810 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads.
3811 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for
3812 // regular OpenMP threads.
3813 if (TCR_4(__kmp_init_hidden_helper_threads)) {
3814 // Find an available thread slot for hidden helper thread. Slots for hidden
3815 // helper threads start from 1 to __kmp_hidden_helper_threads_num.
3816 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL &&
3817 gtid <= __kmp_hidden_helper_threads_num;
3818 gtid++)
3819 ;
3820 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num);
3821 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for "
3822 "hidden helper thread: T#%d\n",
3823 gtid));
3824 } else {
3825 /* find an available thread slot */
3826 // Don't reassign the zero slot since we need that to only be used by
3827 // initial thread. Slots for hidden helper threads should also be skipped.
3828 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3829 gtid = 0;
3830 } else {
3831 for (gtid = __kmp_hidden_helper_threads_num + 1;
3832 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++)
3833 ;
3834 }
3835 KA_TRACE(
3836 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3837 KMP_ASSERT(gtid < __kmp_threads_capacity);
3838 }
3839
3840 /* update global accounting */
3841 __kmp_all_nth++;
3842 TCW_4(__kmp_nth, __kmp_nth + 1);
3843
3844 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3845 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3846 if (__kmp_adjust_gtid_mode) {
3847 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3848 if (TCR_4(__kmp_gtid_mode) != 2) {
3849 TCW_4(__kmp_gtid_mode, 2);
3850 }
3851 } else {
3852 if (TCR_4(__kmp_gtid_mode) != 1) {
3853 TCW_4(__kmp_gtid_mode, 1);
3854 }
3855 }
3856 }
3857
3858#ifdef KMP_ADJUST_BLOCKTIME
3859 /* Adjust blocktime to zero if necessary */
3860 /* Middle initialization might not have occurred yet */
3861 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3862 if (__kmp_nth > __kmp_avail_proc) {
3863 __kmp_zero_bt = TRUE;
3864 }
3865 }
3866#endif /* KMP_ADJUST_BLOCKTIME */
3867
3868 /* setup this new hierarchy */
3869 if (!(root = __kmp_root[gtid])) {
3870 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3871 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3872 }
3873
3874#if KMP_STATS_ENABLED
3875 // Initialize stats as soon as possible (right after gtid assignment).
3876 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3877 __kmp_stats_thread_ptr->startLife();
3878 KMP_SET_THREAD_STATE(SERIAL_REGION);
3879 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3880#endif
3881 __kmp_initialize_root(root);
3882
3883 /* setup new root thread structure */
3884 if (root->r.r_uber_thread) {
3885 root_thread = root->r.r_uber_thread;
3886 } else {
3887 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3888 if (__kmp_storage_map) {
3889 __kmp_print_thread_storage_map(root_thread, gtid);
3890 }
3891 root_thread->th.th_info.ds.ds_gtid = gtid;
3892#if OMPT_SUPPORT
3893 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3894#endif
3895 root_thread->th.th_root = root;
3896 if (__kmp_env_consistency_check) {
3897 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3898 }
3899#if USE_FAST_MEMORY
3900 __kmp_initialize_fast_memory(root_thread);
3901#endif /* USE_FAST_MEMORY */
3902
3903#if KMP_USE_BGET
3904 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3905 __kmp_initialize_bget(root_thread);
3906#endif
3907 __kmp_init_random(root_thread); // Initialize random number generator
3908 }
3909
3910 /* setup the serial team held in reserve by the root thread */
3911 if (!root_thread->th.th_serial_team) {
3912 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3913 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3914 root_thread->th.th_serial_team = __kmp_allocate_team(
3915 root, 1, 1,
3916#if OMPT_SUPPORT
3917 ompt_data_none, // root parallel id
3918#endif
3919 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3920 }
3921 KMP_ASSERT(root_thread->th.th_serial_team);
3922 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3923 root_thread->th.th_serial_team));
3924
3925 /* drop root_thread into place */
3926 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3927
3928 root->r.r_root_team->t.t_threads[0] = root_thread;
3929 root->r.r_hot_team->t.t_threads[0] = root_thread;
3930 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3931 // AC: the team created in reserve, not for execution (it is unused for now).
3932 root_thread->th.th_serial_team->t.t_serialized = 0;
3933 root->r.r_uber_thread = root_thread;
3934
3935 /* initialize the thread, get it ready to go */
3936 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3937 TCW_4(__kmp_init_gtid, TRUE);
3938
3939 /* prepare the primary thread for get_gtid() */
3940 __kmp_gtid_set_specific(gtid);
3941
3942#if USE_ITT_BUILD
3943 __kmp_itt_thread_name(gtid);
3944#endif /* USE_ITT_BUILD */
3945
3946#ifdef KMP_TDATA_GTID
3947 __kmp_gtid = gtid;
3948#endif
3949 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3950 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3951
3952 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3953 "plain=%u\n",
3954 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3955 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3956 KMP_INIT_BARRIER_STATE));
3957 { // Initialize barrier data.
3958 int b;
3959 for (b = 0; b < bs_last_barrier; ++b) {
3960 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3961#if USE_DEBUGGER
3962 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3963#endif
3964 }
3965 }
3966 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3967 KMP_INIT_BARRIER_STATE);
3968
3969#if KMP_AFFINITY_SUPPORTED
3970 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3971 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3972 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3973 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3974#endif /* KMP_AFFINITY_SUPPORTED */
3975 root_thread->th.th_def_allocator = __kmp_def_allocator;
3976 root_thread->th.th_prev_level = 0;
3977 root_thread->th.th_prev_num_threads = 1;
3978
3979 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3980 tmp->cg_root = root_thread;
3981 tmp->cg_thread_limit = __kmp_cg_max_nth;
3982 tmp->cg_nthreads = 1;
3983 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3984 " cg_nthreads init to 1\n",
3985 root_thread, tmp));
3986 tmp->up = NULL;
3987 root_thread->th.th_cg_roots = tmp;
3988
3989 __kmp_root_counter++;
3990
3991#if OMPT_SUPPORT
3992 if (!initial_thread && ompt_enabled.enabled) {
3993
3994 kmp_info_t *root_thread = ompt_get_thread();
3995
3996 ompt_set_thread_state(root_thread, ompt_state_overhead);
3997
3998 if (ompt_enabled.ompt_callback_thread_begin) {
3999 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
4000 ompt_thread_initial, __ompt_get_thread_data_internal());
4001 }
4002 ompt_data_t *task_data;
4003 ompt_data_t *parallel_data;
4004 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4005 NULL);
4006 if (ompt_enabled.ompt_callback_implicit_task) {
4007 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4008 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
4009 }
4010
4011 ompt_set_thread_state(root_thread, ompt_state_work_serial);
4012 }
4013#endif
4014#if OMPD_SUPPORT
4015 if (ompd_state & OMPD_ENABLE_BP)
4016 ompd_bp_thread_begin();
4017#endif
4018
4019 KMP_MB();
4020 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4021
4022 return gtid;
4023}
4024
4025#if KMP_NESTED_HOT_TEAMS
4026static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
4027 const int max_level) {
4028 int i, n, nth;
4029 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
4030 if (!hot_teams || !hot_teams[level].hot_team) {
4031 return 0;
4032 }
4033 KMP_DEBUG_ASSERT(level < max_level);
4034 kmp_team_t *team = hot_teams[level].hot_team;
4035 nth = hot_teams[level].hot_team_nth;
4036 n = nth - 1; // primary thread is not freed
4037 if (level < max_level - 1) {
4038 for (i = 0; i < nth; ++i) {
4039 kmp_info_t *th = team->t.t_threads[i];
4040 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
4041 if (i > 0 && th->th.th_hot_teams) {
4042 __kmp_free(th->th.th_hot_teams);
4043 th->th.th_hot_teams = NULL;
4044 }
4045 }
4046 }
4047 __kmp_free_team(root, team, NULL);
4048 return n;
4049}
4050#endif
4051
4052// Resets a root thread and clear its root and hot teams.
4053// Returns the number of __kmp_threads entries directly and indirectly freed.
4054static int __kmp_reset_root(int gtid, kmp_root_t *root) {
4055 kmp_team_t *root_team = root->r.r_root_team;
4056 kmp_team_t *hot_team = root->r.r_hot_team;
4057 int n = hot_team->t.t_nproc;
4058 int i;
4059
4060 KMP_DEBUG_ASSERT(!root->r.r_active);
4061
4062 root->r.r_root_team = NULL;
4063 root->r.r_hot_team = NULL;
4064 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
4065 // before call to __kmp_free_team().
4066 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
4067#if KMP_NESTED_HOT_TEAMS
4068 if (__kmp_hot_teams_max_level >
4069 0) { // need to free nested hot teams and their threads if any
4070 for (i = 0; i < hot_team->t.t_nproc; ++i) {
4071 kmp_info_t *th = hot_team->t.t_threads[i];
4072 if (__kmp_hot_teams_max_level > 1) {
4073 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
4074 }
4075 if (th->th.th_hot_teams) {
4076 __kmp_free(th->th.th_hot_teams);
4077 th->th.th_hot_teams = NULL;
4078 }
4079 }
4080 }
4081#endif
4082 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
4083
4084 // Before we can reap the thread, we need to make certain that all other
4085 // threads in the teams that had this root as ancestor have stopped trying to
4086 // steal tasks.
4087 if (__kmp_tasking_mode != tskm_immediate_exec) {
4088 __kmp_wait_to_unref_task_teams();
4089 }
4090
4091#if KMP_OS_WINDOWS
4092 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4093 KA_TRACE(
4094 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
4095 "\n",
4096 (LPVOID) & (root->r.r_uber_thread->th),
4097 root->r.r_uber_thread->th.th_info.ds.ds_thread));
4098 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
4099#endif /* KMP_OS_WINDOWS */
4100
4101#if OMPD_SUPPORT
4102 if (ompd_state & OMPD_ENABLE_BP)
4103 ompd_bp_thread_end();
4104#endif
4105
4106#if OMPT_SUPPORT
4107 ompt_data_t *task_data;
4108 ompt_data_t *parallel_data;
4109 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, &parallel_data,
4110 NULL);
4111 if (ompt_enabled.ompt_callback_implicit_task) {
4112 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
4113 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
4114 }
4115 if (ompt_enabled.ompt_callback_thread_end) {
4116 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
4117 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
4118 }
4119#endif
4120
4121 TCW_4(__kmp_nth,
4122 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4123 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
4124 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
4125 " to %d\n",
4126 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
4127 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
4128 if (i == 1) {
4129 // need to free contention group structure
4130 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
4131 root->r.r_uber_thread->th.th_cg_roots->cg_root);
4132 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
4133 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
4134 root->r.r_uber_thread->th.th_cg_roots = NULL;
4135 }
4136 __kmp_reap_thread(root->r.r_uber_thread, 1);
4137
4138 // We canot put root thread to __kmp_thread_pool, so we have to reap it
4139 // instead of freeing.
4140 root->r.r_uber_thread = NULL;
4141 /* mark root as no longer in use */
4142 root->r.r_begin = FALSE;
4143
4144 return n;
4145}
4146
4147void __kmp_unregister_root_current_thread(int gtid) {
4148 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
4149 /* this lock should be ok, since unregister_root_current_thread is never
4150 called during an abort, only during a normal close. furthermore, if you
4151 have the forkjoin lock, you should never try to get the initz lock */
4152 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
4153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
4154 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
4155 "exiting T#%d\n",
4156 gtid));
4157 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4158 return;
4159 }
4160 kmp_root_t *root = __kmp_root[gtid];
4161
4162 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4163 KMP_ASSERT(KMP_UBER_GTID(gtid));
4164 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4165 KMP_ASSERT(root->r.r_active == FALSE);
4166
4167 KMP_MB();
4168
4169 kmp_info_t *thread = __kmp_threads[gtid];
4170 kmp_team_t *team = thread->th.th_team;
4171 kmp_task_team_t *task_team = thread->th.th_task_team;
4172
4173 // we need to wait for the proxy tasks before finishing the thread
4174 if (task_team != NULL && (task_team->tt.tt_found_proxy_tasks ||
4175 task_team->tt.tt_hidden_helper_task_encountered)) {
4176#if OMPT_SUPPORT
4177 // the runtime is shutting down so we won't report any events
4178 thread->th.ompt_thread_info.state = ompt_state_undefined;
4179#endif
4180 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
4181 }
4182
4183 __kmp_reset_root(gtid, root);
4184
4185 KMP_MB();
4186 KC_TRACE(10,
4187 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
4188
4189 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
4190}
4191
4192#if KMP_OS_WINDOWS
4193/* __kmp_forkjoin_lock must be already held
4194 Unregisters a root thread that is not the current thread. Returns the number
4195 of __kmp_threads entries freed as a result. */
4196static int __kmp_unregister_root_other_thread(int gtid) {
4197 kmp_root_t *root = __kmp_root[gtid];
4198 int r;
4199
4200 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4201 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4202 KMP_ASSERT(KMP_UBER_GTID(gtid));
4203 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4204 KMP_ASSERT(root->r.r_active == FALSE);
4205
4206 r = __kmp_reset_root(gtid, root);
4207 KC_TRACE(10,
4208 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4209 return r;
4210}
4211#endif
4212
4213#if KMP_DEBUG
4214void __kmp_task_info() {
4215
4216 kmp_int32 gtid = __kmp_entry_gtid();
4217 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4218 kmp_info_t *this_thr = __kmp_threads[gtid];
4219 kmp_team_t *steam = this_thr->th.th_serial_team;
4220 kmp_team_t *team = this_thr->th.th_team;
4221
4222 __kmp_printf(
4223 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4224 "ptask=%p\n",
4225 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4226 team->t.t_implicit_task_taskdata[tid].td_parent);
4227}
4228#endif // KMP_DEBUG
4229
4230/* TODO optimize with one big memclr, take out what isn't needed, split
4231 responsibility to workers as much as possible, and delay initialization of
4232 features as much as possible */
4233static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4234 int tid, int gtid) {
4235 /* this_thr->th.th_info.ds.ds_gtid is setup in
4236 kmp_allocate_thread/create_worker.
4237 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4238 KMP_DEBUG_ASSERT(this_thr != NULL);
4239 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4240 KMP_DEBUG_ASSERT(team);
4241 KMP_DEBUG_ASSERT(team->t.t_threads);
4242 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4243 kmp_info_t *master = team->t.t_threads[0];
4244 KMP_DEBUG_ASSERT(master);
4245 KMP_DEBUG_ASSERT(master->th.th_root);
4246
4247 KMP_MB();
4248
4249 TCW_SYNC_PTR(this_thr->th.th_team, team);
4250
4251 this_thr->th.th_info.ds.ds_tid = tid;
4252 this_thr->th.th_set_nproc = 0;
4253 if (__kmp_tasking_mode != tskm_immediate_exec)
4254 // When tasking is possible, threads are not safe to reap until they are
4255 // done tasking; this will be set when tasking code is exited in wait
4256 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4257 else // no tasking --> always safe to reap
4258 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4259 this_thr->th.th_set_proc_bind = proc_bind_default;
4260#if KMP_AFFINITY_SUPPORTED
4261 this_thr->th.th_new_place = this_thr->th.th_current_place;
4262#endif
4263 this_thr->th.th_root = master->th.th_root;
4264
4265 /* setup the thread's cache of the team structure */
4266 this_thr->th.th_team_nproc = team->t.t_nproc;
4267 this_thr->th.th_team_master = master;
4268 this_thr->th.th_team_serialized = team->t.t_serialized;
4269
4270 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4271
4272 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4273 tid, gtid, this_thr, this_thr->th.th_current_task));
4274
4275 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4276 team, tid, TRUE);
4277
4278 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4279 tid, gtid, this_thr, this_thr->th.th_current_task));
4280 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4281 // __kmp_initialize_team()?
4282
4283 /* TODO no worksharing in speculative threads */
4284 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4285
4286 this_thr->th.th_local.this_construct = 0;
4287
4288 if (!this_thr->th.th_pri_common) {
4289 this_thr->th.th_pri_common =
4290 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4291 if (__kmp_storage_map) {
4292 __kmp_print_storage_map_gtid(
4293 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4294 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4295 }
4296 this_thr->th.th_pri_head = NULL;
4297 }
4298
4299 if (this_thr != master && // Primary thread's CG root is initialized elsewhere
4300 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4301 // Make new thread's CG root same as primary thread's
4302 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4303 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4304 if (tmp) {
4305 // worker changes CG, need to check if old CG should be freed
4306 int i = tmp->cg_nthreads--;
4307 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4308 " on node %p of thread %p to %d\n",
4309 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4310 if (i == 1) {
4311 __kmp_free(tmp); // last thread left CG --> free it
4312 }
4313 }
4314 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4315 // Increment new thread's CG root's counter to add the new thread
4316 this_thr->th.th_cg_roots->cg_nthreads++;
4317 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4318 " node %p of thread %p to %d\n",
4319 this_thr, this_thr->th.th_cg_roots,
4320 this_thr->th.th_cg_roots->cg_root,
4321 this_thr->th.th_cg_roots->cg_nthreads));
4322 this_thr->th.th_current_task->td_icvs.thread_limit =
4323 this_thr->th.th_cg_roots->cg_thread_limit;
4324 }
4325
4326 /* Initialize dynamic dispatch */
4327 {
4328 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4329 // Use team max_nproc since this will never change for the team.
4330 size_t disp_size =
4331 sizeof(dispatch_private_info_t) *
4332 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4333 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4334 team->t.t_max_nproc));
4335 KMP_ASSERT(dispatch);
4336 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4337 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4338
4339 dispatch->th_disp_index = 0;
4340 dispatch->th_doacross_buf_idx = 0;
4341 if (!dispatch->th_disp_buffer) {
4342 dispatch->th_disp_buffer =
4343 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4344
4345 if (__kmp_storage_map) {
4346 __kmp_print_storage_map_gtid(
4347 gtid, &dispatch->th_disp_buffer[0],
4348 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4349 ? 1
4350 : __kmp_dispatch_num_buffers],
4351 disp_size,
4352 "th_%d.th_dispatch.th_disp_buffer "
4353 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4354 gtid, team->t.t_id, gtid);
4355 }
4356 } else {
4357 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4358 }
4359
4360 dispatch->th_dispatch_pr_current = 0;
4361 dispatch->th_dispatch_sh_current = 0;
4362
4363 dispatch->th_deo_fcn = 0; /* ORDERED */
4364 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4365 }
4366
4367 this_thr->th.th_next_pool = NULL;
4368
4369 if (!this_thr->th.th_task_state_memo_stack) {
4370 size_t i;
4371 this_thr->th.th_task_state_memo_stack =
4372 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4373 this_thr->th.th_task_state_top = 0;
4374 this_thr->th.th_task_state_stack_sz = 4;
4375 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4376 ++i) // zero init the stack
4377 this_thr->th.th_task_state_memo_stack[i] = 0;
4378 }
4379
4380 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4381 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4382
4383 KMP_MB();
4384}
4385
4386/* allocate a new thread for the requesting team. this is only called from
4387 within a forkjoin critical section. we will first try to get an available
4388 thread from the thread pool. if none is available, we will fork a new one
4389 assuming we are able to create a new one. this should be assured, as the
4390 caller should check on this first. */
4391kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4392 int new_tid) {
4393 kmp_team_t *serial_team;
4394 kmp_info_t *new_thr;
4395 int new_gtid;
4396
4397 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4398 KMP_DEBUG_ASSERT(root && team);
4399#if !KMP_NESTED_HOT_TEAMS
4400 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4401#endif
4402 KMP_MB();
4403
4404 /* first, try to get one from the thread pool */
4405 if (__kmp_thread_pool) {
4406 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4407 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4408 if (new_thr == __kmp_thread_pool_insert_pt) {
4409 __kmp_thread_pool_insert_pt = NULL;
4410 }
4411 TCW_4(new_thr->th.th_in_pool, FALSE);
4412 __kmp_suspend_initialize_thread(new_thr);
4413 __kmp_lock_suspend_mx(new_thr);
4414 if (new_thr->th.th_active_in_pool == TRUE) {
4415 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4416 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4417 new_thr->th.th_active_in_pool = FALSE;
4418 }
4419 __kmp_unlock_suspend_mx(new_thr);
4420
4421 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4422 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4423 KMP_ASSERT(!new_thr->th.th_team);
4424 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4425
4426 /* setup the thread structure */
4427 __kmp_initialize_info(new_thr, team, new_tid,
4428 new_thr->th.th_info.ds.ds_gtid);
4429 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4430
4431 TCW_4(__kmp_nth, __kmp_nth + 1);
4432
4433 new_thr->th.th_task_state = 0;
4434 new_thr->th.th_task_state_top = 0;
4435 new_thr->th.th_task_state_stack_sz = 4;
4436
4437 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
4438 // Make sure pool thread has transitioned to waiting on own thread struct
4439 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0);
4440 // Thread activated in __kmp_allocate_team when increasing team size
4441 }
4442
4443#ifdef KMP_ADJUST_BLOCKTIME
4444 /* Adjust blocktime back to zero if necessary */
4445 /* Middle initialization might not have occurred yet */
4446 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4447 if (__kmp_nth > __kmp_avail_proc) {
4448 __kmp_zero_bt = TRUE;
4449 }
4450 }
4451#endif /* KMP_ADJUST_BLOCKTIME */
4452
4453#if KMP_DEBUG
4454 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4455 // KMP_BARRIER_PARENT_FLAG.
4456 int b;
4457 kmp_balign_t *balign = new_thr->th.th_bar;
4458 for (b = 0; b < bs_last_barrier; ++b)
4459 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4460#endif
4461
4462 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4463 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4464
4465 KMP_MB();
4466 return new_thr;
4467 }
4468
4469 /* no, well fork a new one */
4470 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4471 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4472
4473#if KMP_USE_MONITOR
4474 // If this is the first worker thread the RTL is creating, then also
4475 // launch the monitor thread. We try to do this as early as possible.
4476 if (!TCR_4(__kmp_init_monitor)) {
4477 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4478 if (!TCR_4(__kmp_init_monitor)) {
4479 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4480 TCW_4(__kmp_init_monitor, 1);
4481 __kmp_create_monitor(&__kmp_monitor);
4482 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4483#if KMP_OS_WINDOWS
4484 // AC: wait until monitor has started. This is a fix for CQ232808.
4485 // The reason is that if the library is loaded/unloaded in a loop with
4486 // small (parallel) work in between, then there is high probability that
4487 // monitor thread started after the library shutdown. At shutdown it is
4488 // too late to cope with the problem, because when the primary thread is
4489 // in DllMain (process detach) the monitor has no chances to start (it is
4490 // blocked), and primary thread has no means to inform the monitor that
4491 // the library has gone, because all the memory which the monitor can
4492 // access is going to be released/reset.
4493 while (TCR_4(__kmp_init_monitor) < 2) {
4494 KMP_YIELD(TRUE);
4495 }
4496 KF_TRACE(10, ("after monitor thread has started\n"));
4497#endif
4498 }
4499 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4500 }
4501#endif
4502
4503 KMP_MB();
4504
4505 {
4506 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads)
4507 ? 1
4508 : __kmp_hidden_helper_threads_num + 1;
4509
4510 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL;
4511 ++new_gtid) {
4512 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4513 }
4514
4515 if (TCR_4(__kmp_init_hidden_helper_threads)) {
4516 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num);
4517 }
4518 }
4519
4520 /* allocate space for it. */
4521 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4522
4523 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4524
4525#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4526 // suppress race conditions detection on synchronization flags in debug mode
4527 // this helps to analyze library internals eliminating false positives
4528 __itt_suppress_mark_range(
4529 __itt_suppress_range, __itt_suppress_threading_errors,
4530 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4531 __itt_suppress_mark_range(
4532 __itt_suppress_range, __itt_suppress_threading_errors,
4533 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4534#if KMP_OS_WINDOWS
4535 __itt_suppress_mark_range(
4536 __itt_suppress_range, __itt_suppress_threading_errors,
4537 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4538#else
4539 __itt_suppress_mark_range(__itt_suppress_range,
4540 __itt_suppress_threading_errors,
4541 &new_thr->th.th_suspend_init_count,
4542 sizeof(new_thr->th.th_suspend_init_count));
4543#endif
4544 // TODO: check if we need to also suppress b_arrived flags
4545 __itt_suppress_mark_range(__itt_suppress_range,
4546 __itt_suppress_threading_errors,
4547 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4548 sizeof(new_thr->th.th_bar[0].bb.b_go));
4549 __itt_suppress_mark_range(__itt_suppress_range,
4550 __itt_suppress_threading_errors,
4551 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4552 sizeof(new_thr->th.th_bar[1].bb.b_go));
4553 __itt_suppress_mark_range(__itt_suppress_range,
4554 __itt_suppress_threading_errors,
4555 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4556 sizeof(new_thr->th.th_bar[2].bb.b_go));
4557#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4558 if (__kmp_storage_map) {
4559 __kmp_print_thread_storage_map(new_thr, new_gtid);
4560 }
4561
4562 // add the reserve serialized team, initialized from the team's primary thread
4563 {
4564 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4565 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4566 new_thr->th.th_serial_team = serial_team =
4567 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4568#if OMPT_SUPPORT
4569 ompt_data_none, // root parallel id
4570#endif
4571 proc_bind_default, &r_icvs,
4572 0 USE_NESTED_HOT_ARG(NULL));
4573 }
4574 KMP_ASSERT(serial_team);
4575 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4576 // execution (it is unused for now).
4577 serial_team->t.t_threads[0] = new_thr;
4578 KF_TRACE(10,
4579 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4580 new_thr));
4581
4582 /* setup the thread structures */
4583 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4584
4585#if USE_FAST_MEMORY
4586 __kmp_initialize_fast_memory(new_thr);
4587#endif /* USE_FAST_MEMORY */
4588
4589#if KMP_USE_BGET
4590 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4591 __kmp_initialize_bget(new_thr);
4592#endif
4593
4594 __kmp_init_random(new_thr); // Initialize random number generator
4595
4596 /* Initialize these only once when thread is grabbed for a team allocation */
4597 KA_TRACE(20,
4598 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4599 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4600
4601 int b;
4602 kmp_balign_t *balign = new_thr->th.th_bar;
4603 for (b = 0; b < bs_last_barrier; ++b) {
4604 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4605 balign[b].bb.team = NULL;
4606 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4607 balign[b].bb.use_oncore_barrier = 0;
4608 }
4609
4610 TCW_PTR(new_thr->th.th_sleep_loc, NULL);
4611 new_thr->th.th_sleep_loc_type = flag_unset;
4612
4613 new_thr->th.th_spin_here = FALSE;
4614 new_thr->th.th_next_waiting = 0;
4615#if KMP_OS_UNIX
4616 new_thr->th.th_blocking = false;
4617#endif
4618
4619#if KMP_AFFINITY_SUPPORTED
4620 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4621 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4622 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4623 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4624#endif
4625 new_thr->th.th_def_allocator = __kmp_def_allocator;
4626 new_thr->th.th_prev_level = 0;
4627 new_thr->th.th_prev_num_threads = 1;
4628
4629 TCW_4(new_thr->th.th_in_pool, FALSE);
4630 new_thr->th.th_active_in_pool = FALSE;
4631 TCW_4(new_thr->th.th_active, TRUE);
4632
4633 /* adjust the global counters */
4634 __kmp_all_nth++;
4635 __kmp_nth++;
4636
4637 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4638 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4639 if (__kmp_adjust_gtid_mode) {
4640 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4641 if (TCR_4(__kmp_gtid_mode) != 2) {
4642 TCW_4(__kmp_gtid_mode, 2);
4643 }
4644 } else {
4645 if (TCR_4(__kmp_gtid_mode) != 1) {
4646 TCW_4(__kmp_gtid_mode, 1);
4647 }
4648 }
4649 }
4650
4651#ifdef KMP_ADJUST_BLOCKTIME
4652 /* Adjust blocktime back to zero if necessary */
4653 /* Middle initialization might not have occurred yet */
4654 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4655 if (__kmp_nth > __kmp_avail_proc) {
4656 __kmp_zero_bt = TRUE;
4657 }
4658 }
4659#endif /* KMP_ADJUST_BLOCKTIME */
4660
4661 /* actually fork it and create the new worker thread */
4662 KF_TRACE(
4663 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4664 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4665 KF_TRACE(10,
4666 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4667
4668 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4669 new_gtid));
4670 KMP_MB();
4671 return new_thr;
4672}
4673
4674/* Reinitialize team for reuse.
4675 The hot team code calls this case at every fork barrier, so EPCC barrier
4676 test are extremely sensitive to changes in it, esp. writes to the team
4677 struct, which cause a cache invalidation in all threads.
4678 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
4679static void __kmp_reinitialize_team(kmp_team_t *team,
4680 kmp_internal_control_t *new_icvs,
4681 ident_t *loc) {
4682 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4683 team->t.t_threads[0], team));
4684 KMP_DEBUG_ASSERT(team && new_icvs);
4685 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4686 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4687
4688 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4689 // Copy ICVs to the primary thread's implicit taskdata
4690 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4691 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4692
4693 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4694 team->t.t_threads[0], team));
4695}
4696
4697/* Initialize the team data structure.
4698 This assumes the t_threads and t_max_nproc are already set.
4699 Also, we don't touch the arguments */
4700static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4701 kmp_internal_control_t *new_icvs,
4702 ident_t *loc) {
4703 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4704
4705 /* verify */
4706 KMP_DEBUG_ASSERT(team);
4707 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4708 KMP_DEBUG_ASSERT(team->t.t_threads);
4709 KMP_MB();
4710
4711 team->t.t_master_tid = 0; /* not needed */
4712 /* team->t.t_master_bar; not needed */
4713 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4714 team->t.t_nproc = new_nproc;
4715
4716 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4717 team->t.t_next_pool = NULL;
4718 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4719 * up hot team */
4720
4721 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4722 team->t.t_invoke = NULL; /* not needed */
4723
4724 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4725 team->t.t_sched.sched = new_icvs->sched.sched;
4726
4727#if KMP_ARCH_X86 || KMP_ARCH_X86_64
4728 team->t.t_fp_control_saved = FALSE; /* not needed */
4729 team->t.t_x87_fpu_control_word = 0; /* not needed */
4730 team->t.t_mxcsr = 0; /* not needed */
4731#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4732
4733 team->t.t_construct = 0;
4734
4735 team->t.t_ordered.dt.t_value = 0;
4736 team->t.t_master_active = FALSE;
4737
4738#ifdef KMP_DEBUG
4739 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4740#endif
4741#if KMP_OS_WINDOWS
4742 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4743#endif
4744
4745 team->t.t_control_stack_top = NULL;
4746
4747 __kmp_reinitialize_team(team, new_icvs, loc);
4748
4749 KMP_MB();
4750 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4751}
4752
4753#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4754/* Sets full mask for thread and returns old mask, no changes to structures. */
4755static void
4756__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4757 if (KMP_AFFINITY_CAPABLE()) {
4758 int status;
4759 if (old_mask != NULL) {
4760 status = __kmp_get_system_affinity(old_mask, TRUE);
4761 int error = errno;
4762 if (status != 0) {
4763 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4764 __kmp_msg_null);
4765 }
4766 }
4767 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4768 }
4769}
4770#endif
4771
4772#if KMP_AFFINITY_SUPPORTED
4773
4774// __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4775// It calculates the worker + primary thread's partition based upon the parent
4776// thread's partition, and binds each worker to a thread in their partition.
4777// The primary thread's partition should already include its current binding.
4778static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4779 // Do not partition places for the hidden helper team
4780 if (KMP_HIDDEN_HELPER_TEAM(team))
4781 return;
4782 // Copy the primary thread's place partition to the team struct
4783 kmp_info_t *master_th = team->t.t_threads[0];
4784 KMP_DEBUG_ASSERT(master_th != NULL);
4785 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4786 int first_place = master_th->th.th_first_place;
4787 int last_place = master_th->th.th_last_place;
4788 int masters_place = master_th->th.th_current_place;
4789 int num_masks = __kmp_affinity.num_masks;
4790 team->t.t_first_place = first_place;
4791 team->t.t_last_place = last_place;
4792
4793 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4794 "bound to place %d partition = [%d,%d]\n",
4795 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4796 team->t.t_id, masters_place, first_place, last_place));
4797
4798 switch (proc_bind) {
4799
4800 case proc_bind_default:
4801 // Serial teams might have the proc_bind policy set to proc_bind_default.
4802 // Not an issue -- we don't rebind primary thread for any proc_bind policy.
4803 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4804 break;
4805
4806 case proc_bind_primary: {
4807 int f;
4808 int n_th = team->t.t_nproc;
4809 for (f = 1; f < n_th; f++) {
4810 kmp_info_t *th = team->t.t_threads[f];
4811 KMP_DEBUG_ASSERT(th != NULL);
4812 th->th.th_first_place = first_place;
4813 th->th.th_last_place = last_place;
4814 th->th.th_new_place = masters_place;
4815 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4816 team->t.t_display_affinity != 1) {
4817 team->t.t_display_affinity = 1;
4818 }
4819
4820 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d "
4821 "partition = [%d,%d]\n",
4822 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4823 f, masters_place, first_place, last_place));
4824 }
4825 } break;
4826
4827 case proc_bind_close: {
4828 int f;
4829 int n_th = team->t.t_nproc;
4830 int n_places;
4831 if (first_place <= last_place) {
4832 n_places = last_place - first_place + 1;
4833 } else {
4834 n_places = num_masks - first_place + last_place + 1;
4835 }
4836 if (n_th <= n_places) {
4837 int place = masters_place;
4838 for (f = 1; f < n_th; f++) {
4839 kmp_info_t *th = team->t.t_threads[f];
4840 KMP_DEBUG_ASSERT(th != NULL);
4841
4842 if (place == last_place) {
4843 place = first_place;
4844 } else if (place == (num_masks - 1)) {
4845 place = 0;
4846 } else {
4847 place++;
4848 }
4849 th->th.th_first_place = first_place;
4850 th->th.th_last_place = last_place;
4851 th->th.th_new_place = place;
4852 if (__kmp_display_affinity && place != th->th.th_current_place &&
4853 team->t.t_display_affinity != 1) {
4854 team->t.t_display_affinity = 1;
4855 }
4856
4857 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4858 "partition = [%d,%d]\n",
4859 __kmp_gtid_from_thread(team->t.t_threads[f]),
4860 team->t.t_id, f, place, first_place, last_place));
4861 }
4862 } else {
4863 int S, rem, gap, s_count;
4864 S = n_th / n_places;
4865 s_count = 0;
4866 rem = n_th - (S * n_places);
4867 gap = rem > 0 ? n_places / rem : n_places;
4868 int place = masters_place;
4869 int gap_ct = gap;
4870 for (f = 0; f < n_th; f++) {
4871 kmp_info_t *th = team->t.t_threads[f];
4872 KMP_DEBUG_ASSERT(th != NULL);
4873
4874 th->th.th_first_place = first_place;
4875 th->th.th_last_place = last_place;
4876 th->th.th_new_place = place;
4877 if (__kmp_display_affinity && place != th->th.th_current_place &&
4878 team->t.t_display_affinity != 1) {
4879 team->t.t_display_affinity = 1;
4880 }
4881 s_count++;
4882
4883 if ((s_count == S) && rem && (gap_ct == gap)) {
4884 // do nothing, add an extra thread to place on next iteration
4885 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4886 // we added an extra thread to this place; move to next place
4887 if (place == last_place) {
4888 place = first_place;
4889 } else if (place == (num_masks - 1)) {
4890 place = 0;
4891 } else {
4892 place++;
4893 }
4894 s_count = 0;
4895 gap_ct = 1;
4896 rem--;
4897 } else if (s_count == S) { // place full; don't add extra
4898 if (place == last_place) {
4899 place = first_place;
4900 } else if (place == (num_masks - 1)) {
4901 place = 0;
4902 } else {
4903 place++;
4904 }
4905 gap_ct++;
4906 s_count = 0;
4907 }
4908
4909 KA_TRACE(100,
4910 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4911 "partition = [%d,%d]\n",
4912 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4913 th->th.th_new_place, first_place, last_place));
4914 }
4915 KMP_DEBUG_ASSERT(place == masters_place);
4916 }
4917 } break;
4918
4919 case proc_bind_spread: {
4920 int f;
4921 int n_th = team->t.t_nproc;
4922 int n_places;
4923 int thidx;
4924 if (first_place <= last_place) {
4925 n_places = last_place - first_place + 1;
4926 } else {
4927 n_places = num_masks - first_place + last_place + 1;
4928 }
4929 if (n_th <= n_places) {
4930 int place = -1;
4931
4932 if (n_places != num_masks) {
4933 int S = n_places / n_th;
4934 int s_count, rem, gap, gap_ct;
4935
4936 place = masters_place;
4937 rem = n_places - n_th * S;
4938 gap = rem ? n_th / rem : 1;
4939 gap_ct = gap;
4940 thidx = n_th;
4941 if (update_master_only == 1)
4942 thidx = 1;
4943 for (f = 0; f < thidx; f++) {
4944 kmp_info_t *th = team->t.t_threads[f];
4945 KMP_DEBUG_ASSERT(th != NULL);
4946
4947 th->th.th_first_place = place;
4948 th->th.th_new_place = place;
4949 if (__kmp_display_affinity && place != th->th.th_current_place &&
4950 team->t.t_display_affinity != 1) {
4951 team->t.t_display_affinity = 1;
4952 }
4953 s_count = 1;
4954 while (s_count < S) {
4955 if (place == last_place) {
4956 place = first_place;
4957 } else if (place == (num_masks - 1)) {
4958 place = 0;
4959 } else {
4960 place++;
4961 }
4962 s_count++;
4963 }
4964 if (rem && (gap_ct == gap)) {
4965 if (place == last_place) {
4966 place = first_place;
4967 } else if (place == (num_masks - 1)) {
4968 place = 0;
4969 } else {
4970 place++;
4971 }
4972 rem--;
4973 gap_ct = 0;
4974 }
4975 th->th.th_last_place = place;
4976 gap_ct++;
4977
4978 if (place == last_place) {
4979 place = first_place;
4980 } else if (place == (num_masks - 1)) {
4981 place = 0;
4982 } else {
4983 place++;
4984 }
4985
4986 KA_TRACE(100,
4987 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4988 "partition = [%d,%d], num_masks: %u\n",
4989 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4990 f, th->th.th_new_place, th->th.th_first_place,
4991 th->th.th_last_place, num_masks));
4992 }
4993 } else {
4994 /* Having uniform space of available computation places I can create
4995 T partitions of round(P/T) size and put threads into the first
4996 place of each partition. */
4997 double current = static_cast<double>(masters_place);
4998 double spacing =
4999 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
5000 int first, last;
5001 kmp_info_t *th;
5002
5003 thidx = n_th + 1;
5004 if (update_master_only == 1)
5005 thidx = 1;
5006 for (f = 0; f < thidx; f++) {
5007 first = static_cast<int>(current);
5008 last = static_cast<int>(current + spacing) - 1;
5009 KMP_DEBUG_ASSERT(last >= first);
5010 if (first >= n_places) {
5011 if (masters_place) {
5012 first -= n_places;
5013 last -= n_places;
5014 if (first == (masters_place + 1)) {
5015 KMP_DEBUG_ASSERT(f == n_th);
5016 first--;
5017 }
5018 if (last == masters_place) {
5019 KMP_DEBUG_ASSERT(f == (n_th - 1));
5020 last--;
5021 }
5022 } else {
5023 KMP_DEBUG_ASSERT(f == n_th);
5024 first = 0;
5025 last = 0;
5026 }
5027 }
5028 if (last >= n_places) {
5029 last = (n_places - 1);
5030 }
5031 place = first;
5032 current += spacing;
5033 if (f < n_th) {
5034 KMP_DEBUG_ASSERT(0 <= first);
5035 KMP_DEBUG_ASSERT(n_places > first);
5036 KMP_DEBUG_ASSERT(0 <= last);
5037 KMP_DEBUG_ASSERT(n_places > last);
5038 KMP_DEBUG_ASSERT(last_place >= first_place);
5039 th = team->t.t_threads[f];
5040 KMP_DEBUG_ASSERT(th);
5041 th->th.th_first_place = first;
5042 th->th.th_new_place = place;
5043 th->th.th_last_place = last;
5044 if (__kmp_display_affinity && place != th->th.th_current_place &&
5045 team->t.t_display_affinity != 1) {
5046 team->t.t_display_affinity = 1;
5047 }
5048 KA_TRACE(100,
5049 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5050 "partition = [%d,%d], spacing = %.4f\n",
5051 __kmp_gtid_from_thread(team->t.t_threads[f]),
5052 team->t.t_id, f, th->th.th_new_place,
5053 th->th.th_first_place, th->th.th_last_place, spacing));
5054 }
5055 }
5056 }
5057 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5058 } else {
5059 int S, rem, gap, s_count;
5060 S = n_th / n_places;
5061 s_count = 0;
5062 rem = n_th - (S * n_places);
5063 gap = rem > 0 ? n_places / rem : n_places;
5064 int place = masters_place;
5065 int gap_ct = gap;
5066 thidx = n_th;
5067 if (update_master_only == 1)
5068 thidx = 1;
5069 for (f = 0; f < thidx; f++) {
5070 kmp_info_t *th = team->t.t_threads[f];
5071 KMP_DEBUG_ASSERT(th != NULL);
5072
5073 th->th.th_first_place = place;
5074 th->th.th_last_place = place;
5075 th->th.th_new_place = place;
5076 if (__kmp_display_affinity && place != th->th.th_current_place &&
5077 team->t.t_display_affinity != 1) {
5078 team->t.t_display_affinity = 1;
5079 }
5080 s_count++;
5081
5082 if ((s_count == S) && rem && (gap_ct == gap)) {
5083 // do nothing, add an extra thread to place on next iteration
5084 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
5085 // we added an extra thread to this place; move on to next place
5086 if (place == last_place) {
5087 place = first_place;
5088 } else if (place == (num_masks - 1)) {
5089 place = 0;
5090 } else {
5091 place++;
5092 }
5093 s_count = 0;
5094 gap_ct = 1;
5095 rem--;
5096 } else if (s_count == S) { // place is full; don't add extra thread
5097 if (place == last_place) {
5098 place = first_place;
5099 } else if (place == (num_masks - 1)) {
5100 place = 0;
5101 } else {
5102 place++;
5103 }
5104 gap_ct++;
5105 s_count = 0;
5106 }
5107
5108 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
5109 "partition = [%d,%d]\n",
5110 __kmp_gtid_from_thread(team->t.t_threads[f]),
5111 team->t.t_id, f, th->th.th_new_place,
5112 th->th.th_first_place, th->th.th_last_place));
5113 }
5114 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
5115 }
5116 } break;
5117
5118 default:
5119 break;
5120 }
5121
5122 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
5123}
5124
5125#endif // KMP_AFFINITY_SUPPORTED
5126
5127/* allocate a new team data structure to use. take one off of the free pool if
5128 available */
5129kmp_team_t *
5130__kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
5131#if OMPT_SUPPORT
5132 ompt_data_t ompt_parallel_data,
5133#endif
5134 kmp_proc_bind_t new_proc_bind,
5135 kmp_internal_control_t *new_icvs,
5136 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5137 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
5138 int f;
5139 kmp_team_t *team;
5140 int use_hot_team = !root->r.r_active;
5141 int level = 0;
5142 int do_place_partition = 1;
5143
5144 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
5145 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
5146 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
5147 KMP_MB();
5148
5149#if KMP_NESTED_HOT_TEAMS
5150 kmp_hot_team_ptr_t *hot_teams;
5151 if (master) {
5152 team = master->th.th_team;
5153 level = team->t.t_active_level;
5154 if (master->th.th_teams_microtask) { // in teams construct?
5155 if (master->th.th_teams_size.nteams > 1 &&
5156 ( // #teams > 1
5157 team->t.t_pkfn ==
5158 (microtask_t)__kmp_teams_master || // inner fork of the teams
5159 master->th.th_teams_level <
5160 team->t.t_level)) { // or nested parallel inside the teams
5161 ++level; // not increment if #teams==1, or for outer fork of the teams;
5162 // increment otherwise
5163 }
5164 // Do not perform the place partition if inner fork of the teams
5165 // Wait until nested parallel region encountered inside teams construct
5166 if ((master->th.th_teams_size.nteams == 1 &&
5167 master->th.th_teams_level >= team->t.t_level) ||
5168 (team->t.t_pkfn == (microtask_t)__kmp_teams_master))
5169 do_place_partition = 0;
5170 }
5171 hot_teams = master->th.th_hot_teams;
5172 if (level < __kmp_hot_teams_max_level && hot_teams &&
5173 hot_teams[level].hot_team) {
5174 // hot team has already been allocated for given level
5175 use_hot_team = 1;
5176 } else {
5177 use_hot_team = 0;
5178 }
5179 } else {
5180 // check we won't access uninitialized hot_teams, just in case
5181 KMP_DEBUG_ASSERT(new_nproc == 1);
5182 }
5183#endif
5184 // Optimization to use a "hot" team
5185 if (use_hot_team && new_nproc > 1) {
5186 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
5187#if KMP_NESTED_HOT_TEAMS
5188 team = hot_teams[level].hot_team;
5189#else
5190 team = root->r.r_hot_team;
5191#endif
5192#if KMP_DEBUG
5193 if (__kmp_tasking_mode != tskm_immediate_exec) {
5194 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5195 "task_team[1] = %p before reinit\n",
5196 team->t.t_task_team[0], team->t.t_task_team[1]));
5197 }
5198#endif
5199
5200 if (team->t.t_nproc != new_nproc &&
5201 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5202 // Distributed barrier may need a resize
5203 int old_nthr = team->t.t_nproc;
5204 __kmp_resize_dist_barrier(team, old_nthr, new_nproc);
5205 }
5206
5207 // If not doing the place partition, then reset the team's proc bind
5208 // to indicate that partitioning of all threads still needs to take place
5209 if (do_place_partition == 0)
5210 team->t.t_proc_bind = proc_bind_default;
5211 // Has the number of threads changed?
5212 /* Let's assume the most common case is that the number of threads is
5213 unchanged, and put that case first. */
5214 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
5215 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
5216 // This case can mean that omp_set_num_threads() was called and the hot
5217 // team size was already reduced, so we check the special flag
5218 if (team->t.t_size_changed == -1) {
5219 team->t.t_size_changed = 1;
5220 } else {
5221 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
5222 }
5223
5224 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5225 kmp_r_sched_t new_sched = new_icvs->sched;
5226 // set primary thread's schedule as new run-time schedule
5227 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
5228
5229 __kmp_reinitialize_team(team, new_icvs,
5230 root->r.r_uber_thread->th.th_ident);
5231
5232 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
5233 team->t.t_threads[0], team));
5234 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5235
5236#if KMP_AFFINITY_SUPPORTED
5237 if ((team->t.t_size_changed == 0) &&
5238 (team->t.t_proc_bind == new_proc_bind)) {
5239 if (new_proc_bind == proc_bind_spread) {
5240 if (do_place_partition) {
5241 // add flag to update only master for spread
5242 __kmp_partition_places(team, 1);
5243 }
5244 }
5245 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5246 "proc_bind = %d, partition = [%d,%d]\n",
5247 team->t.t_id, new_proc_bind, team->t.t_first_place,
5248 team->t.t_last_place));
5249 } else {
5250 if (do_place_partition) {
5251 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5252 __kmp_partition_places(team);
5253 }
5254 }
5255#else
5256 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5257#endif /* KMP_AFFINITY_SUPPORTED */
5258 } else if (team->t.t_nproc > new_nproc) {
5259 KA_TRACE(20,
5260 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5261 new_nproc));
5262
5263 team->t.t_size_changed = 1;
5264 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5265 // Barrier size already reduced earlier in this function
5266 // Activate team threads via th_used_in_team
5267 __kmp_add_threads_to_team(team, new_nproc);
5268 }
5269#if KMP_NESTED_HOT_TEAMS
5270 if (__kmp_hot_teams_mode == 0) {
5271 // AC: saved number of threads should correspond to team's value in this
5272 // mode, can be bigger in mode 1, when hot team has threads in reserve
5273 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5274 hot_teams[level].hot_team_nth = new_nproc;
5275#endif // KMP_NESTED_HOT_TEAMS
5276 /* release the extra threads we don't need any more */
5277 for (f = new_nproc; f < team->t.t_nproc; f++) {
5278 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5279 if (__kmp_tasking_mode != tskm_immediate_exec) {
5280 // When decreasing team size, threads no longer in the team should
5281 // unref task team.
5282 team->t.t_threads[f]->th.th_task_team = NULL;
5283 }
5284 __kmp_free_thread(team->t.t_threads[f]);
5285 team->t.t_threads[f] = NULL;
5286 }
5287#if KMP_NESTED_HOT_TEAMS
5288 } // (__kmp_hot_teams_mode == 0)
5289 else {
5290 // When keeping extra threads in team, switch threads to wait on own
5291 // b_go flag
5292 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5293 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5294 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5295 for (int b = 0; b < bs_last_barrier; ++b) {
5296 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5297 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5298 }
5299 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5300 }
5301 }
5302 }
5303#endif // KMP_NESTED_HOT_TEAMS
5304 team->t.t_nproc = new_nproc;
5305 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5306 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5307 __kmp_reinitialize_team(team, new_icvs,
5308 root->r.r_uber_thread->th.th_ident);
5309
5310 // Update remaining threads
5311 for (f = 0; f < new_nproc; ++f) {
5312 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5313 }
5314
5315 // restore the current task state of the primary thread: should be the
5316 // implicit task
5317 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5318 team->t.t_threads[0], team));
5319
5320 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5321
5322#ifdef KMP_DEBUG
5323 for (f = 0; f < team->t.t_nproc; f++) {
5324 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5325 team->t.t_threads[f]->th.th_team_nproc ==
5326 team->t.t_nproc);
5327 }
5328#endif
5329
5330 if (do_place_partition) {
5331 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5332#if KMP_AFFINITY_SUPPORTED
5333 __kmp_partition_places(team);
5334#endif
5335 }
5336 } else { // team->t.t_nproc < new_nproc
5337#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5338 kmp_affin_mask_t *old_mask;
5339 if (KMP_AFFINITY_CAPABLE()) {
5340 KMP_CPU_ALLOC(old_mask);
5341 }
5342#endif
5343
5344 KA_TRACE(20,
5345 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5346 new_nproc));
5347 int old_nproc = team->t.t_nproc; // save old value and use to update only
5348 team->t.t_size_changed = 1;
5349
5350#if KMP_NESTED_HOT_TEAMS
5351 int avail_threads = hot_teams[level].hot_team_nth;
5352 if (new_nproc < avail_threads)
5353 avail_threads = new_nproc;
5354 kmp_info_t **other_threads = team->t.t_threads;
5355 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5356 // Adjust barrier data of reserved threads (if any) of the team
5357 // Other data will be set in __kmp_initialize_info() below.
5358 int b;
5359 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5360 for (b = 0; b < bs_last_barrier; ++b) {
5361 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5362 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5363#if USE_DEBUGGER
5364 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5365#endif
5366 }
5367 }
5368 if (hot_teams[level].hot_team_nth >= new_nproc) {
5369 // we have all needed threads in reserve, no need to allocate any
5370 // this only possible in mode 1, cannot have reserved threads in mode 0
5371 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5372 team->t.t_nproc = new_nproc; // just get reserved threads involved
5373 } else {
5374 // We may have some threads in reserve, but not enough;
5375 // get reserved threads involved if any.
5376 team->t.t_nproc = hot_teams[level].hot_team_nth;
5377 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5378#endif // KMP_NESTED_HOT_TEAMS
5379 if (team->t.t_max_nproc < new_nproc) {
5380 /* reallocate larger arrays */
5381 __kmp_reallocate_team_arrays(team, new_nproc);
5382 __kmp_reinitialize_team(team, new_icvs, NULL);
5383 }
5384
5385#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5386 /* Temporarily set full mask for primary thread before creation of
5387 workers. The reason is that workers inherit the affinity from the
5388 primary thread, so if a lot of workers are created on the single
5389 core quickly, they don't get a chance to set their own affinity for
5390 a long time. */
5391 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5392#endif
5393
5394 /* allocate new threads for the hot team */
5395 for (f = team->t.t_nproc; f < new_nproc; f++) {
5396 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5397 KMP_DEBUG_ASSERT(new_worker);
5398 team->t.t_threads[f] = new_worker;
5399
5400 KA_TRACE(20,
5401 ("__kmp_allocate_team: team %d init T#%d arrived: "
5402 "join=%llu, plain=%llu\n",
5403 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5404 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5405 team->t.t_bar[bs_plain_barrier].b_arrived));
5406
5407 { // Initialize barrier data for new threads.
5408 int b;
5409 kmp_balign_t *balign = new_worker->th.th_bar;
5410 for (b = 0; b < bs_last_barrier; ++b) {
5411 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5412 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5413 KMP_BARRIER_PARENT_FLAG);
5414#if USE_DEBUGGER
5415 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5416#endif
5417 }
5418 }
5419 }
5420
5421#if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5422 if (KMP_AFFINITY_CAPABLE()) {
5423 /* Restore initial primary thread's affinity mask */
5424 __kmp_set_system_affinity(old_mask, TRUE);
5425 KMP_CPU_FREE(old_mask);
5426 }
5427#endif
5428#if KMP_NESTED_HOT_TEAMS
5429 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5430#endif // KMP_NESTED_HOT_TEAMS
5431 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5432 // Barrier size already increased earlier in this function
5433 // Activate team threads via th_used_in_team
5434 __kmp_add_threads_to_team(team, new_nproc);
5435 }
5436 /* make sure everyone is syncronized */
5437 // new threads below
5438 __kmp_initialize_team(team, new_nproc, new_icvs,
5439 root->r.r_uber_thread->th.th_ident);
5440
5441 /* reinitialize the threads */
5442 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5443 for (f = 0; f < team->t.t_nproc; ++f)
5444 __kmp_initialize_info(team->t.t_threads[f], team, f,
5445 __kmp_gtid_from_tid(f, team));
5446
5447 // set th_task_state for new threads in hot team with older thread's state
5448 kmp_uint8 old_state = team->t.t_threads[old_nproc - 1]->th.th_task_state;
5449 for (f = old_nproc; f < team->t.t_nproc; ++f)
5450 team->t.t_threads[f]->th.th_task_state = old_state;
5451
5452#ifdef KMP_DEBUG
5453 for (f = 0; f < team->t.t_nproc; ++f) {
5454 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5455 team->t.t_threads[f]->th.th_team_nproc ==
5456 team->t.t_nproc);
5457 }
5458#endif
5459
5460 if (do_place_partition) {
5461 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5462#if KMP_AFFINITY_SUPPORTED
5463 __kmp_partition_places(team);
5464#endif
5465 }
5466 } // Check changes in number of threads
5467
5468 kmp_info_t *master = team->t.t_threads[0];
5469 if (master->th.th_teams_microtask) {
5470 for (f = 1; f < new_nproc; ++f) {
5471 // propagate teams construct specific info to workers
5472 kmp_info_t *thr = team->t.t_threads[f];
5473 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5474 thr->th.th_teams_level = master->th.th_teams_level;
5475 thr->th.th_teams_size = master->th.th_teams_size;
5476 }
5477 }
5478#if KMP_NESTED_HOT_TEAMS
5479 if (level) {
5480 // Sync barrier state for nested hot teams, not needed for outermost hot
5481 // team.
5482 for (f = 1; f < new_nproc; ++f) {
5483 kmp_info_t *thr = team->t.t_threads[f];
5484 int b;
5485 kmp_balign_t *balign = thr->th.th_bar;
5486 for (b = 0; b < bs_last_barrier; ++b) {
5487 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5488 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5489#if USE_DEBUGGER
5490 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5491#endif
5492 }
5493 }
5494 }
5495#endif // KMP_NESTED_HOT_TEAMS
5496
5497 /* reallocate space for arguments if necessary */
5498 __kmp_alloc_argv_entries(argc, team, TRUE);
5499 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5500 // The hot team re-uses the previous task team,
5501 // if untouched during the previous release->gather phase.
5502
5503 KF_TRACE(10, (" hot_team = %p\n", team));
5504
5505#if KMP_DEBUG
5506 if (__kmp_tasking_mode != tskm_immediate_exec) {
5507 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5508 "task_team[1] = %p after reinit\n",
5509 team->t.t_task_team[0], team->t.t_task_team[1]));
5510 }
5511#endif
5512
5513#if OMPT_SUPPORT
5514 __ompt_team_assign_id(team, ompt_parallel_data);
5515#endif
5516
5517 KMP_MB();
5518
5519 return team;
5520 }
5521
5522 /* next, let's try to take one from the team pool */
5523 KMP_MB();
5524 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5525 /* TODO: consider resizing undersized teams instead of reaping them, now
5526 that we have a resizing mechanism */
5527 if (team->t.t_max_nproc >= max_nproc) {
5528 /* take this team from the team pool */
5529 __kmp_team_pool = team->t.t_next_pool;
5530
5531 if (max_nproc > 1 &&
5532 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5533 if (!team->t.b) { // Allocate barrier structure
5534 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5535 }
5536 }
5537
5538 /* setup the team for fresh use */
5539 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5540
5541 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5542 "task_team[1] %p to NULL\n",
5543 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5544 team->t.t_task_team[0] = NULL;
5545 team->t.t_task_team[1] = NULL;
5546
5547 /* reallocate space for arguments if necessary */
5548 __kmp_alloc_argv_entries(argc, team, TRUE);
5549 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5550
5551 KA_TRACE(
5552 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5553 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5554 { // Initialize barrier data.
5555 int b;
5556 for (b = 0; b < bs_last_barrier; ++b) {
5557 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5558#if USE_DEBUGGER
5559 team->t.t_bar[b].b_master_arrived = 0;
5560 team->t.t_bar[b].b_team_arrived = 0;
5561#endif
5562 }
5563 }
5564
5565 team->t.t_proc_bind = new_proc_bind;
5566
5567 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5568 team->t.t_id));
5569
5570#if OMPT_SUPPORT
5571 __ompt_team_assign_id(team, ompt_parallel_data);
5572#endif
5573
5574 KMP_MB();
5575
5576 return team;
5577 }
5578
5579 /* reap team if it is too small, then loop back and check the next one */
5580 // not sure if this is wise, but, will be redone during the hot-teams
5581 // rewrite.
5582 /* TODO: Use technique to find the right size hot-team, don't reap them */
5583 team = __kmp_reap_team(team);
5584 __kmp_team_pool = team;
5585 }
5586
5587 /* nothing available in the pool, no matter, make a new team! */
5588 KMP_MB();
5589 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5590
5591 /* and set it up */
5592 team->t.t_max_nproc = max_nproc;
5593 if (max_nproc > 1 &&
5594 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5595 // Allocate barrier structure
5596 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub);
5597 }
5598
5599 /* NOTE well, for some reason allocating one big buffer and dividing it up
5600 seems to really hurt performance a lot on the P4, so, let's not use this */
5601 __kmp_allocate_team_arrays(team, max_nproc);
5602
5603 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5604 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5605
5606 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5607 "%p to NULL\n",
5608 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5609 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5610 // memory, no need to duplicate
5611 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5612 // memory, no need to duplicate
5613
5614 if (__kmp_storage_map) {
5615 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5616 }
5617
5618 /* allocate space for arguments */
5619 __kmp_alloc_argv_entries(argc, team, FALSE);
5620 team->t.t_argc = argc;
5621
5622 KA_TRACE(20,
5623 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5624 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5625 { // Initialize barrier data.
5626 int b;
5627 for (b = 0; b < bs_last_barrier; ++b) {
5628 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5629#if USE_DEBUGGER
5630 team->t.t_bar[b].b_master_arrived = 0;
5631 team->t.t_bar[b].b_team_arrived = 0;
5632#endif
5633 }
5634 }
5635
5636 team->t.t_proc_bind = new_proc_bind;
5637
5638#if OMPT_SUPPORT
5639 __ompt_team_assign_id(team, ompt_parallel_data);
5640 team->t.ompt_serialized_team_info = NULL;
5641#endif
5642
5643 KMP_MB();
5644
5645 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5646 team->t.t_id));
5647
5648 return team;
5649}
5650
5651/* TODO implement hot-teams at all levels */
5652/* TODO implement lazy thread release on demand (disband request) */
5653
5654/* free the team. return it to the team pool. release all the threads
5655 * associated with it */
5656void __kmp_free_team(kmp_root_t *root,
5657 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5658 int f;
5659 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5660 team->t.t_id));
5661
5662 /* verify state */
5663 KMP_DEBUG_ASSERT(root);
5664 KMP_DEBUG_ASSERT(team);
5665 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5666 KMP_DEBUG_ASSERT(team->t.t_threads);
5667
5668 int use_hot_team = team == root->r.r_hot_team;
5669#if KMP_NESTED_HOT_TEAMS
5670 int level;
5671 if (master) {
5672 level = team->t.t_active_level - 1;
5673 if (master->th.th_teams_microtask) { // in teams construct?
5674 if (master->th.th_teams_size.nteams > 1) {
5675 ++level; // level was not increased in teams construct for
5676 // team_of_masters
5677 }
5678 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5679 master->th.th_teams_level == team->t.t_level) {
5680 ++level; // level was not increased in teams construct for
5681 // team_of_workers before the parallel
5682 } // team->t.t_level will be increased inside parallel
5683 }
5684#if KMP_DEBUG
5685 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams;
5686#endif
5687 if (level < __kmp_hot_teams_max_level) {
5688 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5689 use_hot_team = 1;
5690 }
5691 }
5692#endif // KMP_NESTED_HOT_TEAMS
5693
5694 /* team is done working */
5695 TCW_SYNC_PTR(team->t.t_pkfn,
5696 NULL); // Important for Debugging Support Library.
5697#if KMP_OS_WINDOWS
5698 team->t.t_copyin_counter = 0; // init counter for possible reuse
5699#endif
5700 // Do not reset pointer to parent team to NULL for hot teams.
5701
5702 /* if we are non-hot team, release our threads */
5703 if (!use_hot_team) {
5704 if (__kmp_tasking_mode != tskm_immediate_exec) {
5705 // Wait for threads to reach reapable state
5706 for (f = 1; f < team->t.t_nproc; ++f) {
5707 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5708 kmp_info_t *th = team->t.t_threads[f];
5709 volatile kmp_uint32 *state = &th->th.th_reap_state;
5710 while (*state != KMP_SAFE_TO_REAP) {
5711#if KMP_OS_WINDOWS
5712 // On Windows a thread can be killed at any time, check this
5713 DWORD ecode;
5714 if (!__kmp_is_thread_alive(th, &ecode)) {
5715 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5716 break;
5717 }
5718#endif
5719 // first check if thread is sleeping
5720 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5721 if (fl.is_sleeping())
5722 fl.resume(__kmp_gtid_from_thread(th));
5723 KMP_CPU_PAUSE();
5724 }
5725 }
5726
5727 // Delete task teams
5728 int tt_idx;
5729 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5730 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5731 if (task_team != NULL) {
5732 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5733 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5734 team->t.t_threads[f]->th.th_task_team = NULL;
5735 }
5736 KA_TRACE(
5737 20,
5738 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5739 __kmp_get_gtid(), task_team, team->t.t_id));
5740#if KMP_NESTED_HOT_TEAMS
5741 __kmp_free_task_team(master, task_team);
5742#endif
5743 team->t.t_task_team[tt_idx] = NULL;
5744 }
5745 }
5746 }
5747
5748 // Reset pointer to parent team only for non-hot teams.
5749 team->t.t_parent = NULL;
5750 team->t.t_level = 0;
5751 team->t.t_active_level = 0;
5752
5753 /* free the worker threads */
5754 for (f = 1; f < team->t.t_nproc; ++f) {
5755 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5756 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5757 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team),
5758 1, 2);
5759 }
5760 __kmp_free_thread(team->t.t_threads[f]);
5761 }
5762
5763 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5764 if (team->t.b) {
5765 // wake up thread at old location
5766 team->t.b->go_release();
5767 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5768 for (f = 1; f < team->t.t_nproc; ++f) {
5769 if (team->t.b->sleep[f].sleep) {
5770 __kmp_atomic_resume_64(
5771 team->t.t_threads[f]->th.th_info.ds.ds_gtid,
5772 (kmp_atomic_flag_64<> *)NULL);
5773 }
5774 }
5775 }
5776 // Wait for threads to be removed from team
5777 for (int f = 1; f < team->t.t_nproc; ++f) {
5778 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0)
5779 KMP_CPU_PAUSE();
5780 }
5781 }
5782 }
5783
5784 for (f = 1; f < team->t.t_nproc; ++f) {
5785 team->t.t_threads[f] = NULL;
5786 }
5787
5788 if (team->t.t_max_nproc > 1 &&
5789 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
5790 distributedBarrier::deallocate(team->t.b);
5791 team->t.b = NULL;
5792 }
5793 /* put the team back in the team pool */
5794 /* TODO limit size of team pool, call reap_team if pool too large */
5795 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5796 __kmp_team_pool = (volatile kmp_team_t *)team;
5797 } else { // Check if team was created for primary threads in teams construct
5798 // See if first worker is a CG root
5799 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5800 team->t.t_threads[1]->th.th_cg_roots);
5801 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5802 // Clean up the CG root nodes on workers so that this team can be re-used
5803 for (f = 1; f < team->t.t_nproc; ++f) {
5804 kmp_info_t *thr = team->t.t_threads[f];
5805 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5806 thr->th.th_cg_roots->cg_root == thr);
5807 // Pop current CG root off list
5808 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5809 thr->th.th_cg_roots = tmp->up;
5810 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5811 " up to node %p. cg_nthreads was %d\n",
5812 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5813 int i = tmp->cg_nthreads--;
5814 if (i == 1) {
5815 __kmp_free(tmp); // free CG if we are the last thread in it
5816 }
5817 // Restore current task's thread_limit from CG root
5818 if (thr->th.th_cg_roots)
5819 thr->th.th_current_task->td_icvs.thread_limit =
5820 thr->th.th_cg_roots->cg_thread_limit;
5821 }
5822 }
5823 }
5824
5825 KMP_MB();
5826}
5827
5828/* reap the team. destroy it, reclaim all its resources and free its memory */
5829kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5830 kmp_team_t *next_pool = team->t.t_next_pool;
5831
5832 KMP_DEBUG_ASSERT(team);
5833 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5834 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5835 KMP_DEBUG_ASSERT(team->t.t_threads);
5836 KMP_DEBUG_ASSERT(team->t.t_argv);
5837
5838 /* TODO clean the threads that are a part of this? */
5839
5840 /* free stuff */
5841 __kmp_free_team_arrays(team);
5842 if (team->t.t_argv != &team->t.t_inline_argv[0])
5843 __kmp_free((void *)team->t.t_argv);
5844 __kmp_free(team);
5845
5846 KMP_MB();
5847 return next_pool;
5848}
5849
5850// Free the thread. Don't reap it, just place it on the pool of available
5851// threads.
5852//
5853// Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5854// binding for the affinity mechanism to be useful.
5855//
5856// Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5857// However, we want to avoid a potential performance problem by always
5858// scanning through the list to find the correct point at which to insert
5859// the thread (potential N**2 behavior). To do this we keep track of the
5860// last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5861// With single-level parallelism, threads will always be added to the tail
5862// of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5863// parallelism, all bets are off and we may need to scan through the entire
5864// free list.
5865//
5866// This change also has a potentially large performance benefit, for some
5867// applications. Previously, as threads were freed from the hot team, they
5868// would be placed back on the free list in inverse order. If the hot team
5869// grew back to it's original size, then the freed thread would be placed
5870// back on the hot team in reverse order. This could cause bad cache
5871// locality problems on programs where the size of the hot team regularly
5872// grew and shrunk.
5873//
5874// Now, for single-level parallelism, the OMP tid is always == gtid.
5875void __kmp_free_thread(kmp_info_t *this_th) {
5876 int gtid;
5877 kmp_info_t **scan;
5878
5879 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5880 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5881
5882 KMP_DEBUG_ASSERT(this_th);
5883
5884 // When moving thread to pool, switch thread to wait on own b_go flag, and
5885 // uninitialized (NULL team).
5886 int b;
5887 kmp_balign_t *balign = this_th->th.th_bar;
5888 for (b = 0; b < bs_last_barrier; ++b) {
5889 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5890 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5891 balign[b].bb.team = NULL;
5892 balign[b].bb.leaf_kids = 0;
5893 }
5894 this_th->th.th_task_state = 0;
5895 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5896
5897 /* put thread back on the free pool */
5898 TCW_PTR(this_th->th.th_team, NULL);
5899 TCW_PTR(this_th->th.th_root, NULL);
5900 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5901
5902 while (this_th->th.th_cg_roots) {
5903 this_th->th.th_cg_roots->cg_nthreads--;
5904 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5905 " %p of thread %p to %d\n",
5906 this_th, this_th->th.th_cg_roots,
5907 this_th->th.th_cg_roots->cg_root,
5908 this_th->th.th_cg_roots->cg_nthreads));
5909 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5910 if (tmp->cg_root == this_th) { // Thread is a cg_root
5911 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5912 KA_TRACE(
5913 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5914 this_th->th.th_cg_roots = tmp->up;
5915 __kmp_free(tmp);
5916 } else { // Worker thread
5917 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5918 __kmp_free(tmp);
5919 }
5920 this_th->th.th_cg_roots = NULL;
5921 break;
5922 }
5923 }
5924
5925 /* If the implicit task assigned to this thread can be used by other threads
5926 * -> multiple threads can share the data and try to free the task at
5927 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5928 * with higher probability when hot team is disabled but can occurs even when
5929 * the hot team is enabled */
5930 __kmp_free_implicit_task(this_th);
5931 this_th->th.th_current_task = NULL;
5932
5933 // If the __kmp_thread_pool_insert_pt is already past the new insert
5934 // point, then we need to re-scan the entire list.
5935 gtid = this_th->th.th_info.ds.ds_gtid;
5936 if (__kmp_thread_pool_insert_pt != NULL) {
5937 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5938 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5939 __kmp_thread_pool_insert_pt = NULL;
5940 }
5941 }
5942
5943 // Scan down the list to find the place to insert the thread.
5944 // scan is the address of a link in the list, possibly the address of
5945 // __kmp_thread_pool itself.
5946 //
5947 // In the absence of nested parallelism, the for loop will have 0 iterations.
5948 if (__kmp_thread_pool_insert_pt != NULL) {
5949 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5950 } else {
5951 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5952 }
5953 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5954 scan = &((*scan)->th.th_next_pool))
5955 ;
5956
5957 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5958 // to its address.
5959 TCW_PTR(this_th->th.th_next_pool, *scan);
5960 __kmp_thread_pool_insert_pt = *scan = this_th;
5961 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5962 (this_th->th.th_info.ds.ds_gtid <
5963 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5964 TCW_4(this_th->th.th_in_pool, TRUE);
5965 __kmp_suspend_initialize_thread(this_th);
5966 __kmp_lock_suspend_mx(this_th);
5967 if (this_th->th.th_active == TRUE) {
5968 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5969 this_th->th.th_active_in_pool = TRUE;
5970 }
5971#if KMP_DEBUG
5972 else {
5973 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5974 }
5975#endif
5976 __kmp_unlock_suspend_mx(this_th);
5977
5978 TCW_4(__kmp_nth, __kmp_nth - 1);
5979
5980#ifdef KMP_ADJUST_BLOCKTIME
5981 /* Adjust blocktime back to user setting or default if necessary */
5982 /* Middle initialization might never have occurred */
5983 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5984 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5985 if (__kmp_nth <= __kmp_avail_proc) {
5986 __kmp_zero_bt = FALSE;
5987 }
5988 }
5989#endif /* KMP_ADJUST_BLOCKTIME */
5990
5991 KMP_MB();
5992}
5993
5994/* ------------------------------------------------------------------------ */
5995
5996void *__kmp_launch_thread(kmp_info_t *this_thr) {
5997#if OMP_PROFILING_SUPPORT
5998 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE");
5999 // TODO: add a configuration option for time granularity
6000 if (ProfileTraceFile)
6001 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget");
6002#endif
6003
6004 int gtid = this_thr->th.th_info.ds.ds_gtid;
6005 /* void *stack_data;*/
6006 kmp_team_t **volatile pteam;
6007
6008 KMP_MB();
6009 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
6010
6011 if (__kmp_env_consistency_check) {
6012 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
6013 }
6014
6015#if OMPD_SUPPORT
6016 if (ompd_state & OMPD_ENABLE_BP)
6017 ompd_bp_thread_begin();
6018#endif
6019
6020#if OMPT_SUPPORT
6021 ompt_data_t *thread_data = nullptr;
6022 if (ompt_enabled.enabled) {
6023 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
6024 *thread_data = ompt_data_none;
6025
6026 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6027 this_thr->th.ompt_thread_info.wait_id = 0;
6028 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
6029 this_thr->th.ompt_thread_info.parallel_flags = 0;
6030 if (ompt_enabled.ompt_callback_thread_begin) {
6031 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
6032 ompt_thread_worker, thread_data);
6033 }
6034 this_thr->th.ompt_thread_info.state = ompt_state_idle;
6035 }
6036#endif
6037
6038 /* This is the place where threads wait for work */
6039 while (!TCR_4(__kmp_global.g.g_done)) {
6040 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
6041 KMP_MB();
6042
6043 /* wait for work to do */
6044 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
6045
6046 /* No tid yet since not part of a team */
6047 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
6048
6049#if OMPT_SUPPORT
6050 if (ompt_enabled.enabled) {
6051 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6052 }
6053#endif
6054
6055 pteam = &this_thr->th.th_team;
6056
6057 /* have we been allocated? */
6058 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
6059 /* we were just woken up, so run our new task */
6060 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
6061 int rc;
6062 KA_TRACE(20,
6063 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6064 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6065 (*pteam)->t.t_pkfn));
6066
6067 updateHWFPControl(*pteam);
6068
6069#if OMPT_SUPPORT
6070 if (ompt_enabled.enabled) {
6071 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
6072 }
6073#endif
6074
6075 rc = (*pteam)->t.t_invoke(gtid);
6076 KMP_ASSERT(rc);
6077
6078 KMP_MB();
6079 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6080 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
6081 (*pteam)->t.t_pkfn));
6082 }
6083#if OMPT_SUPPORT
6084 if (ompt_enabled.enabled) {
6085 /* no frame set while outside task */
6086 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
6087
6088 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
6089 }
6090#endif
6091 /* join barrier after parallel region */
6092 __kmp_join_barrier(gtid);
6093 }
6094 }
6095 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
6096
6097#if OMPD_SUPPORT
6098 if (ompd_state & OMPD_ENABLE_BP)
6099 ompd_bp_thread_end();
6100#endif
6101
6102#if OMPT_SUPPORT
6103 if (ompt_enabled.ompt_callback_thread_end) {
6104 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
6105 }
6106#endif
6107
6108 this_thr->th.th_task_team = NULL;
6109 /* run the destructors for the threadprivate data for this thread */
6110 __kmp_common_destroy_gtid(gtid);
6111
6112 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
6113 KMP_MB();
6114
6115#if OMP_PROFILING_SUPPORT
6116 llvm::timeTraceProfilerFinishThread();
6117#endif
6118 return this_thr;
6119}
6120
6121/* ------------------------------------------------------------------------ */
6122
6123void __kmp_internal_end_dest(void *specific_gtid) {
6124 // Make sure no significant bits are lost
6125 int gtid;
6126 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, &gtid);
6127
6128 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6129 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6130 * this is because 0 is reserved for the nothing-stored case */
6131
6132 __kmp_internal_end_thread(gtid);
6133}
6134
6135#if KMP_OS_UNIX && KMP_DYNAMIC_LIB
6136
6137__attribute__((destructor)) void __kmp_internal_end_dtor(void) {
6138 __kmp_internal_end_atexit();
6139}
6140
6141#endif
6142
6143/* [Windows] josh: when the atexit handler is called, there may still be more
6144 than one thread alive */
6145void __kmp_internal_end_atexit(void) {
6146 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
6147 /* [Windows]
6148 josh: ideally, we want to completely shutdown the library in this atexit
6149 handler, but stat code that depends on thread specific data for gtid fails
6150 because that data becomes unavailable at some point during the shutdown, so
6151 we call __kmp_internal_end_thread instead. We should eventually remove the
6152 dependency on __kmp_get_specific_gtid in the stat code and use
6153 __kmp_internal_end_library to cleanly shutdown the library.
6154
6155 // TODO: Can some of this comment about GVS be removed?
6156 I suspect that the offending stat code is executed when the calling thread
6157 tries to clean up a dead root thread's data structures, resulting in GVS
6158 code trying to close the GVS structures for that thread, but since the stat
6159 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
6160 the calling thread is cleaning up itself instead of another thread, it get
6161 confused. This happens because allowing a thread to unregister and cleanup
6162 another thread is a recent modification for addressing an issue.
6163 Based on the current design (20050722), a thread may end up
6164 trying to unregister another thread only if thread death does not trigger
6165 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
6166 thread specific data destructor function to detect thread death. For
6167 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
6168 is nothing. Thus, the workaround is applicable only for Windows static
6169 stat library. */
6170 __kmp_internal_end_library(-1);
6171#if KMP_OS_WINDOWS
6172 __kmp_close_console();
6173#endif
6174}
6175
6176static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
6177 // It is assumed __kmp_forkjoin_lock is acquired.
6178
6179 int gtid;
6180
6181 KMP_DEBUG_ASSERT(thread != NULL);
6182
6183 gtid = thread->th.th_info.ds.ds_gtid;
6184
6185 if (!is_root) {
6186 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
6187 /* Assume the threads are at the fork barrier here */
6188 KA_TRACE(
6189 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
6190 gtid));
6191 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
6192 while (
6193 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3))
6194 KMP_CPU_PAUSE();
6195 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL);
6196 } else {
6197 /* Need release fence here to prevent seg faults for tree forkjoin
6198 barrier (GEH) */
6199 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
6200 thread);
6201 __kmp_release_64(&flag);
6202 }
6203 }
6204
6205 // Terminate OS thread.
6206 __kmp_reap_worker(thread);
6207
6208 // The thread was killed asynchronously. If it was actively
6209 // spinning in the thread pool, decrement the global count.
6210 //
6211 // There is a small timing hole here - if the worker thread was just waking
6212 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
6213 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
6214 // the global counter might not get updated.
6215 //
6216 // Currently, this can only happen as the library is unloaded,
6217 // so there are no harmful side effects.
6218 if (thread->th.th_active_in_pool) {
6219 thread->th.th_active_in_pool = FALSE;
6220 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
6221 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
6222 }
6223 }
6224
6225 __kmp_free_implicit_task(thread);
6226
6227// Free the fast memory for tasking
6228#if USE_FAST_MEMORY
6229 __kmp_free_fast_memory(thread);
6230#endif /* USE_FAST_MEMORY */
6231
6232 __kmp_suspend_uninitialize_thread(thread);
6233
6234 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
6235 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6236
6237 --__kmp_all_nth;
6238 // __kmp_nth was decremented when thread is added to the pool.
6239
6240#ifdef KMP_ADJUST_BLOCKTIME
6241 /* Adjust blocktime back to user setting or default if necessary */
6242 /* Middle initialization might never have occurred */
6243 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
6244 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
6245 if (__kmp_nth <= __kmp_avail_proc) {
6246 __kmp_zero_bt = FALSE;
6247 }
6248 }
6249#endif /* KMP_ADJUST_BLOCKTIME */
6250
6251 /* free the memory being used */
6252 if (__kmp_env_consistency_check) {
6253 if (thread->th.th_cons) {
6254 __kmp_free_cons_stack(thread->th.th_cons);
6255 thread->th.th_cons = NULL;
6256 }
6257 }
6258
6259 if (thread->th.th_pri_common != NULL) {
6260 __kmp_free(thread->th.th_pri_common);
6261 thread->th.th_pri_common = NULL;
6262 }
6263
6264 if (thread->th.th_task_state_memo_stack != NULL) {
6265 __kmp_free(thread->th.th_task_state_memo_stack);
6266 thread->th.th_task_state_memo_stack = NULL;
6267 }
6268
6269#if KMP_USE_BGET
6270 if (thread->th.th_local.bget_data != NULL) {
6271 __kmp_finalize_bget(thread);
6272 }
6273#endif
6274
6275#if KMP_AFFINITY_SUPPORTED
6276 if (thread->th.th_affin_mask != NULL) {
6277 KMP_CPU_FREE(thread->th.th_affin_mask);
6278 thread->th.th_affin_mask = NULL;
6279 }
6280#endif /* KMP_AFFINITY_SUPPORTED */
6281
6282#if KMP_USE_HIER_SCHED
6283 if (thread->th.th_hier_bar_data != NULL) {
6284 __kmp_free(thread->th.th_hier_bar_data);
6285 thread->th.th_hier_bar_data = NULL;
6286 }
6287#endif
6288
6289 __kmp_reap_team(thread->th.th_serial_team);
6290 thread->th.th_serial_team = NULL;
6291 __kmp_free(thread);
6292
6293 KMP_MB();
6294
6295} // __kmp_reap_thread
6296
6297static void __kmp_itthash_clean(kmp_info_t *th) {
6298#if USE_ITT_NOTIFY
6299 if (__kmp_itt_region_domains.count > 0) {
6300 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6301 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i];
6302 while (bucket) {
6303 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6304 __kmp_thread_free(th, bucket);
6305 bucket = next;
6306 }
6307 }
6308 }
6309 if (__kmp_itt_barrier_domains.count > 0) {
6310 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) {
6311 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i];
6312 while (bucket) {
6313 kmp_itthash_entry_t *next = bucket->next_in_bucket;
6314 __kmp_thread_free(th, bucket);
6315 bucket = next;
6316 }
6317 }
6318 }
6319#endif
6320}
6321
6322static void __kmp_internal_end(void) {
6323 int i;
6324
6325 /* First, unregister the library */
6326 __kmp_unregister_library();
6327
6328#if KMP_OS_WINDOWS
6329 /* In Win static library, we can't tell when a root actually dies, so we
6330 reclaim the data structures for any root threads that have died but not
6331 unregistered themselves, in order to shut down cleanly.
6332 In Win dynamic library we also can't tell when a thread dies. */
6333 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
6334// dead roots
6335#endif
6336
6337 for (i = 0; i < __kmp_threads_capacity; i++)
6338 if (__kmp_root[i])
6339 if (__kmp_root[i]->r.r_active)
6340 break;
6341 KMP_MB(); /* Flush all pending memory write invalidates. */
6342 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6343
6344 if (i < __kmp_threads_capacity) {
6345#if KMP_USE_MONITOR
6346 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6347 KMP_MB(); /* Flush all pending memory write invalidates. */
6348
6349 // Need to check that monitor was initialized before reaping it. If we are
6350 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6351 // __kmp_monitor will appear to contain valid data, but it is only valid in
6352 // the parent process, not the child.
6353 // New behavior (201008): instead of keying off of the flag
6354 // __kmp_init_parallel, the monitor thread creation is keyed off
6355 // of the new flag __kmp_init_monitor.
6356 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6357 if (TCR_4(__kmp_init_monitor)) {
6358 __kmp_reap_monitor(&__kmp_monitor);
6359 TCW_4(__kmp_init_monitor, 0);
6360 }
6361 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6362 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6363#endif // KMP_USE_MONITOR
6364 } else {
6365/* TODO move this to cleanup code */
6366#ifdef KMP_DEBUG
6367 /* make sure that everything has properly ended */
6368 for (i = 0; i < __kmp_threads_capacity; i++) {
6369 if (__kmp_root[i]) {
6370 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6371 // there can be uber threads alive here
6372 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6373 }
6374 }
6375#endif
6376
6377 KMP_MB();
6378
6379 // Reap the worker threads.
6380 // This is valid for now, but be careful if threads are reaped sooner.
6381 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6382 // Get the next thread from the pool.
6383 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6384 __kmp_thread_pool = thread->th.th_next_pool;
6385 // Reap it.
6386 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6387 thread->th.th_next_pool = NULL;
6388 thread->th.th_in_pool = FALSE;
6389 __kmp_reap_thread(thread, 0);
6390 }
6391 __kmp_thread_pool_insert_pt = NULL;
6392
6393 // Reap teams.
6394 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6395 // Get the next team from the pool.
6396 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6397 __kmp_team_pool = team->t.t_next_pool;
6398 // Reap it.
6399 team->t.t_next_pool = NULL;
6400 __kmp_reap_team(team);
6401 }
6402
6403 __kmp_reap_task_teams();
6404
6405#if KMP_OS_UNIX
6406 // Threads that are not reaped should not access any resources since they
6407 // are going to be deallocated soon, so the shutdown sequence should wait
6408 // until all threads either exit the final spin-waiting loop or begin
6409 // sleeping after the given blocktime.
6410 for (i = 0; i < __kmp_threads_capacity; i++) {
6411 kmp_info_t *thr = __kmp_threads[i];
6412 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6413 KMP_CPU_PAUSE();
6414 }
6415#endif
6416
6417 for (i = 0; i < __kmp_threads_capacity; ++i) {
6418 // TBD: Add some checking...
6419 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6420 }
6421
6422 /* Make sure all threadprivate destructors get run by joining with all
6423 worker threads before resetting this flag */
6424 TCW_SYNC_4(__kmp_init_common, FALSE);
6425
6426 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6427 KMP_MB();
6428
6429#if KMP_USE_MONITOR
6430 // See note above: One of the possible fixes for CQ138434 / CQ140126
6431 //
6432 // FIXME: push both code fragments down and CSE them?
6433 // push them into __kmp_cleanup() ?
6434 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6435 if (TCR_4(__kmp_init_monitor)) {
6436 __kmp_reap_monitor(&__kmp_monitor);
6437 TCW_4(__kmp_init_monitor, 0);
6438 }
6439 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6440 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6441#endif
6442 } /* else !__kmp_global.t_active */
6443 TCW_4(__kmp_init_gtid, FALSE);
6444 KMP_MB(); /* Flush all pending memory write invalidates. */
6445
6446 __kmp_cleanup();
6447#if OMPT_SUPPORT
6448 ompt_fini();
6449#endif
6450}
6451
6452void __kmp_internal_end_library(int gtid_req) {
6453 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6454 /* this shouldn't be a race condition because __kmp_internal_end() is the
6455 only place to clear __kmp_serial_init */
6456 /* we'll check this later too, after we get the lock */
6457 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6458 // redundant, because the next check will work in any case.
6459 if (__kmp_global.g.g_abort) {
6460 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6461 /* TODO abort? */
6462 return;
6463 }
6464 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6465 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6466 return;
6467 }
6468
6469 // If hidden helper team has been initialized, we need to deinit it
6470 if (TCR_4(__kmp_init_hidden_helper) &&
6471 !TCR_4(__kmp_hidden_helper_team_done)) {
6472 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6473 // First release the main thread to let it continue its work
6474 __kmp_hidden_helper_main_thread_release();
6475 // Wait until the hidden helper team has been destroyed
6476 __kmp_hidden_helper_threads_deinitz_wait();
6477 }
6478
6479 KMP_MB(); /* Flush all pending memory write invalidates. */
6480 /* find out who we are and what we should do */
6481 {
6482 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6483 KA_TRACE(
6484 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6485 if (gtid == KMP_GTID_SHUTDOWN) {
6486 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6487 "already shutdown\n"));
6488 return;
6489 } else if (gtid == KMP_GTID_MONITOR) {
6490 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6491 "registered, or system shutdown\n"));
6492 return;
6493 } else if (gtid == KMP_GTID_DNE) {
6494 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6495 "shutdown\n"));
6496 /* we don't know who we are, but we may still shutdown the library */
6497 } else if (KMP_UBER_GTID(gtid)) {
6498 /* unregister ourselves as an uber thread. gtid is no longer valid */
6499 if (__kmp_root[gtid]->r.r_active) {
6500 __kmp_global.g.g_abort = -1;
6501 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6502 __kmp_unregister_library();
6503 KA_TRACE(10,
6504 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6505 gtid));
6506 return;
6507 } else {
6508 __kmp_itthash_clean(__kmp_threads[gtid]);
6509 KA_TRACE(
6510 10,
6511 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6512 __kmp_unregister_root_current_thread(gtid);
6513 }
6514 } else {
6515/* worker threads may call this function through the atexit handler, if they
6516 * call exit() */
6517/* For now, skip the usual subsequent processing and just dump the debug buffer.
6518 TODO: do a thorough shutdown instead */
6519#ifdef DUMP_DEBUG_ON_EXIT
6520 if (__kmp_debug_buf)
6521 __kmp_dump_debug_buffer();
6522#endif
6523 // added unregister library call here when we switch to shm linux
6524 // if we don't, it will leave lots of files in /dev/shm
6525 // cleanup shared memory file before exiting.
6526 __kmp_unregister_library();
6527 return;
6528 }
6529 }
6530 /* synchronize the termination process */
6531 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6532
6533 /* have we already finished */
6534 if (__kmp_global.g.g_abort) {
6535 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6536 /* TODO abort? */
6537 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6538 return;
6539 }
6540 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6541 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6542 return;
6543 }
6544
6545 /* We need this lock to enforce mutex between this reading of
6546 __kmp_threads_capacity and the writing by __kmp_register_root.
6547 Alternatively, we can use a counter of roots that is atomically updated by
6548 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6549 __kmp_internal_end_*. */
6550 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6551
6552 /* now we can safely conduct the actual termination */
6553 __kmp_internal_end();
6554
6555 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6556 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6557
6558 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6559
6560#ifdef DUMP_DEBUG_ON_EXIT
6561 if (__kmp_debug_buf)
6562 __kmp_dump_debug_buffer();
6563#endif
6564
6565#if KMP_OS_WINDOWS
6566 __kmp_close_console();
6567#endif
6568
6569 __kmp_fini_allocator();
6570
6571} // __kmp_internal_end_library
6572
6573void __kmp_internal_end_thread(int gtid_req) {
6574 int i;
6575
6576 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6577 /* this shouldn't be a race condition because __kmp_internal_end() is the
6578 * only place to clear __kmp_serial_init */
6579 /* we'll check this later too, after we get the lock */
6580 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6581 // redundant, because the next check will work in any case.
6582 if (__kmp_global.g.g_abort) {
6583 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6584 /* TODO abort? */
6585 return;
6586 }
6587 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6588 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6589 return;
6590 }
6591
6592 // If hidden helper team has been initialized, we need to deinit it
6593 if (TCR_4(__kmp_init_hidden_helper) &&
6594 !TCR_4(__kmp_hidden_helper_team_done)) {
6595 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE);
6596 // First release the main thread to let it continue its work
6597 __kmp_hidden_helper_main_thread_release();
6598 // Wait until the hidden helper team has been destroyed
6599 __kmp_hidden_helper_threads_deinitz_wait();
6600 }
6601
6602 KMP_MB(); /* Flush all pending memory write invalidates. */
6603
6604 /* find out who we are and what we should do */
6605 {
6606 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6607 KA_TRACE(10,
6608 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6609 if (gtid == KMP_GTID_SHUTDOWN) {
6610 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6611 "already shutdown\n"));
6612 return;
6613 } else if (gtid == KMP_GTID_MONITOR) {
6614 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6615 "registered, or system shutdown\n"));
6616 return;
6617 } else if (gtid == KMP_GTID_DNE) {
6618 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6619 "shutdown\n"));
6620 return;
6621 /* we don't know who we are */
6622 } else if (KMP_UBER_GTID(gtid)) {
6623 /* unregister ourselves as an uber thread. gtid is no longer valid */
6624 if (__kmp_root[gtid]->r.r_active) {
6625 __kmp_global.g.g_abort = -1;
6626 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6627 KA_TRACE(10,
6628 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6629 gtid));
6630 return;
6631 } else {
6632 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6633 gtid));
6634 __kmp_unregister_root_current_thread(gtid);
6635 }
6636 } else {
6637 /* just a worker thread, let's leave */
6638 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6639
6640 if (gtid >= 0) {
6641 __kmp_threads[gtid]->th.th_task_team = NULL;
6642 }
6643
6644 KA_TRACE(10,
6645 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6646 gtid));
6647 return;
6648 }
6649 }
6650#if KMP_DYNAMIC_LIB
6651 if (__kmp_pause_status != kmp_hard_paused)
6652 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6653 // because we will better shutdown later in the library destructor.
6654 {
6655 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6656 return;
6657 }
6658#endif
6659 /* synchronize the termination process */
6660 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6661
6662 /* have we already finished */
6663 if (__kmp_global.g.g_abort) {
6664 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6665 /* TODO abort? */
6666 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6667 return;
6668 }
6669 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6670 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6671 return;
6672 }
6673
6674 /* We need this lock to enforce mutex between this reading of
6675 __kmp_threads_capacity and the writing by __kmp_register_root.
6676 Alternatively, we can use a counter of roots that is atomically updated by
6677 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6678 __kmp_internal_end_*. */
6679
6680 /* should we finish the run-time? are all siblings done? */
6681 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6682
6683 for (i = 0; i < __kmp_threads_capacity; ++i) {
6684 if (KMP_UBER_GTID(i)) {
6685 KA_TRACE(
6686 10,
6687 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6688 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6689 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6690 return;
6691 }
6692 }
6693
6694 /* now we can safely conduct the actual termination */
6695
6696 __kmp_internal_end();
6697
6698 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6699 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6700
6701 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6702
6703#ifdef DUMP_DEBUG_ON_EXIT
6704 if (__kmp_debug_buf)
6705 __kmp_dump_debug_buffer();
6706#endif
6707} // __kmp_internal_end_thread
6708
6709// -----------------------------------------------------------------------------
6710// Library registration stuff.
6711
6712static long __kmp_registration_flag = 0;
6713// Random value used to indicate library initialization.
6714static char *__kmp_registration_str = NULL;
6715// Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6716
6717static inline char *__kmp_reg_status_name() {
6718/* On RHEL 3u5 if linked statically, getpid() returns different values in
6719 each thread. If registration and unregistration go in different threads
6720 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6721 env var can not be found, because the name will contain different pid. */
6722// macOS* complains about name being too long with additional getuid()
6723#if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB
6724 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6725 (int)getuid());
6726#else
6727 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6728#endif
6729} // __kmp_reg_status_get
6730
6731#if defined(KMP_USE_SHM)
6732// If /dev/shm is not accessible, we will create a temporary file under /tmp.
6733char *temp_reg_status_file_name = nullptr;
6734#endif
6735
6736void __kmp_register_library_startup(void) {
6737
6738 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6739 int done = 0;
6740 union {
6741 double dtime;
6742 long ltime;
6743 } time;
6744#if KMP_ARCH_X86 || KMP_ARCH_X86_64
6745 __kmp_initialize_system_tick();
6746#endif
6747 __kmp_read_system_time(&time.dtime);
6748 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6749 __kmp_registration_str =
6750 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6751 __kmp_registration_flag, KMP_LIBRARY_FILE);
6752
6753 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6754 __kmp_registration_str));
6755
6756 while (!done) {
6757
6758 char *value = NULL; // Actual value of the environment variable.
6759
6760#if defined(KMP_USE_SHM)
6761 char *shm_name = __kmp_str_format("/%s", name);
6762 int shm_preexist = 0;
6763 char *data1;
6764 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6765 if ((fd1 == -1) && (errno == EEXIST)) {
6766 // file didn't open because it already exists.
6767 // try opening existing file
6768 fd1 = shm_open(shm_name, O_RDWR, 0666);
6769 if (fd1 == -1) { // file didn't open
6770 // error out here
6771 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6772 __kmp_msg_null);
6773 } else {
6774 // able to open existing file
6775 shm_preexist = 1;
6776 }
6777 } else if (fd1 == -1) {
6778 // SHM didn't open; it was due to error other than already exists. Try to
6779 // create a temp file under /tmp.
6780 // TODO: /tmp might not always be the temporary directory. For now we will
6781 // not consider TMPDIR. If /tmp is not accessible, we simply error out.
6782 char *temp_file_name = __kmp_str_format("/tmp/%sXXXXXX", name);
6783 fd1 = mkstemp(temp_file_name);
6784 if (fd1 == -1) {
6785 // error out here.
6786 __kmp_fatal(KMP_MSG(FunctionError, "Can't open TEMP"), KMP_ERR(errno),
6787 __kmp_msg_null);
6788 }
6789 temp_reg_status_file_name = temp_file_name;
6790 }
6791 if (shm_preexist == 0) {
6792 // we created SHM now set size
6793 if (ftruncate(fd1, SHM_SIZE) == -1) {
6794 // error occured setting size;
6795 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6796 KMP_ERR(errno), __kmp_msg_null);
6797 }
6798 }
6799 data1 =
6800 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6801 if (data1 == MAP_FAILED) {
6802 // failed to map shared memory
6803 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6804 __kmp_msg_null);
6805 }
6806 if (shm_preexist == 0) { // set data to SHM, set value
6807 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6808 }
6809 // Read value from either what we just wrote or existing file.
6810 value = __kmp_str_format("%s", data1); // read value from SHM
6811 munmap(data1, SHM_SIZE);
6812 close(fd1);
6813#else // Windows and unix with static library
6814 // Set environment variable, but do not overwrite if it is exist.
6815 __kmp_env_set(name, __kmp_registration_str, 0);
6816 // read value to see if it got set
6817 value = __kmp_env_get(name);
6818#endif
6819
6820 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6821 done = 1; // Ok, environment variable set successfully, exit the loop.
6822 } else {
6823 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6824 // Check whether it alive or dead.
6825 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6826 char *tail = value;
6827 char *flag_addr_str = NULL;
6828 char *flag_val_str = NULL;
6829 char const *file_name = NULL;
6830 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6831 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6832 file_name = tail;
6833 if (tail != NULL) {
6834 unsigned long *flag_addr = 0;
6835 unsigned long flag_val = 0;
6836 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr));
6837 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6838 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6839 // First, check whether environment-encoded address is mapped into
6840 // addr space.
6841 // If so, dereference it to see if it still has the right value.
6842 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6843 neighbor = 1;
6844 } else {
6845 // If not, then we know the other copy of the library is no longer
6846 // running.
6847 neighbor = 2;
6848 }
6849 }
6850 }
6851 switch (neighbor) {
6852 case 0: // Cannot parse environment variable -- neighbor status unknown.
6853 // Assume it is the incompatible format of future version of the
6854 // library. Assume the other library is alive.
6855 // WARN( ... ); // TODO: Issue a warning.
6856 file_name = "unknown library";
6857 KMP_FALLTHROUGH();
6858 // Attention! Falling to the next case. That's intentional.
6859 case 1: { // Neighbor is alive.
6860 // Check it is allowed.
6861 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6862 if (!__kmp_str_match_true(duplicate_ok)) {
6863 // That's not allowed. Issue fatal error.
6864 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6865 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6866 }
6867 KMP_INTERNAL_FREE(duplicate_ok);
6868 __kmp_duplicate_library_ok = 1;
6869 done = 1; // Exit the loop.
6870 } break;
6871 case 2: { // Neighbor is dead.
6872
6873#if defined(KMP_USE_SHM)
6874 // close shared memory.
6875 shm_unlink(shm_name); // this removes file in /dev/shm
6876#else
6877 // Clear the variable and try to register library again.
6878 __kmp_env_unset(name);
6879#endif
6880 } break;
6881 default: {
6882 KMP_DEBUG_ASSERT(0);
6883 } break;
6884 }
6885 }
6886 KMP_INTERNAL_FREE((void *)value);
6887#if defined(KMP_USE_SHM)
6888 KMP_INTERNAL_FREE((void *)shm_name);
6889#endif
6890 } // while
6891 KMP_INTERNAL_FREE((void *)name);
6892
6893} // func __kmp_register_library_startup
6894
6895void __kmp_unregister_library(void) {
6896
6897 char *name = __kmp_reg_status_name();
6898 char *value = NULL;
6899
6900#if defined(KMP_USE_SHM)
6901 bool use_shm = true;
6902 char *shm_name = __kmp_str_format("/%s", name);
6903 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6904 if (fd1 == -1) {
6905 // File did not open. Try the temporary file.
6906 use_shm = false;
6907 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6908 fd1 = open(temp_reg_status_file_name, O_RDONLY);
6909 if (fd1 == -1) {
6910 // give it up now.
6911 return;
6912 }
6913 }
6914 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6915 if (data1 != MAP_FAILED) {
6916 value = __kmp_str_format("%s", data1); // read value from SHM
6917 munmap(data1, SHM_SIZE);
6918 }
6919 close(fd1);
6920#else
6921 value = __kmp_env_get(name);
6922#endif
6923
6924 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6925 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6926 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6927// Ok, this is our variable. Delete it.
6928#if defined(KMP_USE_SHM)
6929 if (use_shm) {
6930 shm_unlink(shm_name); // this removes file in /dev/shm
6931 } else {
6932 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6933 unlink(temp_reg_status_file_name); // this removes the temp file
6934 }
6935#else
6936 __kmp_env_unset(name);
6937#endif
6938 }
6939
6940#if defined(KMP_USE_SHM)
6941 KMP_INTERNAL_FREE(shm_name);
6942 if (!use_shm) {
6943 KMP_DEBUG_ASSERT(temp_reg_status_file_name);
6944 KMP_INTERNAL_FREE(temp_reg_status_file_name);
6945 }
6946#endif
6947
6948 KMP_INTERNAL_FREE(__kmp_registration_str);
6949 KMP_INTERNAL_FREE(value);
6950 KMP_INTERNAL_FREE(name);
6951
6952 __kmp_registration_flag = 0;
6953 __kmp_registration_str = NULL;
6954
6955} // __kmp_unregister_library
6956
6957// End of Library registration stuff.
6958// -----------------------------------------------------------------------------
6959
6960#if KMP_MIC_SUPPORTED
6961
6962static void __kmp_check_mic_type() {
6963 kmp_cpuid_t cpuid_state = {0};
6964 kmp_cpuid_t *cs_p = &cpuid_state;
6965 __kmp_x86_cpuid(1, 0, cs_p);
6966 // We don't support mic1 at the moment
6967 if ((cs_p->eax & 0xff0) == 0xB10) {
6968 __kmp_mic_type = mic2;
6969 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6970 __kmp_mic_type = mic3;
6971 } else {
6972 __kmp_mic_type = non_mic;
6973 }
6974}
6975
6976#endif /* KMP_MIC_SUPPORTED */
6977
6978#if KMP_HAVE_UMWAIT
6979static void __kmp_user_level_mwait_init() {
6980 struct kmp_cpuid buf;
6981 __kmp_x86_cpuid(7, 0, &buf);
6982 __kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
6983 __kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
6984 __kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
6985 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6986 __kmp_umwait_enabled));
6987}
6988#elif KMP_HAVE_MWAIT
6989#ifndef AT_INTELPHIUSERMWAIT
6990// Spurious, non-existent value that should always fail to return anything.
6991// Will be replaced with the correct value when we know that.
6992#define AT_INTELPHIUSERMWAIT 10000
6993#endif
6994// getauxval() function is available in RHEL7 and SLES12. If a system with an
6995// earlier OS is used to build the RTL, we'll use the following internal
6996// function when the entry is not found.
6997unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
6998unsigned long getauxval(unsigned long) { return 0; }
6999
7000static void __kmp_user_level_mwait_init() {
7001 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
7002 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
7003 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
7004 // KMP_USER_LEVEL_MWAIT was set to TRUE.
7005 if (__kmp_mic_type == mic3) {
7006 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
7007 if ((res & 0x1) || __kmp_user_level_mwait) {
7008 __kmp_mwait_enabled = TRUE;
7009 if (__kmp_user_level_mwait) {
7010 KMP_INFORM(EnvMwaitWarn);
7011 }
7012 } else {
7013 __kmp_mwait_enabled = FALSE;
7014 }
7015 }
7016 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
7017 "__kmp_mwait_enabled = %d\n",
7018 __kmp_mic_type, __kmp_mwait_enabled));
7019}
7020#endif /* KMP_HAVE_UMWAIT */
7021
7022static void __kmp_do_serial_initialize(void) {
7023 int i, gtid;
7024 size_t size;
7025
7026 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
7027
7028 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
7029 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
7030 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
7031 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
7032 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
7033
7034#if OMPT_SUPPORT
7035 ompt_pre_init();
7036#endif
7037#if OMPD_SUPPORT
7038 __kmp_env_dump();
7039 ompd_init();
7040#endif
7041
7042 __kmp_validate_locks();
7043
7044 /* Initialize internal memory allocator */
7045 __kmp_init_allocator();
7046
7047 /* Register the library startup via an environment variable or via mapped
7048 shared memory file and check to see whether another copy of the library is
7049 already registered. Since forked child process is often terminated, we
7050 postpone the registration till middle initialization in the child */
7051 if (__kmp_need_register_serial)
7052 __kmp_register_library_startup();
7053
7054 /* TODO reinitialization of library */
7055 if (TCR_4(__kmp_global.g.g_done)) {
7056 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
7057 }
7058
7059 __kmp_global.g.g_abort = 0;
7060 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7061
7062/* initialize the locks */
7063#if KMP_USE_ADAPTIVE_LOCKS
7064#if KMP_DEBUG_ADAPTIVE_LOCKS
7065 __kmp_init_speculative_stats();
7066#endif
7067#endif
7068#if KMP_STATS_ENABLED
7069 __kmp_stats_init();
7070#endif
7071 __kmp_init_lock(&__kmp_global_lock);
7072 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
7073 __kmp_init_lock(&__kmp_debug_lock);
7074 __kmp_init_atomic_lock(&__kmp_atomic_lock);
7075 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
7076 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
7077 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
7078 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
7079 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
7080 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
7081 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
7082 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
7083 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
7084 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
7085 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
7086 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
7087 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
7088 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
7089#if KMP_USE_MONITOR
7090 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
7091#endif
7092 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
7093
7094 /* conduct initialization and initial setup of configuration */
7095
7096 __kmp_runtime_initialize();
7097
7098#if KMP_MIC_SUPPORTED
7099 __kmp_check_mic_type();
7100#endif
7101
7102// Some global variable initialization moved here from kmp_env_initialize()
7103#ifdef KMP_DEBUG
7104 kmp_diag = 0;
7105#endif
7106 __kmp_abort_delay = 0;
7107
7108 // From __kmp_init_dflt_team_nth()
7109 /* assume the entire machine will be used */
7110 __kmp_dflt_team_nth_ub = __kmp_xproc;
7111 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
7112 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7113 }
7114 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
7115 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7116 }
7117 __kmp_max_nth = __kmp_sys_max_nth;
7118 __kmp_cg_max_nth = __kmp_sys_max_nth;
7119 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
7120 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
7121 __kmp_teams_max_nth = __kmp_sys_max_nth;
7122 }
7123
7124 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
7125 // part
7126 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7127#if KMP_USE_MONITOR
7128 __kmp_monitor_wakeups =
7129 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7130 __kmp_bt_intervals =
7131 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
7132#endif
7133 // From "KMP_LIBRARY" part of __kmp_env_initialize()
7134 __kmp_library = library_throughput;
7135 // From KMP_SCHEDULE initialization
7136 __kmp_static = kmp_sch_static_balanced;
7137// AC: do not use analytical here, because it is non-monotonous
7138//__kmp_guided = kmp_sch_guided_iterative_chunked;
7139//__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
7140// need to repeat assignment
7141// Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
7142// bit control and barrier method control parts
7143#if KMP_FAST_REDUCTION_BARRIER
7144#define kmp_reduction_barrier_gather_bb ((int)1)
7145#define kmp_reduction_barrier_release_bb ((int)1)
7146#define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt
7147#define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt
7148#endif // KMP_FAST_REDUCTION_BARRIER
7149 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
7150 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
7151 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
7152 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
7153 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
7154#if KMP_FAST_REDUCTION_BARRIER
7155 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
7156 // lin_64 ): hyper,1
7157 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
7158 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
7159 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
7160 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
7161 }
7162#endif // KMP_FAST_REDUCTION_BARRIER
7163 }
7164#if KMP_FAST_REDUCTION_BARRIER
7165#undef kmp_reduction_barrier_release_pat
7166#undef kmp_reduction_barrier_gather_pat
7167#undef kmp_reduction_barrier_release_bb
7168#undef kmp_reduction_barrier_gather_bb
7169#endif // KMP_FAST_REDUCTION_BARRIER
7170#if KMP_MIC_SUPPORTED
7171 if (__kmp_mic_type == mic2) { // KNC
7172 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7173 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
7174 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
7175 1; // forkjoin release
7176 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7177 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
7178 }
7179#if KMP_FAST_REDUCTION_BARRIER
7180 if (__kmp_mic_type == mic2) { // KNC
7181 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7182 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
7183 }
7184#endif // KMP_FAST_REDUCTION_BARRIER
7185#endif // KMP_MIC_SUPPORTED
7186
7187// From KMP_CHECKS initialization
7188#ifdef KMP_DEBUG
7189 __kmp_env_checks = TRUE; /* development versions have the extra checks */
7190#else
7191 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7192#endif
7193
7194 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7195 __kmp_foreign_tp = TRUE;
7196
7197 __kmp_global.g.g_dynamic = FALSE;
7198 __kmp_global.g.g_dynamic_mode = dynamic_default;
7199
7200 __kmp_init_nesting_mode();
7201
7202 __kmp_env_initialize(NULL);
7203
7204#if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
7205 __kmp_user_level_mwait_init();
7206#endif
7207// Print all messages in message catalog for testing purposes.
7208#ifdef KMP_DEBUG
7209 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
7210 if (__kmp_str_match_true(val)) {
7211 kmp_str_buf_t buffer;
7212 __kmp_str_buf_init(&buffer);
7213 __kmp_i18n_dump_catalog(&buffer);
7214 __kmp_printf("%s", buffer.str);
7215 __kmp_str_buf_free(&buffer);
7216 }
7217 __kmp_env_free(&val);
7218#endif
7219
7220 __kmp_threads_capacity =
7221 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
7222 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7223 __kmp_tp_capacity = __kmp_default_tp_capacity(
7224 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7225
7226 // If the library is shut down properly, both pools must be NULL. Just in
7227 // case, set them to NULL -- some memory may leak, but subsequent code will
7228 // work even if pools are not freed.
7229 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
7230 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
7231 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
7232 __kmp_thread_pool = NULL;
7233 __kmp_thread_pool_insert_pt = NULL;
7234 __kmp_team_pool = NULL;
7235
7236 /* Allocate all of the variable sized records */
7237 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
7238 * expandable */
7239 /* Since allocation is cache-aligned, just add extra padding at the end */
7240 size =
7241 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
7242 CACHE_LINE;
7243 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
7244 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
7245 sizeof(kmp_info_t *) * __kmp_threads_capacity);
7246
7247 /* init thread counts */
7248 KMP_DEBUG_ASSERT(__kmp_all_nth ==
7249 0); // Asserts fail if the library is reinitializing and
7250 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
7251 __kmp_all_nth = 0;
7252 __kmp_nth = 0;
7253
7254 /* setup the uber master thread and hierarchy */
7255 gtid = __kmp_register_root(TRUE);
7256 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
7257 KMP_ASSERT(KMP_UBER_GTID(gtid));
7258 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
7259
7260 KMP_MB(); /* Flush all pending memory write invalidates. */
7261
7262 __kmp_common_initialize();
7263
7264#if KMP_OS_UNIX
7265 /* invoke the child fork handler */
7266 __kmp_register_atfork();
7267#endif
7268
7269#if !KMP_DYNAMIC_LIB || \
7270 ((KMP_COMPILER_ICC || KMP_COMPILER_ICX) && KMP_OS_DARWIN)
7271 {
7272 /* Invoke the exit handler when the program finishes, only for static
7273 library and macOS* dynamic. For other dynamic libraries, we already
7274 have _fini and DllMain. */
7275 int rc = atexit(__kmp_internal_end_atexit);
7276 if (rc != 0) {
7277 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
7278 __kmp_msg_null);
7279 }
7280 }
7281#endif
7282
7283#if KMP_HANDLE_SIGNALS
7284#if KMP_OS_UNIX
7285 /* NOTE: make sure that this is called before the user installs their own
7286 signal handlers so that the user handlers are called first. this way they
7287 can return false, not call our handler, avoid terminating the library, and
7288 continue execution where they left off. */
7289 __kmp_install_signals(FALSE);
7290#endif /* KMP_OS_UNIX */
7291#if KMP_OS_WINDOWS
7292 __kmp_install_signals(TRUE);
7293#endif /* KMP_OS_WINDOWS */
7294#endif
7295
7296 /* we have finished the serial initialization */
7297 __kmp_init_counter++;
7298
7299 __kmp_init_serial = TRUE;
7300
7301 if (__kmp_settings) {
7302 __kmp_env_print();
7303 }
7304
7305 if (__kmp_display_env || __kmp_display_env_verbose) {
7306 __kmp_env_print_2();
7307 }
7308
7309#if OMPT_SUPPORT
7310 ompt_post_init();
7311#endif
7312
7313 KMP_MB();
7314
7315 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
7316}
7317
7318void __kmp_serial_initialize(void) {
7319 if (__kmp_init_serial) {
7320 return;
7321 }
7322 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7323 if (__kmp_init_serial) {
7324 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7325 return;
7326 }
7327 __kmp_do_serial_initialize();
7328 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7329}
7330
7331static void __kmp_do_middle_initialize(void) {
7332 int i, j;
7333 int prev_dflt_team_nth;
7334
7335 if (!__kmp_init_serial) {
7336 __kmp_do_serial_initialize();
7337 }
7338
7339 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
7340
7341 if (UNLIKELY(!__kmp_need_register_serial)) {
7342 // We are in a forked child process. The registration was skipped during
7343 // serial initialization in __kmp_atfork_child handler. Do it here.
7344 __kmp_register_library_startup();
7345 }
7346
7347 // Save the previous value for the __kmp_dflt_team_nth so that
7348 // we can avoid some reinitialization if it hasn't changed.
7349 prev_dflt_team_nth = __kmp_dflt_team_nth;
7350
7351#if KMP_AFFINITY_SUPPORTED
7352 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7353 // number of cores on the machine.
7354 __kmp_affinity_initialize(__kmp_affinity);
7355
7356#endif /* KMP_AFFINITY_SUPPORTED */
7357
7358 KMP_ASSERT(__kmp_xproc > 0);
7359 if (__kmp_avail_proc == 0) {
7360 __kmp_avail_proc = __kmp_xproc;
7361 }
7362
7363 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
7364 // correct them now
7365 j = 0;
7366 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
7367 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
7368 __kmp_avail_proc;
7369 j++;
7370 }
7371
7372 if (__kmp_dflt_team_nth == 0) {
7373#ifdef KMP_DFLT_NTH_CORES
7374 // Default #threads = #cores
7375 __kmp_dflt_team_nth = __kmp_ncores;
7376 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7377 "__kmp_ncores (%d)\n",
7378 __kmp_dflt_team_nth));
7379#else
7380 // Default #threads = #available OS procs
7381 __kmp_dflt_team_nth = __kmp_avail_proc;
7382 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
7383 "__kmp_avail_proc(%d)\n",
7384 __kmp_dflt_team_nth));
7385#endif /* KMP_DFLT_NTH_CORES */
7386 }
7387
7388 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
7389 __kmp_dflt_team_nth = KMP_MIN_NTH;
7390 }
7391 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
7392 __kmp_dflt_team_nth = __kmp_sys_max_nth;
7393 }
7394
7395 if (__kmp_nesting_mode > 0)
7396 __kmp_set_nesting_mode_threads();
7397
7398 // There's no harm in continuing if the following check fails,
7399 // but it indicates an error in the previous logic.
7400 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
7401
7402 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
7403 // Run through the __kmp_threads array and set the num threads icv for each
7404 // root thread that is currently registered with the RTL (which has not
7405 // already explicitly set its nthreads-var with a call to
7406 // omp_set_num_threads()).
7407 for (i = 0; i < __kmp_threads_capacity; i++) {
7408 kmp_info_t *thread = __kmp_threads[i];
7409 if (thread == NULL)
7410 continue;
7411 if (thread->th.th_current_task->td_icvs.nproc != 0)
7412 continue;
7413
7414 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7415 }
7416 }
7417 KA_TRACE(
7418 20,
7419 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7420 __kmp_dflt_team_nth));
7421
7422#ifdef KMP_ADJUST_BLOCKTIME
7423 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7424 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7425 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7426 if (__kmp_nth > __kmp_avail_proc) {
7427 __kmp_zero_bt = TRUE;
7428 }
7429 }
7430#endif /* KMP_ADJUST_BLOCKTIME */
7431
7432 /* we have finished middle initialization */
7433 TCW_SYNC_4(__kmp_init_middle, TRUE);
7434
7435 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7436}
7437
7438void __kmp_middle_initialize(void) {
7439 if (__kmp_init_middle) {
7440 return;
7441 }
7442 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7443 if (__kmp_init_middle) {
7444 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7445 return;
7446 }
7447 __kmp_do_middle_initialize();
7448 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7449}
7450
7451void __kmp_parallel_initialize(void) {
7452 int gtid = __kmp_entry_gtid(); // this might be a new root
7453
7454 /* synchronize parallel initialization (for sibling) */
7455 if (TCR_4(__kmp_init_parallel))
7456 return;
7457 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7458 if (TCR_4(__kmp_init_parallel)) {
7459 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7460 return;
7461 }
7462
7463 /* TODO reinitialization after we have already shut down */
7464 if (TCR_4(__kmp_global.g.g_done)) {
7465 KA_TRACE(
7466 10,
7467 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7468 __kmp_infinite_loop();
7469 }
7470
7471 /* jc: The lock __kmp_initz_lock is already held, so calling
7472 __kmp_serial_initialize would cause a deadlock. So we call
7473 __kmp_do_serial_initialize directly. */
7474 if (!__kmp_init_middle) {
7475 __kmp_do_middle_initialize();
7476 }
7477 __kmp_assign_root_init_mask();
7478 __kmp_resume_if_hard_paused();
7479
7480 /* begin initialization */
7481 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7482 KMP_ASSERT(KMP_UBER_GTID(gtid));
7483
7484#if KMP_ARCH_X86 || KMP_ARCH_X86_64
7485 // Save the FP control regs.
7486 // Worker threads will set theirs to these values at thread startup.
7487 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7488 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7489 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7490#endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7491
7492#if KMP_OS_UNIX
7493#if KMP_HANDLE_SIGNALS
7494 /* must be after __kmp_serial_initialize */
7495 __kmp_install_signals(TRUE);
7496#endif
7497#endif
7498
7499 __kmp_suspend_initialize();
7500
7501#if defined(USE_LOAD_BALANCE)
7502 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7503 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7504 }
7505#else
7506 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7507 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7508 }
7509#endif
7510
7511 if (__kmp_version) {
7512 __kmp_print_version_2();
7513 }
7514
7515 /* we have finished parallel initialization */
7516 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7517
7518 KMP_MB();
7519 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7520
7521 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7522}
7523
7524void __kmp_hidden_helper_initialize() {
7525 if (TCR_4(__kmp_init_hidden_helper))
7526 return;
7527
7528 // __kmp_parallel_initialize is required before we initialize hidden helper
7529 if (!TCR_4(__kmp_init_parallel))
7530 __kmp_parallel_initialize();
7531
7532 // Double check. Note that this double check should not be placed before
7533 // __kmp_parallel_initialize as it will cause dead lock.
7534 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7535 if (TCR_4(__kmp_init_hidden_helper)) {
7536 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7537 return;
7538 }
7539
7540#if KMP_AFFINITY_SUPPORTED
7541 // Initialize hidden helper affinity settings.
7542 // The above __kmp_parallel_initialize() will initialize
7543 // regular affinity (and topology) if not already done.
7544 if (!__kmp_hh_affinity.flags.initialized)
7545 __kmp_affinity_initialize(__kmp_hh_affinity);
7546#endif
7547
7548 // Set the count of hidden helper tasks to be executed to zero
7549 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0);
7550
7551 // Set the global variable indicating that we're initializing hidden helper
7552 // team/threads
7553 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE);
7554
7555 // Platform independent initialization
7556 __kmp_do_initialize_hidden_helper_threads();
7557
7558 // Wait here for the finish of initialization of hidden helper teams
7559 __kmp_hidden_helper_threads_initz_wait();
7560
7561 // We have finished hidden helper initialization
7562 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE);
7563
7564 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7565}
7566
7567/* ------------------------------------------------------------------------ */
7568
7569void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7570 kmp_team_t *team) {
7571 kmp_disp_t *dispatch;
7572
7573 KMP_MB();
7574
7575 /* none of the threads have encountered any constructs, yet. */
7576 this_thr->th.th_local.this_construct = 0;
7577#if KMP_CACHE_MANAGE
7578 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7579#endif /* KMP_CACHE_MANAGE */
7580 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7581 KMP_DEBUG_ASSERT(dispatch);
7582 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7583 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7584 // this_thr->th.th_info.ds.ds_tid ] );
7585
7586 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7587 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7588 if (__kmp_env_consistency_check)
7589 __kmp_push_parallel(gtid, team->t.t_ident);
7590
7591 KMP_MB(); /* Flush all pending memory write invalidates. */
7592}
7593
7594void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7595 kmp_team_t *team) {
7596 if (__kmp_env_consistency_check)
7597 __kmp_pop_parallel(gtid, team->t.t_ident);
7598
7599 __kmp_finish_implicit_task(this_thr);
7600}
7601
7602int __kmp_invoke_task_func(int gtid) {
7603 int rc;
7604 int tid = __kmp_tid_from_gtid(gtid);
7605 kmp_info_t *this_thr = __kmp_threads[gtid];
7606 kmp_team_t *team = this_thr->th.th_team;
7607
7608 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7609#if USE_ITT_BUILD
7610 if (__itt_stack_caller_create_ptr) {
7611 // inform ittnotify about entering user's code
7612 if (team->t.t_stack_id != NULL) {
7613 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id);
7614 } else {
7615 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7616 __kmp_itt_stack_callee_enter(
7617 (__itt_caller)team->t.t_parent->t.t_stack_id);
7618 }
7619 }
7620#endif /* USE_ITT_BUILD */
7621#if INCLUDE_SSC_MARKS
7622 SSC_MARK_INVOKING();
7623#endif
7624
7625#if OMPT_SUPPORT
7626 void *dummy;
7627 void **exit_frame_p;
7628 ompt_data_t *my_task_data;
7629 ompt_data_t *my_parallel_data;
7630 int ompt_team_size;
7631
7632 if (ompt_enabled.enabled) {
7633 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid]
7634 .ompt_task_info.frame.exit_frame.ptr);
7635 } else {
7636 exit_frame_p = &dummy;
7637 }
7638
7639 my_task_data =
7640 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7641 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7642 if (ompt_enabled.ompt_callback_implicit_task) {
7643 ompt_team_size = team->t.t_nproc;
7644 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7645 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7646 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7647 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7648 }
7649#endif
7650
7651#if KMP_STATS_ENABLED
7652 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7653 if (previous_state == stats_state_e::TEAMS_REGION) {
7654 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7655 } else {
7656 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7657 }
7658 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7659#endif
7660
7661 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7662 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7663#if OMPT_SUPPORT
7664 ,
7665 exit_frame_p
7666#endif
7667 );
7668#if OMPT_SUPPORT
7669 *exit_frame_p = NULL;
7670 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7671#endif
7672
7673#if KMP_STATS_ENABLED
7674 if (previous_state == stats_state_e::TEAMS_REGION) {
7675 KMP_SET_THREAD_STATE(previous_state);
7676 }
7677 KMP_POP_PARTITIONED_TIMER();
7678#endif
7679
7680#if USE_ITT_BUILD
7681 if (__itt_stack_caller_create_ptr) {
7682 // inform ittnotify about leaving user's code
7683 if (team->t.t_stack_id != NULL) {
7684 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id);
7685 } else {
7686 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL);
7687 __kmp_itt_stack_callee_leave(
7688 (__itt_caller)team->t.t_parent->t.t_stack_id);
7689 }
7690 }
7691#endif /* USE_ITT_BUILD */
7692 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7693
7694 return rc;
7695}
7696
7697void __kmp_teams_master(int gtid) {
7698 // This routine is called by all primary threads in teams construct
7699 kmp_info_t *thr = __kmp_threads[gtid];
7700 kmp_team_t *team = thr->th.th_team;
7701 ident_t *loc = team->t.t_ident;
7702 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7703 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7704 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7705 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7706 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7707
7708 // This thread is a new CG root. Set up the proper variables.
7709 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7710 tmp->cg_root = thr; // Make thr the CG root
7711 // Init to thread limit stored when league primary threads were forked
7712 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7713 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7714 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7715 " cg_nthreads to 1\n",
7716 thr, tmp));
7717 tmp->up = thr->th.th_cg_roots;
7718 thr->th.th_cg_roots = tmp;
7719
7720// Launch league of teams now, but not let workers execute
7721// (they hang on fork barrier until next parallel)
7722#if INCLUDE_SSC_MARKS
7723 SSC_MARK_FORKING();
7724#endif
7725 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7726 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7727 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7728#if INCLUDE_SSC_MARKS
7729 SSC_MARK_JOINING();
7730#endif
7731 // If the team size was reduced from the limit, set it to the new size
7732 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7733 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7734 // AC: last parameter "1" eliminates join barrier which won't work because
7735 // worker threads are in a fork barrier waiting for more parallel regions
7736 __kmp_join_call(loc, gtid
7737#if OMPT_SUPPORT
7738 ,
7739 fork_context_intel
7740#endif
7741 ,
7742 1);
7743}
7744
7745int __kmp_invoke_teams_master(int gtid) {
7746 kmp_info_t *this_thr = __kmp_threads[gtid];
7747 kmp_team_t *team = this_thr->th.th_team;
7748#if KMP_DEBUG
7749 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7750 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7751 (void *)__kmp_teams_master);
7752#endif
7753 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7754#if OMPT_SUPPORT
7755 int tid = __kmp_tid_from_gtid(gtid);
7756 ompt_data_t *task_data =
7757 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7758 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7759 if (ompt_enabled.ompt_callback_implicit_task) {
7760 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7761 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7762 ompt_task_initial);
7763 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7764 }
7765#endif
7766 __kmp_teams_master(gtid);
7767#if OMPT_SUPPORT
7768 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7769#endif
7770 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7771 return 1;
7772}
7773
7774/* this sets the requested number of threads for the next parallel region
7775 encountered by this team. since this should be enclosed in the forkjoin
7776 critical section it should avoid race conditions with asymmetrical nested
7777 parallelism */
7778
7779void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7780 kmp_info_t *thr = __kmp_threads[gtid];
7781
7782 if (num_threads > 0)
7783 thr->th.th_set_nproc = num_threads;
7784}
7785
7786static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
7787 int num_threads) {
7788 KMP_DEBUG_ASSERT(thr);
7789 // Remember the number of threads for inner parallel regions
7790 if (!TCR_4(__kmp_init_middle))
7791 __kmp_middle_initialize(); // get internal globals calculated
7792 __kmp_assign_root_init_mask();
7793 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7794 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7795
7796 if (num_threads == 0) {
7797 if (__kmp_teams_thread_limit > 0) {
7798 num_threads = __kmp_teams_thread_limit;
7799 } else {
7800 num_threads = __kmp_avail_proc / num_teams;
7801 }
7802 // adjust num_threads w/o warning as it is not user setting
7803 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7804 // no thread_limit clause specified - do not change thread-limit-var ICV
7805 if (num_threads > __kmp_dflt_team_nth) {
7806 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7807 }
7808 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7809 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7810 } // prevent team size to exceed thread-limit-var
7811 if (num_teams * num_threads > __kmp_teams_max_nth) {
7812 num_threads = __kmp_teams_max_nth / num_teams;
7813 }
7814 if (num_threads == 0) {
7815 num_threads = 1;
7816 }
7817 } else {
7818 if (num_threads < 0) {
7819 __kmp_msg(kmp_ms_warning, KMP_MSG(CantFormThrTeam, num_threads, 1),
7820 __kmp_msg_null);
7821 num_threads = 1;
7822 }
7823 // This thread will be the primary thread of the league primary threads
7824 // Store new thread limit; old limit is saved in th_cg_roots list
7825 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7826 // num_threads = min(num_threads, nthreads-var)
7827 if (num_threads > __kmp_dflt_team_nth) {
7828 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7829 }
7830 if (num_teams * num_threads > __kmp_teams_max_nth) {
7831 int new_threads = __kmp_teams_max_nth / num_teams;
7832 if (new_threads == 0) {
7833 new_threads = 1;
7834 }
7835 if (new_threads != num_threads) {
7836 if (!__kmp_reserve_warn) { // user asked for too many threads
7837 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7838 __kmp_msg(kmp_ms_warning,
7839 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7840 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7841 }
7842 }
7843 num_threads = new_threads;
7844 }
7845 }
7846 thr->th.th_teams_size.nth = num_threads;
7847}
7848
7849/* this sets the requested number of teams for the teams region and/or
7850 the number of threads for the next parallel region encountered */
7851void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7852 int num_threads) {
7853 kmp_info_t *thr = __kmp_threads[gtid];
7854 if (num_teams < 0) {
7855 // OpenMP specification requires requested values to be positive,
7856 // but people can send us any value, so we'd better check
7857 __kmp_msg(kmp_ms_warning, KMP_MSG(NumTeamsNotPositive, num_teams, 1),
7858 __kmp_msg_null);
7859 num_teams = 1;
7860 }
7861 if (num_teams == 0) {
7862 if (__kmp_nteams > 0) {
7863 num_teams = __kmp_nteams;
7864 } else {
7865 num_teams = 1; // default number of teams is 1.
7866 }
7867 }
7868 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7869 if (!__kmp_reserve_warn) {
7870 __kmp_reserve_warn = 1;
7871 __kmp_msg(kmp_ms_warning,
7872 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7873 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7874 }
7875 num_teams = __kmp_teams_max_nth;
7876 }
7877 // Set number of teams (number of threads in the outer "parallel" of the
7878 // teams)
7879 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7880
7881 __kmp_push_thread_limit(thr, num_teams, num_threads);
7882}
7883
7884/* This sets the requested number of teams for the teams region and/or
7885 the number of threads for the next parallel region encountered */
7886void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb,
7887 int num_teams_ub, int num_threads) {
7888 kmp_info_t *thr = __kmp_threads[gtid];
7889 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0);
7890 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb);
7891 KMP_DEBUG_ASSERT(num_threads >= 0);
7892
7893 if (num_teams_lb > num_teams_ub) {
7894 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub),
7895 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null);
7896 }
7897
7898 int num_teams = 1; // defalt number of teams is 1.
7899
7900 if (num_teams_lb == 0 && num_teams_ub > 0)
7901 num_teams_lb = num_teams_ub;
7902
7903 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause
7904 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams;
7905 if (num_teams > __kmp_teams_max_nth) {
7906 if (!__kmp_reserve_warn) {
7907 __kmp_reserve_warn = 1;
7908 __kmp_msg(kmp_ms_warning,
7909 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7910 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7911 }
7912 num_teams = __kmp_teams_max_nth;
7913 }
7914 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams
7915 num_teams = num_teams_ub;
7916 } else { // num_teams_lb <= num_teams <= num_teams_ub
7917 if (num_threads <= 0) {
7918 if (num_teams_ub > __kmp_teams_max_nth) {
7919 num_teams = num_teams_lb;
7920 } else {
7921 num_teams = num_teams_ub;
7922 }
7923 } else {
7924 num_teams = (num_threads > __kmp_teams_max_nth)
7925 ? num_teams
7926 : __kmp_teams_max_nth / num_threads;
7927 if (num_teams < num_teams_lb) {
7928 num_teams = num_teams_lb;
7929 } else if (num_teams > num_teams_ub) {
7930 num_teams = num_teams_ub;
7931 }
7932 }
7933 }
7934 // Set number of teams (number of threads in the outer "parallel" of the
7935 // teams)
7936 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7937
7938 __kmp_push_thread_limit(thr, num_teams, num_threads);
7939}
7940
7941// Set the proc_bind var to use in the following parallel region.
7942void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7943 kmp_info_t *thr = __kmp_threads[gtid];
7944 thr->th.th_set_proc_bind = proc_bind;
7945}
7946
7947/* Launch the worker threads into the microtask. */
7948
7949void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7950 kmp_info_t *this_thr = __kmp_threads[gtid];
7951
7952#ifdef KMP_DEBUG
7953 int f;
7954#endif /* KMP_DEBUG */
7955
7956 KMP_DEBUG_ASSERT(team);
7957 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7958 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7959 KMP_MB(); /* Flush all pending memory write invalidates. */
7960
7961 team->t.t_construct = 0; /* no single directives seen yet */
7962 team->t.t_ordered.dt.t_value =
7963 0; /* thread 0 enters the ordered section first */
7964
7965 /* Reset the identifiers on the dispatch buffer */
7966 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7967 if (team->t.t_max_nproc > 1) {
7968 int i;
7969 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7970 team->t.t_disp_buffer[i].buffer_index = i;
7971 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7972 }
7973 } else {
7974 team->t.t_disp_buffer[0].buffer_index = 0;
7975 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7976 }
7977
7978 KMP_MB(); /* Flush all pending memory write invalidates. */
7979 KMP_ASSERT(this_thr->th.th_team == team);
7980
7981#ifdef KMP_DEBUG
7982 for (f = 0; f < team->t.t_nproc; f++) {
7983 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7984 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7985 }
7986#endif /* KMP_DEBUG */
7987
7988 /* release the worker threads so they may begin working */
7989 __kmp_fork_barrier(gtid, 0);
7990}
7991
7992void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7993 kmp_info_t *this_thr = __kmp_threads[gtid];
7994
7995 KMP_DEBUG_ASSERT(team);
7996 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7997 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7998 KMP_MB(); /* Flush all pending memory write invalidates. */
7999
8000 /* Join barrier after fork */
8001
8002#ifdef KMP_DEBUG
8003 if (__kmp_threads[gtid] &&
8004 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
8005 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
8006 __kmp_threads[gtid]);
8007 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
8008 "team->t.t_nproc=%d\n",
8009 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
8010 team->t.t_nproc);
8011 __kmp_print_structure();
8012 }
8013 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
8014 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
8015#endif /* KMP_DEBUG */
8016
8017 __kmp_join_barrier(gtid); /* wait for everyone */
8018#if OMPT_SUPPORT
8019 if (ompt_enabled.enabled &&
8020 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
8021 int ds_tid = this_thr->th.th_info.ds.ds_tid;
8022 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
8023 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
8024#if OMPT_OPTIONAL
8025 void *codeptr = NULL;
8026 if (KMP_MASTER_TID(ds_tid) &&
8027 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
8028 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
8029 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
8030
8031 if (ompt_enabled.ompt_callback_sync_region_wait) {
8032 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
8033 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8034 codeptr);
8035 }
8036 if (ompt_enabled.ompt_callback_sync_region) {
8037 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
8038 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
8039 codeptr);
8040 }
8041#endif
8042 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
8043 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
8044 ompt_scope_end, NULL, task_data, 0, ds_tid,
8045 ompt_task_implicit); // TODO: Can this be ompt_task_initial?
8046 }
8047 }
8048#endif
8049
8050 KMP_MB(); /* Flush all pending memory write invalidates. */
8051 KMP_ASSERT(this_thr->th.th_team == team);
8052}
8053
8054/* ------------------------------------------------------------------------ */
8055
8056#ifdef USE_LOAD_BALANCE
8057
8058// Return the worker threads actively spinning in the hot team, if we
8059// are at the outermost level of parallelism. Otherwise, return 0.
8060static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
8061 int i;
8062 int retval;
8063 kmp_team_t *hot_team;
8064
8065 if (root->r.r_active) {
8066 return 0;
8067 }
8068 hot_team = root->r.r_hot_team;
8069 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
8070 return hot_team->t.t_nproc - 1; // Don't count primary thread
8071 }
8072
8073 // Skip the primary thread - it is accounted for elsewhere.
8074 retval = 0;
8075 for (i = 1; i < hot_team->t.t_nproc; i++) {
8076 if (hot_team->t.t_threads[i]->th.th_active) {
8077 retval++;
8078 }
8079 }
8080 return retval;
8081}
8082
8083// Perform an automatic adjustment to the number of
8084// threads used by the next parallel region.
8085static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
8086 int retval;
8087 int pool_active;
8088 int hot_team_active;
8089 int team_curr_active;
8090 int system_active;
8091
8092 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
8093 set_nproc));
8094 KMP_DEBUG_ASSERT(root);
8095 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
8096 ->th.th_current_task->td_icvs.dynamic == TRUE);
8097 KMP_DEBUG_ASSERT(set_nproc > 1);
8098
8099 if (set_nproc == 1) {
8100 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
8101 return 1;
8102 }
8103
8104 // Threads that are active in the thread pool, active in the hot team for this
8105 // particular root (if we are at the outer par level), and the currently
8106 // executing thread (to become the primary thread) are available to add to the
8107 // new team, but are currently contributing to the system load, and must be
8108 // accounted for.
8109 pool_active = __kmp_thread_pool_active_nth;
8110 hot_team_active = __kmp_active_hot_team_nproc(root);
8111 team_curr_active = pool_active + hot_team_active + 1;
8112
8113 // Check the system load.
8114 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
8115 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
8116 "hot team active = %d\n",
8117 system_active, pool_active, hot_team_active));
8118
8119 if (system_active < 0) {
8120 // There was an error reading the necessary info from /proc, so use the
8121 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
8122 // = dynamic_thread_limit, we shouldn't wind up getting back here.
8123 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8124 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
8125
8126 // Make this call behave like the thread limit algorithm.
8127 retval = __kmp_avail_proc - __kmp_nth +
8128 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
8129 if (retval > set_nproc) {
8130 retval = set_nproc;
8131 }
8132 if (retval < KMP_MIN_NTH) {
8133 retval = KMP_MIN_NTH;
8134 }
8135
8136 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
8137 retval));
8138 return retval;
8139 }
8140
8141 // There is a slight delay in the load balance algorithm in detecting new
8142 // running procs. The real system load at this instant should be at least as
8143 // large as the #active omp thread that are available to add to the team.
8144 if (system_active < team_curr_active) {
8145 system_active = team_curr_active;
8146 }
8147 retval = __kmp_avail_proc - system_active + team_curr_active;
8148 if (retval > set_nproc) {
8149 retval = set_nproc;
8150 }
8151 if (retval < KMP_MIN_NTH) {
8152 retval = KMP_MIN_NTH;
8153 }
8154
8155 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
8156 return retval;
8157} // __kmp_load_balance_nproc()
8158
8159#endif /* USE_LOAD_BALANCE */
8160
8161/* ------------------------------------------------------------------------ */
8162
8163/* NOTE: this is called with the __kmp_init_lock held */
8164void __kmp_cleanup(void) {
8165 int f;
8166
8167 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
8168
8169 if (TCR_4(__kmp_init_parallel)) {
8170#if KMP_HANDLE_SIGNALS
8171 __kmp_remove_signals();
8172#endif
8173 TCW_4(__kmp_init_parallel, FALSE);
8174 }
8175
8176 if (TCR_4(__kmp_init_middle)) {
8177#if KMP_AFFINITY_SUPPORTED
8178 __kmp_affinity_uninitialize();
8179#endif /* KMP_AFFINITY_SUPPORTED */
8180 __kmp_cleanup_hierarchy();
8181 TCW_4(__kmp_init_middle, FALSE);
8182 }
8183
8184 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
8185
8186 if (__kmp_init_serial) {
8187 __kmp_runtime_destroy();
8188 __kmp_init_serial = FALSE;
8189 }
8190
8191 __kmp_cleanup_threadprivate_caches();
8192
8193 for (f = 0; f < __kmp_threads_capacity; f++) {
8194 if (__kmp_root[f] != NULL) {
8195 __kmp_free(__kmp_root[f]);
8196 __kmp_root[f] = NULL;
8197 }
8198 }
8199 __kmp_free(__kmp_threads);
8200 // __kmp_threads and __kmp_root were allocated at once, as single block, so
8201 // there is no need in freeing __kmp_root.
8202 __kmp_threads = NULL;
8203 __kmp_root = NULL;
8204 __kmp_threads_capacity = 0;
8205
8206 // Free old __kmp_threads arrays if they exist.
8207 kmp_old_threads_list_t *ptr = __kmp_old_threads_list;
8208 while (ptr) {
8209 kmp_old_threads_list_t *next = ptr->next;
8210 __kmp_free(ptr->threads);
8211 __kmp_free(ptr);
8212 ptr = next;
8213 }
8214
8215#if KMP_USE_DYNAMIC_LOCK
8216 __kmp_cleanup_indirect_user_locks();
8217#else
8218 __kmp_cleanup_user_locks();
8219#endif
8220#if OMPD_SUPPORT
8221 if (ompd_state) {
8222 __kmp_free(ompd_env_block);
8223 ompd_env_block = NULL;
8224 ompd_env_block_size = 0;
8225 }
8226#endif
8227
8228#if KMP_AFFINITY_SUPPORTED
8229 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
8230 __kmp_cpuinfo_file = NULL;
8231#endif /* KMP_AFFINITY_SUPPORTED */
8232
8233#if KMP_USE_ADAPTIVE_LOCKS
8234#if KMP_DEBUG_ADAPTIVE_LOCKS
8235 __kmp_print_speculative_stats();
8236#endif
8237#endif
8238 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
8239 __kmp_nested_nth.nth = NULL;
8240 __kmp_nested_nth.size = 0;
8241 __kmp_nested_nth.used = 0;
8242 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
8243 __kmp_nested_proc_bind.bind_types = NULL;
8244 __kmp_nested_proc_bind.size = 0;
8245 __kmp_nested_proc_bind.used = 0;
8246 if (__kmp_affinity_format) {
8247 KMP_INTERNAL_FREE(__kmp_affinity_format);
8248 __kmp_affinity_format = NULL;
8249 }
8250
8251 __kmp_i18n_catclose();
8252
8253#if KMP_USE_HIER_SCHED
8254 __kmp_hier_scheds.deallocate();
8255#endif
8256
8257#if KMP_STATS_ENABLED
8258 __kmp_stats_fini();
8259#endif
8260
8261 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
8262}
8263
8264/* ------------------------------------------------------------------------ */
8265
8266int __kmp_ignore_mppbeg(void) {
8267 char *env;
8268
8269 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
8270 if (__kmp_str_match_false(env))
8271 return FALSE;
8272 }
8273 // By default __kmpc_begin() is no-op.
8274 return TRUE;
8275}
8276
8277int __kmp_ignore_mppend(void) {
8278 char *env;
8279
8280 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
8281 if (__kmp_str_match_false(env))
8282 return FALSE;
8283 }
8284 // By default __kmpc_end() is no-op.
8285 return TRUE;
8286}
8287
8288void __kmp_internal_begin(void) {
8289 int gtid;
8290 kmp_root_t *root;
8291
8292 /* this is a very important step as it will register new sibling threads
8293 and assign these new uber threads a new gtid */
8294 gtid = __kmp_entry_gtid();
8295 root = __kmp_threads[gtid]->th.th_root;
8296 KMP_ASSERT(KMP_UBER_GTID(gtid));
8297
8298 if (root->r.r_begin)
8299 return;
8300 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
8301 if (root->r.r_begin) {
8302 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8303 return;
8304 }
8305
8306 root->r.r_begin = TRUE;
8307
8308 __kmp_release_lock(&root->r.r_begin_lock, gtid);
8309}
8310
8311/* ------------------------------------------------------------------------ */
8312
8313void __kmp_user_set_library(enum library_type arg) {
8314 int gtid;
8315 kmp_root_t *root;
8316 kmp_info_t *thread;
8317
8318 /* first, make sure we are initialized so we can get our gtid */
8319
8320 gtid = __kmp_entry_gtid();
8321 thread = __kmp_threads[gtid];
8322
8323 root = thread->th.th_root;
8324
8325 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
8326 library_serial));
8327 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
8328 thread */
8329 KMP_WARNING(SetLibraryIncorrectCall);
8330 return;
8331 }
8332
8333 switch (arg) {
8334 case library_serial:
8335 thread->th.th_set_nproc = 0;
8336 set__nproc(thread, 1);
8337 break;
8338 case library_turnaround:
8339 thread->th.th_set_nproc = 0;
8340 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8341 : __kmp_dflt_team_nth_ub);
8342 break;
8343 case library_throughput:
8344 thread->th.th_set_nproc = 0;
8345 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
8346 : __kmp_dflt_team_nth_ub);
8347 break;
8348 default:
8349 KMP_FATAL(UnknownLibraryType, arg);
8350 }
8351
8352 __kmp_aux_set_library(arg);
8353}
8354
8355void __kmp_aux_set_stacksize(size_t arg) {
8356 if (!__kmp_init_serial)
8357 __kmp_serial_initialize();
8358
8359#if KMP_OS_DARWIN
8360 if (arg & (0x1000 - 1)) {
8361 arg &= ~(0x1000 - 1);
8362 if (arg + 0x1000) /* check for overflow if we round up */
8363 arg += 0x1000;
8364 }
8365#endif
8366 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8367
8368 /* only change the default stacksize before the first parallel region */
8369 if (!TCR_4(__kmp_init_parallel)) {
8370 size_t value = arg; /* argument is in bytes */
8371
8372 if (value < __kmp_sys_min_stksize)
8373 value = __kmp_sys_min_stksize;
8374 else if (value > KMP_MAX_STKSIZE)
8375 value = KMP_MAX_STKSIZE;
8376
8377 __kmp_stksize = value;
8378
8379 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8380 }
8381
8382 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8383}
8384
8385/* set the behaviour of the runtime library */
8386/* TODO this can cause some odd behaviour with sibling parallelism... */
8387void __kmp_aux_set_library(enum library_type arg) {
8388 __kmp_library = arg;
8389
8390 switch (__kmp_library) {
8391 case library_serial: {
8392 KMP_INFORM(LibraryIsSerial);
8393 } break;
8394 case library_turnaround:
8395 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
8396 __kmp_use_yield = 2; // only yield when oversubscribed
8397 break;
8398 case library_throughput:
8399 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
8400 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
8401 break;
8402 default:
8403 KMP_FATAL(UnknownLibraryType, arg);
8404 }
8405}
8406
8407/* Getting team information common for all team API */
8408// Returns NULL if not in teams construct
8409static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
8410 kmp_info_t *thr = __kmp_entry_thread();
8411 teams_serialized = 0;
8412 if (thr->th.th_teams_microtask) {
8413 kmp_team_t *team = thr->th.th_team;
8414 int tlevel = thr->th.th_teams_level; // the level of the teams construct
8415 int ii = team->t.t_level;
8416 teams_serialized = team->t.t_serialized;
8417 int level = tlevel + 1;
8418 KMP_DEBUG_ASSERT(ii >= tlevel);
8419 while (ii > level) {
8420 for (teams_serialized = team->t.t_serialized;
8421 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
8422 }
8423 if (team->t.t_serialized && (!teams_serialized)) {
8424 team = team->t.t_parent;
8425 continue;
8426 }
8427 if (ii > level) {
8428 team = team->t.t_parent;
8429 ii--;
8430 }
8431 }
8432 return team;
8433 }
8434 return NULL;
8435}
8436
8437int __kmp_aux_get_team_num() {
8438 int serialized;
8439 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8440 if (team) {
8441 if (serialized > 1) {
8442 return 0; // teams region is serialized ( 1 team of 1 thread ).
8443 } else {
8444 return team->t.t_master_tid;
8445 }
8446 }
8447 return 0;
8448}
8449
8450int __kmp_aux_get_num_teams() {
8451 int serialized;
8452 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
8453 if (team) {
8454 if (serialized > 1) {
8455 return 1;
8456 } else {
8457 return team->t.t_parent->t.t_nproc;
8458 }
8459 }
8460 return 1;
8461}
8462
8463/* ------------------------------------------------------------------------ */
8464
8465/*
8466 * Affinity Format Parser
8467 *
8468 * Field is in form of: %[[[0].]size]type
8469 * % and type are required (%% means print a literal '%')
8470 * type is either single char or long name surrounded by {},
8471 * e.g., N or {num_threads}
8472 * 0 => leading zeros
8473 * . => right justified when size is specified
8474 * by default output is left justified
8475 * size is the *minimum* field length
8476 * All other characters are printed as is
8477 *
8478 * Available field types:
8479 * L {thread_level} - omp_get_level()
8480 * n {thread_num} - omp_get_thread_num()
8481 * h {host} - name of host machine
8482 * P {process_id} - process id (integer)
8483 * T {thread_identifier} - native thread identifier (integer)
8484 * N {num_threads} - omp_get_num_threads()
8485 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
8486 * a {thread_affinity} - comma separated list of integers or integer ranges
8487 * (values of affinity mask)
8488 *
8489 * Implementation-specific field types can be added
8490 * If a type is unknown, print "undefined"
8491 */
8492
8493// Structure holding the short name, long name, and corresponding data type
8494// for snprintf. A table of these will represent the entire valid keyword
8495// field types.
8496typedef struct kmp_affinity_format_field_t {
8497 char short_name; // from spec e.g., L -> thread level
8498 const char *long_name; // from spec thread_level -> thread level
8499 char field_format; // data type for snprintf (typically 'd' or 's'
8500 // for integer or string)
8501} kmp_affinity_format_field_t;
8502
8503static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
8504#if KMP_AFFINITY_SUPPORTED
8505 {'A', "thread_affinity", 's'},
8506#endif
8507 {'t', "team_num", 'd'},
8508 {'T', "num_teams", 'd'},
8509 {'L', "nesting_level", 'd'},
8510 {'n', "thread_num", 'd'},
8511 {'N', "num_threads", 'd'},
8512 {'a', "ancestor_tnum", 'd'},
8513 {'H', "host", 's'},
8514 {'P', "process_id", 'd'},
8515 {'i', "native_thread_id", 'd'}};
8516
8517// Return the number of characters it takes to hold field
8518static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
8519 const char **ptr,
8520 kmp_str_buf_t *field_buffer) {
8521 int rc, format_index, field_value;
8522 const char *width_left, *width_right;
8523 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
8524 static const int FORMAT_SIZE = 20;
8525 char format[FORMAT_SIZE] = {0};
8526 char absolute_short_name = 0;
8527
8528 KMP_DEBUG_ASSERT(gtid >= 0);
8529 KMP_DEBUG_ASSERT(th);
8530 KMP_DEBUG_ASSERT(**ptr == '%');
8531 KMP_DEBUG_ASSERT(field_buffer);
8532
8533 __kmp_str_buf_clear(field_buffer);
8534
8535 // Skip the initial %
8536 (*ptr)++;
8537
8538 // Check for %% first
8539 if (**ptr == '%') {
8540 __kmp_str_buf_cat(field_buffer, "%", 1);
8541 (*ptr)++; // skip over the second %
8542 return 1;
8543 }
8544
8545 // Parse field modifiers if they are present
8546 pad_zeros = false;
8547 if (**ptr == '0') {
8548 pad_zeros = true;
8549 (*ptr)++; // skip over 0
8550 }
8551 right_justify = false;
8552 if (**ptr == '.') {
8553 right_justify = true;
8554 (*ptr)++; // skip over .
8555 }
8556 // Parse width of field: [width_left, width_right)
8557 width_left = width_right = NULL;
8558 if (**ptr >= '0' && **ptr <= '9') {
8559 width_left = *ptr;
8560 SKIP_DIGITS(*ptr);
8561 width_right = *ptr;
8562 }
8563
8564 // Create the format for KMP_SNPRINTF based on flags parsed above
8565 format_index = 0;
8566 format[format_index++] = '%';
8567 if (!right_justify)
8568 format[format_index++] = '-';
8569 if (pad_zeros)
8570 format[format_index++] = '0';
8571 if (width_left && width_right) {
8572 int i = 0;
8573 // Only allow 8 digit number widths.
8574 // This also prevents overflowing format variable
8575 while (i < 8 && width_left < width_right) {
8576 format[format_index++] = *width_left;
8577 width_left++;
8578 i++;
8579 }
8580 }
8581
8582 // Parse a name (long or short)
8583 // Canonicalize the name into absolute_short_name
8584 found_valid_name = false;
8585 parse_long_name = (**ptr == '{');
8586 if (parse_long_name)
8587 (*ptr)++; // skip initial left brace
8588 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8589 sizeof(__kmp_affinity_format_table[0]);
8590 ++i) {
8591 char short_name = __kmp_affinity_format_table[i].short_name;
8592 const char *long_name = __kmp_affinity_format_table[i].long_name;
8593 char field_format = __kmp_affinity_format_table[i].field_format;
8594 if (parse_long_name) {
8595 size_t length = KMP_STRLEN(long_name);
8596 if (strncmp(*ptr, long_name, length) == 0) {
8597 found_valid_name = true;
8598 (*ptr) += length; // skip the long name
8599 }
8600 } else if (**ptr == short_name) {
8601 found_valid_name = true;
8602 (*ptr)++; // skip the short name
8603 }
8604 if (found_valid_name) {
8605 format[format_index++] = field_format;
8606 format[format_index++] = '\0';
8607 absolute_short_name = short_name;
8608 break;
8609 }
8610 }
8611 if (parse_long_name) {
8612 if (**ptr != '}') {
8613 absolute_short_name = 0;
8614 } else {
8615 (*ptr)++; // skip over the right brace
8616 }
8617 }
8618
8619 // Attempt to fill the buffer with the requested
8620 // value using snprintf within __kmp_str_buf_print()
8621 switch (absolute_short_name) {
8622 case 't':
8623 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8624 break;
8625 case 'T':
8626 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8627 break;
8628 case 'L':
8629 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8630 break;
8631 case 'n':
8632 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8633 break;
8634 case 'H': {
8635 static const int BUFFER_SIZE = 256;
8636 char buf[BUFFER_SIZE];
8637 __kmp_expand_host_name(buf, BUFFER_SIZE);
8638 rc = __kmp_str_buf_print(field_buffer, format, buf);
8639 } break;
8640 case 'P':
8641 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8642 break;
8643 case 'i':
8644 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8645 break;
8646 case 'N':
8647 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8648 break;
8649 case 'a':
8650 field_value =
8651 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8652 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8653 break;
8654#if KMP_AFFINITY_SUPPORTED
8655 case 'A': {
8656 kmp_str_buf_t buf;
8657 __kmp_str_buf_init(&buf);
8658 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8659 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8660 __kmp_str_buf_free(&buf);
8661 } break;
8662#endif
8663 default:
8664 // According to spec, If an implementation does not have info for field
8665 // type, then "undefined" is printed
8666 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8667 // Skip the field
8668 if (parse_long_name) {
8669 SKIP_TOKEN(*ptr);
8670 if (**ptr == '}')
8671 (*ptr)++;
8672 } else {
8673 (*ptr)++;
8674 }
8675 }
8676
8677 KMP_ASSERT(format_index <= FORMAT_SIZE);
8678 return rc;
8679}
8680
8681/*
8682 * Return number of characters needed to hold the affinity string
8683 * (not including null byte character)
8684 * The resultant string is printed to buffer, which the caller can then
8685 * handle afterwards
8686 */
8687size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8688 kmp_str_buf_t *buffer) {
8689 const char *parse_ptr;
8690 size_t retval;
8691 const kmp_info_t *th;
8692 kmp_str_buf_t field;
8693
8694 KMP_DEBUG_ASSERT(buffer);
8695 KMP_DEBUG_ASSERT(gtid >= 0);
8696
8697 __kmp_str_buf_init(&field);
8698 __kmp_str_buf_clear(buffer);
8699
8700 th = __kmp_threads[gtid];
8701 retval = 0;
8702
8703 // If format is NULL or zero-length string, then we use
8704 // affinity-format-var ICV
8705 parse_ptr = format;
8706 if (parse_ptr == NULL || *parse_ptr == '\0') {
8707 parse_ptr = __kmp_affinity_format;
8708 }
8709 KMP_DEBUG_ASSERT(parse_ptr);
8710
8711 while (*parse_ptr != '\0') {
8712 // Parse a field
8713 if (*parse_ptr == '%') {
8714 // Put field in the buffer
8715 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8716 __kmp_str_buf_catbuf(buffer, &field);
8717 retval += rc;
8718 } else {
8719 // Put literal character in buffer
8720 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8721 retval++;
8722 parse_ptr++;
8723 }
8724 }
8725 __kmp_str_buf_free(&field);
8726 return retval;
8727}
8728
8729// Displays the affinity string to stdout
8730void __kmp_aux_display_affinity(int gtid, const char *format) {
8731 kmp_str_buf_t buf;
8732 __kmp_str_buf_init(&buf);
8733 __kmp_aux_capture_affinity(gtid, format, &buf);
8734 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8735 __kmp_str_buf_free(&buf);
8736}
8737
8738/* ------------------------------------------------------------------------ */
8739
8740void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8741 int blocktime = arg; /* argument is in milliseconds */
8742#if KMP_USE_MONITOR
8743 int bt_intervals;
8744#endif
8745 kmp_int8 bt_set;
8746
8747 __kmp_save_internal_controls(thread);
8748
8749 /* Normalize and set blocktime for the teams */
8750 if (blocktime < KMP_MIN_BLOCKTIME)
8751 blocktime = KMP_MIN_BLOCKTIME;
8752 else if (blocktime > KMP_MAX_BLOCKTIME)
8753 blocktime = KMP_MAX_BLOCKTIME;
8754
8755 set__blocktime_team(thread->th.th_team, tid, blocktime);
8756 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8757
8758#if KMP_USE_MONITOR
8759 /* Calculate and set blocktime intervals for the teams */
8760 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8761
8762 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8763 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8764#endif
8765
8766 /* Set whether blocktime has been set to "TRUE" */
8767 bt_set = TRUE;
8768
8769 set__bt_set_team(thread->th.th_team, tid, bt_set);
8770 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8771#if KMP_USE_MONITOR
8772 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8773 "bt_intervals=%d, monitor_updates=%d\n",
8774 __kmp_gtid_from_tid(tid, thread->th.th_team),
8775 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8776 __kmp_monitor_wakeups));
8777#else
8778 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8779 __kmp_gtid_from_tid(tid, thread->th.th_team),
8780 thread->th.th_team->t.t_id, tid, blocktime));
8781#endif
8782}
8783
8784void __kmp_aux_set_defaults(char const *str, size_t len) {
8785 if (!__kmp_init_serial) {
8786 __kmp_serial_initialize();
8787 }
8788 __kmp_env_initialize(str);
8789
8790 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8791 __kmp_env_print();
8792 }
8793} // __kmp_aux_set_defaults
8794
8795/* ------------------------------------------------------------------------ */
8796/* internal fast reduction routines */
8797
8798PACKED_REDUCTION_METHOD_T
8799__kmp_determine_reduction_method(
8800 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8801 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8802 kmp_critical_name *lck) {
8803
8804 // Default reduction method: critical construct ( lck != NULL, like in current
8805 // PAROPT )
8806 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8807 // can be selected by RTL
8808 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8809 // can be selected by RTL
8810 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8811 // among generated by PAROPT.
8812
8813 PACKED_REDUCTION_METHOD_T retval;
8814
8815 int team_size;
8816
8817 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8818 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8819
8820#define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8821 (loc && \
8822 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)))
8823#define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8824
8825 retval = critical_reduce_block;
8826
8827 // another choice of getting a team size (with 1 dynamic deference) is slower
8828 team_size = __kmp_get_team_num_threads(global_tid);
8829 if (team_size == 1) {
8830
8831 retval = empty_reduce_block;
8832
8833 } else {
8834
8835 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8836
8837#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8838 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
8839
8840#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8841 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8842
8843 int teamsize_cutoff = 4;
8844
8845#if KMP_MIC_SUPPORTED
8846 if (__kmp_mic_type != non_mic) {
8847 teamsize_cutoff = 8;
8848 }
8849#endif
8850 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8851 if (tree_available) {
8852 if (team_size <= teamsize_cutoff) {
8853 if (atomic_available) {
8854 retval = atomic_reduce_block;
8855 }
8856 } else {
8857 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8858 }
8859 } else if (atomic_available) {
8860 retval = atomic_reduce_block;
8861 }
8862#else
8863#error "Unknown or unsupported OS"
8864#endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8865 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8866
8867#elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8868
8869#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8870
8871 // basic tuning
8872
8873 if (atomic_available) {
8874 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8875 retval = atomic_reduce_block;
8876 }
8877 } // otherwise: use critical section
8878
8879#elif KMP_OS_DARWIN
8880
8881 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8882 if (atomic_available && (num_vars <= 3)) {
8883 retval = atomic_reduce_block;
8884 } else if (tree_available) {
8885 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8886 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8887 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8888 }
8889 } // otherwise: use critical section
8890
8891#else
8892#error "Unknown or unsupported OS"
8893#endif
8894
8895#else
8896#error "Unknown or unsupported architecture"
8897#endif
8898 }
8899
8900 // KMP_FORCE_REDUCTION
8901
8902 // If the team is serialized (team_size == 1), ignore the forced reduction
8903 // method and stay with the unsynchronized method (empty_reduce_block)
8904 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8905 team_size != 1) {
8906
8907 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8908
8909 int atomic_available, tree_available;
8910
8911 switch ((forced_retval = __kmp_force_reduction_method)) {
8912 case critical_reduce_block:
8913 KMP_ASSERT(lck); // lck should be != 0
8914 break;
8915
8916 case atomic_reduce_block:
8917 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8918 if (!atomic_available) {
8919 KMP_WARNING(RedMethodNotSupported, "atomic");
8920 forced_retval = critical_reduce_block;
8921 }
8922 break;
8923
8924 case tree_reduce_block:
8925 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8926 if (!tree_available) {
8927 KMP_WARNING(RedMethodNotSupported, "tree");
8928 forced_retval = critical_reduce_block;
8929 } else {
8930#if KMP_FAST_REDUCTION_BARRIER
8931 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8932#endif
8933 }
8934 break;
8935
8936 default:
8937 KMP_ASSERT(0); // "unsupported method specified"
8938 }
8939
8940 retval = forced_retval;
8941 }
8942
8943 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8944
8945#undef FAST_REDUCTION_TREE_METHOD_GENERATED
8946#undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8947
8948 return (retval);
8949}
8950// this function is for testing set/get/determine reduce method
8951kmp_int32 __kmp_get_reduce_method(void) {
8952 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8953}
8954
8955// Soft pause sets up threads to ignore blocktime and just go to sleep.
8956// Spin-wait code checks __kmp_pause_status and reacts accordingly.
8957void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8958
8959// Hard pause shuts down the runtime completely. Resume happens naturally when
8960// OpenMP is used subsequently.
8961void __kmp_hard_pause() {
8962 __kmp_pause_status = kmp_hard_paused;
8963 __kmp_internal_end_thread(-1);
8964}
8965
8966// Soft resume sets __kmp_pause_status, and wakes up all threads.
8967void __kmp_resume_if_soft_paused() {
8968 if (__kmp_pause_status == kmp_soft_paused) {
8969 __kmp_pause_status = kmp_not_paused;
8970
8971 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8972 kmp_info_t *thread = __kmp_threads[gtid];
8973 if (thread) { // Wake it if sleeping
8974 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8975 thread);
8976 if (fl.is_sleeping())
8977 fl.resume(gtid);
8978 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8979 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8980 } else { // thread holds the lock and may sleep soon
8981 do { // until either the thread sleeps, or we can get the lock
8982 if (fl.is_sleeping()) {
8983 fl.resume(gtid);
8984 break;
8985 } else if (__kmp_try_suspend_mx(thread)) {
8986 __kmp_unlock_suspend_mx(thread);
8987 break;
8988 }
8989 } while (1);
8990 }
8991 }
8992 }
8993 }
8994}
8995
8996// This function is called via __kmpc_pause_resource. Returns 0 if successful.
8997// TODO: add warning messages
8998int __kmp_pause_resource(kmp_pause_status_t level) {
8999 if (level == kmp_not_paused) { // requesting resume
9000 if (__kmp_pause_status == kmp_not_paused) {
9001 // error message about runtime not being paused, so can't resume
9002 return 1;
9003 } else {
9004 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
9005 __kmp_pause_status == kmp_hard_paused);
9006 __kmp_pause_status = kmp_not_paused;
9007 return 0;
9008 }
9009 } else if (level == kmp_soft_paused) { // requesting soft pause
9010 if (__kmp_pause_status != kmp_not_paused) {
9011 // error message about already being paused
9012 return 1;
9013 } else {
9014 __kmp_soft_pause();
9015 return 0;
9016 }
9017 } else if (level == kmp_hard_paused) { // requesting hard pause
9018 if (__kmp_pause_status != kmp_not_paused) {
9019 // error message about already being paused
9020 return 1;
9021 } else {
9022 __kmp_hard_pause();
9023 return 0;
9024 }
9025 } else {
9026 // error message about invalid level
9027 return 1;
9028 }
9029}
9030
9031void __kmp_omp_display_env(int verbose) {
9032 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
9033 if (__kmp_init_serial == 0)
9034 __kmp_do_serial_initialize();
9035 __kmp_display_env_impl(!verbose, verbose);
9036 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
9037}
9038
9039// The team size is changing, so distributed barrier must be modified
9040void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
9041 int new_nthreads) {
9042 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] ==
9043 bp_dist_bar);
9044 kmp_info_t **other_threads = team->t.t_threads;
9045
9046 // We want all the workers to stop waiting on the barrier while we adjust the
9047 // size of the team.
9048 for (int f = 1; f < old_nthreads; ++f) {
9049 KMP_DEBUG_ASSERT(other_threads[f] != NULL);
9050 // Ignore threads that are already inactive or not present in the team
9051 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) {
9052 // teams construct causes thread_limit to get passed in, and some of
9053 // those could be inactive; just ignore them
9054 continue;
9055 }
9056 // If thread is transitioning still to in_use state, wait for it
9057 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) {
9058 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3)
9059 KMP_CPU_PAUSE();
9060 }
9061 // The thread should be in_use now
9062 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1);
9063 // Transition to unused state
9064 team->t.t_threads[f]->th.th_used_in_team.store(2);
9065 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2);
9066 }
9067 // Release all the workers
9068 team->t.b->go_release();
9069
9070 KMP_MFENCE();
9071
9072 // Workers should see transition status 2 and move to 0; but may need to be
9073 // woken up first
9074 int count = old_nthreads - 1;
9075 while (count > 0) {
9076 count = old_nthreads - 1;
9077 for (int f = 1; f < old_nthreads; ++f) {
9078 if (other_threads[f]->th.th_used_in_team.load() != 0) {
9079 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers
9080 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST(
9081 void *, other_threads[f]->th.th_sleep_loc);
9082 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag);
9083 }
9084 } else {
9085 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0);
9086 count--;
9087 }
9088 }
9089 }
9090 // Now update the barrier size
9091 team->t.b->update_num_threads(new_nthreads);
9092 team->t.b->go_reset();
9093}
9094
9095void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) {
9096 // Add the threads back to the team
9097 KMP_DEBUG_ASSERT(team);
9098 // Threads were paused and pointed at th_used_in_team temporarily during a
9099 // resize of the team. We're going to set th_used_in_team to 3 to indicate to
9100 // the thread that it should transition itself back into the team. Then, if
9101 // blocktime isn't infinite, the thread could be sleeping, so we send a resume
9102 // to wake it up.
9103 for (int f = 1; f < new_nthreads; ++f) {
9104 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
9105 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0,
9106 3);
9107 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads
9108 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid,
9109 (kmp_flag_32<false, false> *)NULL);
9110 }
9111 }
9112 // The threads should be transitioning to the team; when they are done, they
9113 // should have set th_used_in_team to 1. This loop forces master to wait until
9114 // all threads have moved into the team and are waiting in the barrier.
9115 int count = new_nthreads - 1;
9116 while (count > 0) {
9117 count = new_nthreads - 1;
9118 for (int f = 1; f < new_nthreads; ++f) {
9119 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) {
9120 count--;
9121 }
9122 }
9123 }
9124}
9125
9126// Globals and functions for hidden helper task
9127kmp_info_t **__kmp_hidden_helper_threads;
9128kmp_info_t *__kmp_hidden_helper_main_thread;
9129std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks;
9130#if KMP_OS_LINUX
9131kmp_int32 __kmp_hidden_helper_threads_num = 8;
9132kmp_int32 __kmp_enable_hidden_helper = TRUE;
9133#else
9134kmp_int32 __kmp_hidden_helper_threads_num = 0;
9135kmp_int32 __kmp_enable_hidden_helper = FALSE;
9136#endif
9137
9138namespace {
9139std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num;
9140
9141void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) {
9142 // This is an explicit synchronization on all hidden helper threads in case
9143 // that when a regular thread pushes a hidden helper task to one hidden
9144 // helper thread, the thread has not been awaken once since they're released
9145 // by the main thread after creating the team.
9146 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num);
9147 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) !=
9148 __kmp_hidden_helper_threads_num)
9149 ;
9150
9151 // If main thread, then wait for signal
9152 if (__kmpc_master(nullptr, *gtid)) {
9153 // First, unset the initial state and release the initial thread
9154 TCW_4(__kmp_init_hidden_helper_threads, FALSE);
9155 __kmp_hidden_helper_initz_release();
9156 __kmp_hidden_helper_main_thread_wait();
9157 // Now wake up all worker threads
9158 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) {
9159 __kmp_hidden_helper_worker_thread_signal();
9160 }
9161 }
9162}
9163} // namespace
9164
9165void __kmp_hidden_helper_threads_initz_routine() {
9166 // Create a new root for hidden helper team/threads
9167 const int gtid = __kmp_register_root(TRUE);
9168 __kmp_hidden_helper_main_thread = __kmp_threads[gtid];
9169 __kmp_hidden_helper_threads = &__kmp_threads[gtid];
9170 __kmp_hidden_helper_main_thread->th.th_set_nproc =
9171 __kmp_hidden_helper_threads_num;
9172
9173 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0);
9174
9175 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn);
9176
9177 // Set the initialization flag to FALSE
9178 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE);
9179
9180 __kmp_hidden_helper_threads_deinitz_release();
9181}
9182
9183/* Nesting Mode:
9184 Set via KMP_NESTING_MODE, which takes an integer.
9185 Note: we skip duplicate topology levels, and skip levels with only
9186 one entity.
9187 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode.
9188 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels
9189 in the topology, and initializes the number of threads at each of those
9190 levels to the number of entities at each level, respectively, below the
9191 entity at the parent level.
9192 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels,
9193 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires
9194 the user to turn nesting on explicitly. This is an even more experimental
9195 option to this experimental feature, and may change or go away in the
9196 future.
9197*/
9198
9199// Allocate space to store nesting levels
9200void __kmp_init_nesting_mode() {
9201 int levels = KMP_HW_LAST;
9202 __kmp_nesting_mode_nlevels = levels;
9203 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int));
9204 for (int i = 0; i < levels; ++i)
9205 __kmp_nesting_nth_level[i] = 0;
9206 if (__kmp_nested_nth.size < levels) {
9207 __kmp_nested_nth.nth =
9208 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int));
9209 __kmp_nested_nth.size = levels;
9210 }
9211}
9212
9213// Set # threads for top levels of nesting; must be called after topology set
9214void __kmp_set_nesting_mode_threads() {
9215 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()];
9216
9217 if (__kmp_nesting_mode == 1)
9218 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
9219 else if (__kmp_nesting_mode > 1)
9220 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9221
9222 if (__kmp_topology) { // use topology info
9223 int loc, hw_level;
9224 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() &&
9225 loc < __kmp_nesting_mode_nlevels;
9226 loc++, hw_level++) {
9227 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level);
9228 if (__kmp_nesting_nth_level[loc] == 1)
9229 loc--;
9230 }
9231 // Make sure all cores are used
9232 if (__kmp_nesting_mode > 1 && loc > 1) {
9233 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
9234 int num_cores = __kmp_topology->get_count(core_level);
9235 int upper_levels = 1;
9236 for (int level = 0; level < loc - 1; ++level)
9237 upper_levels *= __kmp_nesting_nth_level[level];
9238 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores)
9239 __kmp_nesting_nth_level[loc - 1] =
9240 num_cores / __kmp_nesting_nth_level[loc - 2];
9241 }
9242 __kmp_nesting_mode_nlevels = loc;
9243 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9244 } else { // no topology info available; provide a reasonable guesstimation
9245 if (__kmp_avail_proc >= 4) {
9246 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2;
9247 __kmp_nesting_nth_level[1] = 2;
9248 __kmp_nesting_mode_nlevels = 2;
9249 } else {
9250 __kmp_nesting_nth_level[0] = __kmp_avail_proc;
9251 __kmp_nesting_mode_nlevels = 1;
9252 }
9253 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels;
9254 }
9255 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) {
9256 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i];
9257 }
9258 set__nproc(thread, __kmp_nesting_nth_level[0]);
9259 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode)
9260 __kmp_nesting_mode_nlevels = __kmp_nesting_mode;
9261 if (get__max_active_levels(thread) > 1) {
9262 // if max levels was set, set nesting mode levels to same
9263 __kmp_nesting_mode_nlevels = get__max_active_levels(thread);
9264 }
9265 if (__kmp_nesting_mode == 1) // turn on nesting for this case only
9266 set__max_active_levels(thread, __kmp_nesting_mode_nlevels);
9267}
9268
9269// Empty symbols to export (see exports_so.txt) when feature is disabled
9270extern "C" {
9271#if !KMP_STATS_ENABLED
9272void __kmp_reset_stats() {}
9273#endif
9274#if !USE_DEBUGGER
9275int __kmp_omp_debug_struct_info = FALSE;
9276int __kmp_debugging = FALSE;
9277#endif
9278#if !USE_ITT_BUILD || !USE_ITT_NOTIFY
9279void __kmp_itt_fini_ittlib() {}
9280void __kmp_itt_init_ittlib() {}
9281#endif
9282}
9283
9284// end of file
@ KMP_IDENT_AUTOPAR
Definition: kmp.h:199
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
KMP_EXPORT void __kmpc_fork_call(ident_t *, kmp_int32 nargs, kmpc_micro microtask,...)
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
#define KMP_INIT_PARTITIONED_TIMERS(name)
Initializes the partitioned timers to begin with name.
Definition: kmp_stats.h:940
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:898
stats_state_e
the states which a thread can be in
Definition: kmp_stats.h:63
sched_type
Definition: kmp.h:357
KMP_EXPORT kmp_int32 __kmpc_master(ident_t *, kmp_int32 global_tid)
@ kmp_sch_auto
Definition: kmp.h:364
@ kmp_sch_static
Definition: kmp.h:360
@ kmp_sch_guided_chunked
Definition: kmp.h:362
Definition: kmp.h:234
kmp_int32 flags
Definition: kmp.h:236