13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
20 #include "kmp_affinity.h"
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
27 #if KMP_MIC && USE_NGO_STORES
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
40 void __kmp_print_structure(
void);
47 void distributedBarrier::computeVarsForN(
size_t n) {
50 int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51 int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52 int ncores_per_socket =
53 __kmp_topology->calculate_ratio(core_level, socket_level);
54 nsockets = __kmp_topology->get_count(socket_level);
58 if (ncores_per_socket <= 0)
59 ncores_per_socket = 1;
61 threads_per_go = ncores_per_socket >> 1;
62 if (!fix_threads_per_go) {
64 if (threads_per_go > 4) {
65 if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66 threads_per_go = threads_per_go >> 1;
68 if (threads_per_go > 4 && nsockets == 1)
69 threads_per_go = threads_per_go >> 1;
72 if (threads_per_go == 0)
74 fix_threads_per_go =
true;
75 num_gos = n / threads_per_go;
76 if (n % threads_per_go)
78 if (nsockets == 1 || num_gos == 1)
81 num_groups = num_gos / nsockets;
82 if (num_gos % nsockets)
87 gos_per_group = num_gos / num_groups;
88 if (num_gos % num_groups)
90 threads_per_group = threads_per_go * gos_per_group;
92 num_gos = n / threads_per_go;
93 if (n % threads_per_go)
98 num_groups = num_gos / 2;
102 gos_per_group = num_gos / num_groups;
103 if (num_gos % num_groups)
105 threads_per_group = threads_per_go * gos_per_group;
109 void distributedBarrier::computeGo(
size_t n) {
111 for (num_gos = 1;; num_gos++)
112 if (IDEAL_CONTENTION * num_gos >= n)
114 threads_per_go = n / num_gos;
117 while (num_gos > MAX_GOS) {
119 num_gos = n / threads_per_go;
120 if (n % threads_per_go)
128 void distributedBarrier::resize(
size_t nthr) {
129 KMP_DEBUG_ASSERT(nthr > max_threads);
132 max_threads = nthr * 2;
135 for (
int i = 0; i < MAX_ITERS; ++i) {
137 flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138 max_threads *
sizeof(flags_s));
140 flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(flags_s));
144 go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads *
sizeof(go_s));
146 go = (go_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(go_s));
149 iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads *
sizeof(iter_s));
151 iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(iter_s));
155 (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads *
sizeof(sleep_s));
157 sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads *
sizeof(sleep_s));
163 kmp_uint64 distributedBarrier::go_release() {
164 kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165 for (
size_t j = 0; j < num_gos; j++) {
166 go[j].go.store(next_go);
171 void distributedBarrier::go_reset() {
172 for (
size_t j = 0; j < max_threads; ++j) {
173 for (
size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174 flags[i][j].stillNeed = 1;
183 void distributedBarrier::init(
size_t nthr) {
184 size_t old_max = max_threads;
185 if (nthr > max_threads) {
189 for (
size_t i = 0; i < max_threads; i++) {
190 for (
size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191 flags[j][i].stillNeed = 1;
196 sleep[i].sleep =
false;
200 computeVarsForN(nthr);
204 if (team_icvs == NULL)
205 team_icvs = __kmp_allocate(
sizeof(kmp_internal_control_t));
208 void distributedBarrier::deallocate(distributedBarrier *db) {
209 for (
int i = 0; i < MAX_ITERS; ++i) {
211 KMP_INTERNAL_FREE(db->flags[i]);
215 KMP_INTERNAL_FREE(db->go);
219 KMP_INTERNAL_FREE(db->iter);
223 KMP_INTERNAL_FREE(db->sleep);
227 __kmp_free(db->team_icvs);
228 db->team_icvs = NULL;
230 KMP_ALIGNED_FREE(db);
235 void __kmp_dist_barrier_wakeup(
enum barrier_type bt, kmp_team_t *team,
236 size_t start,
size_t stop,
size_t inc,
238 KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
239 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242 kmp_info_t **other_threads = team->t.t_threads;
243 for (
size_t thr = start; thr < stop; thr += inc) {
244 KMP_DEBUG_ASSERT(other_threads[thr]);
245 int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
247 __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
251 static void __kmp_dist_barrier_gather(
252 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
253 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
254 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
256 distributedBarrier *b;
257 kmp_info_t **other_threads;
258 kmp_uint64 my_current_iter, my_next_iter;
262 team = this_thr->th.th_team;
263 nproc = this_thr->th.th_team_nproc;
264 other_threads = team->t.t_threads;
266 my_current_iter = b->iter[tid].iter;
267 my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
268 group_leader = ((tid % b->threads_per_group) == 0);
271 (
"__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
272 gtid, team->t.t_id, tid, bt));
274 #if USE_ITT_BUILD && USE_ITT_NOTIFY
276 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
277 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
278 __itt_get_timestamp();
284 size_t group_start = tid + 1;
285 size_t group_end = tid + b->threads_per_group;
286 size_t threads_pending = 0;
288 if (group_end > nproc)
293 for (
size_t thr = group_start; thr < group_end; thr++) {
295 threads_pending += b->flags[my_current_iter][thr].stillNeed;
298 if (__kmp_tasking_mode != tskm_immediate_exec) {
299 kmp_task_team_t *task_team = this_thr->th.th_task_team;
300 if (task_team != NULL) {
301 if (TCR_SYNC_4(task_team->tt.tt_active)) {
302 if (KMP_TASKING_ENABLED(task_team)) {
303 int tasks_completed = FALSE;
304 __kmp_atomic_execute_tasks_64(
305 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
306 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
308 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
311 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
314 if (TCR_4(__kmp_global.g.g_done)) {
315 if (__kmp_global.g.g_abort)
316 __kmp_abort_thread();
318 }
else if (__kmp_tasking_mode != tskm_immediate_exec &&
319 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
320 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
322 }
while (threads_pending > 0);
325 OMPT_REDUCTION_DECL(this_thr, gtid);
326 OMPT_REDUCTION_BEGIN;
328 for (
size_t thr = group_start; thr < group_end; thr++) {
329 (*reduce)(this_thr->th.th_local.reduce_data,
330 other_threads[thr]->th.th_local.reduce_data);
336 b->flags[my_next_iter][tid].stillNeed = 1;
339 b->flags[my_current_iter][tid].stillNeed = 0;
343 for (
size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
344 threads_pending += b->flags[my_current_iter][thr].stillNeed;
347 if (__kmp_tasking_mode != tskm_immediate_exec) {
348 kmp_task_team_t *task_team = this_thr->th.th_task_team;
349 if (task_team != NULL) {
350 if (TCR_SYNC_4(task_team->tt.tt_active)) {
351 if (KMP_TASKING_ENABLED(task_team)) {
352 int tasks_completed = FALSE;
353 __kmp_atomic_execute_tasks_64(
354 this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
355 &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
357 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
360 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
363 if (TCR_4(__kmp_global.g.g_done)) {
364 if (__kmp_global.g.g_abort)
365 __kmp_abort_thread();
367 }
else if (__kmp_tasking_mode != tskm_immediate_exec &&
368 this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
369 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
371 }
while (threads_pending > 0);
374 if (KMP_MASTER_TID(tid)) {
375 OMPT_REDUCTION_DECL(this_thr, gtid);
376 OMPT_REDUCTION_BEGIN;
377 for (
size_t thr = b->threads_per_group; thr < nproc;
378 thr += b->threads_per_group) {
379 (*reduce)(this_thr->th.th_local.reduce_data,
380 other_threads[thr]->th.th_local.reduce_data);
387 b->flags[my_next_iter][tid].stillNeed = 1;
390 b->flags[my_current_iter][tid].stillNeed = 0;
396 (
"__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
397 gtid, team->t.t_id, tid, bt));
400 static void __kmp_dist_barrier_release(
401 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
402 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
403 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
405 distributedBarrier *b;
406 kmp_bstate_t *thr_bar;
407 kmp_uint64 my_current_iter, next_go;
411 KA_TRACE(20, (
"__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
414 thr_bar = &this_thr->th.th_bar[bt].bb;
416 if (!KMP_MASTER_TID(tid)) {
419 if (this_thr->th.th_used_in_team.load() != 1 &&
420 this_thr->th.th_used_in_team.load() != 3) {
425 kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
426 if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
428 this_thr->th.th_used_in_team.load() == 0) {
429 my_flag.wait(this_thr,
true USE_ITT_BUILD_ARG(itt_sync_obj));
431 #if USE_ITT_BUILD && USE_ITT_NOTIFY
432 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
435 __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
437 __kmp_itt_task_starting(itt_sync_obj);
439 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
442 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
443 if (itt_sync_obj != NULL)
445 __kmp_itt_task_finished(itt_sync_obj);
448 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
451 if (this_thr->th.th_used_in_team.load() != 1 &&
452 this_thr->th.th_used_in_team.load() != 3)
454 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
463 tid = __kmp_tid_from_gtid(gtid);
464 team = this_thr->th.th_team;
465 KMP_DEBUG_ASSERT(tid >= 0);
466 KMP_DEBUG_ASSERT(team);
468 my_current_iter = b->iter[tid].iter;
469 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
470 my_go_index = tid / b->threads_per_go;
471 if (this_thr->th.th_used_in_team.load() == 3) {
472 (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
476 if (b->go[my_go_index].go.load() != next_go) {
478 kmp_atomic_flag_64<false, true> my_flag(
479 &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
480 my_flag.wait(this_thr,
true USE_ITT_BUILD_ARG(itt_sync_obj));
481 KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
482 b->iter[tid].iter == 0);
483 KMP_DEBUG_ASSERT(b->sleep[tid].sleep ==
false);
486 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
493 if (this_thr->th.th_used_in_team.load() == 1)
497 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
500 group_leader = ((tid % b->threads_per_group) == 0);
503 for (
size_t go_idx = my_go_index + 1;
504 go_idx < my_go_index + b->gos_per_group; go_idx++) {
505 b->go[go_idx].go.store(next_go);
511 #if KMP_BARRIER_ICV_PUSH
512 if (propagate_icvs) {
513 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
515 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
516 (kmp_internal_control_t *)team->t.b->team_icvs);
517 copy_icvs(&thr_bar->th_fixed_icvs,
518 &team->t.t_implicit_task_taskdata[tid].td_icvs);
521 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
524 size_t nproc = this_thr->th.th_team_nproc;
525 size_t group_end = tid + b->threads_per_group;
526 if (nproc < group_end)
528 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
531 team = this_thr->th.th_team;
533 my_current_iter = b->iter[tid].iter;
534 next_go = my_current_iter + distributedBarrier::MAX_ITERS;
535 #if KMP_BARRIER_ICV_PUSH
536 if (propagate_icvs) {
538 copy_icvs(&thr_bar->th_fixed_icvs,
539 &team->t.t_implicit_task_taskdata[tid].td_icvs);
543 for (
size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
544 b->go[go_idx].go.store(next_go);
547 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
549 size_t nproc = this_thr->th.th_team_nproc;
550 __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
551 b->threads_per_group, tid);
555 for (
size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
556 b->go[go_idx].go.store(next_go);
562 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
564 size_t nproc = this_thr->th.th_team_nproc;
565 size_t group_end = tid + b->threads_per_group;
566 if (nproc < group_end)
568 __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
572 KMP_ASSERT(my_current_iter == b->iter[tid].iter);
573 b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
576 20, (
"__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
577 gtid, team->t.t_id, tid, bt));
581 template <
bool cancellable = false>
582 static bool __kmp_linear_barrier_gather_template(
583 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
584 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
585 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
586 kmp_team_t *team = this_thr->th.th_team;
587 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
588 kmp_info_t **other_threads = team->t.t_threads;
592 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
593 gtid, team->t.t_id, tid, bt));
594 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
596 #if USE_ITT_BUILD && USE_ITT_NOTIFY
598 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
599 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
600 __itt_get_timestamp();
605 if (!KMP_MASTER_TID(tid)) {
607 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
608 "arrived(%p): %llu => %llu\n",
609 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
610 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
611 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
616 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
619 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
620 int nproc = this_thr->th.th_team_nproc;
623 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
626 for (i = 1; i < nproc; ++i) {
630 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
632 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
633 "arrived(%p) == %llu\n",
634 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
636 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
640 kmp_flag_64<true, false> flag(
641 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
642 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
645 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
647 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
649 #if USE_ITT_BUILD && USE_ITT_NOTIFY
652 if (__kmp_forkjoin_frames_mode == 2) {
653 this_thr->th.th_bar_min_time = KMP_MIN(
654 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
659 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
660 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
662 OMPT_REDUCTION_DECL(this_thr, gtid);
663 OMPT_REDUCTION_BEGIN;
664 (*reduce)(this_thr->th.th_local.reduce_data,
665 other_threads[i]->th.th_local.reduce_data);
670 team_bar->b_arrived = new_state;
671 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
672 "arrived(%p) = %llu\n",
673 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
678 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
679 gtid, team->t.t_id, tid, bt));
683 template <
bool cancellable = false>
684 static bool __kmp_linear_barrier_release_template(
685 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
686 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
687 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
688 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
691 if (KMP_MASTER_TID(tid)) {
693 kmp_uint32 nproc = this_thr->th.th_team_nproc;
694 kmp_info_t **other_threads;
696 team = __kmp_threads[gtid]->th.th_team;
697 KMP_DEBUG_ASSERT(team != NULL);
698 other_threads = team->t.t_threads;
700 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
702 gtid, team->t.t_id, tid, bt));
705 #if KMP_BARRIER_ICV_PUSH
707 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
708 if (propagate_icvs) {
709 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
710 for (i = 1; i < nproc; ++i) {
711 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
713 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
714 &team->t.t_implicit_task_taskdata[0].td_icvs);
722 for (i = 1; i < nproc; ++i) {
726 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
730 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
731 "go(%p): %u => %u\n",
732 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
733 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
734 other_threads[i]->th.th_bar[bt].bb.b_go,
735 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
736 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
742 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
743 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
745 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
746 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
749 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
750 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
752 #if USE_ITT_BUILD && USE_ITT_NOTIFY
753 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
756 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
758 __kmp_itt_task_starting(itt_sync_obj);
760 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
763 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
764 if (itt_sync_obj != NULL)
766 __kmp_itt_task_finished(itt_sync_obj);
770 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
774 tid = __kmp_tid_from_gtid(gtid);
775 team = __kmp_threads[gtid]->th.th_team;
777 KMP_DEBUG_ASSERT(team != NULL);
778 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
780 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
781 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
786 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
787 gtid, team->t.t_id, tid, bt));
791 static void __kmp_linear_barrier_gather(
792 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
793 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
794 __kmp_linear_barrier_gather_template<false>(
795 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
798 static bool __kmp_linear_barrier_gather_cancellable(
799 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
800 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
801 return __kmp_linear_barrier_gather_template<true>(
802 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
805 static void __kmp_linear_barrier_release(
806 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
807 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
808 __kmp_linear_barrier_release_template<false>(
809 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
812 static bool __kmp_linear_barrier_release_cancellable(
813 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
814 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
815 return __kmp_linear_barrier_release_template<true>(
816 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
820 static void __kmp_tree_barrier_gather(
821 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
822 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
823 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
824 kmp_team_t *team = this_thr->th.th_team;
825 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
826 kmp_info_t **other_threads = team->t.t_threads;
827 kmp_uint32 nproc = this_thr->th.th_team_nproc;
828 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
829 kmp_uint32 branch_factor = 1 << branch_bits;
831 kmp_uint32 child_tid;
832 kmp_uint64 new_state = 0;
835 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
836 gtid, team->t.t_id, tid, bt));
837 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
841 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
842 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
843 __itt_get_timestamp();
848 child_tid = (tid << branch_bits) + 1;
849 if (child_tid < nproc) {
851 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
854 kmp_info_t *child_thr = other_threads[child_tid];
855 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
858 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
860 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
863 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
864 "arrived(%p) == %llu\n",
865 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
868 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
869 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
873 if (__kmp_forkjoin_frames_mode == 2) {
874 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
875 child_thr->th.th_bar_min_time);
880 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
881 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
882 team->t.t_id, child_tid));
883 OMPT_REDUCTION_DECL(this_thr, gtid);
884 OMPT_REDUCTION_BEGIN;
885 (*reduce)(this_thr->th.th_local.reduce_data,
886 child_thr->th.th_local.reduce_data);
891 }
while (child <= branch_factor && child_tid < nproc);
894 if (!KMP_MASTER_TID(tid)) {
895 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
898 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
899 "arrived(%p): %llu => %llu\n",
900 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
901 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
902 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
908 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
913 team->t.t_bar[bt].b_arrived = new_state;
915 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
916 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
917 "arrived(%p) = %llu\n",
918 gtid, team->t.t_id, tid, team->t.t_id,
919 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
922 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
923 gtid, team->t.t_id, tid, bt));
926 static void __kmp_tree_barrier_release(
927 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
928 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
929 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
931 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
933 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
934 kmp_uint32 branch_factor = 1 << branch_bits;
936 kmp_uint32 child_tid;
941 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
942 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
944 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
945 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
946 #if USE_ITT_BUILD && USE_ITT_NOTIFY
947 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
950 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
952 __kmp_itt_task_starting(itt_sync_obj);
954 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
957 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
958 if (itt_sync_obj != NULL)
960 __kmp_itt_task_finished(itt_sync_obj);
964 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
968 team = __kmp_threads[gtid]->th.th_team;
969 KMP_DEBUG_ASSERT(team != NULL);
970 tid = __kmp_tid_from_gtid(gtid);
972 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
974 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
975 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
978 team = __kmp_threads[gtid]->th.th_team;
979 KMP_DEBUG_ASSERT(team != NULL);
980 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
982 gtid, team->t.t_id, tid, bt));
984 nproc = this_thr->th.th_team_nproc;
985 child_tid = (tid << branch_bits) + 1;
987 if (child_tid < nproc) {
988 kmp_info_t **other_threads = team->t.t_threads;
992 kmp_info_t *child_thr = other_threads[child_tid];
993 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
996 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
998 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
1001 #if KMP_BARRIER_ICV_PUSH
1003 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1004 if (propagate_icvs) {
1005 __kmp_init_implicit_task(team->t.t_ident,
1006 team->t.t_threads[child_tid], team,
1008 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1009 &team->t.t_implicit_task_taskdata[0].td_icvs);
1014 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1015 "go(%p): %u => %u\n",
1016 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1017 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1018 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1020 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1024 }
while (child <= branch_factor && child_tid < nproc);
1027 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028 gtid, team->t.t_id, tid, bt));
1032 static void __kmp_hyper_barrier_gather(
1033 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1034 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1035 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1036 kmp_team_t *team = this_thr->th.th_team;
1037 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1038 kmp_info_t **other_threads = team->t.t_threads;
1039 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1040 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1041 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1042 kmp_uint32 branch_factor = 1 << branch_bits;
1048 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1049 gtid, team->t.t_id, tid, bt));
1050 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1052 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1054 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1055 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1056 __itt_get_timestamp();
1061 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1062 for (level = 0, offset = 1; offset < num_threads;
1063 level += branch_bits, offset <<= branch_bits) {
1065 kmp_uint32 child_tid;
1067 if (((tid >> level) & (branch_factor - 1)) != 0) {
1068 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1072 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1073 "arrived(%p): %llu => %llu\n",
1074 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1075 team->t.t_id, parent_tid, &thr_bar->b_arrived,
1077 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1082 p_flag.set_waiter(other_threads[parent_tid]);
1088 if (new_state == KMP_BARRIER_UNUSED_STATE)
1089 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1090 for (child = 1, child_tid = tid + (1 << level);
1091 child < branch_factor && child_tid < num_threads;
1092 child++, child_tid += (1 << level)) {
1093 kmp_info_t *child_thr = other_threads[child_tid];
1094 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1095 #if KMP_CACHE_MANAGE
1096 kmp_uint32 next_child_tid = child_tid + (1 << level);
1098 if (child + 1 < branch_factor && next_child_tid < num_threads)
1100 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1103 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1104 "arrived(%p) == %llu\n",
1105 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1106 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1108 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1109 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1114 if (__kmp_forkjoin_frames_mode == 2) {
1115 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1116 child_thr->th.th_bar_min_time);
1121 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1122 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1123 team->t.t_id, child_tid));
1124 OMPT_REDUCTION_DECL(this_thr, gtid);
1125 OMPT_REDUCTION_BEGIN;
1126 (*reduce)(this_thr->th.th_local.reduce_data,
1127 child_thr->th.th_local.reduce_data);
1133 if (KMP_MASTER_TID(tid)) {
1135 if (new_state == KMP_BARRIER_UNUSED_STATE)
1136 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1138 team->t.t_bar[bt].b_arrived = new_state;
1139 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1140 "arrived(%p) = %llu\n",
1141 gtid, team->t.t_id, tid, team->t.t_id,
1142 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1145 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1146 gtid, team->t.t_id, tid, bt));
1150 #define KMP_REVERSE_HYPER_BAR
1151 static void __kmp_hyper_barrier_release(
1152 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1153 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1154 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1157 kmp_info_t **other_threads;
1158 kmp_uint32 num_threads;
1159 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1160 kmp_uint32 branch_factor = 1 << branch_bits;
1162 kmp_uint32 child_tid;
1170 if (KMP_MASTER_TID(tid)) {
1171 team = __kmp_threads[gtid]->th.th_team;
1172 KMP_DEBUG_ASSERT(team != NULL);
1173 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1174 "barrier type %d\n",
1175 gtid, team->t.t_id, tid, bt));
1176 #if KMP_BARRIER_ICV_PUSH
1177 if (propagate_icvs) {
1178 copy_icvs(&thr_bar->th_fixed_icvs,
1179 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1183 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1184 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1186 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1187 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1188 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1193 __kmp_itt_task_starting(itt_sync_obj);
1195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1199 if (itt_sync_obj != NULL)
1201 __kmp_itt_task_finished(itt_sync_obj);
1205 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1209 team = __kmp_threads[gtid]->th.th_team;
1210 KMP_DEBUG_ASSERT(team != NULL);
1211 tid = __kmp_tid_from_gtid(gtid);
1213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1215 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1216 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1219 num_threads = this_thr->th.th_team_nproc;
1220 other_threads = team->t.t_threads;
1222 #ifdef KMP_REVERSE_HYPER_BAR
1224 for (level = 0, offset = 1;
1225 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1226 level += branch_bits, offset <<= branch_bits)
1230 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1231 level -= branch_bits, offset >>= branch_bits)
1234 for (level = 0, offset = 1; offset < num_threads;
1235 level += branch_bits, offset <<= branch_bits)
1238 #ifdef KMP_REVERSE_HYPER_BAR
1241 child = num_threads >> ((level == 0) ? level : level - 1);
1242 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1243 child_tid = tid + (child << level);
1244 child >= 1; child--, child_tid -= (1 << level))
1246 if (((tid >> level) & (branch_factor - 1)) != 0)
1251 for (child = 1, child_tid = tid + (1 << level);
1252 child < branch_factor && child_tid < num_threads;
1253 child++, child_tid += (1 << level))
1256 if (child_tid >= num_threads)
1259 kmp_info_t *child_thr = other_threads[child_tid];
1260 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1261 #if KMP_CACHE_MANAGE
1262 kmp_uint32 next_child_tid = child_tid - (1 << level);
1264 #ifdef KMP_REVERSE_HYPER_BAR
1265 if (child - 1 >= 1 && next_child_tid < num_threads)
1267 if (child + 1 < branch_factor && next_child_tid < num_threads)
1270 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1273 #if KMP_BARRIER_ICV_PUSH
1275 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1280 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1281 "go(%p): %u => %u\n",
1282 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1283 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1284 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1286 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1291 #if KMP_BARRIER_ICV_PUSH
1292 if (propagate_icvs &&
1293 !KMP_MASTER_TID(tid)) {
1294 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1296 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1297 &thr_bar->th_fixed_icvs);
1302 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1303 gtid, team->t.t_id, tid, bt));
1316 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
1317 kmp_bstate_t *thr_bar,
1318 kmp_uint32 nproc,
int gtid,
1319 int tid, kmp_team_t *team) {
1321 bool uninitialized = thr_bar->team == NULL;
1322 bool team_changed = team != thr_bar->team;
1323 bool team_sz_changed = nproc != thr_bar->nproc;
1324 bool tid_changed = tid != thr_bar->old_tid;
1325 bool retval =
false;
1327 if (uninitialized || team_sz_changed) {
1328 __kmp_get_hierarchy(nproc, thr_bar);
1331 if (uninitialized || team_sz_changed || tid_changed) {
1332 thr_bar->my_level = thr_bar->depth - 1;
1333 thr_bar->parent_tid = -1;
1334 if (!KMP_MASTER_TID(tid)) {
1337 while (d < thr_bar->depth) {
1340 if (d == thr_bar->depth - 2) {
1341 thr_bar->parent_tid = 0;
1342 thr_bar->my_level = d;
1344 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1347 thr_bar->parent_tid = tid - rem;
1348 thr_bar->my_level = d;
1354 __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1355 (thr_bar->skip_per_level[thr_bar->my_level])),
1356 &(thr_bar->offset));
1357 thr_bar->old_tid = tid;
1358 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1359 thr_bar->team = team;
1360 thr_bar->parent_bar =
1361 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1363 if (uninitialized || team_changed || tid_changed) {
1364 thr_bar->team = team;
1365 thr_bar->parent_bar =
1366 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1369 if (uninitialized || team_sz_changed || tid_changed) {
1370 thr_bar->nproc = nproc;
1371 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1372 if (thr_bar->my_level == 0)
1373 thr_bar->leaf_kids = 0;
1374 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1375 __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1376 thr_bar->leaf_state = 0;
1377 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
1378 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
1383 static void __kmp_hierarchical_barrier_gather(
1384 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1385 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1386 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1387 kmp_team_t *team = this_thr->th.th_team;
1388 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1389 kmp_uint32 nproc = this_thr->th.th_team_nproc;
1390 kmp_info_t **other_threads = team->t.t_threads;
1391 kmp_uint64 new_state = 0;
1393 int level = team->t.t_level;
1394 if (other_threads[0]
1395 ->th.th_teams_microtask)
1396 if (this_thr->th.th_teams_size.nteams > 1)
1399 thr_bar->use_oncore_barrier = 1;
1401 thr_bar->use_oncore_barrier = 0;
1403 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1404 "barrier type %d\n",
1405 gtid, team->t.t_id, tid, bt));
1406 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1410 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1411 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1415 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1418 if (thr_bar->my_level) {
1419 kmp_int32 child_tid;
1421 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1422 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1423 thr_bar->use_oncore_barrier) {
1424 if (thr_bar->leaf_kids) {
1426 kmp_uint64 leaf_state =
1428 ? thr_bar->b_arrived | thr_bar->leaf_state
1429 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1430 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1432 gtid, team->t.t_id, tid));
1433 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1434 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1436 OMPT_REDUCTION_DECL(this_thr, gtid);
1437 OMPT_REDUCTION_BEGIN;
1438 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1440 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1442 gtid, team->t.t_id, tid,
1443 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1445 (*reduce)(this_thr->th.th_local.reduce_data,
1446 other_threads[child_tid]->th.th_local.reduce_data);
1451 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1454 for (kmp_uint32 d = 1; d < thr_bar->my_level;
1456 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1457 skip = thr_bar->skip_per_level[d];
1460 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1461 kmp_info_t *child_thr = other_threads[child_tid];
1462 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1463 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1465 "arrived(%p) == %llu\n",
1466 gtid, team->t.t_id, tid,
1467 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1468 child_tid, &child_bar->b_arrived, new_state));
1469 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1470 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1472 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1474 gtid, team->t.t_id, tid,
1475 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1477 (*reduce)(this_thr->th.th_local.reduce_data,
1478 child_thr->th.th_local.reduce_data);
1483 for (kmp_uint32 d = 0; d < thr_bar->my_level;
1485 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1486 skip = thr_bar->skip_per_level[d];
1489 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1490 kmp_info_t *child_thr = other_threads[child_tid];
1491 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1492 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1494 "arrived(%p) == %llu\n",
1495 gtid, team->t.t_id, tid,
1496 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1497 child_tid, &child_bar->b_arrived, new_state));
1498 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1499 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1501 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1503 gtid, team->t.t_id, tid,
1504 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1506 (*reduce)(this_thr->th.th_local.reduce_data,
1507 child_thr->th.th_local.reduce_data);
1515 if (!KMP_MASTER_TID(tid)) {
1516 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1517 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1518 gtid, team->t.t_id, tid,
1519 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1520 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1521 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1525 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1526 !thr_bar->use_oncore_barrier) {
1528 kmp_flag_64<> flag(&thr_bar->b_arrived,
1529 other_threads[thr_bar->parent_tid]);
1533 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1534 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1535 thr_bar->offset + 1);
1536 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1540 team->t.t_bar[bt].b_arrived = new_state;
1541 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1542 "arrived(%p) = %llu\n",
1543 gtid, team->t.t_id, tid, team->t.t_id,
1544 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1547 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1548 "barrier type %d\n",
1549 gtid, team->t.t_id, tid, bt));
1552 static void __kmp_hierarchical_barrier_release(
1553 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1554 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1555 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1557 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1559 bool team_change =
false;
1561 if (KMP_MASTER_TID(tid)) {
1562 team = __kmp_threads[gtid]->th.th_team;
1563 KMP_DEBUG_ASSERT(team != NULL);
1564 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1565 "entered barrier type %d\n",
1566 gtid, team->t.t_id, tid, bt));
1569 if (!thr_bar->use_oncore_barrier ||
1570 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1571 thr_bar->team == NULL) {
1573 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1574 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1575 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1576 TCW_8(thr_bar->b_go,
1577 KMP_INIT_BARRIER_STATE);
1581 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1582 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1583 thr_bar->offset + 1, bt,
1584 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1585 flag.wait(this_thr, TRUE);
1586 if (thr_bar->wait_flag ==
1587 KMP_BARRIER_SWITCHING) {
1588 TCW_8(thr_bar->b_go,
1589 KMP_INIT_BARRIER_STATE);
1591 (RCAST(
volatile char *,
1592 &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1595 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1597 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1600 team = __kmp_threads[gtid]->th.th_team;
1601 KMP_DEBUG_ASSERT(team != NULL);
1602 tid = __kmp_tid_from_gtid(gtid);
1606 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1611 nproc = this_thr->th.th_team_nproc;
1612 int level = team->t.t_level;
1613 if (team->t.t_threads[0]
1614 ->th.th_teams_microtask) {
1615 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1616 this_thr->th.th_teams_level == level)
1618 if (this_thr->th.th_teams_size.nteams > 1)
1622 thr_bar->use_oncore_barrier = 1;
1624 thr_bar->use_oncore_barrier = 0;
1628 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1629 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1630 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1636 #if KMP_BARRIER_ICV_PUSH
1637 if (propagate_icvs) {
1638 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1642 copy_icvs(&thr_bar->th_fixed_icvs,
1643 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1644 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1645 thr_bar->use_oncore_barrier) {
1646 if (!thr_bar->my_level)
1649 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1650 &thr_bar->parent_bar->th_fixed_icvs);
1653 if (thr_bar->my_level)
1655 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1657 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1658 &thr_bar->parent_bar->th_fixed_icvs);
1664 if (thr_bar->my_level) {
1665 kmp_int32 child_tid;
1667 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1668 thr_bar->use_oncore_barrier) {
1669 if (KMP_MASTER_TID(tid)) {
1672 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1675 ngo_load(&thr_bar->th_fixed_icvs);
1678 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1679 child_tid += thr_bar->skip_per_level[1]) {
1680 kmp_bstate_t *child_bar =
1681 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1682 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1683 "releasing T#%d(%d:%d)"
1684 " go(%p): %u => %u\n",
1685 gtid, team->t.t_id, tid,
1686 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1687 child_tid, &child_bar->b_go, child_bar->b_go,
1688 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1691 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1695 TCW_8(thr_bar->b_go,
1696 KMP_INIT_BARRIER_STATE);
1698 if (thr_bar->leaf_kids) {
1701 old_leaf_kids < thr_bar->leaf_kids) {
1702 if (old_leaf_kids) {
1703 thr_bar->b_go |= old_leaf_state;
1706 last = tid + thr_bar->skip_per_level[1];
1709 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1711 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1715 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1716 " T#%d(%d:%d) go(%p): %u => %u\n",
1717 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1718 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1719 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1721 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1726 thr_bar->b_go |= thr_bar->leaf_state;
1730 for (
int d = thr_bar->my_level - 1; d >= 0;
1732 last = tid + thr_bar->skip_per_level[d + 1];
1733 kmp_uint32 skip = thr_bar->skip_per_level[d];
1736 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1737 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1738 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1739 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1740 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1741 gtid, team->t.t_id, tid,
1742 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1743 child_tid, &child_bar->b_go, child_bar->b_go,
1744 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1746 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1751 #if KMP_BARRIER_ICV_PUSH
1752 if (propagate_icvs && !KMP_MASTER_TID(tid))
1754 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1755 &thr_bar->th_fixed_icvs);
1758 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1759 "barrier type %d\n",
1760 gtid, team->t.t_id, tid, bt));
1768 template <
bool cancellable>
struct is_cancellable {};
1769 template <>
struct is_cancellable<true> {
1771 is_cancellable() : value(false) {}
1772 is_cancellable(
bool b) : value(b) {}
1773 is_cancellable &operator=(
bool b) {
1777 operator bool()
const {
return value; }
1779 template <>
struct is_cancellable<false> {
1780 is_cancellable &operator=(
bool b) {
return *
this; }
1781 constexpr
operator bool()
const {
return false; }
1792 template <
bool cancellable = false>
1793 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1794 size_t reduce_size,
void *reduce_data,
1795 void (*reduce)(
void *,
void *)) {
1796 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1797 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1798 int tid = __kmp_tid_from_gtid(gtid);
1799 kmp_info_t *this_thr = __kmp_threads[gtid];
1800 kmp_team_t *team = this_thr->th.th_team;
1802 is_cancellable<cancellable> cancelled;
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804 ompt_data_t *my_task_data;
1805 ompt_data_t *my_parallel_data;
1806 void *return_address;
1807 ompt_sync_region_t barrier_kind;
1810 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1811 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1814 if (ompt_enabled.enabled) {
1816 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1817 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1818 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1819 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1820 if (ompt_enabled.ompt_callback_sync_region) {
1821 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1822 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1825 if (ompt_enabled.ompt_callback_sync_region_wait) {
1826 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1827 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1834 auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1835 switch (barrier_kind) {
1836 case ompt_sync_region_barrier_explicit:
1837 ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1839 case ompt_sync_region_barrier_implicit_workshare:
1840 ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1842 case ompt_sync_region_barrier_implicit_parallel:
1843 ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1845 case ompt_sync_region_barrier_teams:
1846 ompt_thr_info->state = ompt_state_wait_barrier_teams;
1848 case ompt_sync_region_barrier_implementation:
1851 ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1856 if (!team->t.t_serialized) {
1859 void *itt_sync_obj = NULL;
1861 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1862 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1865 if (__kmp_tasking_mode == tskm_extra_barrier) {
1866 __kmp_tasking_barrier(team, this_thr, gtid);
1868 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1869 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1876 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1878 this_thr->th.th_team_bt_intervals =
1879 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1880 this_thr->th.th_team_bt_set =
1881 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1883 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1888 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1893 if (KMP_MASTER_TID(tid)) {
1894 team->t.t_bar[bt].b_master_arrived += 1;
1896 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1899 if (reduce != NULL) {
1901 this_thr->th.th_local.reduce_data = reduce_data;
1904 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1905 __kmp_task_team_setup(this_thr, team);
1908 cancelled = __kmp_linear_barrier_gather_cancellable(
1909 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1911 switch (__kmp_barrier_gather_pattern[bt]) {
1913 __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1914 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1917 case bp_hyper_bar: {
1918 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1919 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1922 case bp_hierarchical_bar: {
1923 __kmp_hierarchical_barrier_gather(
1924 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1928 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1929 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1933 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1934 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1941 if (KMP_MASTER_TID(tid)) {
1943 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1944 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1949 team->t.t_bar[bt].b_team_arrived += 1;
1952 if (__kmp_omp_cancellation) {
1953 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1955 if (cancel_request == cancel_loop ||
1956 cancel_request == cancel_sections) {
1957 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1965 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1966 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1968 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1970 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1971 __kmp_forkjoin_frames_mode &&
1972 (this_thr->th.th_teams_microtask == NULL ||
1973 this_thr->th.th_teams_size.nteams == 1) &&
1974 team->t.t_active_level == 1) {
1975 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1976 kmp_uint64 cur_time = __itt_get_timestamp();
1977 kmp_info_t **other_threads = team->t.t_threads;
1978 int nproc = this_thr->th.th_team_nproc;
1980 switch (__kmp_forkjoin_frames_mode) {
1982 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1984 this_thr->th.th_frame_time = cur_time;
1988 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1992 if (__itt_metadata_add_ptr) {
1994 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1997 this_thr->th.th_bar_arrive_time = 0;
1998 for (i = 1; i < nproc; ++i) {
1999 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2000 other_threads[i]->th.th_bar_arrive_time = 0;
2002 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2004 (kmp_uint64)(reduce != NULL));
2006 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2008 this_thr->th.th_frame_time = cur_time;
2016 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2017 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2020 if ((status == 1 || !is_split) && !cancelled) {
2022 cancelled = __kmp_linear_barrier_release_cancellable(
2023 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2025 switch (__kmp_barrier_release_pattern[bt]) {
2027 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2028 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2029 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2032 case bp_hyper_bar: {
2033 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2034 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2035 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2038 case bp_hierarchical_bar: {
2039 __kmp_hierarchical_barrier_release(
2040 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2044 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2045 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2046 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2050 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2051 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2055 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2056 __kmp_task_team_sync(this_thr, team);
2064 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2065 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2069 if (__kmp_tasking_mode != tskm_immediate_exec) {
2070 if (this_thr->th.th_task_team != NULL) {
2072 void *itt_sync_obj = NULL;
2073 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2074 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2075 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2080 this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2081 this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2083 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2084 __kmp_task_team_setup(this_thr, team);
2087 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2088 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2093 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2094 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2095 __kmp_tid_from_gtid(gtid), status));
2098 if (ompt_enabled.enabled) {
2100 if (ompt_enabled.ompt_callback_sync_region_wait) {
2101 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2102 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2105 if (ompt_enabled.ompt_callback_sync_region) {
2106 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2107 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2111 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2116 return (
int)cancelled;
2121 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
2122 size_t reduce_size,
void *reduce_data,
2123 void (*reduce)(
void *,
void *)) {
2124 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2128 #if defined(KMP_GOMP_COMPAT)
2130 int __kmp_barrier_gomp_cancel(
int gtid) {
2131 if (__kmp_omp_cancellation) {
2132 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2135 int tid = __kmp_tid_from_gtid(gtid);
2136 kmp_info_t *this_thr = __kmp_threads[gtid];
2137 if (KMP_MASTER_TID(tid)) {
2141 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2142 KMP_BARRIER_STATE_BUMP;
2147 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2152 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
2153 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2154 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2155 KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2156 int tid = __kmp_tid_from_gtid(gtid);
2157 kmp_info_t *this_thr = __kmp_threads[gtid];
2158 kmp_team_t *team = this_thr->th.th_team;
2160 if (!team->t.t_serialized) {
2161 if (KMP_MASTER_GTID(gtid)) {
2162 switch (__kmp_barrier_release_pattern[bt]) {
2164 __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2165 FALSE USE_ITT_BUILD_ARG(NULL));
2168 case bp_hyper_bar: {
2169 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2170 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2171 FALSE USE_ITT_BUILD_ARG(NULL));
2174 case bp_hierarchical_bar: {
2175 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2176 FALSE USE_ITT_BUILD_ARG(NULL));
2180 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2181 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2182 FALSE USE_ITT_BUILD_ARG(NULL));
2186 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2187 FALSE USE_ITT_BUILD_ARG(NULL));
2190 if (__kmp_tasking_mode != tskm_immediate_exec) {
2191 __kmp_task_team_sync(this_thr, team);
2197 void __kmp_join_barrier(
int gtid) {
2198 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2199 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2201 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2203 kmp_info_t *this_thr = __kmp_threads[gtid];
2210 void *itt_sync_obj = NULL;
2212 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2214 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2217 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2218 int nproc = this_thr->th.th_team_nproc;
2223 team = this_thr->th.th_team;
2224 KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2225 tid = __kmp_tid_from_gtid(gtid);
2227 team_id = team->t.t_id;
2228 kmp_info_t *master_thread = this_thr->th.th_team_master;
2229 if (master_thread != team->t.t_threads[0]) {
2230 __kmp_print_structure();
2233 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2237 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2238 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2239 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2240 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2241 gtid, team_id, tid));
2244 if (ompt_enabled.enabled) {
2246 ompt_data_t *my_task_data;
2247 ompt_data_t *my_parallel_data;
2248 void *codeptr = NULL;
2249 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2250 if (KMP_MASTER_TID(ds_tid) &&
2251 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2252 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2253 codeptr = team->t.ompt_team_info.master_return_address;
2254 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2255 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2256 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2257 ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2258 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2259 sync_kind = ompt_sync_region_barrier_teams;
2260 ompt_state = ompt_state_wait_barrier_teams;
2262 if (ompt_enabled.ompt_callback_sync_region) {
2263 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2264 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2266 if (ompt_enabled.ompt_callback_sync_region_wait) {
2267 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2268 sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2270 if (!KMP_MASTER_TID(ds_tid))
2271 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2273 this_thr->th.ompt_thread_info.state = ompt_state;
2277 if (__kmp_tasking_mode == tskm_extra_barrier) {
2278 __kmp_tasking_barrier(team, this_thr, gtid);
2279 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2280 gtid, team_id, tid));
2283 if (__kmp_tasking_mode != tskm_immediate_exec) {
2284 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2285 "%p, th_task_team = %p\n",
2286 __kmp_gtid_from_thread(this_thr), team_id,
2287 team->t.t_task_team[this_thr->th.th_task_state],
2288 this_thr->th.th_task_team));
2289 KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2298 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2300 this_thr->th.th_team_bt_intervals =
2301 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2302 this_thr->th.th_team_bt_set =
2303 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2305 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2310 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2311 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2314 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2316 __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2320 case bp_hyper_bar: {
2321 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2322 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2325 case bp_hierarchical_bar: {
2326 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2327 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2331 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2332 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2336 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2337 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2345 if (KMP_MASTER_TID(tid)) {
2346 if (__kmp_tasking_mode != tskm_immediate_exec) {
2347 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2349 if (__kmp_display_affinity) {
2350 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2352 #if KMP_STATS_ENABLED
2356 for (
int i = 0; i < team->t.t_nproc; ++i) {
2357 kmp_info_t *team_thread = team->t.t_threads[i];
2358 if (team_thread == this_thr)
2360 team_thread->th.th_stats->setIdleFlag();
2361 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2362 team_thread->th.th_sleep_loc != NULL)
2363 __kmp_null_resume_wrapper(team_thread);
2367 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2368 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2371 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2373 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2374 __kmp_forkjoin_frames_mode &&
2375 (this_thr->th.th_teams_microtask == NULL ||
2376 this_thr->th.th_teams_size.nteams == 1) &&
2377 team->t.t_active_level == 1) {
2378 kmp_uint64 cur_time = __itt_get_timestamp();
2379 ident_t *loc = team->t.t_ident;
2380 kmp_info_t **other_threads = team->t.t_threads;
2381 switch (__kmp_forkjoin_frames_mode) {
2383 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2387 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2391 if (__itt_metadata_add_ptr) {
2393 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2396 this_thr->th.th_bar_arrive_time = 0;
2397 for (
int i = 1; i < nproc; ++i) {
2398 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2399 other_threads[i]->th.th_bar_arrive_time = 0;
2401 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2402 cur_time, delta, 0);
2404 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2406 this_thr->th.th_frame_time = cur_time;
2414 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2415 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2420 if (KMP_MASTER_TID(tid)) {
2423 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2424 gtid, team_id, tid, nproc));
2431 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2437 void __kmp_fork_barrier(
int gtid,
int tid) {
2438 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2439 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2440 kmp_info_t *this_thr = __kmp_threads[gtid];
2441 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2443 void *itt_sync_obj = NULL;
2447 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2448 (team != NULL) ? team->t.t_id : -1, tid));
2451 if (KMP_MASTER_TID(tid)) {
2452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2453 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2455 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2456 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2461 KMP_DEBUG_ASSERT(team);
2462 kmp_info_t **other_threads = team->t.t_threads;
2468 for (i = 1; i < team->t.t_nproc; ++i) {
2470 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2472 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2473 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2474 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2476 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2477 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2478 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2482 if (__kmp_tasking_mode != tskm_immediate_exec)
2483 __kmp_task_team_setup(this_thr, team);
2491 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2493 this_thr->th.th_team_bt_intervals =
2494 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2495 this_thr->th.th_team_bt_set =
2496 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2498 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2503 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2505 __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2506 TRUE USE_ITT_BUILD_ARG(NULL));
2509 case bp_hyper_bar: {
2510 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2511 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2512 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2515 case bp_hierarchical_bar: {
2516 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2517 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2521 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2522 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2523 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2527 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2528 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2533 ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2534 if (ompt_enabled.enabled &&
2535 (ompt_state == ompt_state_wait_barrier_teams ||
2536 ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2537 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2538 ompt_data_t *task_data = (team)
2539 ? OMPT_CUR_TASK_DATA(this_thr)
2540 : &(this_thr->th.ompt_thread_info.task_data);
2541 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2543 void *codeptr = NULL;
2544 if (KMP_MASTER_TID(ds_tid) &&
2545 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2546 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2547 codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2548 ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2549 if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2550 sync_kind = ompt_sync_region_barrier_teams;
2551 if (ompt_enabled.ompt_callback_sync_region_wait) {
2552 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2553 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2555 if (ompt_enabled.ompt_callback_sync_region) {
2556 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2557 sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2560 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2561 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2562 ompt_scope_end, NULL, task_data, 0, ds_tid,
2563 ompt_task_implicit);
2569 if (TCR_4(__kmp_global.g.g_done)) {
2570 this_thr->th.th_task_team = NULL;
2572 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2573 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2574 if (!KMP_MASTER_TID(tid)) {
2575 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2577 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2581 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2589 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2590 KMP_DEBUG_ASSERT(team != NULL);
2591 tid = __kmp_tid_from_gtid(gtid);
2593 #if KMP_BARRIER_ICV_PULL
2601 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2602 if (!KMP_MASTER_TID(tid)) {
2606 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2607 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2609 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2610 &team->t.t_threads[0]
2611 ->th.th_bar[bs_forkjoin_barrier]
2617 if (__kmp_tasking_mode != tskm_immediate_exec) {
2618 __kmp_task_team_sync(this_thr, team);
2621 #if KMP_AFFINITY_SUPPORTED
2622 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2623 if (proc_bind == proc_bind_intel) {
2625 if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2626 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2628 }
else if (proc_bind != proc_bind_false) {
2629 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2630 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2631 __kmp_gtid_from_thread(this_thr),
2632 this_thr->th.th_current_place));
2634 __kmp_affinity_bind_place(gtid);
2639 if (__kmp_display_affinity) {
2640 if (team->t.t_display_affinity
2641 #
if KMP_AFFINITY_SUPPORTED
2642 || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2646 __kmp_aux_display_affinity(gtid, NULL);
2647 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2648 this_thr->th.th_prev_level = team->t.t_level;
2651 if (!KMP_MASTER_TID(tid))
2652 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2654 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2655 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2656 if (!KMP_MASTER_TID(tid)) {
2658 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2659 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2663 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2664 team->t.t_id, tid));
2667 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2668 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2669 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2671 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2672 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2677 #if KMP_BARRIER_ICV_PULL
2681 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2684 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2686 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2687 team->t.t_threads[0], team));
2688 #elif KMP_BARRIER_ICV_PUSH
2691 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2692 team->t.t_threads[0], team));
2697 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2699 for (
int f = 1; f < new_nproc; ++f) {
2701 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2702 f, team->t.t_threads[f], team));
2703 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2704 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2705 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2706 f, team->t.t_threads[f], team));