14 #include "kmp_wait_release.h"
17 #include "kmp_stats.h"
19 #include "ompt-specific.h"
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
27 #include "tsan_annotations.h"
29 #if KMP_MIC && USE_NGO_STORES
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
42 void __kmp_print_structure(
void);
47 template <
bool cancellable = false>
48 static bool __kmp_linear_barrier_gather_template(
49 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
50 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
51 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52 kmp_team_t *team = this_thr->th.th_team;
53 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54 kmp_info_t **other_threads = team->t.t_threads;
58 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
64 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66 __itt_get_timestamp();
71 if (!KMP_MASTER_TID(tid)) {
73 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74 "arrived(%p): %llu => %llu\n",
75 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
82 ANNOTATE_BARRIER_BEGIN(this_thr);
83 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
86 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87 int nproc = this_thr->th.th_team_nproc;
90 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
93 for (i = 1; i < nproc; ++i) {
97 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
99 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100 "arrived(%p) == %llu\n",
101 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
103 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
106 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
109 bool cancelled = flag.wait_cancellable_nosleep(
110 this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
114 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
116 ANNOTATE_BARRIER_END(other_threads[i]);
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
120 if (__kmp_forkjoin_frames_mode == 2) {
121 this_thr->th.th_bar_min_time = KMP_MIN(
122 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
127 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
128 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
130 ANNOTATE_REDUCE_AFTER(reduce);
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
133 ANNOTATE_REDUCE_BEFORE(reduce);
134 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
138 team_bar->b_arrived = new_state;
139 KA_TRACE(20, (
"__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
140 "arrived(%p) = %llu\n",
141 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
146 (
"__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
147 gtid, team->t.t_id, tid, bt));
151 template <
bool cancellable = false>
152 static bool __kmp_linear_barrier_release_template(
153 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
154 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
155 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
156 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
159 if (KMP_MASTER_TID(tid)) {
161 kmp_uint32 nproc = this_thr->th.th_team_nproc;
162 kmp_info_t **other_threads;
164 team = __kmp_threads[gtid]->th.th_team;
165 KMP_DEBUG_ASSERT(team != NULL);
166 other_threads = team->t.t_threads;
168 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170 gtid, team->t.t_id, tid, bt));
173 #if KMP_BARRIER_ICV_PUSH
175 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
176 if (propagate_icvs) {
177 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
178 for (i = 1; i < nproc; ++i) {
179 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
182 &team->t.t_implicit_task_taskdata[0].td_icvs);
187 #endif // KMP_BARRIER_ICV_PUSH
190 for (i = 1; i < nproc; ++i) {
194 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
198 (
"__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
199 "go(%p): %u => %u\n",
200 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
201 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
202 other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
204 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
205 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
211 KA_TRACE(20, (
"__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
212 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
213 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
215 bool cancelled = flag.wait_cancellable_nosleep(
216 this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
223 ANNOTATE_BARRIER_END(this_thr);
224 #if USE_ITT_BUILD && USE_ITT_NOTIFY
225 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
228 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
230 __kmp_itt_task_starting(itt_sync_obj);
232 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
235 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
236 if (itt_sync_obj != NULL)
238 __kmp_itt_task_finished(itt_sync_obj);
242 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
246 tid = __kmp_tid_from_gtid(gtid);
247 team = __kmp_threads[gtid]->th.th_team;
249 KMP_DEBUG_ASSERT(team != NULL);
250 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
252 (
"__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
253 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
258 (
"__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
259 gtid, team->t.t_id, tid, bt));
263 static void __kmp_linear_barrier_gather(
264 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
265 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
266 __kmp_linear_barrier_gather_template<false>(
267 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
270 static bool __kmp_linear_barrier_gather_cancellable(
271 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
272 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
273 return __kmp_linear_barrier_gather_template<true>(
274 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
277 static void __kmp_linear_barrier_release(
278 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
279 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
280 __kmp_linear_barrier_release_template<false>(
281 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
284 static bool __kmp_linear_barrier_release_cancellable(
285 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
286 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
287 return __kmp_linear_barrier_release_template<true>(
288 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
293 __kmp_tree_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
294 int tid,
void (*reduce)(
void *,
void *)
295 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
296 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
297 kmp_team_t *team = this_thr->th.th_team;
298 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
299 kmp_info_t **other_threads = team->t.t_threads;
300 kmp_uint32 nproc = this_thr->th.th_team_nproc;
301 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
302 kmp_uint32 branch_factor = 1 << branch_bits;
304 kmp_uint32 child_tid;
305 kmp_uint64 new_state;
308 20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
309 gtid, team->t.t_id, tid, bt));
310 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
312 #if USE_ITT_BUILD && USE_ITT_NOTIFY
314 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
315 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
316 __itt_get_timestamp();
321 child_tid = (tid << branch_bits) + 1;
322 if (child_tid < nproc) {
324 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
327 kmp_info_t *child_thr = other_threads[child_tid];
328 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
331 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
333 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
336 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
337 "arrived(%p) == %llu\n",
338 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
339 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
341 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
342 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
343 ANNOTATE_BARRIER_END(child_thr);
344 #if USE_ITT_BUILD && USE_ITT_NOTIFY
347 if (__kmp_forkjoin_frames_mode == 2) {
348 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
349 child_thr->th.th_bar_min_time);
354 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
355 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
356 team->t.t_id, child_tid));
357 ANNOTATE_REDUCE_AFTER(reduce);
358 (*reduce)(this_thr->th.th_local.reduce_data,
359 child_thr->th.th_local.reduce_data);
360 ANNOTATE_REDUCE_BEFORE(reduce);
361 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
365 }
while (child <= branch_factor && child_tid < nproc);
368 if (!KMP_MASTER_TID(tid)) {
369 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
373 "arrived(%p): %llu => %llu\n",
374 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
375 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
376 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
382 ANNOTATE_BARRIER_BEGIN(this_thr);
383 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
388 team->t.t_bar[bt].b_arrived = new_state;
390 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
391 KA_TRACE(20, (
"__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
392 "arrived(%p) = %llu\n",
393 gtid, team->t.t_id, tid, team->t.t_id,
394 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397 (
"__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
398 gtid, team->t.t_id, tid, bt));
401 static void __kmp_tree_barrier_release(
402 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
403 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
404 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
406 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
408 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
409 kmp_uint32 branch_factor = 1 << branch_bits;
411 kmp_uint32 child_tid;
416 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
417 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
419 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
420 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
421 ANNOTATE_BARRIER_END(this_thr);
422 #if USE_ITT_BUILD && USE_ITT_NOTIFY
423 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
428 __kmp_itt_task_starting(itt_sync_obj);
430 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
434 if (itt_sync_obj != NULL)
436 __kmp_itt_task_finished(itt_sync_obj);
440 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
444 team = __kmp_threads[gtid]->th.th_team;
445 KMP_DEBUG_ASSERT(team != NULL);
446 tid = __kmp_tid_from_gtid(gtid);
448 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
450 (
"__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
451 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454 team = __kmp_threads[gtid]->th.th_team;
455 KMP_DEBUG_ASSERT(team != NULL);
456 KA_TRACE(20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
458 gtid, team->t.t_id, tid, bt));
460 nproc = this_thr->th.th_team_nproc;
461 child_tid = (tid << branch_bits) + 1;
463 if (child_tid < nproc) {
464 kmp_info_t **other_threads = team->t.t_threads;
468 kmp_info_t *child_thr = other_threads[child_tid];
469 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
474 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #if KMP_BARRIER_ICV_PUSH
479 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
480 if (propagate_icvs) {
481 __kmp_init_implicit_task(team->t.t_ident,
482 team->t.t_threads[child_tid], team,
484 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
485 &team->t.t_implicit_task_taskdata[0].td_icvs);
488 #endif // KMP_BARRIER_ICV_PUSH
490 (
"__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
491 "go(%p): %u => %u\n",
492 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
493 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
494 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
496 ANNOTATE_BARRIER_BEGIN(child_thr);
497 kmp_flag_64 flag(&child_bar->b_go, child_thr);
501 }
while (child <= branch_factor && child_tid < nproc);
504 20, (
"__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
505 gtid, team->t.t_id, tid, bt));
510 __kmp_hyper_barrier_gather(
enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
511 int tid,
void (*reduce)(
void *,
void *)
512 USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
513 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514 kmp_team_t *team = this_thr->th.th_team;
515 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516 kmp_info_t **other_threads = team->t.t_threads;
517 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520 kmp_uint32 branch_factor = 1 << branch_bits;
526 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527 gtid, team->t.t_id, tid, bt));
528 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
532 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534 __itt_get_timestamp();
539 kmp_flag_64 p_flag(&thr_bar->b_arrived);
540 for (level = 0, offset = 1; offset < num_threads;
541 level += branch_bits, offset <<= branch_bits) {
543 kmp_uint32 child_tid;
545 if (((tid >> level) & (branch_factor - 1)) != 0) {
546 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
550 "arrived(%p): %llu => %llu\n",
551 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
552 team->t.t_id, parent_tid, &thr_bar->b_arrived,
554 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
559 ANNOTATE_BARRIER_BEGIN(this_thr);
560 p_flag.set_waiter(other_threads[parent_tid]);
566 if (new_state == KMP_BARRIER_UNUSED_STATE)
567 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
568 for (child = 1, child_tid = tid + (1 << level);
569 child < branch_factor && child_tid < num_threads;
570 child++, child_tid += (1 << level)) {
571 kmp_info_t *child_thr = other_threads[child_tid];
572 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
574 kmp_uint32 next_child_tid = child_tid + (1 << level);
576 if (child + 1 < branch_factor && next_child_tid < num_threads)
578 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
581 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
582 "arrived(%p) == %llu\n",
583 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
584 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
586 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
587 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
588 ANNOTATE_BARRIER_END(child_thr);
589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
592 if (__kmp_forkjoin_frames_mode == 2) {
593 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
594 child_thr->th.th_bar_min_time);
599 (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
600 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
601 team->t.t_id, child_tid));
602 ANNOTATE_REDUCE_AFTER(reduce);
603 (*reduce)(this_thr->th.th_local.reduce_data,
604 child_thr->th.th_local.reduce_data);
605 ANNOTATE_REDUCE_BEFORE(reduce);
606 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
611 if (KMP_MASTER_TID(tid)) {
613 if (new_state == KMP_BARRIER_UNUSED_STATE)
614 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
616 team->t.t_bar[bt].b_arrived = new_state;
617 KA_TRACE(20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
618 "arrived(%p) = %llu\n",
619 gtid, team->t.t_id, tid, team->t.t_id,
620 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
623 20, (
"__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
624 gtid, team->t.t_id, tid, bt));
628 #define KMP_REVERSE_HYPER_BAR
629 static void __kmp_hyper_barrier_release(
630 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
631 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
632 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
634 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
635 kmp_info_t **other_threads;
636 kmp_uint32 num_threads;
637 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
638 kmp_uint32 branch_factor = 1 << branch_bits;
640 kmp_uint32 child_tid;
648 if (KMP_MASTER_TID(tid)) {
649 team = __kmp_threads[gtid]->th.th_team;
650 KMP_DEBUG_ASSERT(team != NULL);
651 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
653 gtid, team->t.t_id, tid, bt));
654 #if KMP_BARRIER_ICV_PUSH
655 if (propagate_icvs) {
656 copy_icvs(&thr_bar->th_fixed_icvs,
657 &team->t.t_implicit_task_taskdata[tid].td_icvs);
661 KA_TRACE(20, (
"__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
662 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
664 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
665 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
666 ANNOTATE_BARRIER_END(this_thr);
667 #if USE_ITT_BUILD && USE_ITT_NOTIFY
668 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
670 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
672 __kmp_itt_task_starting(itt_sync_obj);
674 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
678 if (itt_sync_obj != NULL)
680 __kmp_itt_task_finished(itt_sync_obj);
684 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
688 team = __kmp_threads[gtid]->th.th_team;
689 KMP_DEBUG_ASSERT(team != NULL);
690 tid = __kmp_tid_from_gtid(gtid);
692 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
694 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
695 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
698 num_threads = this_thr->th.th_team_nproc;
699 other_threads = team->t.t_threads;
701 #ifdef KMP_REVERSE_HYPER_BAR
703 for (level = 0, offset = 1;
704 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
705 level += branch_bits, offset <<= branch_bits)
709 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
710 level -= branch_bits, offset >>= branch_bits)
713 for (level = 0, offset = 1; offset < num_threads;
714 level += branch_bits, offset <<= branch_bits)
717 #ifdef KMP_REVERSE_HYPER_BAR
720 child = num_threads >> ((level == 0) ? level : level - 1);
721 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
722 child_tid = tid + (child << level);
723 child >= 1; child--, child_tid -= (1 << level))
725 if (((tid >> level) & (branch_factor - 1)) != 0)
730 for (child = 1, child_tid = tid + (1 << level);
731 child < branch_factor && child_tid < num_threads;
732 child++, child_tid += (1 << level))
733 #endif // KMP_REVERSE_HYPER_BAR
735 if (child_tid >= num_threads)
738 kmp_info_t *child_thr = other_threads[child_tid];
739 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
741 kmp_uint32 next_child_tid = child_tid - (1 << level);
743 #ifdef KMP_REVERSE_HYPER_BAR
744 if (child - 1 >= 1 && next_child_tid < num_threads)
746 if (child + 1 < branch_factor && next_child_tid < num_threads)
747 #endif // KMP_REVERSE_HYPER_BAR
749 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
752 #if KMP_BARRIER_ICV_PUSH
754 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
755 #endif // KMP_BARRIER_ICV_PUSH
759 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
760 "go(%p): %u => %u\n",
761 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
762 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
763 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
765 ANNOTATE_BARRIER_BEGIN(child_thr);
766 kmp_flag_64 flag(&child_bar->b_go, child_thr);
771 #if KMP_BARRIER_ICV_PUSH
772 if (propagate_icvs &&
773 !KMP_MASTER_TID(tid)) {
774 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
776 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
777 &thr_bar->th_fixed_icvs);
782 (
"__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
783 gtid, team->t.t_id, tid, bt));
796 static bool __kmp_init_hierarchical_barrier_thread(
enum barrier_type bt,
797 kmp_bstate_t *thr_bar,
798 kmp_uint32 nproc,
int gtid,
799 int tid, kmp_team_t *team) {
801 bool uninitialized = thr_bar->team == NULL;
802 bool team_changed = team != thr_bar->team;
803 bool team_sz_changed = nproc != thr_bar->nproc;
804 bool tid_changed = tid != thr_bar->old_tid;
807 if (uninitialized || team_sz_changed) {
808 __kmp_get_hierarchy(nproc, thr_bar);
811 if (uninitialized || team_sz_changed || tid_changed) {
812 thr_bar->my_level = thr_bar->depth - 1;
813 thr_bar->parent_tid = -1;
817 while (d < thr_bar->depth) {
820 if (d == thr_bar->depth - 2) {
821 thr_bar->parent_tid = 0;
822 thr_bar->my_level = d;
824 }
else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
827 thr_bar->parent_tid = tid - rem;
828 thr_bar->my_level = d;
834 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
835 thr_bar->old_tid = tid;
836 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
837 thr_bar->team = team;
838 thr_bar->parent_bar =
839 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
841 if (uninitialized || team_changed || tid_changed) {
842 thr_bar->team = team;
843 thr_bar->parent_bar =
844 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847 if (uninitialized || team_sz_changed || tid_changed) {
848 thr_bar->nproc = nproc;
849 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
850 if (thr_bar->my_level == 0)
851 thr_bar->leaf_kids = 0;
852 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
853 thr_bar->leaf_kids = nproc - tid - 1;
854 thr_bar->leaf_state = 0;
855 for (
int i = 0; i < thr_bar->leaf_kids; ++i)
856 ((
char *)&(thr_bar->leaf_state))[7 - i] = 1;
861 static void __kmp_hierarchical_barrier_gather(
862 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
863 void (*reduce)(
void *,
void *) USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
864 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
865 kmp_team_t *team = this_thr->th.th_team;
866 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
867 kmp_uint32 nproc = this_thr->th.th_team_nproc;
868 kmp_info_t **other_threads = team->t.t_threads;
869 kmp_uint64 new_state;
871 int level = team->t.t_level;
874 ->th.th_teams_microtask)
875 if (this_thr->th.th_teams_size.nteams > 1)
879 thr_bar->use_oncore_barrier = 1;
881 thr_bar->use_oncore_barrier = 0;
883 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
885 gtid, team->t.t_id, tid, bt));
886 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
888 #if USE_ITT_BUILD && USE_ITT_NOTIFY
890 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
891 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
895 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
898 if (thr_bar->my_level) {
901 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
902 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
903 thr_bar->use_oncore_barrier) {
904 if (thr_bar->leaf_kids) {
906 kmp_uint64 leaf_state =
908 ? thr_bar->b_arrived | thr_bar->leaf_state
909 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
910 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
912 gtid, team->t.t_id, tid));
913 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
914 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
916 ANNOTATE_REDUCE_AFTER(reduce);
917 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
919 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
921 gtid, team->t.t_id, tid,
922 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
924 ANNOTATE_BARRIER_END(other_threads[child_tid]);
925 (*reduce)(this_thr->th.th_local.reduce_data,
926 other_threads[child_tid]->th.th_local.reduce_data);
928 ANNOTATE_REDUCE_BEFORE(reduce);
929 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
932 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
935 for (kmp_uint32 d = 1; d < thr_bar->my_level;
937 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
938 skip = thr_bar->skip_per_level[d];
941 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
942 kmp_info_t *child_thr = other_threads[child_tid];
943 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
944 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
946 "arrived(%p) == %llu\n",
947 gtid, team->t.t_id, tid,
948 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
949 child_tid, &child_bar->b_arrived, new_state));
950 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
951 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
952 ANNOTATE_BARRIER_END(child_thr);
954 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
956 gtid, team->t.t_id, tid,
957 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
959 ANNOTATE_REDUCE_AFTER(reduce);
960 (*reduce)(this_thr->th.th_local.reduce_data,
961 child_thr->th.th_local.reduce_data);
962 ANNOTATE_REDUCE_BEFORE(reduce);
963 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
968 for (kmp_uint32 d = 0; d < thr_bar->my_level;
970 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
971 skip = thr_bar->skip_per_level[d];
974 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
975 kmp_info_t *child_thr = other_threads[child_tid];
976 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
977 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
979 "arrived(%p) == %llu\n",
980 gtid, team->t.t_id, tid,
981 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
982 child_tid, &child_bar->b_arrived, new_state));
983 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
984 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
985 ANNOTATE_BARRIER_END(child_thr);
987 KA_TRACE(100, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
989 gtid, team->t.t_id, tid,
990 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
992 ANNOTATE_REDUCE_AFTER(reduce);
993 (*reduce)(this_thr->th.th_local.reduce_data,
994 child_thr->th.th_local.reduce_data);
995 ANNOTATE_REDUCE_BEFORE(reduce);
996 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1004 if (!KMP_MASTER_TID(tid)) {
1005 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1006 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1007 gtid, team->t.t_id, tid,
1008 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1009 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1010 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1014 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1015 !thr_bar->use_oncore_barrier) {
1017 ANNOTATE_BARRIER_BEGIN(this_thr);
1018 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1022 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1023 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1024 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1028 team->t.t_bar[bt].b_arrived = new_state;
1029 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1030 "arrived(%p) = %llu\n",
1031 gtid, team->t.t_id, tid, team->t.t_id,
1032 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1035 KA_TRACE(20, (
"__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1036 "barrier type %d\n",
1037 gtid, team->t.t_id, tid, bt));
1040 static void __kmp_hierarchical_barrier_release(
1041 enum barrier_type bt, kmp_info_t *this_thr,
int gtid,
int tid,
1042 int propagate_icvs USE_ITT_BUILD_ARG(
void *itt_sync_obj)) {
1043 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1045 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1047 bool team_change =
false;
1049 if (KMP_MASTER_TID(tid)) {
1050 team = __kmp_threads[gtid]->th.th_team;
1051 KMP_DEBUG_ASSERT(team != NULL);
1052 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1053 "entered barrier type %d\n",
1054 gtid, team->t.t_id, tid, bt));
1057 if (!thr_bar->use_oncore_barrier ||
1058 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1059 thr_bar->team == NULL) {
1061 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1062 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1063 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1064 ANNOTATE_BARRIER_END(this_thr);
1065 TCW_8(thr_bar->b_go,
1066 KMP_INIT_BARRIER_STATE);
1070 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1071 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1072 thr_bar->offset, bt,
1073 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1074 flag.wait(this_thr, TRUE);
1075 if (thr_bar->wait_flag ==
1076 KMP_BARRIER_SWITCHING) {
1077 TCW_8(thr_bar->b_go,
1078 KMP_INIT_BARRIER_STATE);
1080 (RCAST(
volatile char *,
1081 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1084 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1086 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1089 team = __kmp_threads[gtid]->th.th_team;
1090 KMP_DEBUG_ASSERT(team != NULL);
1091 tid = __kmp_tid_from_gtid(gtid);
1095 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1096 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1100 nproc = this_thr->th.th_team_nproc;
1101 int level = team->t.t_level;
1103 if (team->t.t_threads[0]
1104 ->th.th_teams_microtask) {
1105 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1106 this_thr->th.th_teams_level == level)
1108 if (this_thr->th.th_teams_size.nteams > 1)
1113 thr_bar->use_oncore_barrier = 1;
1115 thr_bar->use_oncore_barrier = 0;
1119 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1120 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1121 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1127 #if KMP_BARRIER_ICV_PUSH
1128 if (propagate_icvs) {
1129 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1133 copy_icvs(&thr_bar->th_fixed_icvs,
1134 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1135 }
else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1136 thr_bar->use_oncore_barrier) {
1137 if (!thr_bar->my_level)
1140 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1141 &thr_bar->parent_bar->th_fixed_icvs);
1144 if (thr_bar->my_level)
1146 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1148 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149 &thr_bar->parent_bar->th_fixed_icvs);
1152 #endif // KMP_BARRIER_ICV_PUSH
1155 if (thr_bar->my_level) {
1156 kmp_int32 child_tid;
1158 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1159 thr_bar->use_oncore_barrier) {
1160 if (KMP_MASTER_TID(tid)) {
1163 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1166 ngo_load(&thr_bar->th_fixed_icvs);
1169 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (
int)nproc;
1170 child_tid += thr_bar->skip_per_level[1]) {
1171 kmp_bstate_t *child_bar =
1172 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1173 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1174 "releasing T#%d(%d:%d)"
1175 " go(%p): %u => %u\n",
1176 gtid, team->t.t_id, tid,
1177 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1178 child_tid, &child_bar->b_go, child_bar->b_go,
1179 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1182 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1186 TCW_8(thr_bar->b_go,
1187 KMP_INIT_BARRIER_STATE);
1189 if (thr_bar->leaf_kids) {
1192 old_leaf_kids < thr_bar->leaf_kids) {
1193 if (old_leaf_kids) {
1194 thr_bar->b_go |= old_leaf_state;
1197 last = tid + thr_bar->skip_per_level[1];
1200 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1202 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1203 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1206 (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1207 " T#%d(%d:%d) go(%p): %u => %u\n",
1208 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1209 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1210 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1212 ANNOTATE_BARRIER_BEGIN(child_thr);
1213 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1218 thr_bar->b_go |= thr_bar->leaf_state;
1222 for (
int d = thr_bar->my_level - 1; d >= 0;
1224 last = tid + thr_bar->skip_per_level[d + 1];
1225 kmp_uint32 skip = thr_bar->skip_per_level[d];
1228 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1229 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1230 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1231 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1232 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1233 gtid, team->t.t_id, tid,
1234 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1235 child_tid, &child_bar->b_go, child_bar->b_go,
1236 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1238 ANNOTATE_BARRIER_BEGIN(child_thr);
1239 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1244 #if KMP_BARRIER_ICV_PUSH
1245 if (propagate_icvs && !KMP_MASTER_TID(tid))
1247 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1248 &thr_bar->th_fixed_icvs);
1249 #endif // KMP_BARRIER_ICV_PUSH
1251 KA_TRACE(20, (
"__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1252 "barrier type %d\n",
1253 gtid, team->t.t_id, tid, bt));
1261 template <
bool cancellable>
struct is_cancellable {};
1262 template <>
struct is_cancellable<true> {
1264 is_cancellable() : value(false) {}
1265 is_cancellable(
bool b) : value(b) {}
1266 is_cancellable &operator=(
bool b) {
1270 operator bool()
const {
return value; }
1272 template <>
struct is_cancellable<false> {
1273 is_cancellable &operator=(
bool b) {
return *
this; }
1274 constexpr
operator bool()
const {
return false; }
1285 template <
bool cancellable = false>
1286 static int __kmp_barrier_template(
enum barrier_type bt,
int gtid,
int is_split,
1287 size_t reduce_size,
void *reduce_data,
1288 void (*reduce)(
void *,
void *)) {
1289 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1290 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1291 int tid = __kmp_tid_from_gtid(gtid);
1292 kmp_info_t *this_thr = __kmp_threads[gtid];
1293 kmp_team_t *team = this_thr->th.th_team;
1295 is_cancellable<cancellable> cancelled;
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297 ompt_data_t *my_task_data;
1298 ompt_data_t *my_parallel_data;
1299 void *return_address;
1302 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1303 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1305 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1307 if (ompt_enabled.enabled) {
1309 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1310 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1311 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1312 if (ompt_enabled.ompt_callback_sync_region) {
1313 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1314 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1315 my_task_data, return_address);
1317 if (ompt_enabled.ompt_callback_sync_region_wait) {
1318 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1319 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1320 my_task_data, return_address);
1326 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1330 if (!team->t.t_serialized) {
1333 void *itt_sync_obj = NULL;
1335 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1336 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1339 if (__kmp_tasking_mode == tskm_extra_barrier) {
1340 __kmp_tasking_barrier(team, this_thr, gtid);
1342 (
"__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1343 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1350 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1352 this_thr->th.th_team_bt_intervals =
1353 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1354 this_thr->th.th_team_bt_set =
1355 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1357 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1362 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1363 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1367 if (KMP_MASTER_TID(tid)) {
1368 team->t.t_bar[bt].b_master_arrived += 1;
1370 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1373 if (reduce != NULL) {
1375 this_thr->th.th_local.reduce_data = reduce_data;
1378 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1380 __kmp_task_team_setup(this_thr, team, 0);
1383 cancelled = __kmp_linear_barrier_gather_cancellable(
1384 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1386 switch (__kmp_barrier_gather_pattern[bt]) {
1387 case bp_hyper_bar: {
1389 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1390 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1391 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1394 case bp_hierarchical_bar: {
1395 __kmp_hierarchical_barrier_gather(
1396 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1401 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1403 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1407 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1408 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1415 if (KMP_MASTER_TID(tid)) {
1417 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1418 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1423 team->t.t_bar[bt].b_team_arrived += 1;
1427 if (__kmp_omp_cancellation) {
1428 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1430 if (cancel_request == cancel_loop ||
1431 cancel_request == cancel_sections) {
1432 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1441 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1442 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1444 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1446 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1447 __kmp_forkjoin_frames_mode &&
1449 this_thr->th.th_teams_microtask == NULL &&
1451 team->t.t_active_level == 1) {
1452 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1453 kmp_uint64 cur_time = __itt_get_timestamp();
1454 kmp_info_t **other_threads = team->t.t_threads;
1455 int nproc = this_thr->th.th_team_nproc;
1457 switch (__kmp_forkjoin_frames_mode) {
1459 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1461 this_thr->th.th_frame_time = cur_time;
1465 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1469 if (__itt_metadata_add_ptr) {
1471 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1474 this_thr->th.th_bar_arrive_time = 0;
1475 for (i = 1; i < nproc; ++i) {
1476 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1477 other_threads[i]->th.th_bar_arrive_time = 0;
1479 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1481 (kmp_uint64)(reduce != NULL));
1483 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1485 this_thr->th.th_frame_time = cur_time;
1493 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1494 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1497 if ((status == 1 || !is_split) && !cancelled) {
1499 cancelled = __kmp_linear_barrier_release_cancellable(
1500 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1502 switch (__kmp_barrier_release_pattern[bt]) {
1503 case bp_hyper_bar: {
1504 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1505 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1506 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1509 case bp_hierarchical_bar: {
1510 __kmp_hierarchical_barrier_release(
1511 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1515 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1516 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1517 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1521 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1522 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1526 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1527 __kmp_task_team_sync(this_thr, team);
1535 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1536 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1540 if (__kmp_tasking_mode != tskm_immediate_exec) {
1542 if (this_thr->th.th_task_team != NULL) {
1544 void *itt_sync_obj = NULL;
1545 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1546 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1547 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1551 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1553 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1554 __kmp_task_team_setup(this_thr, team, 0);
1557 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1558 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1565 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1569 KA_TRACE(15, (
"__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571 __kmp_tid_from_gtid(gtid), status));
1574 if (ompt_enabled.enabled) {
1576 if (ompt_enabled.ompt_callback_sync_region_wait) {
1577 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578 ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1579 my_task_data, return_address);
1581 if (ompt_enabled.ompt_callback_sync_region) {
1582 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583 ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1584 my_task_data, return_address);
1587 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1590 ANNOTATE_BARRIER_END(&team->t.t_bar);
1593 return (
int)cancelled;
1598 int __kmp_barrier(
enum barrier_type bt,
int gtid,
int is_split,
1599 size_t reduce_size,
void *reduce_data,
1600 void (*reduce)(
void *,
void *)) {
1601 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1605 #if defined(KMP_GOMP_COMPAT)
1607 int __kmp_barrier_gomp_cancel(
int gtid) {
1608 if (__kmp_omp_cancellation) {
1609 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1612 int tid = __kmp_tid_from_gtid(gtid);
1613 kmp_info_t *this_thr = __kmp_threads[gtid];
1614 if (KMP_MASTER_TID(tid)) {
1618 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619 KMP_BARRIER_STATE_BUMP;
1624 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1629 void __kmp_end_split_barrier(
enum barrier_type bt,
int gtid) {
1630 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632 int tid = __kmp_tid_from_gtid(gtid);
1633 kmp_info_t *this_thr = __kmp_threads[gtid];
1634 kmp_team_t *team = this_thr->th.th_team;
1636 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1637 if (!team->t.t_serialized) {
1638 if (KMP_MASTER_GTID(gtid)) {
1639 switch (__kmp_barrier_release_pattern[bt]) {
1640 case bp_hyper_bar: {
1641 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1642 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1643 FALSE USE_ITT_BUILD_ARG(NULL));
1646 case bp_hierarchical_bar: {
1647 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1648 FALSE USE_ITT_BUILD_ARG(NULL));
1652 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1653 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1654 FALSE USE_ITT_BUILD_ARG(NULL));
1658 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1659 FALSE USE_ITT_BUILD_ARG(NULL));
1662 if (__kmp_tasking_mode != tskm_immediate_exec) {
1663 __kmp_task_team_sync(this_thr, team);
1667 ANNOTATE_BARRIER_END(&team->t.t_bar);
1670 void __kmp_join_barrier(
int gtid) {
1671 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1672 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1673 kmp_info_t *this_thr = __kmp_threads[gtid];
1676 kmp_info_t *master_thread;
1682 void *itt_sync_obj = NULL;
1684 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1686 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1692 team = this_thr->th.th_team;
1693 nproc = this_thr->th.th_team_nproc;
1694 KMP_DEBUG_ASSERT((
int)nproc == team->t.t_nproc);
1695 tid = __kmp_tid_from_gtid(gtid);
1697 team_id = team->t.t_id;
1699 master_thread = this_thr->th.th_team_master;
1701 if (master_thread != team->t.t_threads[0]) {
1702 __kmp_print_structure();
1705 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1709 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1710 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1711 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1712 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1713 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1714 gtid, team_id, tid));
1716 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1718 if (ompt_enabled.enabled) {
1720 ompt_data_t *my_task_data;
1721 ompt_data_t *my_parallel_data;
1722 void *codeptr = NULL;
1723 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1724 if (KMP_MASTER_TID(ds_tid) &&
1725 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1726 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1727 codeptr = team->t.ompt_team_info.master_return_address;
1728 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1729 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1730 if (ompt_enabled.ompt_callback_sync_region) {
1731 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1732 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1733 my_task_data, codeptr);
1735 if (ompt_enabled.ompt_callback_sync_region_wait) {
1736 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1737 ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1738 my_task_data, codeptr);
1740 if (!KMP_MASTER_TID(ds_tid))
1741 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1743 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1747 if (__kmp_tasking_mode == tskm_extra_barrier) {
1748 __kmp_tasking_barrier(team, this_thr, gtid);
1749 KA_TRACE(10, (
"__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1753 if (__kmp_tasking_mode != tskm_immediate_exec) {
1754 KA_TRACE(20, (
"__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1755 "%p, th_task_team = %p\n",
1756 __kmp_gtid_from_thread(this_thr), team_id,
1757 team->t.t_task_team[this_thr->th.th_task_state],
1758 this_thr->th.th_task_team));
1759 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1760 team->t.t_task_team[this_thr->th.th_task_state]);
1769 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1771 this_thr->th.th_team_bt_intervals =
1772 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1773 this_thr->th.th_team_bt_set =
1774 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1776 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1781 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1782 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1785 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1786 case bp_hyper_bar: {
1787 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1788 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1789 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1792 case bp_hierarchical_bar: {
1793 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1794 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1798 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1799 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1800 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1804 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1805 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1813 if (KMP_MASTER_TID(tid)) {
1814 if (__kmp_tasking_mode != tskm_immediate_exec) {
1815 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1818 if (__kmp_display_affinity) {
1819 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1822 #if KMP_STATS_ENABLED
1826 for (
int i = 0; i < team->t.t_nproc; ++i) {
1827 kmp_info_t *team_thread = team->t.t_threads[i];
1828 if (team_thread == this_thr)
1830 team_thread->th.th_stats->setIdleFlag();
1831 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832 team_thread->th.th_sleep_loc != NULL)
1833 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834 team_thread->th.th_sleep_loc);
1838 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1844 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845 __kmp_forkjoin_frames_mode &&
1847 this_thr->th.th_teams_microtask == NULL &&
1849 team->t.t_active_level == 1) {
1850 kmp_uint64 cur_time = __itt_get_timestamp();
1851 ident_t *loc = team->t.t_ident;
1852 kmp_info_t **other_threads = team->t.t_threads;
1853 int nproc = this_thr->th.th_team_nproc;
1855 switch (__kmp_forkjoin_frames_mode) {
1857 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1861 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1865 if (__itt_metadata_add_ptr) {
1867 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1870 this_thr->th.th_bar_arrive_time = 0;
1871 for (i = 1; i < nproc; ++i) {
1872 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1873 other_threads[i]->th.th_bar_arrive_time = 0;
1875 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1876 cur_time, delta, 0);
1878 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1880 this_thr->th.th_frame_time = cur_time;
1888 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1894 if (KMP_MASTER_TID(tid)) {
1897 (
"__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1898 gtid, team_id, tid, nproc));
1905 (
"__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1907 ANNOTATE_BARRIER_END(&team->t.t_bar);
1912 void __kmp_fork_barrier(
int gtid,
int tid) {
1913 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1914 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1915 kmp_info_t *this_thr = __kmp_threads[gtid];
1916 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1918 void *itt_sync_obj = NULL;
1921 ANNOTATE_BARRIER_END(&team->t.t_bar);
1923 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1924 (team != NULL) ? team->t.t_id : -1, tid));
1927 if (KMP_MASTER_TID(tid)) {
1928 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1929 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1931 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1932 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1937 kmp_info_t **other_threads = team->t.t_threads;
1943 for (i = 1; i < team->t.t_nproc; ++i) {
1945 (
"__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1947 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1948 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1949 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1951 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1952 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1953 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1957 if (__kmp_tasking_mode != tskm_immediate_exec) {
1959 __kmp_task_team_setup(this_thr, team, 0);
1968 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1970 this_thr->th.th_team_bt_intervals =
1971 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1972 this_thr->th.th_team_bt_set =
1973 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1975 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1980 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1981 case bp_hyper_bar: {
1982 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1983 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1984 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1987 case bp_hierarchical_bar: {
1988 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1993 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1994 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1995 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1999 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2000 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2005 if (ompt_enabled.enabled &&
2006 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2007 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2008 ompt_data_t *task_data = (team)
2009 ? OMPT_CUR_TASK_DATA(this_thr)
2010 : &(this_thr->th.ompt_thread_info.task_data);
2011 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2013 void *codeptr = NULL;
2014 if (KMP_MASTER_TID(ds_tid) &&
2015 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2016 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2017 codeptr = team->t.ompt_team_info.master_return_address;
2018 if (ompt_enabled.ompt_callback_sync_region_wait) {
2019 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2020 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
2022 if (ompt_enabled.ompt_callback_sync_region) {
2023 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
2027 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2028 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2029 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit);
2035 if (TCR_4(__kmp_global.g.g_done)) {
2036 this_thr->th.th_task_team = NULL;
2038 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2039 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2040 if (!KMP_MASTER_TID(tid)) {
2041 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2043 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2047 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2055 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2056 KMP_DEBUG_ASSERT(team != NULL);
2057 tid = __kmp_tid_from_gtid(gtid);
2059 #if KMP_BARRIER_ICV_PULL
2067 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2068 if (!KMP_MASTER_TID(tid)) {
2072 (
"__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2073 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2075 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2076 &team->t.t_threads[0]
2077 ->th.th_bar[bs_forkjoin_barrier]
2081 #endif // KMP_BARRIER_ICV_PULL
2083 if (__kmp_tasking_mode != tskm_immediate_exec) {
2084 __kmp_task_team_sync(this_thr, team);
2087 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2088 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2089 if (proc_bind == proc_bind_intel) {
2091 #if KMP_AFFINITY_SUPPORTED
2093 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2094 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2096 #endif // KMP_AFFINITY_SUPPORTED
2097 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2098 }
else if (proc_bind != proc_bind_false) {
2099 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2100 KA_TRACE(100, (
"__kmp_fork_barrier: T#%d already in correct place %d\n",
2101 __kmp_gtid_from_thread(this_thr),
2102 this_thr->th.th_current_place));
2104 __kmp_affinity_set_place(gtid);
2110 if (__kmp_display_affinity) {
2111 if (team->t.t_display_affinity
2112 #
if KMP_AFFINITY_SUPPORTED
2113 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2117 __kmp_aux_display_affinity(gtid, NULL);
2118 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2119 this_thr->th.th_prev_level = team->t.t_level;
2122 if (!KMP_MASTER_TID(tid))
2123 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2126 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2127 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2128 if (!KMP_MASTER_TID(tid)) {
2130 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2131 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2135 ANNOTATE_BARRIER_END(&team->t.t_bar);
2136 KA_TRACE(10, (
"__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2137 team->t.t_id, tid));
2140 void __kmp_setup_icv_copy(kmp_team_t *team,
int new_nproc,
2141 kmp_internal_control_t *new_icvs,
ident_t *loc) {
2142 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2144 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2145 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2150 #if KMP_BARRIER_ICV_PULL
2154 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2157 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2159 KF_TRACE(10, (
"__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2160 team->t.t_threads[0], team));
2161 #elif KMP_BARRIER_ICV_PUSH
2164 KF_TRACE(10, (
"__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2165 team->t.t_threads[0], team));
2170 KMP_DEBUG_ASSERT(team->t.t_threads[0]);
2172 for (
int f = 1; f < new_nproc; ++f) {
2174 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2175 f, team->t.t_threads[f], team));
2176 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2177 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2178 KF_TRACE(10, (
"__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2179 f, team->t.t_threads[f], team));
2182 #endif // KMP_BARRIER_ICV_PULL