LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48  int nsockets = 1;
49  if (__kmp_topology) {
50  int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52  int ncores_per_socket =
53  __kmp_topology->calculate_ratio(core_level, socket_level);
54  nsockets = __kmp_topology->get_count(socket_level);
55 
56  if (nsockets <= 0)
57  nsockets = 1;
58  if (ncores_per_socket <= 0)
59  ncores_per_socket = 1;
60 
61  threads_per_go = ncores_per_socket >> 1;
62  if (!fix_threads_per_go) {
63  // Minimize num_gos
64  if (threads_per_go > 4) {
65  if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66  threads_per_go = threads_per_go >> 1;
67  }
68  if (threads_per_go > 4 && nsockets == 1)
69  threads_per_go = threads_per_go >> 1;
70  }
71  }
72  if (threads_per_go == 0)
73  threads_per_go = 1;
74  fix_threads_per_go = true;
75  num_gos = n / threads_per_go;
76  if (n % threads_per_go)
77  num_gos++;
78  if (nsockets == 1 || num_gos == 1)
79  num_groups = 1;
80  else {
81  num_groups = num_gos / nsockets;
82  if (num_gos % nsockets)
83  num_groups++;
84  }
85  if (num_groups <= 0)
86  num_groups = 1;
87  gos_per_group = num_gos / num_groups;
88  if (num_gos % num_groups)
89  gos_per_group++;
90  threads_per_group = threads_per_go * gos_per_group;
91  } else {
92  num_gos = n / threads_per_go;
93  if (n % threads_per_go)
94  num_gos++;
95  if (num_gos == 1)
96  num_groups = 1;
97  else {
98  num_groups = num_gos / 2;
99  if (num_gos % 2)
100  num_groups++;
101  }
102  gos_per_group = num_gos / num_groups;
103  if (num_gos % num_groups)
104  gos_per_group++;
105  threads_per_group = threads_per_go * gos_per_group;
106  }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110  // Minimize num_gos
111  for (num_gos = 1;; num_gos++)
112  if (IDEAL_CONTENTION * num_gos >= n)
113  break;
114  threads_per_go = n / num_gos;
115  if (n % num_gos)
116  threads_per_go++;
117  while (num_gos > MAX_GOS) {
118  threads_per_go++;
119  num_gos = n / threads_per_go;
120  if (n % threads_per_go)
121  num_gos++;
122  }
123  computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129  KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131  // expand to requested size * 2
132  max_threads = nthr * 2;
133 
134  // allocate arrays to new max threads
135  for (int i = 0; i < MAX_ITERS; ++i) {
136  if (flags[i])
137  flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138  max_threads * sizeof(flags_s));
139  else
140  flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141  }
142 
143  if (go)
144  go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145  else
146  go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148  if (iter)
149  iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150  else
151  iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153  if (sleep)
154  sleep =
155  (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156  else
157  sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165  for (size_t j = 0; j < num_gos; j++) {
166  go[j].go.store(next_go);
167  }
168  return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172  for (size_t j = 0; j < max_threads; ++j) {
173  for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174  flags[i][j].stillNeed = 1;
175  }
176  go[j].go.store(0);
177  iter[j].iter = 0;
178  }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184  size_t old_max = max_threads;
185  if (nthr > max_threads) { // need more space in arrays
186  resize(nthr);
187  }
188 
189  for (size_t i = 0; i < max_threads; i++) {
190  for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191  flags[j][i].stillNeed = 1;
192  }
193  go[i].go.store(0);
194  iter[i].iter = 0;
195  if (i >= old_max)
196  sleep[i].sleep = false;
197  }
198 
199  // Recalculate num_gos, etc. based on new nthr
200  computeVarsForN(nthr);
201 
202  num_threads = nthr;
203 
204  if (team_icvs == NULL)
205  team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 void distributedBarrier::deallocate(distributedBarrier *db) {
209  for (int i = 0; i < MAX_ITERS; ++i) {
210  if (db->flags[i])
211  KMP_INTERNAL_FREE(db->flags[i]);
212  db->flags[i] = NULL;
213  }
214  if (db->go) {
215  KMP_INTERNAL_FREE(db->go);
216  db->go = NULL;
217  }
218  if (db->iter) {
219  KMP_INTERNAL_FREE(db->iter);
220  db->iter = NULL;
221  }
222  if (db->sleep) {
223  KMP_INTERNAL_FREE(db->sleep);
224  db->sleep = NULL;
225  }
226  if (db->team_icvs) {
227  __kmp_free(db->team_icvs);
228  db->team_icvs = NULL;
229  }
230  KMP_ALIGNED_FREE(db);
231 }
232 
233 // This function is used only when KMP_BLOCKTIME is not infinite.
234 // static
235 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
236  size_t start, size_t stop, size_t inc,
237  size_t tid) {
238  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
239  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
240  return;
241 
242  kmp_info_t **other_threads = team->t.t_threads;
243  for (size_t thr = start; thr < stop; thr += inc) {
244  KMP_DEBUG_ASSERT(other_threads[thr]);
245  int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
246  // Wake up worker regardless of if it appears to be sleeping or not
247  __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
248  }
249 }
250 
251 static void __kmp_dist_barrier_gather(
252  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
253  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
254  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
255  kmp_team_t *team;
256  distributedBarrier *b;
257  kmp_info_t **other_threads;
258  kmp_uint64 my_current_iter, my_next_iter;
259  kmp_uint32 nproc;
260  bool group_leader;
261 
262  team = this_thr->th.th_team;
263  nproc = this_thr->th.th_team_nproc;
264  other_threads = team->t.t_threads;
265  b = team->t.b;
266  my_current_iter = b->iter[tid].iter;
267  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
268  group_leader = ((tid % b->threads_per_group) == 0);
269 
270  KA_TRACE(20,
271  ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
272  gtid, team->t.t_id, tid, bt));
273 
274 #if USE_ITT_BUILD && USE_ITT_NOTIFY
275  // Barrier imbalance - save arrive time to the thread
276  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
277  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
278  __itt_get_timestamp();
279  }
280 #endif
281 
282  if (group_leader) {
283  // Start from the thread after the group leader
284  size_t group_start = tid + 1;
285  size_t group_end = tid + b->threads_per_group;
286  size_t threads_pending = 0;
287 
288  if (group_end > nproc)
289  group_end = nproc;
290  do { // wait for threads in my group
291  threads_pending = 0;
292  // Check all the flags every time to avoid branch misspredict
293  for (size_t thr = group_start; thr < group_end; thr++) {
294  // Each thread uses a different cache line
295  threads_pending += b->flags[my_current_iter][thr].stillNeed;
296  }
297  // Execute tasks here
298  if (__kmp_tasking_mode != tskm_immediate_exec) {
299  kmp_task_team_t *task_team = this_thr->th.th_task_team;
300  if (task_team != NULL) {
301  if (TCR_SYNC_4(task_team->tt.tt_active)) {
302  if (KMP_TASKING_ENABLED(task_team)) {
303  int tasks_completed = FALSE;
304  __kmp_atomic_execute_tasks_64(
305  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
306  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
307  } else
308  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
309  }
310  } else {
311  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
312  } // if
313  }
314  if (TCR_4(__kmp_global.g.g_done)) {
315  if (__kmp_global.g.g_abort)
316  __kmp_abort_thread();
317  break;
318  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
319  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
320  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
321  }
322  } while (threads_pending > 0);
323 
324  if (reduce) { // Perform reduction if needed
325  OMPT_REDUCTION_DECL(this_thr, gtid);
326  OMPT_REDUCTION_BEGIN;
327  // Group leader reduces all threads in group
328  for (size_t thr = group_start; thr < group_end; thr++) {
329  (*reduce)(this_thr->th.th_local.reduce_data,
330  other_threads[thr]->th.th_local.reduce_data);
331  }
332  OMPT_REDUCTION_END;
333  }
334 
335  // Set flag for next iteration
336  b->flags[my_next_iter][tid].stillNeed = 1;
337  // Each thread uses a different cache line; resets stillNeed to 0 to
338  // indicate it has reached the barrier
339  b->flags[my_current_iter][tid].stillNeed = 0;
340 
341  do { // wait for all group leaders
342  threads_pending = 0;
343  for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
344  threads_pending += b->flags[my_current_iter][thr].stillNeed;
345  }
346  // Execute tasks here
347  if (__kmp_tasking_mode != tskm_immediate_exec) {
348  kmp_task_team_t *task_team = this_thr->th.th_task_team;
349  if (task_team != NULL) {
350  if (TCR_SYNC_4(task_team->tt.tt_active)) {
351  if (KMP_TASKING_ENABLED(task_team)) {
352  int tasks_completed = FALSE;
353  __kmp_atomic_execute_tasks_64(
354  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
355  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
356  } else
357  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
358  }
359  } else {
360  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
361  } // if
362  }
363  if (TCR_4(__kmp_global.g.g_done)) {
364  if (__kmp_global.g.g_abort)
365  __kmp_abort_thread();
366  break;
367  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
368  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
369  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
370  }
371  } while (threads_pending > 0);
372 
373  if (reduce) { // Perform reduction if needed
374  if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
375  OMPT_REDUCTION_DECL(this_thr, gtid);
376  OMPT_REDUCTION_BEGIN;
377  for (size_t thr = b->threads_per_group; thr < nproc;
378  thr += b->threads_per_group) {
379  (*reduce)(this_thr->th.th_local.reduce_data,
380  other_threads[thr]->th.th_local.reduce_data);
381  }
382  OMPT_REDUCTION_END;
383  }
384  }
385  } else {
386  // Set flag for next iteration
387  b->flags[my_next_iter][tid].stillNeed = 1;
388  // Each thread uses a different cache line; resets stillNeed to 0 to
389  // indicate it has reached the barrier
390  b->flags[my_current_iter][tid].stillNeed = 0;
391  }
392 
393  KMP_MFENCE();
394 
395  KA_TRACE(20,
396  ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
397  gtid, team->t.t_id, tid, bt));
398 }
399 
400 static void __kmp_dist_barrier_release(
401  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
402  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
403  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
404  kmp_team_t *team;
405  distributedBarrier *b;
406  kmp_bstate_t *thr_bar;
407  kmp_uint64 my_current_iter, next_go;
408  size_t my_go_index;
409  bool group_leader;
410 
411  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
412  gtid, tid, bt));
413 
414  thr_bar = &this_thr->th.th_bar[bt].bb;
415 
416  if (!KMP_MASTER_TID(tid)) {
417  // workers and non-master group leaders need to check their presence in team
418  do {
419  if (this_thr->th.th_used_in_team.load() != 1 &&
420  this_thr->th.th_used_in_team.load() != 3) {
421  // Thread is not in use in a team. Wait on location in tid's thread
422  // struct. The 0 value tells anyone looking that this thread is spinning
423  // or sleeping until this location becomes 3 again; 3 is the transition
424  // state to get to 1 which is waiting on go and being in the team
425  kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
426  if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
427  0) ||
428  this_thr->th.th_used_in_team.load() == 0) {
429  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
430  }
431 #if USE_ITT_BUILD && USE_ITT_NOTIFY
432  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
433  // In fork barrier where we could not get the object reliably
434  itt_sync_obj =
435  __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
436  // Cancel wait on previous parallel region...
437  __kmp_itt_task_starting(itt_sync_obj);
438 
439  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
440  return;
441 
442  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
443  if (itt_sync_obj != NULL)
444  // Call prepare as early as possible for "new" barrier
445  __kmp_itt_task_finished(itt_sync_obj);
446  } else
447 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
448  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
449  return;
450  }
451  if (this_thr->th.th_used_in_team.load() != 1 &&
452  this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
453  continue;
454  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
455  return;
456 
457  // At this point, the thread thinks it is in use in a team, or in
458  // transition to be used in a team, but it might have reached this barrier
459  // before it was marked unused by the team. Unused threads are awoken and
460  // shifted to wait on local thread struct elsewhere. It also might reach
461  // this point by being picked up for use by a different team. Either way,
462  // we need to update the tid.
463  tid = __kmp_tid_from_gtid(gtid);
464  team = this_thr->th.th_team;
465  KMP_DEBUG_ASSERT(tid >= 0);
466  KMP_DEBUG_ASSERT(team);
467  b = team->t.b;
468  my_current_iter = b->iter[tid].iter;
469  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
470  my_go_index = tid / b->threads_per_go;
471  if (this_thr->th.th_used_in_team.load() == 3) {
472  (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
473  1);
474  }
475  // Check if go flag is set
476  if (b->go[my_go_index].go.load() != next_go) {
477  // Wait on go flag on team
478  kmp_atomic_flag_64<false, true> my_flag(
479  &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
480  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
481  KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
482  b->iter[tid].iter == 0);
483  KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
484  }
485 
486  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
487  return;
488  // At this point, the thread's go location was set. This means the primary
489  // thread is safely in the barrier, and so this thread's data is
490  // up-to-date, but we should check again that this thread is really in
491  // use in the team, as it could have been woken up for the purpose of
492  // changing team size, or reaping threads at shutdown.
493  if (this_thr->th.th_used_in_team.load() == 1)
494  break;
495  } while (1);
496 
497  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
498  return;
499 
500  group_leader = ((tid % b->threads_per_group) == 0);
501  if (group_leader) {
502  // Tell all the threads in my group they can go!
503  for (size_t go_idx = my_go_index + 1;
504  go_idx < my_go_index + b->gos_per_group; go_idx++) {
505  b->go[go_idx].go.store(next_go);
506  }
507  // Fence added so that workers can see changes to go. sfence inadequate.
508  KMP_MFENCE();
509  }
510 
511 #if KMP_BARRIER_ICV_PUSH
512  if (propagate_icvs) { // copy ICVs to final dest
513  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
514  tid, FALSE);
515  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
516  (kmp_internal_control_t *)team->t.b->team_icvs);
517  copy_icvs(&thr_bar->th_fixed_icvs,
518  &team->t.t_implicit_task_taskdata[tid].td_icvs);
519  }
520 #endif
521  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
522  // This thread is now awake and participating in the barrier;
523  // wake up the other threads in the group
524  size_t nproc = this_thr->th.th_team_nproc;
525  size_t group_end = tid + b->threads_per_group;
526  if (nproc < group_end)
527  group_end = nproc;
528  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
529  }
530  } else { // Primary thread
531  team = this_thr->th.th_team;
532  b = team->t.b;
533  my_current_iter = b->iter[tid].iter;
534  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
535 #if KMP_BARRIER_ICV_PUSH
536  if (propagate_icvs) {
537  // primary thread has ICVs in final destination; copy
538  copy_icvs(&thr_bar->th_fixed_icvs,
539  &team->t.t_implicit_task_taskdata[tid].td_icvs);
540  }
541 #endif
542  // Tell all the group leaders they can go!
543  for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
544  b->go[go_idx].go.store(next_go);
545  }
546 
547  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
548  // Wake-up the group leaders
549  size_t nproc = this_thr->th.th_team_nproc;
550  __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
551  b->threads_per_group, tid);
552  }
553 
554  // Tell all the threads in my group they can go!
555  for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
556  b->go[go_idx].go.store(next_go);
557  }
558 
559  // Fence added so that workers can see changes to go. sfence inadequate.
560  KMP_MFENCE();
561 
562  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
563  // Wake-up the other threads in my group
564  size_t nproc = this_thr->th.th_team_nproc;
565  size_t group_end = tid + b->threads_per_group;
566  if (nproc < group_end)
567  group_end = nproc;
568  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
569  }
570  }
571  // Update to next iteration
572  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
573  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
574 
575  KA_TRACE(
576  20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
577  gtid, team->t.t_id, tid, bt));
578 }
579 
580 // Linear Barrier
581 template <bool cancellable = false>
582 static bool __kmp_linear_barrier_gather_template(
583  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
584  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
585  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
586  kmp_team_t *team = this_thr->th.th_team;
587  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
588  kmp_info_t **other_threads = team->t.t_threads;
589 
590  KA_TRACE(
591  20,
592  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
593  gtid, team->t.t_id, tid, bt));
594  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
595 
596 #if USE_ITT_BUILD && USE_ITT_NOTIFY
597  // Barrier imbalance - save arrive time to the thread
598  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
599  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
600  __itt_get_timestamp();
601  }
602 #endif
603  // We now perform a linear reduction to signal that all of the threads have
604  // arrived.
605  if (!KMP_MASTER_TID(tid)) {
606  KA_TRACE(20,
607  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
608  "arrived(%p): %llu => %llu\n",
609  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
610  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
611  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
612  // Mark arrival to primary thread
613  /* After performing this write, a worker thread may not assume that the team
614  is valid any more - it could be deallocated by the primary thread at any
615  time. */
616  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
617  flag.release();
618  } else {
619  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
620  int nproc = this_thr->th.th_team_nproc;
621  int i;
622  // Don't have to worry about sleep bit here or atomic since team setting
623  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
624 
625  // Collect all the worker team member threads.
626  for (i = 1; i < nproc; ++i) {
627 #if KMP_CACHE_MANAGE
628  // Prefetch next thread's arrived count
629  if (i + 1 < nproc)
630  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
631 #endif /* KMP_CACHE_MANAGE */
632  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
633  "arrived(%p) == %llu\n",
634  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635  team->t.t_id, i,
636  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
637 
638  // Wait for worker thread to arrive
639  if (cancellable) {
640  kmp_flag_64<true, false> flag(
641  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
642  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
643  return true;
644  } else {
645  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
646  new_state);
647  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
648  }
649 #if USE_ITT_BUILD && USE_ITT_NOTIFY
650  // Barrier imbalance - write min of the thread time and the other thread
651  // time to the thread.
652  if (__kmp_forkjoin_frames_mode == 2) {
653  this_thr->th.th_bar_min_time = KMP_MIN(
654  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
655  }
656 #endif
657  if (reduce) {
658  KA_TRACE(100,
659  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
660  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
661  team->t.t_id, i));
662  OMPT_REDUCTION_DECL(this_thr, gtid);
663  OMPT_REDUCTION_BEGIN;
664  (*reduce)(this_thr->th.th_local.reduce_data,
665  other_threads[i]->th.th_local.reduce_data);
666  OMPT_REDUCTION_END;
667  }
668  }
669  // Don't have to worry about sleep bit here or atomic since team setting
670  team_bar->b_arrived = new_state;
671  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
672  "arrived(%p) = %llu\n",
673  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
674  new_state));
675  }
676  KA_TRACE(
677  20,
678  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
679  gtid, team->t.t_id, tid, bt));
680  return false;
681 }
682 
683 template <bool cancellable = false>
684 static bool __kmp_linear_barrier_release_template(
685  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
686  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
687  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
688  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
689  kmp_team_t *team;
690 
691  if (KMP_MASTER_TID(tid)) {
692  unsigned int i;
693  kmp_uint32 nproc = this_thr->th.th_team_nproc;
694  kmp_info_t **other_threads;
695 
696  team = __kmp_threads[gtid]->th.th_team;
697  KMP_DEBUG_ASSERT(team != NULL);
698  other_threads = team->t.t_threads;
699 
700  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
701  "barrier type %d\n",
702  gtid, team->t.t_id, tid, bt));
703 
704  if (nproc > 1) {
705 #if KMP_BARRIER_ICV_PUSH
706  {
707  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
708  if (propagate_icvs) {
709  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
710  for (i = 1; i < nproc; ++i) {
711  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
712  team, i, FALSE);
713  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
714  &team->t.t_implicit_task_taskdata[0].td_icvs);
715  }
716  ngo_sync();
717  }
718  }
719 #endif // KMP_BARRIER_ICV_PUSH
720 
721  // Now, release all of the worker threads
722  for (i = 1; i < nproc; ++i) {
723 #if KMP_CACHE_MANAGE
724  // Prefetch next thread's go flag
725  if (i + 1 < nproc)
726  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
727 #endif /* KMP_CACHE_MANAGE */
728  KA_TRACE(
729  20,
730  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
731  "go(%p): %u => %u\n",
732  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
733  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
734  other_threads[i]->th.th_bar[bt].bb.b_go,
735  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
736  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
737  other_threads[i]);
738  flag.release();
739  }
740  }
741  } else { // Wait for the PRIMARY thread to release us
742  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
743  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
744  if (cancellable) {
745  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
746  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
747  return true;
748  } else {
749  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
750  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
751  }
752 #if USE_ITT_BUILD && USE_ITT_NOTIFY
753  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
754  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
755  // disabled)
756  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
757  // Cancel wait on previous parallel region...
758  __kmp_itt_task_starting(itt_sync_obj);
759 
760  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
761  return false;
762 
763  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
764  if (itt_sync_obj != NULL)
765  // Call prepare as early as possible for "new" barrier
766  __kmp_itt_task_finished(itt_sync_obj);
767  } else
768 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
769  // Early exit for reaping threads releasing forkjoin barrier
770  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
771  return false;
772 // The worker thread may now assume that the team is valid.
773 #ifdef KMP_DEBUG
774  tid = __kmp_tid_from_gtid(gtid);
775  team = __kmp_threads[gtid]->th.th_team;
776 #endif
777  KMP_DEBUG_ASSERT(team != NULL);
778  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
779  KA_TRACE(20,
780  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
781  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
782  KMP_MB(); // Flush all pending memory write invalidates.
783  }
784  KA_TRACE(
785  20,
786  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
787  gtid, team->t.t_id, tid, bt));
788  return false;
789 }
790 
791 static void __kmp_linear_barrier_gather(
792  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
793  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
794  __kmp_linear_barrier_gather_template<false>(
795  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
796 }
797 
798 static bool __kmp_linear_barrier_gather_cancellable(
799  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
800  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
801  return __kmp_linear_barrier_gather_template<true>(
802  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
803 }
804 
805 static void __kmp_linear_barrier_release(
806  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
807  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
808  __kmp_linear_barrier_release_template<false>(
809  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
810 }
811 
812 static bool __kmp_linear_barrier_release_cancellable(
813  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
814  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
815  return __kmp_linear_barrier_release_template<true>(
816  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
817 }
818 
819 // Tree barrier
820 static void __kmp_tree_barrier_gather(
821  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
822  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
823  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
824  kmp_team_t *team = this_thr->th.th_team;
825  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
826  kmp_info_t **other_threads = team->t.t_threads;
827  kmp_uint32 nproc = this_thr->th.th_team_nproc;
828  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
829  kmp_uint32 branch_factor = 1 << branch_bits;
830  kmp_uint32 child;
831  kmp_uint32 child_tid;
832  kmp_uint64 new_state = 0;
833 
834  KA_TRACE(
835  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
836  gtid, team->t.t_id, tid, bt));
837  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
838 
839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
840  // Barrier imbalance - save arrive time to the thread
841  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
842  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
843  __itt_get_timestamp();
844  }
845 #endif
846  // Perform tree gather to wait until all threads have arrived; reduce any
847  // required data as we go
848  child_tid = (tid << branch_bits) + 1;
849  if (child_tid < nproc) {
850  // Parent threads wait for all their children to arrive
851  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
852  child = 1;
853  do {
854  kmp_info_t *child_thr = other_threads[child_tid];
855  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
856 #if KMP_CACHE_MANAGE
857  // Prefetch next thread's arrived count
858  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
859  KMP_CACHE_PREFETCH(
860  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
861 #endif /* KMP_CACHE_MANAGE */
862  KA_TRACE(20,
863  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
864  "arrived(%p) == %llu\n",
865  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
867  // Wait for child to arrive
868  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
869  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
871  // Barrier imbalance - write min of the thread time and a child time to
872  // the thread.
873  if (__kmp_forkjoin_frames_mode == 2) {
874  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
875  child_thr->th.th_bar_min_time);
876  }
877 #endif
878  if (reduce) {
879  KA_TRACE(100,
880  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
881  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
882  team->t.t_id, child_tid));
883  OMPT_REDUCTION_DECL(this_thr, gtid);
884  OMPT_REDUCTION_BEGIN;
885  (*reduce)(this_thr->th.th_local.reduce_data,
886  child_thr->th.th_local.reduce_data);
887  OMPT_REDUCTION_END;
888  }
889  child++;
890  child_tid++;
891  } while (child <= branch_factor && child_tid < nproc);
892  }
893 
894  if (!KMP_MASTER_TID(tid)) { // Worker threads
895  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
896 
897  KA_TRACE(20,
898  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
899  "arrived(%p): %llu => %llu\n",
900  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
901  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
902  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
903 
904  // Mark arrival to parent thread
905  /* After performing this write, a worker thread may not assume that the team
906  is valid any more - it could be deallocated by the primary thread at any
907  time. */
908  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
909  flag.release();
910  } else {
911  // Need to update the team arrived pointer if we are the primary thread
912  if (nproc > 1) // New value was already computed above
913  team->t.t_bar[bt].b_arrived = new_state;
914  else
915  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
916  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
917  "arrived(%p) = %llu\n",
918  gtid, team->t.t_id, tid, team->t.t_id,
919  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
920  }
921  KA_TRACE(20,
922  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
923  gtid, team->t.t_id, tid, bt));
924 }
925 
926 static void __kmp_tree_barrier_release(
927  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
928  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
929  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
930  kmp_team_t *team;
931  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
932  kmp_uint32 nproc;
933  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
934  kmp_uint32 branch_factor = 1 << branch_bits;
935  kmp_uint32 child;
936  kmp_uint32 child_tid;
937 
938  // Perform a tree release for all of the threads that have been gathered
939  if (!KMP_MASTER_TID(
940  tid)) { // Handle fork barrier workers who aren't part of a team yet
941  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
942  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
943  // Wait for parent thread to release us
944  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
945  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
946 #if USE_ITT_BUILD && USE_ITT_NOTIFY
947  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
948  // In fork barrier where we could not get the object reliably (or
949  // ITTNOTIFY is disabled)
950  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
951  // Cancel wait on previous parallel region...
952  __kmp_itt_task_starting(itt_sync_obj);
953 
954  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
955  return;
956 
957  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
958  if (itt_sync_obj != NULL)
959  // Call prepare as early as possible for "new" barrier
960  __kmp_itt_task_finished(itt_sync_obj);
961  } else
962 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
963  // Early exit for reaping threads releasing forkjoin barrier
964  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
965  return;
966 
967  // The worker thread may now assume that the team is valid.
968  team = __kmp_threads[gtid]->th.th_team;
969  KMP_DEBUG_ASSERT(team != NULL);
970  tid = __kmp_tid_from_gtid(gtid);
971 
972  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
973  KA_TRACE(20,
974  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
975  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
976  KMP_MB(); // Flush all pending memory write invalidates.
977  } else {
978  team = __kmp_threads[gtid]->th.th_team;
979  KMP_DEBUG_ASSERT(team != NULL);
980  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
981  "barrier type %d\n",
982  gtid, team->t.t_id, tid, bt));
983  }
984  nproc = this_thr->th.th_team_nproc;
985  child_tid = (tid << branch_bits) + 1;
986 
987  if (child_tid < nproc) {
988  kmp_info_t **other_threads = team->t.t_threads;
989  child = 1;
990  // Parent threads release all their children
991  do {
992  kmp_info_t *child_thr = other_threads[child_tid];
993  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
994 #if KMP_CACHE_MANAGE
995  // Prefetch next thread's go count
996  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
997  KMP_CACHE_PREFETCH(
998  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
999 #endif /* KMP_CACHE_MANAGE */
1000 
1001 #if KMP_BARRIER_ICV_PUSH
1002  {
1003  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1004  if (propagate_icvs) {
1005  __kmp_init_implicit_task(team->t.t_ident,
1006  team->t.t_threads[child_tid], team,
1007  child_tid, FALSE);
1008  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1009  &team->t.t_implicit_task_taskdata[0].td_icvs);
1010  }
1011  }
1012 #endif // KMP_BARRIER_ICV_PUSH
1013  KA_TRACE(20,
1014  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1015  "go(%p): %u => %u\n",
1016  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1017  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1018  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1019  // Release child from barrier
1020  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1021  flag.release();
1022  child++;
1023  child_tid++;
1024  } while (child <= branch_factor && child_tid < nproc);
1025  }
1026  KA_TRACE(
1027  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028  gtid, team->t.t_id, tid, bt));
1029 }
1030 
1031 // Hyper Barrier
1032 static void __kmp_hyper_barrier_gather(
1033  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1034  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1035  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1036  kmp_team_t *team = this_thr->th.th_team;
1037  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1038  kmp_info_t **other_threads = team->t.t_threads;
1039  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1040  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1041  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1042  kmp_uint32 branch_factor = 1 << branch_bits;
1043  kmp_uint32 offset;
1044  kmp_uint32 level;
1045 
1046  KA_TRACE(
1047  20,
1048  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1049  gtid, team->t.t_id, tid, bt));
1050  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1051 
1052 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1053  // Barrier imbalance - save arrive time to the thread
1054  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1055  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1056  __itt_get_timestamp();
1057  }
1058 #endif
1059  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1060  have arrived, and reduce any required data as we go. */
1061  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1062  for (level = 0, offset = 1; offset < num_threads;
1063  level += branch_bits, offset <<= branch_bits) {
1064  kmp_uint32 child;
1065  kmp_uint32 child_tid;
1066 
1067  if (((tid >> level) & (branch_factor - 1)) != 0) {
1068  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1069 
1070  KMP_MB(); // Synchronize parent and child threads.
1071  KA_TRACE(20,
1072  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1073  "arrived(%p): %llu => %llu\n",
1074  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1075  team->t.t_id, parent_tid, &thr_bar->b_arrived,
1076  thr_bar->b_arrived,
1077  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1078  // Mark arrival to parent thread
1079  /* After performing this write (in the last iteration of the enclosing for
1080  loop), a worker thread may not assume that the team is valid any more
1081  - it could be deallocated by the primary thread at any time. */
1082  p_flag.set_waiter(other_threads[parent_tid]);
1083  p_flag.release();
1084  break;
1085  }
1086 
1087  // Parent threads wait for children to arrive
1088  if (new_state == KMP_BARRIER_UNUSED_STATE)
1089  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1090  for (child = 1, child_tid = tid + (1 << level);
1091  child < branch_factor && child_tid < num_threads;
1092  child++, child_tid += (1 << level)) {
1093  kmp_info_t *child_thr = other_threads[child_tid];
1094  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1095 #if KMP_CACHE_MANAGE
1096  kmp_uint32 next_child_tid = child_tid + (1 << level);
1097  // Prefetch next thread's arrived count
1098  if (child + 1 < branch_factor && next_child_tid < num_threads)
1099  KMP_CACHE_PREFETCH(
1100  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1101 #endif /* KMP_CACHE_MANAGE */
1102  KA_TRACE(20,
1103  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1104  "arrived(%p) == %llu\n",
1105  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1106  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1107  // Wait for child to arrive
1108  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1109  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1110  KMP_MB(); // Synchronize parent and child threads.
1111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1112  // Barrier imbalance - write min of the thread time and a child time to
1113  // the thread.
1114  if (__kmp_forkjoin_frames_mode == 2) {
1115  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1116  child_thr->th.th_bar_min_time);
1117  }
1118 #endif
1119  if (reduce) {
1120  KA_TRACE(100,
1121  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1122  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1123  team->t.t_id, child_tid));
1124  OMPT_REDUCTION_DECL(this_thr, gtid);
1125  OMPT_REDUCTION_BEGIN;
1126  (*reduce)(this_thr->th.th_local.reduce_data,
1127  child_thr->th.th_local.reduce_data);
1128  OMPT_REDUCTION_END;
1129  }
1130  }
1131  }
1132 
1133  if (KMP_MASTER_TID(tid)) {
1134  // Need to update the team arrived pointer if we are the primary thread
1135  if (new_state == KMP_BARRIER_UNUSED_STATE)
1136  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1137  else
1138  team->t.t_bar[bt].b_arrived = new_state;
1139  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1140  "arrived(%p) = %llu\n",
1141  gtid, team->t.t_id, tid, team->t.t_id,
1142  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1143  }
1144  KA_TRACE(
1145  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1146  gtid, team->t.t_id, tid, bt));
1147 }
1148 
1149 // The reverse versions seem to beat the forward versions overall
1150 #define KMP_REVERSE_HYPER_BAR
1151 static void __kmp_hyper_barrier_release(
1152  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1153  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1154  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1155  kmp_team_t *team;
1156  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1157  kmp_info_t **other_threads;
1158  kmp_uint32 num_threads;
1159  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1160  kmp_uint32 branch_factor = 1 << branch_bits;
1161  kmp_uint32 child;
1162  kmp_uint32 child_tid;
1163  kmp_uint32 offset;
1164  kmp_uint32 level;
1165 
1166  /* Perform a hypercube-embedded tree release for all of the threads that have
1167  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1168  are released in the reverse order of the corresponding gather, otherwise
1169  threads are released in the same order. */
1170  if (KMP_MASTER_TID(tid)) { // primary thread
1171  team = __kmp_threads[gtid]->th.th_team;
1172  KMP_DEBUG_ASSERT(team != NULL);
1173  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1174  "barrier type %d\n",
1175  gtid, team->t.t_id, tid, bt));
1176 #if KMP_BARRIER_ICV_PUSH
1177  if (propagate_icvs) { // primary already has ICVs in final destination; copy
1178  copy_icvs(&thr_bar->th_fixed_icvs,
1179  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1180  }
1181 #endif
1182  } else { // Handle fork barrier workers who aren't part of a team yet
1183  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1184  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1185  // Wait for parent thread to release us
1186  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1187  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1188 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1189  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1190  // In fork barrier where we could not get the object reliably
1191  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1192  // Cancel wait on previous parallel region...
1193  __kmp_itt_task_starting(itt_sync_obj);
1194 
1195  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1196  return;
1197 
1198  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1199  if (itt_sync_obj != NULL)
1200  // Call prepare as early as possible for "new" barrier
1201  __kmp_itt_task_finished(itt_sync_obj);
1202  } else
1203 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1204  // Early exit for reaping threads releasing forkjoin barrier
1205  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1206  return;
1207 
1208  // The worker thread may now assume that the team is valid.
1209  team = __kmp_threads[gtid]->th.th_team;
1210  KMP_DEBUG_ASSERT(team != NULL);
1211  tid = __kmp_tid_from_gtid(gtid);
1212 
1213  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1214  KA_TRACE(20,
1215  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1216  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1217  KMP_MB(); // Flush all pending memory write invalidates.
1218  }
1219  num_threads = this_thr->th.th_team_nproc;
1220  other_threads = team->t.t_threads;
1221 
1222 #ifdef KMP_REVERSE_HYPER_BAR
1223  // Count up to correct level for parent
1224  for (level = 0, offset = 1;
1225  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1226  level += branch_bits, offset <<= branch_bits)
1227  ;
1228 
1229  // Now go down from there
1230  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1231  level -= branch_bits, offset >>= branch_bits)
1232 #else
1233  // Go down the tree, level by level
1234  for (level = 0, offset = 1; offset < num_threads;
1235  level += branch_bits, offset <<= branch_bits)
1236 #endif // KMP_REVERSE_HYPER_BAR
1237  {
1238 #ifdef KMP_REVERSE_HYPER_BAR
1239  /* Now go in reverse order through the children, highest to lowest.
1240  Initial setting of child is conservative here. */
1241  child = num_threads >> ((level == 0) ? level : level - 1);
1242  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1243  child_tid = tid + (child << level);
1244  child >= 1; child--, child_tid -= (1 << level))
1245 #else
1246  if (((tid >> level) & (branch_factor - 1)) != 0)
1247  // No need to go lower than this, since this is the level parent would be
1248  // notified
1249  break;
1250  // Iterate through children on this level of the tree
1251  for (child = 1, child_tid = tid + (1 << level);
1252  child < branch_factor && child_tid < num_threads;
1253  child++, child_tid += (1 << level))
1254 #endif // KMP_REVERSE_HYPER_BAR
1255  {
1256  if (child_tid >= num_threads)
1257  continue; // Child doesn't exist so keep going
1258  else {
1259  kmp_info_t *child_thr = other_threads[child_tid];
1260  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1261 #if KMP_CACHE_MANAGE
1262  kmp_uint32 next_child_tid = child_tid - (1 << level);
1263 // Prefetch next thread's go count
1264 #ifdef KMP_REVERSE_HYPER_BAR
1265  if (child - 1 >= 1 && next_child_tid < num_threads)
1266 #else
1267  if (child + 1 < branch_factor && next_child_tid < num_threads)
1268 #endif // KMP_REVERSE_HYPER_BAR
1269  KMP_CACHE_PREFETCH(
1270  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1271 #endif /* KMP_CACHE_MANAGE */
1272 
1273 #if KMP_BARRIER_ICV_PUSH
1274  if (propagate_icvs) // push my fixed ICVs to my child
1275  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1276 #endif // KMP_BARRIER_ICV_PUSH
1277 
1278  KA_TRACE(
1279  20,
1280  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1281  "go(%p): %u => %u\n",
1282  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1283  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1284  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1285  // Release child from barrier
1286  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1287  flag.release();
1288  }
1289  }
1290  }
1291 #if KMP_BARRIER_ICV_PUSH
1292  if (propagate_icvs &&
1293  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1294  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1295  FALSE);
1296  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1297  &thr_bar->th_fixed_icvs);
1298  }
1299 #endif
1300  KA_TRACE(
1301  20,
1302  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1303  gtid, team->t.t_id, tid, bt));
1304 }
1305 
1306 // Hierarchical Barrier
1307 
1308 // Initialize thread barrier data
1309 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1310  Performs the minimum amount of initialization required based on how the team
1311  has changed. Returns true if leaf children will require both on-core and
1312  traditional wake-up mechanisms. For example, if the team size increases,
1313  threads already in the team will respond to on-core wakeup on their parent
1314  thread, but threads newly added to the team will only be listening on the
1315  their local b_go. */
1316 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1317  kmp_bstate_t *thr_bar,
1318  kmp_uint32 nproc, int gtid,
1319  int tid, kmp_team_t *team) {
1320  // Checks to determine if (re-)initialization is needed
1321  bool uninitialized = thr_bar->team == NULL;
1322  bool team_changed = team != thr_bar->team;
1323  bool team_sz_changed = nproc != thr_bar->nproc;
1324  bool tid_changed = tid != thr_bar->old_tid;
1325  bool retval = false;
1326 
1327  if (uninitialized || team_sz_changed) {
1328  __kmp_get_hierarchy(nproc, thr_bar);
1329  }
1330 
1331  if (uninitialized || team_sz_changed || tid_changed) {
1332  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1333  thr_bar->parent_tid = -1; // default for primary thread
1334  if (!KMP_MASTER_TID(tid)) {
1335  // if not primary thread, find parent thread in hierarchy
1336  kmp_uint32 d = 0;
1337  while (d < thr_bar->depth) { // find parent based on level of thread in
1338  // hierarchy, and note level
1339  kmp_uint32 rem;
1340  if (d == thr_bar->depth - 2) { // reached level right below the primary
1341  thr_bar->parent_tid = 0;
1342  thr_bar->my_level = d;
1343  break;
1344  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1345  // TODO: can we make the above op faster?
1346  // thread is not a subtree root at next level, so this is max
1347  thr_bar->parent_tid = tid - rem;
1348  thr_bar->my_level = d;
1349  break;
1350  }
1351  ++d;
1352  }
1353  }
1354  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1355  (thr_bar->skip_per_level[thr_bar->my_level])),
1356  &(thr_bar->offset));
1357  thr_bar->old_tid = tid;
1358  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1359  thr_bar->team = team;
1360  thr_bar->parent_bar =
1361  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1362  }
1363  if (uninitialized || team_changed || tid_changed) {
1364  thr_bar->team = team;
1365  thr_bar->parent_bar =
1366  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1367  retval = true;
1368  }
1369  if (uninitialized || team_sz_changed || tid_changed) {
1370  thr_bar->nproc = nproc;
1371  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1372  if (thr_bar->my_level == 0)
1373  thr_bar->leaf_kids = 0;
1374  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1375  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1376  thr_bar->leaf_state = 0;
1377  for (int i = 0; i < thr_bar->leaf_kids; ++i)
1378  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1379  }
1380  return retval;
1381 }
1382 
1383 static void __kmp_hierarchical_barrier_gather(
1384  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1385  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1386  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1387  kmp_team_t *team = this_thr->th.th_team;
1388  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1389  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1390  kmp_info_t **other_threads = team->t.t_threads;
1391  kmp_uint64 new_state = 0;
1392 
1393  int level = team->t.t_level;
1394  if (other_threads[0]
1395  ->th.th_teams_microtask) // are we inside the teams construct?
1396  if (this_thr->th.th_teams_size.nteams > 1)
1397  ++level; // level was not increased in teams construct for team_of_masters
1398  if (level == 1)
1399  thr_bar->use_oncore_barrier = 1;
1400  else
1401  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1402 
1403  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1404  "barrier type %d\n",
1405  gtid, team->t.t_id, tid, bt));
1406  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1407 
1408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1409  // Barrier imbalance - save arrive time to the thread
1410  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1411  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1412  }
1413 #endif
1414 
1415  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1416  team);
1417 
1418  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1419  kmp_int32 child_tid;
1420  new_state =
1421  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1422  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1423  thr_bar->use_oncore_barrier) {
1424  if (thr_bar->leaf_kids) {
1425  // First, wait for leaf children to check-in on my b_arrived flag
1426  kmp_uint64 leaf_state =
1427  KMP_MASTER_TID(tid)
1428  ? thr_bar->b_arrived | thr_bar->leaf_state
1429  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1430  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1431  "for leaf kids\n",
1432  gtid, team->t.t_id, tid));
1433  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1434  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1435  if (reduce) {
1436  OMPT_REDUCTION_DECL(this_thr, gtid);
1437  OMPT_REDUCTION_BEGIN;
1438  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1439  ++child_tid) {
1440  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1441  "T#%d(%d:%d)\n",
1442  gtid, team->t.t_id, tid,
1443  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1444  child_tid));
1445  (*reduce)(this_thr->th.th_local.reduce_data,
1446  other_threads[child_tid]->th.th_local.reduce_data);
1447  }
1448  OMPT_REDUCTION_END;
1449  }
1450  // clear leaf_state bits
1451  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1452  }
1453  // Next, wait for higher level children on each child's b_arrived flag
1454  for (kmp_uint32 d = 1; d < thr_bar->my_level;
1455  ++d) { // gather lowest level threads first, but skip 0
1456  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1457  skip = thr_bar->skip_per_level[d];
1458  if (last > nproc)
1459  last = nproc;
1460  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1461  kmp_info_t *child_thr = other_threads[child_tid];
1462  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1463  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1464  "T#%d(%d:%d) "
1465  "arrived(%p) == %llu\n",
1466  gtid, team->t.t_id, tid,
1467  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1468  child_tid, &child_bar->b_arrived, new_state));
1469  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1470  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1471  if (reduce) {
1472  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1473  "T#%d(%d:%d)\n",
1474  gtid, team->t.t_id, tid,
1475  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1476  child_tid));
1477  (*reduce)(this_thr->th.th_local.reduce_data,
1478  child_thr->th.th_local.reduce_data);
1479  }
1480  }
1481  }
1482  } else { // Blocktime is not infinite
1483  for (kmp_uint32 d = 0; d < thr_bar->my_level;
1484  ++d) { // Gather lowest level threads first
1485  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1486  skip = thr_bar->skip_per_level[d];
1487  if (last > nproc)
1488  last = nproc;
1489  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1490  kmp_info_t *child_thr = other_threads[child_tid];
1491  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1492  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1493  "T#%d(%d:%d) "
1494  "arrived(%p) == %llu\n",
1495  gtid, team->t.t_id, tid,
1496  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1497  child_tid, &child_bar->b_arrived, new_state));
1498  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1499  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1500  if (reduce) {
1501  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1502  "T#%d(%d:%d)\n",
1503  gtid, team->t.t_id, tid,
1504  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1505  child_tid));
1506  (*reduce)(this_thr->th.th_local.reduce_data,
1507  child_thr->th.th_local.reduce_data);
1508  }
1509  }
1510  }
1511  }
1512  }
1513  // All subordinates are gathered; now release parent if not primary thread
1514 
1515  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1516  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1517  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1518  gtid, team->t.t_id, tid,
1519  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1520  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1521  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1522  /* Mark arrival to parent: After performing this write, a worker thread may
1523  not assume that the team is valid any more - it could be deallocated by
1524  the primary thread at any time. */
1525  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1526  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1527  // flag; release it
1528  kmp_flag_64<> flag(&thr_bar->b_arrived,
1529  other_threads[thr_bar->parent_tid]);
1530  flag.release();
1531  } else {
1532  // Leaf does special release on "offset" bits of parent's b_arrived flag
1533  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1534  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1535  thr_bar->offset + 1);
1536  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1537  flag.release();
1538  }
1539  } else { // Primary thread needs to update the team's b_arrived value
1540  team->t.t_bar[bt].b_arrived = new_state;
1541  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1542  "arrived(%p) = %llu\n",
1543  gtid, team->t.t_id, tid, team->t.t_id,
1544  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1545  }
1546  // Is the team access below unsafe or just technically invalid?
1547  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1548  "barrier type %d\n",
1549  gtid, team->t.t_id, tid, bt));
1550 }
1551 
1552 static void __kmp_hierarchical_barrier_release(
1553  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1554  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1555  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1556  kmp_team_t *team;
1557  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1558  kmp_uint32 nproc;
1559  bool team_change = false; // indicates on-core barrier shouldn't be used
1560 
1561  if (KMP_MASTER_TID(tid)) {
1562  team = __kmp_threads[gtid]->th.th_team;
1563  KMP_DEBUG_ASSERT(team != NULL);
1564  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1565  "entered barrier type %d\n",
1566  gtid, team->t.t_id, tid, bt));
1567  } else { // Worker threads
1568  // Wait for parent thread to release me
1569  if (!thr_bar->use_oncore_barrier ||
1570  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1571  thr_bar->team == NULL) {
1572  // Use traditional method of waiting on my own b_go flag
1573  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1574  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1575  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1576  TCW_8(thr_bar->b_go,
1577  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1578  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1579  // infinite, not nested
1580  // Wait on my "offset" bits on parent's b_go flag
1581  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1582  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1583  thr_bar->offset + 1, bt,
1584  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1585  flag.wait(this_thr, TRUE);
1586  if (thr_bar->wait_flag ==
1587  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1588  TCW_8(thr_bar->b_go,
1589  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1590  } else { // Reset my bits on parent's b_go flag
1591  (RCAST(volatile char *,
1592  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1593  }
1594  }
1595  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1596  // Early exit for reaping threads releasing forkjoin barrier
1597  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1598  return;
1599  // The worker thread may now assume that the team is valid.
1600  team = __kmp_threads[gtid]->th.th_team;
1601  KMP_DEBUG_ASSERT(team != NULL);
1602  tid = __kmp_tid_from_gtid(gtid);
1603 
1604  KA_TRACE(
1605  20,
1606  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1608  KMP_MB(); // Flush all pending memory write invalidates.
1609  }
1610 
1611  nproc = this_thr->th.th_team_nproc;
1612  int level = team->t.t_level;
1613  if (team->t.t_threads[0]
1614  ->th.th_teams_microtask) { // are we inside the teams construct?
1615  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1616  this_thr->th.th_teams_level == level)
1617  ++level; // level was not increased in teams construct for team_of_workers
1618  if (this_thr->th.th_teams_size.nteams > 1)
1619  ++level; // level was not increased in teams construct for team_of_masters
1620  }
1621  if (level == 1)
1622  thr_bar->use_oncore_barrier = 1;
1623  else
1624  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1625 
1626  // If the team size has increased, we still communicate with old leaves via
1627  // oncore barrier.
1628  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1629  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1630  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1631  tid, team);
1632  // But if the entire team changes, we won't use oncore barrier at all
1633  if (team_change)
1634  old_leaf_kids = 0;
1635 
1636 #if KMP_BARRIER_ICV_PUSH
1637  if (propagate_icvs) {
1638  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1639  FALSE);
1640  if (KMP_MASTER_TID(
1641  tid)) { // primary already has copy in final destination; copy
1642  copy_icvs(&thr_bar->th_fixed_icvs,
1643  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1644  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1645  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1646  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1647  // leaves (on-core children) pull parent's fixed ICVs directly to local
1648  // ICV store
1649  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1650  &thr_bar->parent_bar->th_fixed_icvs);
1651  // non-leaves will get ICVs piggybacked with b_go via NGO store
1652  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1653  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1654  // access
1655  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1656  else // leaves copy parent's fixed ICVs directly to local ICV store
1657  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1658  &thr_bar->parent_bar->th_fixed_icvs);
1659  }
1660  }
1661 #endif // KMP_BARRIER_ICV_PUSH
1662 
1663  // Now, release my children
1664  if (thr_bar->my_level) { // not a leaf
1665  kmp_int32 child_tid;
1666  kmp_uint32 last;
1667  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1668  thr_bar->use_oncore_barrier) {
1669  if (KMP_MASTER_TID(tid)) { // do a flat release
1670  // Set local b_go to bump children via NGO store of the cache line
1671  // containing IVCs and b_go.
1672  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1673  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1674  // the cache line
1675  ngo_load(&thr_bar->th_fixed_icvs);
1676  // This loops over all the threads skipping only the leaf nodes in the
1677  // hierarchy
1678  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1679  child_tid += thr_bar->skip_per_level[1]) {
1680  kmp_bstate_t *child_bar =
1681  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1682  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1683  "releasing T#%d(%d:%d)"
1684  " go(%p): %u => %u\n",
1685  gtid, team->t.t_id, tid,
1686  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1687  child_tid, &child_bar->b_go, child_bar->b_go,
1688  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1689  // Use ngo store (if available) to both store ICVs and release child
1690  // via child's b_go
1691  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1692  }
1693  ngo_sync();
1694  }
1695  TCW_8(thr_bar->b_go,
1696  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1697  // Now, release leaf children
1698  if (thr_bar->leaf_kids) { // if there are any
1699  // We test team_change on the off-chance that the level 1 team changed.
1700  if (team_change ||
1701  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1702  if (old_leaf_kids) { // release old leaf kids
1703  thr_bar->b_go |= old_leaf_state;
1704  }
1705  // Release new leaf kids
1706  last = tid + thr_bar->skip_per_level[1];
1707  if (last > nproc)
1708  last = nproc;
1709  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1710  ++child_tid) { // skip_per_level[0]=1
1711  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713  KA_TRACE(
1714  20,
1715  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1716  " T#%d(%d:%d) go(%p): %u => %u\n",
1717  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1718  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1719  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720  // Release child using child's b_go flag
1721  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722  flag.release();
1723  }
1724  } else { // Release all children at once with leaf_state bits on my own
1725  // b_go flag
1726  thr_bar->b_go |= thr_bar->leaf_state;
1727  }
1728  }
1729  } else { // Blocktime is not infinite; do a simple hierarchical release
1730  for (int d = thr_bar->my_level - 1; d >= 0;
1731  --d) { // Release highest level threads first
1732  last = tid + thr_bar->skip_per_level[d + 1];
1733  kmp_uint32 skip = thr_bar->skip_per_level[d];
1734  if (last > nproc)
1735  last = nproc;
1736  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1737  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1738  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1739  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1740  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1741  gtid, team->t.t_id, tid,
1742  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1743  child_tid, &child_bar->b_go, child_bar->b_go,
1744  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1745  // Release child using child's b_go flag
1746  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1747  flag.release();
1748  }
1749  }
1750  }
1751 #if KMP_BARRIER_ICV_PUSH
1752  if (propagate_icvs && !KMP_MASTER_TID(tid))
1753  // non-leaves copy ICVs from fixed ICVs to local dest
1754  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1755  &thr_bar->th_fixed_icvs);
1756 #endif // KMP_BARRIER_ICV_PUSH
1757  }
1758  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1759  "barrier type %d\n",
1760  gtid, team->t.t_id, tid, bt));
1761 }
1762 
1763 // End of Barrier Algorithms
1764 
1765 // type traits for cancellable value
1766 // if cancellable is true, then is_cancellable is a normal boolean variable
1767 // if cancellable is false, then is_cancellable is a compile time constant
1768 template <bool cancellable> struct is_cancellable {};
1769 template <> struct is_cancellable<true> {
1770  bool value;
1771  is_cancellable() : value(false) {}
1772  is_cancellable(bool b) : value(b) {}
1773  is_cancellable &operator=(bool b) {
1774  value = b;
1775  return *this;
1776  }
1777  operator bool() const { return value; }
1778 };
1779 template <> struct is_cancellable<false> {
1780  is_cancellable &operator=(bool b) { return *this; }
1781  constexpr operator bool() const { return false; }
1782 };
1783 
1784 // Internal function to do a barrier.
1785 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1786  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1787  barrier
1788  When cancellable = false,
1789  Returns 0 if primary thread, 1 if worker thread.
1790  When cancellable = true
1791  Returns 0 if not cancelled, 1 if cancelled. */
1792 template <bool cancellable = false>
1793 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1794  size_t reduce_size, void *reduce_data,
1795  void (*reduce)(void *, void *)) {
1796  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1797  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1798  int tid = __kmp_tid_from_gtid(gtid);
1799  kmp_info_t *this_thr = __kmp_threads[gtid];
1800  kmp_team_t *team = this_thr->th.th_team;
1801  int status = 0;
1802  is_cancellable<cancellable> cancelled;
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804  ompt_data_t *my_task_data;
1805  ompt_data_t *my_parallel_data;
1806  void *return_address;
1807  ompt_sync_region_t barrier_kind;
1808 #endif
1809 
1810  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1811  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1812 
1813 #if OMPT_SUPPORT
1814  if (ompt_enabled.enabled) {
1815 #if OMPT_OPTIONAL
1816  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1817  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1818  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1819  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1820  if (ompt_enabled.ompt_callback_sync_region) {
1821  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1822  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1823  return_address);
1824  }
1825  if (ompt_enabled.ompt_callback_sync_region_wait) {
1826  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1827  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1828  return_address);
1829  }
1830 #endif
1831  // It is OK to report the barrier state after the barrier begin callback.
1832  // According to the OMPT specification, a compliant implementation may
1833  // even delay reporting this state until the barrier begins to wait.
1834  auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1835  switch (barrier_kind) {
1836  case ompt_sync_region_barrier_explicit:
1837  ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1838  break;
1839  case ompt_sync_region_barrier_implicit_workshare:
1840  ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1841  break;
1842  case ompt_sync_region_barrier_implicit_parallel:
1843  ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1844  break;
1845  case ompt_sync_region_barrier_teams:
1846  ompt_thr_info->state = ompt_state_wait_barrier_teams;
1847  break;
1848  case ompt_sync_region_barrier_implementation:
1849  [[fallthrough]];
1850  default:
1851  ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1852  }
1853  }
1854 #endif
1855 
1856 #if ENABLE_LIBOMPTARGET
1857  // Give an opportunity to the offload runtime to make progress and create
1858  // proxy tasks if necessary
1859  if (UNLIKELY(kmp_target_sync_cb != NULL))
1860  (*kmp_target_sync_cb)(
1861  NULL, gtid, KMP_TASKDATA_TO_TASK(this_thr->th.th_current_task), NULL);
1862 #endif
1863 
1864  if (!team->t.t_serialized) {
1865 #if USE_ITT_BUILD
1866  // This value will be used in itt notify events below.
1867  void *itt_sync_obj = NULL;
1868 #if USE_ITT_NOTIFY
1869  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1870  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1871 #endif
1872 #endif /* USE_ITT_BUILD */
1873  if (__kmp_tasking_mode == tskm_extra_barrier) {
1874  __kmp_tasking_barrier(team, this_thr, gtid);
1875  KA_TRACE(15,
1876  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1877  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1878  }
1879 
1880  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1881  access it when the team struct is not guaranteed to exist. */
1882  // See note about the corresponding code in __kmp_join_barrier() being
1883  // performance-critical.
1884  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1885 #if KMP_USE_MONITOR
1886  this_thr->th.th_team_bt_intervals =
1887  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1888  this_thr->th.th_team_bt_set =
1889  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1890 #else
1891  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1892 #endif
1893  }
1894 
1895 #if USE_ITT_BUILD
1896  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1897  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1898 #endif /* USE_ITT_BUILD */
1899 #if USE_DEBUGGER
1900  // Let the debugger know: the thread arrived to the barrier and waiting.
1901  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1902  team->t.t_bar[bt].b_master_arrived += 1;
1903  } else {
1904  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1905  } // if
1906 #endif /* USE_DEBUGGER */
1907  if (reduce != NULL) {
1908  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1909  this_thr->th.th_local.reduce_data = reduce_data;
1910  }
1911 
1912  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1913  __kmp_task_team_setup(this_thr, team);
1914 
1915  if (cancellable) {
1916  cancelled = __kmp_linear_barrier_gather_cancellable(
1917  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1918  } else {
1919  switch (__kmp_barrier_gather_pattern[bt]) {
1920  case bp_dist_bar: {
1921  __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1922  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1923  break;
1924  }
1925  case bp_hyper_bar: {
1926  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1927  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1928  break;
1929  }
1930  case bp_hierarchical_bar: {
1931  __kmp_hierarchical_barrier_gather(
1932  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1933  break;
1934  }
1935  case bp_tree_bar: {
1936  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1937  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1938  break;
1939  }
1940  default: {
1941  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1942  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1943  }
1944  }
1945  }
1946 
1947  KMP_MB();
1948 
1949  if (KMP_MASTER_TID(tid)) {
1950  status = 0;
1951  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1952  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1953  }
1954 #if USE_DEBUGGER
1955  // Let the debugger know: All threads are arrived and starting leaving the
1956  // barrier.
1957  team->t.t_bar[bt].b_team_arrived += 1;
1958 #endif
1959 
1960  if (__kmp_omp_cancellation) {
1961  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1962  // Reset cancellation flag for worksharing constructs
1963  if (cancel_request == cancel_loop ||
1964  cancel_request == cancel_sections) {
1965  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1966  }
1967  }
1968 #if USE_ITT_BUILD
1969  /* TODO: In case of split reduction barrier, primary thread may send
1970  acquired event early, before the final summation into the shared
1971  variable is done (final summation can be a long operation for array
1972  reductions). */
1973  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1974  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1975 #endif /* USE_ITT_BUILD */
1976 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1977  // Barrier - report frame end (only if active_level == 1)
1978  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1979  __kmp_forkjoin_frames_mode &&
1980  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1981  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1982  team->t.t_active_level == 1) {
1983  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1984  kmp_uint64 cur_time = __itt_get_timestamp();
1985  kmp_info_t **other_threads = team->t.t_threads;
1986  int nproc = this_thr->th.th_team_nproc;
1987  int i;
1988  switch (__kmp_forkjoin_frames_mode) {
1989  case 1:
1990  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1991  loc, nproc);
1992  this_thr->th.th_frame_time = cur_time;
1993  break;
1994  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1995  // be fixed)
1996  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1997  1, loc, nproc);
1998  break;
1999  case 3:
2000  if (__itt_metadata_add_ptr) {
2001  // Initialize with primary thread's wait time
2002  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2003  // Set arrive time to zero to be able to check it in
2004  // __kmp_invoke_task(); the same is done inside the loop below
2005  this_thr->th.th_bar_arrive_time = 0;
2006  for (i = 1; i < nproc; ++i) {
2007  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2008  other_threads[i]->th.th_bar_arrive_time = 0;
2009  }
2010  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2011  cur_time, delta,
2012  (kmp_uint64)(reduce != NULL));
2013  }
2014  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2015  loc, nproc);
2016  this_thr->th.th_frame_time = cur_time;
2017  break;
2018  }
2019  }
2020 #endif /* USE_ITT_BUILD */
2021  } else {
2022  status = 1;
2023 #if USE_ITT_BUILD
2024  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2025  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2026 #endif /* USE_ITT_BUILD */
2027  }
2028  if ((status == 1 || !is_split) && !cancelled) {
2029  if (cancellable) {
2030  cancelled = __kmp_linear_barrier_release_cancellable(
2031  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2032  } else {
2033  switch (__kmp_barrier_release_pattern[bt]) {
2034  case bp_dist_bar: {
2035  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2036  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2037  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2038  break;
2039  }
2040  case bp_hyper_bar: {
2041  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2042  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2043  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2044  break;
2045  }
2046  case bp_hierarchical_bar: {
2047  __kmp_hierarchical_barrier_release(
2048  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2049  break;
2050  }
2051  case bp_tree_bar: {
2052  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2053  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2054  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2055  break;
2056  }
2057  default: {
2058  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2059  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2060  }
2061  }
2062  }
2063  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2064  __kmp_task_team_sync(this_thr, team);
2065  }
2066  }
2067 
2068 #if USE_ITT_BUILD
2069  /* GEH: TODO: Move this under if-condition above and also include in
2070  __kmp_end_split_barrier(). This will more accurately represent the actual
2071  release time of the threads for split barriers. */
2072  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2073  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2074 #endif /* USE_ITT_BUILD */
2075  } else { // Team is serialized.
2076  status = 0;
2077  if (__kmp_tasking_mode != tskm_immediate_exec) {
2078  if (this_thr->th.th_task_team != NULL) {
2079 #if USE_ITT_NOTIFY
2080  void *itt_sync_obj = NULL;
2081  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2082  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2083  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2084  }
2085 #endif
2086 
2087  KMP_DEBUG_ASSERT(
2088  this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2089  this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2090  TRUE);
2091  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2092  __kmp_task_team_setup(this_thr, team);
2093 
2094 #if USE_ITT_BUILD
2095  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2096  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2097 #endif /* USE_ITT_BUILD */
2098  }
2099  }
2100  }
2101  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2102  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2103  __kmp_tid_from_gtid(gtid), status));
2104 
2105 #if OMPT_SUPPORT
2106  if (ompt_enabled.enabled) {
2107 #if OMPT_OPTIONAL
2108  if (ompt_enabled.ompt_callback_sync_region_wait) {
2109  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2110  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2111  return_address);
2112  }
2113  if (ompt_enabled.ompt_callback_sync_region) {
2114  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2115  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2116  return_address);
2117  }
2118 #endif
2119  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2120  }
2121 #endif
2122 
2123  if (cancellable)
2124  return (int)cancelled;
2125  return status;
2126 }
2127 
2128 // Returns 0 if primary thread, 1 if worker thread.
2129 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2130  size_t reduce_size, void *reduce_data,
2131  void (*reduce)(void *, void *)) {
2132  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2133  reduce);
2134 }
2135 
2136 #if defined(KMP_GOMP_COMPAT)
2137 // Returns 1 if cancelled, 0 otherwise
2138 int __kmp_barrier_gomp_cancel(int gtid) {
2139  if (__kmp_omp_cancellation) {
2140  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2141  0, NULL, NULL);
2142  if (cancelled) {
2143  int tid = __kmp_tid_from_gtid(gtid);
2144  kmp_info_t *this_thr = __kmp_threads[gtid];
2145  if (KMP_MASTER_TID(tid)) {
2146  // Primary thread does not need to revert anything
2147  } else {
2148  // Workers need to revert their private b_arrived flag
2149  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2150  KMP_BARRIER_STATE_BUMP;
2151  }
2152  }
2153  return cancelled;
2154  }
2155  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2156  return FALSE;
2157 }
2158 #endif
2159 
2160 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2161  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2162  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2163  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2164  int tid = __kmp_tid_from_gtid(gtid);
2165  kmp_info_t *this_thr = __kmp_threads[gtid];
2166  kmp_team_t *team = this_thr->th.th_team;
2167 
2168  if (!team->t.t_serialized) {
2169  if (KMP_MASTER_GTID(gtid)) {
2170  switch (__kmp_barrier_release_pattern[bt]) {
2171  case bp_dist_bar: {
2172  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2173  FALSE USE_ITT_BUILD_ARG(NULL));
2174  break;
2175  }
2176  case bp_hyper_bar: {
2177  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2178  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2179  FALSE USE_ITT_BUILD_ARG(NULL));
2180  break;
2181  }
2182  case bp_hierarchical_bar: {
2183  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2184  FALSE USE_ITT_BUILD_ARG(NULL));
2185  break;
2186  }
2187  case bp_tree_bar: {
2188  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2189  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2190  FALSE USE_ITT_BUILD_ARG(NULL));
2191  break;
2192  }
2193  default: {
2194  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2195  FALSE USE_ITT_BUILD_ARG(NULL));
2196  }
2197  }
2198  if (__kmp_tasking_mode != tskm_immediate_exec) {
2199  __kmp_task_team_sync(this_thr, team);
2200  } // if
2201  }
2202  }
2203 }
2204 
2205 void __kmp_join_barrier(int gtid) {
2206  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2207  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2208 
2209  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2210 
2211  kmp_info_t *this_thr = __kmp_threads[gtid];
2212  kmp_team_t *team;
2213  int tid;
2214 #ifdef KMP_DEBUG
2215  int team_id;
2216 #endif /* KMP_DEBUG */
2217 #if USE_ITT_BUILD
2218  void *itt_sync_obj = NULL;
2219 #if USE_ITT_NOTIFY
2220  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2221  // Get object created at fork_barrier
2222  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2223 #endif
2224 #endif /* USE_ITT_BUILD */
2225 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2226  int nproc = this_thr->th.th_team_nproc;
2227 #endif
2228  KMP_MB();
2229 
2230  // Get current info
2231  team = this_thr->th.th_team;
2232  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2233  tid = __kmp_tid_from_gtid(gtid);
2234 #ifdef KMP_DEBUG
2235  team_id = team->t.t_id;
2236  kmp_info_t *master_thread = this_thr->th.th_team_master;
2237  if (master_thread != team->t.t_threads[0]) {
2238  __kmp_print_structure();
2239  }
2240 #endif /* KMP_DEBUG */
2241  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2242  KMP_MB();
2243 
2244  // Verify state
2245  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2246  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2247  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2248  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2249  gtid, team_id, tid));
2250 
2251 #if OMPT_SUPPORT
2252  if (ompt_enabled.enabled) {
2253 #if OMPT_OPTIONAL
2254  ompt_data_t *my_task_data;
2255  ompt_data_t *my_parallel_data;
2256  void *codeptr = NULL;
2257  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2258  if (KMP_MASTER_TID(ds_tid) &&
2259  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2260  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2261  codeptr = team->t.ompt_team_info.master_return_address;
2262  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2263  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2264  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2265  ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2266  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2267  sync_kind = ompt_sync_region_barrier_teams;
2268  ompt_state = ompt_state_wait_barrier_teams;
2269  }
2270  if (ompt_enabled.ompt_callback_sync_region) {
2271  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2272  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2273  }
2274  if (ompt_enabled.ompt_callback_sync_region_wait) {
2275  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2276  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2277  }
2278  if (!KMP_MASTER_TID(ds_tid))
2279  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2280 #endif
2281  this_thr->th.ompt_thread_info.state = ompt_state;
2282  }
2283 #endif
2284 
2285  if (__kmp_tasking_mode == tskm_extra_barrier) {
2286  __kmp_tasking_barrier(team, this_thr, gtid);
2287  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2288  gtid, team_id, tid));
2289  }
2290 #ifdef KMP_DEBUG
2291  if (__kmp_tasking_mode != tskm_immediate_exec) {
2292  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2293  "%p, th_task_team = %p\n",
2294  __kmp_gtid_from_thread(this_thr), team_id,
2295  team->t.t_task_team[this_thr->th.th_task_state],
2296  this_thr->th.th_task_team));
2297  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2298  }
2299 #endif /* KMP_DEBUG */
2300 
2301  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2302  access it when the team struct is not guaranteed to exist. Doing these
2303  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2304  we do not perform the copy if blocktime=infinite, since the values are not
2305  used by __kmp_wait_template() in that case. */
2306  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2307 #if KMP_USE_MONITOR
2308  this_thr->th.th_team_bt_intervals =
2309  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2310  this_thr->th.th_team_bt_set =
2311  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2312 #else
2313  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2314 #endif
2315  }
2316 
2317 #if USE_ITT_BUILD
2318  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2319  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2320 #endif /* USE_ITT_BUILD */
2321 
2322  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2323  case bp_dist_bar: {
2324  __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2325  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2326  break;
2327  }
2328  case bp_hyper_bar: {
2329  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2330  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2331  break;
2332  }
2333  case bp_hierarchical_bar: {
2334  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2335  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2336  break;
2337  }
2338  case bp_tree_bar: {
2339  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2340  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2341  break;
2342  }
2343  default: {
2344  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2345  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2346  }
2347  }
2348 
2349  /* From this point on, the team data structure may be deallocated at any time
2350  by the primary thread - it is unsafe to reference it in any of the worker
2351  threads. Any per-team data items that need to be referenced before the
2352  end of the barrier should be moved to the kmp_task_team_t structs. */
2353  if (KMP_MASTER_TID(tid)) {
2354  if (__kmp_tasking_mode != tskm_immediate_exec) {
2355  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2356  }
2357  if (__kmp_display_affinity) {
2358  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2359  }
2360 #if KMP_STATS_ENABLED
2361  // Have primary thread flag the workers to indicate they are now waiting for
2362  // next parallel region, Also wake them up so they switch their timers to
2363  // idle.
2364  for (int i = 0; i < team->t.t_nproc; ++i) {
2365  kmp_info_t *team_thread = team->t.t_threads[i];
2366  if (team_thread == this_thr)
2367  continue;
2368  team_thread->th.th_stats->setIdleFlag();
2369  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2370  team_thread->th.th_sleep_loc != NULL)
2371  __kmp_null_resume_wrapper(team_thread);
2372  }
2373 #endif
2374 #if USE_ITT_BUILD
2375  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2376  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2377 #endif /* USE_ITT_BUILD */
2378 
2379 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2380  // Join barrier - report frame end
2381  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2382  __kmp_forkjoin_frames_mode &&
2383  (this_thr->th.th_teams_microtask == NULL || // either not in teams
2384  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2385  team->t.t_active_level == 1) {
2386  kmp_uint64 cur_time = __itt_get_timestamp();
2387  ident_t *loc = team->t.t_ident;
2388  kmp_info_t **other_threads = team->t.t_threads;
2389  switch (__kmp_forkjoin_frames_mode) {
2390  case 1:
2391  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2392  loc, nproc);
2393  break;
2394  case 2:
2395  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2396  loc, nproc);
2397  break;
2398  case 3:
2399  if (__itt_metadata_add_ptr) {
2400  // Initialize with primary thread's wait time
2401  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2402  // Set arrive time to zero to be able to check it in
2403  // __kmp_invoke_task(); the same is done inside the loop below
2404  this_thr->th.th_bar_arrive_time = 0;
2405  for (int i = 1; i < nproc; ++i) {
2406  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2407  other_threads[i]->th.th_bar_arrive_time = 0;
2408  }
2409  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2410  cur_time, delta, 0);
2411  }
2412  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2413  loc, nproc);
2414  this_thr->th.th_frame_time = cur_time;
2415  break;
2416  }
2417  }
2418 #endif /* USE_ITT_BUILD */
2419  }
2420 #if USE_ITT_BUILD
2421  else {
2422  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2423  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2424  }
2425 #endif /* USE_ITT_BUILD */
2426 
2427 #if KMP_DEBUG
2428  if (KMP_MASTER_TID(tid)) {
2429  KA_TRACE(
2430  15,
2431  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2432  gtid, team_id, tid, nproc));
2433  }
2434 #endif /* KMP_DEBUG */
2435 
2436  // TODO now, mark worker threads as done so they may be disbanded
2437  KMP_MB(); // Flush all pending memory write invalidates.
2438  KA_TRACE(10,
2439  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2440 
2441 }
2442 
2443 // TODO release worker threads' fork barriers as we are ready instead of all at
2444 // once
2445 void __kmp_fork_barrier(int gtid, int tid) {
2446  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2447  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2448  kmp_info_t *this_thr = __kmp_threads[gtid];
2449  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2450 #if USE_ITT_BUILD
2451  void *itt_sync_obj = NULL;
2452 #endif /* USE_ITT_BUILD */
2453 #ifdef KMP_DEBUG
2454  if (team)
2455  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2456  (team != NULL) ? team->t.t_id : -1, tid));
2457 #endif
2458  // th_team pointer only valid for primary thread here
2459  if (KMP_MASTER_TID(tid)) {
2460 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2461  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2462  // Create itt barrier object
2463  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2464  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2465  }
2466 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2467 
2468 #ifdef KMP_DEBUG
2469  KMP_DEBUG_ASSERT(team);
2470  kmp_info_t **other_threads = team->t.t_threads;
2471  int i;
2472 
2473  // Verify state
2474  KMP_MB();
2475 
2476  for (i = 1; i < team->t.t_nproc; ++i) {
2477  KA_TRACE(500,
2478  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2479  "== %u.\n",
2480  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2481  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2482  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2483  KMP_DEBUG_ASSERT(
2484  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2485  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2486  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2487  }
2488 #endif
2489 
2490  if (__kmp_tasking_mode != tskm_immediate_exec)
2491  __kmp_task_team_setup(this_thr, team);
2492 
2493  /* The primary thread may have changed its blocktime between join barrier
2494  and fork barrier. Copy the blocktime info to the thread, where
2495  __kmp_wait_template() can access it when the team struct is not
2496  guaranteed to exist. */
2497  // See note about the corresponding code in __kmp_join_barrier() being
2498  // performance-critical
2499  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2500 #if KMP_USE_MONITOR
2501  this_thr->th.th_team_bt_intervals =
2502  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2503  this_thr->th.th_team_bt_set =
2504  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2505 #else
2506  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2507 #endif
2508  }
2509  } // primary thread
2510 
2511  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2512  case bp_dist_bar: {
2513  __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2514  TRUE USE_ITT_BUILD_ARG(NULL));
2515  break;
2516  }
2517  case bp_hyper_bar: {
2518  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2519  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2520  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2521  break;
2522  }
2523  case bp_hierarchical_bar: {
2524  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2525  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2526  break;
2527  }
2528  case bp_tree_bar: {
2529  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2530  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2531  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2532  break;
2533  }
2534  default: {
2535  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2536  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2537  }
2538  }
2539 
2540 #if OMPT_SUPPORT
2541  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2542  if (ompt_enabled.enabled &&
2543  (ompt_state == ompt_state_wait_barrier_teams ||
2544  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2545  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2546  ompt_data_t *task_data = (team)
2547  ? OMPT_CUR_TASK_DATA(this_thr)
2548  : &(this_thr->th.ompt_thread_info.task_data);
2549  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2550 #if OMPT_OPTIONAL
2551  void *codeptr = NULL;
2552  if (KMP_MASTER_TID(ds_tid) &&
2553  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2554  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2555  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2556  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2557  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2558  sync_kind = ompt_sync_region_barrier_teams;
2559  if (ompt_enabled.ompt_callback_sync_region_wait) {
2560  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2561  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2562  }
2563  if (ompt_enabled.ompt_callback_sync_region) {
2564  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2565  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2566  }
2567 #endif
2568  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2569  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2570  ompt_scope_end, NULL, task_data, 0, ds_tid,
2571  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2572  }
2573  }
2574 #endif
2575 
2576  // Early exit for reaping threads releasing forkjoin barrier
2577  if (TCR_4(__kmp_global.g.g_done)) {
2578  this_thr->th.th_task_team = NULL;
2579 
2580 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2581  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2582  if (!KMP_MASTER_TID(tid)) {
2583  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2584  if (itt_sync_obj)
2585  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2586  }
2587  }
2588 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2589  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2590  return;
2591  }
2592 
2593  /* We can now assume that a valid team structure has been allocated by the
2594  primary thread and propagated to all worker threads. The current thread,
2595  however, may not be part of the team, so we can't blindly assume that the
2596  team pointer is non-null. */
2597  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2598  KMP_DEBUG_ASSERT(team != NULL);
2599  tid = __kmp_tid_from_gtid(gtid);
2600 
2601 #if KMP_BARRIER_ICV_PULL
2602  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2603  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2604  implicit task has this data before this function is called. We cannot
2605  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2606  thread struct, because it is not always the case that the threads arrays
2607  have been allocated when __kmp_fork_call() is executed. */
2608  {
2609  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2610  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2611  // Copy the initial ICVs from the primary thread's thread struct to the
2612  // implicit task for this tid.
2613  KA_TRACE(10,
2614  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2615  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2616  tid, FALSE);
2617  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2618  &team->t.t_threads[0]
2619  ->th.th_bar[bs_forkjoin_barrier]
2620  .bb.th_fixed_icvs);
2621  }
2622  }
2623 #endif // KMP_BARRIER_ICV_PULL
2624 
2625  if (__kmp_tasking_mode != tskm_immediate_exec) {
2626  __kmp_task_team_sync(this_thr, team);
2627  }
2628 
2629 #if KMP_AFFINITY_SUPPORTED
2630  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2631  if (proc_bind == proc_bind_intel) {
2632  // Call dynamic affinity settings
2633  if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2634  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2635  }
2636  } else if (proc_bind != proc_bind_false) {
2637  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2638  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2639  __kmp_gtid_from_thread(this_thr),
2640  this_thr->th.th_current_place));
2641  } else {
2642  __kmp_affinity_bind_place(gtid);
2643  }
2644  }
2645 #endif // KMP_AFFINITY_SUPPORTED
2646  // Perform the display affinity functionality
2647  if (__kmp_display_affinity) {
2648  if (team->t.t_display_affinity
2649 #if KMP_AFFINITY_SUPPORTED
2650  || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2651 #endif
2652  ) {
2653  // NULL means use the affinity-format-var ICV
2654  __kmp_aux_display_affinity(gtid, NULL);
2655  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2656  this_thr->th.th_prev_level = team->t.t_level;
2657  }
2658  }
2659  if (!KMP_MASTER_TID(tid))
2660  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2661 
2662 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2663  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2664  if (!KMP_MASTER_TID(tid)) {
2665  // Get correct barrier object
2666  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2667  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2668  } // (prepare called inside barrier_release)
2669  }
2670 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2671  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2672  team->t.t_id, tid));
2673 }
2674 
2675 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2676  kmp_internal_control_t *new_icvs, ident_t *loc) {
2677  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2678 
2679  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2680  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2681 
2682 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2683  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2684  implicit task has this data before this function is called. */
2685 #if KMP_BARRIER_ICV_PULL
2686  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2687  remains untouched), where all of the worker threads can access them and
2688  make their own copies after the barrier. */
2689  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2690  // allocated at this point
2691  copy_icvs(
2692  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2693  new_icvs);
2694  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2695  team->t.t_threads[0], team));
2696 #elif KMP_BARRIER_ICV_PUSH
2697  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2698  // done here.
2699  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2700  team->t.t_threads[0], team));
2701 #else
2702  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2703  // time.
2704  ngo_load(new_icvs);
2705  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2706  // allocated at this point
2707  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2708  // TODO: GEH - pass in better source location info since usually NULL here
2709  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2710  f, team->t.t_threads[f], team));
2711  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2712  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2713  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2714  f, team->t.t_threads[f], team));
2715  }
2716  ngo_sync();
2717 #endif // KMP_BARRIER_ICV_PULL
2718 }
Definition: kmp.h:230