LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp_wait_release.h"
14 #include "kmp_barrier.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19 // for distributed barrier
20 #include "kmp_affinity.h"
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39 
40 void __kmp_print_structure(void); // Forward declaration
41 
42 // ---------------------------- Barrier Algorithms ----------------------------
43 // Distributed barrier
44 
45 // Compute how many threads to have polling each cache-line.
46 // We want to limit the number of writes to IDEAL_GO_RESOLUTION.
47 void distributedBarrier::computeVarsForN(size_t n) {
48  int nsockets = 1;
49  if (__kmp_topology) {
50  int socket_level = __kmp_topology->get_level(KMP_HW_SOCKET);
51  int core_level = __kmp_topology->get_level(KMP_HW_CORE);
52  int ncores_per_socket =
53  __kmp_topology->calculate_ratio(core_level, socket_level);
54  nsockets = __kmp_topology->get_count(socket_level);
55 
56  if (nsockets <= 0)
57  nsockets = 1;
58  if (ncores_per_socket <= 0)
59  ncores_per_socket = 1;
60 
61  threads_per_go = ncores_per_socket >> 1;
62  if (!fix_threads_per_go) {
63  // Minimize num_gos
64  if (threads_per_go > 4) {
65  if (KMP_OPTIMIZE_FOR_REDUCTIONS) {
66  threads_per_go = threads_per_go >> 1;
67  }
68  if (threads_per_go > 4 && nsockets == 1)
69  threads_per_go = threads_per_go >> 1;
70  }
71  }
72  if (threads_per_go == 0)
73  threads_per_go = 1;
74  fix_threads_per_go = true;
75  num_gos = n / threads_per_go;
76  if (n % threads_per_go)
77  num_gos++;
78  if (nsockets == 1 || num_gos == 1)
79  num_groups = 1;
80  else {
81  num_groups = num_gos / nsockets;
82  if (num_gos % nsockets)
83  num_groups++;
84  }
85  if (num_groups <= 0)
86  num_groups = 1;
87  gos_per_group = num_gos / num_groups;
88  if (num_gos % num_groups)
89  gos_per_group++;
90  threads_per_group = threads_per_go * gos_per_group;
91  } else {
92  num_gos = n / threads_per_go;
93  if (n % threads_per_go)
94  num_gos++;
95  if (num_gos == 1)
96  num_groups = 1;
97  else {
98  num_groups = num_gos / 2;
99  if (num_gos % 2)
100  num_groups++;
101  }
102  gos_per_group = num_gos / num_groups;
103  if (num_gos % num_groups)
104  gos_per_group++;
105  threads_per_group = threads_per_go * gos_per_group;
106  }
107 }
108 
109 void distributedBarrier::computeGo(size_t n) {
110  // Minimize num_gos
111  for (num_gos = 1;; num_gos++)
112  if (IDEAL_CONTENTION * num_gos >= n)
113  break;
114  threads_per_go = n / num_gos;
115  if (n % num_gos)
116  threads_per_go++;
117  while (num_gos > MAX_GOS) {
118  threads_per_go++;
119  num_gos = n / threads_per_go;
120  if (n % threads_per_go)
121  num_gos++;
122  }
123  computeVarsForN(n);
124 }
125 
126 // This function is to resize the barrier arrays when the new number of threads
127 // exceeds max_threads, which is the current size of all the arrays
128 void distributedBarrier::resize(size_t nthr) {
129  KMP_DEBUG_ASSERT(nthr > max_threads);
130 
131  // expand to requested size * 2
132  max_threads = nthr * 2;
133 
134  // allocate arrays to new max threads
135  for (int i = 0; i < MAX_ITERS; ++i) {
136  if (flags[i])
137  flags[i] = (flags_s *)KMP_INTERNAL_REALLOC(flags[i],
138  max_threads * sizeof(flags_s));
139  else
140  flags[i] = (flags_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(flags_s));
141  }
142 
143  if (go)
144  go = (go_s *)KMP_INTERNAL_REALLOC(go, max_threads * sizeof(go_s));
145  else
146  go = (go_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(go_s));
147 
148  if (iter)
149  iter = (iter_s *)KMP_INTERNAL_REALLOC(iter, max_threads * sizeof(iter_s));
150  else
151  iter = (iter_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(iter_s));
152 
153  if (sleep)
154  sleep =
155  (sleep_s *)KMP_INTERNAL_REALLOC(sleep, max_threads * sizeof(sleep_s));
156  else
157  sleep = (sleep_s *)KMP_INTERNAL_MALLOC(max_threads * sizeof(sleep_s));
158 }
159 
160 // This function is to set all the go flags that threads might be waiting
161 // on, and when blocktime is not infinite, it should be followed by a wake-up
162 // call to each thread
163 kmp_uint64 distributedBarrier::go_release() {
164  kmp_uint64 next_go = iter[0].iter + distributedBarrier::MAX_ITERS;
165  for (size_t j = 0; j < num_gos; j++) {
166  go[j].go.store(next_go);
167  }
168  return next_go;
169 }
170 
171 void distributedBarrier::go_reset() {
172  for (size_t j = 0; j < max_threads; ++j) {
173  for (size_t i = 0; i < distributedBarrier::MAX_ITERS; ++i) {
174  flags[i][j].stillNeed = 1;
175  }
176  go[j].go.store(0);
177  iter[j].iter = 0;
178  }
179 }
180 
181 // This function inits/re-inits the distributed barrier for a particular number
182 // of threads. If a resize of arrays is needed, it calls the resize function.
183 void distributedBarrier::init(size_t nthr) {
184  size_t old_max = max_threads;
185  if (nthr > max_threads) { // need more space in arrays
186  resize(nthr);
187  }
188 
189  for (size_t i = 0; i < max_threads; i++) {
190  for (size_t j = 0; j < distributedBarrier::MAX_ITERS; j++) {
191  flags[j][i].stillNeed = 1;
192  }
193  go[i].go.store(0);
194  iter[i].iter = 0;
195  if (i >= old_max)
196  sleep[i].sleep = false;
197  }
198 
199  // Recalculate num_gos, etc. based on new nthr
200  computeVarsForN(nthr);
201 
202  num_threads = nthr;
203 
204  if (team_icvs == NULL)
205  team_icvs = __kmp_allocate(sizeof(kmp_internal_control_t));
206 }
207 
208 void distributedBarrier::deallocate(distributedBarrier *db) {
209  for (int i = 0; i < MAX_ITERS; ++i) {
210  if (db->flags[i])
211  KMP_INTERNAL_FREE(db->flags[i]);
212  db->flags[i] = NULL;
213  }
214  if (db->go) {
215  KMP_INTERNAL_FREE(db->go);
216  db->go = NULL;
217  }
218  if (db->iter) {
219  KMP_INTERNAL_FREE(db->iter);
220  db->iter = NULL;
221  }
222  if (db->sleep) {
223  KMP_INTERNAL_FREE(db->sleep);
224  db->sleep = NULL;
225  }
226  if (db->team_icvs) {
227  __kmp_free(db->team_icvs);
228  db->team_icvs = NULL;
229  }
230  KMP_ALIGNED_FREE(db);
231 }
232 
233 // This function is used only when KMP_BLOCKTIME is not infinite.
234 // static
235 void __kmp_dist_barrier_wakeup(enum barrier_type bt, kmp_team_t *team,
236  size_t start, size_t stop, size_t inc,
237  size_t tid) {
238  KMP_DEBUG_ASSERT(__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME);
239  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
240  return;
241 
242  kmp_info_t **other_threads = team->t.t_threads;
243  for (size_t thr = start; thr < stop; thr += inc) {
244  KMP_DEBUG_ASSERT(other_threads[thr]);
245  int gtid = other_threads[thr]->th.th_info.ds.ds_gtid;
246  // Wake up worker regardless of if it appears to be sleeping or not
247  __kmp_atomic_resume_64(gtid, (kmp_atomic_flag_64<> *)NULL);
248  }
249 }
250 
251 static void __kmp_dist_barrier_gather(
252  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
253  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
254  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_gather);
255  kmp_team_t *team;
256  distributedBarrier *b;
257  kmp_info_t **other_threads;
258  kmp_uint64 my_current_iter, my_next_iter;
259  kmp_uint32 nproc;
260  bool group_leader;
261 
262  team = this_thr->th.th_team;
263  nproc = this_thr->th.th_team_nproc;
264  other_threads = team->t.t_threads;
265  b = team->t.b;
266  my_current_iter = b->iter[tid].iter;
267  my_next_iter = (my_current_iter + 1) % distributedBarrier::MAX_ITERS;
268  group_leader = ((tid % b->threads_per_group) == 0);
269 
270  KA_TRACE(20,
271  ("__kmp_dist_barrier_gather: T#%d(%d:%d) enter; barrier type %d\n",
272  gtid, team->t.t_id, tid, bt));
273 
274 #if USE_ITT_BUILD && USE_ITT_NOTIFY
275  // Barrier imbalance - save arrive time to the thread
276  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
277  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
278  __itt_get_timestamp();
279  }
280 #endif
281 
282  if (group_leader) {
283  // Start from the thread after the group leader
284  size_t group_start = tid + 1;
285  size_t group_end = tid + b->threads_per_group;
286  size_t threads_pending = 0;
287 
288  if (group_end > nproc)
289  group_end = nproc;
290  do { // wait for threads in my group
291  threads_pending = 0;
292  // Check all the flags every time to avoid branch misspredict
293  for (size_t thr = group_start; thr < group_end; thr++) {
294  // Each thread uses a different cache line
295  threads_pending += b->flags[my_current_iter][thr].stillNeed;
296  }
297  // Execute tasks here
298  if (__kmp_tasking_mode != tskm_immediate_exec) {
299  kmp_task_team_t *task_team = this_thr->th.th_task_team;
300  if (task_team != NULL) {
301  if (TCR_SYNC_4(task_team->tt.tt_active)) {
302  if (KMP_TASKING_ENABLED(task_team)) {
303  int tasks_completed = FALSE;
304  __kmp_atomic_execute_tasks_64(
305  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
306  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
307  } else
308  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
309  }
310  } else {
311  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
312  } // if
313  }
314  if (TCR_4(__kmp_global.g.g_done)) {
315  if (__kmp_global.g.g_abort)
316  __kmp_abort_thread();
317  break;
318  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
319  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
320  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
321  }
322  } while (threads_pending > 0);
323 
324  if (reduce) { // Perform reduction if needed
325  OMPT_REDUCTION_DECL(this_thr, gtid);
326  OMPT_REDUCTION_BEGIN;
327  // Group leader reduces all threads in group
328  for (size_t thr = group_start; thr < group_end; thr++) {
329  (*reduce)(this_thr->th.th_local.reduce_data,
330  other_threads[thr]->th.th_local.reduce_data);
331  }
332  OMPT_REDUCTION_END;
333  }
334 
335  // Set flag for next iteration
336  b->flags[my_next_iter][tid].stillNeed = 1;
337  // Each thread uses a different cache line; resets stillNeed to 0 to
338  // indicate it has reached the barrier
339  b->flags[my_current_iter][tid].stillNeed = 0;
340 
341  do { // wait for all group leaders
342  threads_pending = 0;
343  for (size_t thr = 0; thr < nproc; thr += b->threads_per_group) {
344  threads_pending += b->flags[my_current_iter][thr].stillNeed;
345  }
346  // Execute tasks here
347  if (__kmp_tasking_mode != tskm_immediate_exec) {
348  kmp_task_team_t *task_team = this_thr->th.th_task_team;
349  if (task_team != NULL) {
350  if (TCR_SYNC_4(task_team->tt.tt_active)) {
351  if (KMP_TASKING_ENABLED(task_team)) {
352  int tasks_completed = FALSE;
353  __kmp_atomic_execute_tasks_64(
354  this_thr, gtid, (kmp_atomic_flag_64<> *)NULL, FALSE,
355  &tasks_completed USE_ITT_BUILD_ARG(itt_sync_obj), 0);
356  } else
357  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
358  }
359  } else {
360  this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
361  } // if
362  }
363  if (TCR_4(__kmp_global.g.g_done)) {
364  if (__kmp_global.g.g_abort)
365  __kmp_abort_thread();
366  break;
367  } else if (__kmp_tasking_mode != tskm_immediate_exec &&
368  this_thr->th.th_reap_state == KMP_SAFE_TO_REAP) {
369  this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
370  }
371  } while (threads_pending > 0);
372 
373  if (reduce) { // Perform reduction if needed
374  if (KMP_MASTER_TID(tid)) { // Master reduces over group leaders
375  OMPT_REDUCTION_DECL(this_thr, gtid);
376  OMPT_REDUCTION_BEGIN;
377  for (size_t thr = b->threads_per_group; thr < nproc;
378  thr += b->threads_per_group) {
379  (*reduce)(this_thr->th.th_local.reduce_data,
380  other_threads[thr]->th.th_local.reduce_data);
381  }
382  OMPT_REDUCTION_END;
383  }
384  }
385  } else {
386  // Set flag for next iteration
387  b->flags[my_next_iter][tid].stillNeed = 1;
388  // Each thread uses a different cache line; resets stillNeed to 0 to
389  // indicate it has reached the barrier
390  b->flags[my_current_iter][tid].stillNeed = 0;
391  }
392 
393  KMP_MFENCE();
394 
395  KA_TRACE(20,
396  ("__kmp_dist_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
397  gtid, team->t.t_id, tid, bt));
398 }
399 
400 static void __kmp_dist_barrier_release(
401  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
402  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
403  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_dist_release);
404  kmp_team_t *team;
405  distributedBarrier *b;
406  kmp_bstate_t *thr_bar;
407  kmp_uint64 my_current_iter, next_go;
408  size_t my_go_index;
409  bool group_leader;
410 
411  KA_TRACE(20, ("__kmp_dist_barrier_release: T#%d(%d) enter; barrier type %d\n",
412  gtid, tid, bt));
413 
414  thr_bar = &this_thr->th.th_bar[bt].bb;
415 
416  if (!KMP_MASTER_TID(tid)) {
417  // workers and non-master group leaders need to check their presence in team
418  do {
419  if (this_thr->th.th_used_in_team.load() != 1 &&
420  this_thr->th.th_used_in_team.load() != 3) {
421  // Thread is not in use in a team. Wait on location in tid's thread
422  // struct. The 0 value tells anyone looking that this thread is spinning
423  // or sleeping until this location becomes 3 again; 3 is the transition
424  // state to get to 1 which is waiting on go and being in the team
425  kmp_flag_32<false, false> my_flag(&(this_thr->th.th_used_in_team), 3);
426  if (KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 2,
427  0) ||
428  this_thr->th.th_used_in_team.load() == 0) {
429  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
430  }
431 #if USE_ITT_BUILD && USE_ITT_NOTIFY
432  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
433  // In fork barrier where we could not get the object reliably
434  itt_sync_obj =
435  __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
436  // Cancel wait on previous parallel region...
437  __kmp_itt_task_starting(itt_sync_obj);
438 
439  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
440  return;
441 
442  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
443  if (itt_sync_obj != NULL)
444  // Call prepare as early as possible for "new" barrier
445  __kmp_itt_task_finished(itt_sync_obj);
446  } else
447 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
448  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
449  return;
450  }
451  if (this_thr->th.th_used_in_team.load() != 1 &&
452  this_thr->th.th_used_in_team.load() != 3) // spurious wake-up?
453  continue;
454  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
455  return;
456 
457  // At this point, the thread thinks it is in use in a team, or in
458  // transition to be used in a team, but it might have reached this barrier
459  // before it was marked unused by the team. Unused threads are awoken and
460  // shifted to wait on local thread struct elsewhere. It also might reach
461  // this point by being picked up for use by a different team. Either way,
462  // we need to update the tid.
463  tid = __kmp_tid_from_gtid(gtid);
464  team = this_thr->th.th_team;
465  KMP_DEBUG_ASSERT(tid >= 0);
466  KMP_DEBUG_ASSERT(team);
467  b = team->t.b;
468  my_current_iter = b->iter[tid].iter;
469  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
470  my_go_index = tid / b->threads_per_go;
471  if (this_thr->th.th_used_in_team.load() == 3) {
472  (void)KMP_COMPARE_AND_STORE_ACQ32(&(this_thr->th.th_used_in_team), 3,
473  1);
474  }
475  // Check if go flag is set
476  if (b->go[my_go_index].go.load() != next_go) {
477  // Wait on go flag on team
478  kmp_atomic_flag_64<false, true> my_flag(
479  &(b->go[my_go_index].go), next_go, &(b->sleep[tid].sleep));
480  my_flag.wait(this_thr, true USE_ITT_BUILD_ARG(itt_sync_obj));
481  KMP_DEBUG_ASSERT(my_current_iter == b->iter[tid].iter ||
482  b->iter[tid].iter == 0);
483  KMP_DEBUG_ASSERT(b->sleep[tid].sleep == false);
484  }
485 
486  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
487  return;
488  // At this point, the thread's go location was set. This means the primary
489  // thread is safely in the barrier, and so this thread's data is
490  // up-to-date, but we should check again that this thread is really in
491  // use in the team, as it could have been woken up for the purpose of
492  // changing team size, or reaping threads at shutdown.
493  if (this_thr->th.th_used_in_team.load() == 1)
494  break;
495  } while (1);
496 
497  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
498  return;
499 
500  group_leader = ((tid % b->threads_per_group) == 0);
501  if (group_leader) {
502  // Tell all the threads in my group they can go!
503  for (size_t go_idx = my_go_index + 1;
504  go_idx < my_go_index + b->gos_per_group; go_idx++) {
505  b->go[go_idx].go.store(next_go);
506  }
507  // Fence added so that workers can see changes to go. sfence inadequate.
508  KMP_MFENCE();
509  }
510 
511 #if KMP_BARRIER_ICV_PUSH
512  if (propagate_icvs) { // copy ICVs to final dest
513  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
514  tid, FALSE);
515  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
516  (kmp_internal_control_t *)team->t.b->team_icvs);
517  copy_icvs(&thr_bar->th_fixed_icvs,
518  &team->t.t_implicit_task_taskdata[tid].td_icvs);
519  }
520 #endif
521  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && group_leader) {
522  // This thread is now awake and participating in the barrier;
523  // wake up the other threads in the group
524  size_t nproc = this_thr->th.th_team_nproc;
525  size_t group_end = tid + b->threads_per_group;
526  if (nproc < group_end)
527  group_end = nproc;
528  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
529  }
530  } else { // Primary thread
531  team = this_thr->th.th_team;
532  b = team->t.b;
533  my_current_iter = b->iter[tid].iter;
534  next_go = my_current_iter + distributedBarrier::MAX_ITERS;
535 #if KMP_BARRIER_ICV_PUSH
536  if (propagate_icvs) {
537  // primary thread has ICVs in final destination; copy
538  copy_icvs(&thr_bar->th_fixed_icvs,
539  &team->t.t_implicit_task_taskdata[tid].td_icvs);
540  }
541 #endif
542  // Tell all the group leaders they can go!
543  for (size_t go_idx = 0; go_idx < b->num_gos; go_idx += b->gos_per_group) {
544  b->go[go_idx].go.store(next_go);
545  }
546 
547  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
548  // Wake-up the group leaders
549  size_t nproc = this_thr->th.th_team_nproc;
550  __kmp_dist_barrier_wakeup(bt, team, tid + b->threads_per_group, nproc,
551  b->threads_per_group, tid);
552  }
553 
554  // Tell all the threads in my group they can go!
555  for (size_t go_idx = 1; go_idx < b->gos_per_group; go_idx++) {
556  b->go[go_idx].go.store(next_go);
557  }
558 
559  // Fence added so that workers can see changes to go. sfence inadequate.
560  KMP_MFENCE();
561 
562  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
563  // Wake-up the other threads in my group
564  size_t nproc = this_thr->th.th_team_nproc;
565  size_t group_end = tid + b->threads_per_group;
566  if (nproc < group_end)
567  group_end = nproc;
568  __kmp_dist_barrier_wakeup(bt, team, tid + 1, group_end, 1, tid);
569  }
570  }
571  // Update to next iteration
572  KMP_ASSERT(my_current_iter == b->iter[tid].iter);
573  b->iter[tid].iter = (b->iter[tid].iter + 1) % distributedBarrier::MAX_ITERS;
574 
575  KA_TRACE(
576  20, ("__kmp_dist_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
577  gtid, team->t.t_id, tid, bt));
578 }
579 
580 // Linear Barrier
581 template <bool cancellable = false>
582 static bool __kmp_linear_barrier_gather_template(
583  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
584  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
585  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
586  kmp_team_t *team = this_thr->th.th_team;
587  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
588  kmp_info_t **other_threads = team->t.t_threads;
589 
590  KA_TRACE(
591  20,
592  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
593  gtid, team->t.t_id, tid, bt));
594  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
595 
596 #if USE_ITT_BUILD && USE_ITT_NOTIFY
597  // Barrier imbalance - save arrive time to the thread
598  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
599  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
600  __itt_get_timestamp();
601  }
602 #endif
603  // We now perform a linear reduction to signal that all of the threads have
604  // arrived.
605  if (!KMP_MASTER_TID(tid)) {
606  KA_TRACE(20,
607  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
608  "arrived(%p): %llu => %llu\n",
609  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
610  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
611  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
612  // Mark arrival to primary thread
613  /* After performing this write, a worker thread may not assume that the team
614  is valid any more - it could be deallocated by the primary thread at any
615  time. */
616  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
617  flag.release();
618  } else {
619  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
620  int nproc = this_thr->th.th_team_nproc;
621  int i;
622  // Don't have to worry about sleep bit here or atomic since team setting
623  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
624 
625  // Collect all the worker team member threads.
626  for (i = 1; i < nproc; ++i) {
627 #if KMP_CACHE_MANAGE
628  // Prefetch next thread's arrived count
629  if (i + 1 < nproc)
630  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
631 #endif /* KMP_CACHE_MANAGE */
632  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
633  "arrived(%p) == %llu\n",
634  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
635  team->t.t_id, i,
636  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
637 
638  // Wait for worker thread to arrive
639  if (cancellable) {
640  kmp_flag_64<true, false> flag(
641  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
642  if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
643  return true;
644  } else {
645  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
646  new_state);
647  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
648  }
649 #if USE_ITT_BUILD && USE_ITT_NOTIFY
650  // Barrier imbalance - write min of the thread time and the other thread
651  // time to the thread.
652  if (__kmp_forkjoin_frames_mode == 2) {
653  this_thr->th.th_bar_min_time = KMP_MIN(
654  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
655  }
656 #endif
657  if (reduce) {
658  KA_TRACE(100,
659  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
660  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
661  team->t.t_id, i));
662  OMPT_REDUCTION_DECL(this_thr, gtid);
663  OMPT_REDUCTION_BEGIN;
664  (*reduce)(this_thr->th.th_local.reduce_data,
665  other_threads[i]->th.th_local.reduce_data);
666  OMPT_REDUCTION_END;
667  }
668  }
669  // Don't have to worry about sleep bit here or atomic since team setting
670  team_bar->b_arrived = new_state;
671  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
672  "arrived(%p) = %llu\n",
673  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
674  new_state));
675  }
676  KA_TRACE(
677  20,
678  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
679  gtid, team->t.t_id, tid, bt));
680  return false;
681 }
682 
683 template <bool cancellable = false>
684 static bool __kmp_linear_barrier_release_template(
685  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
686  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
687  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
688  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
689  kmp_team_t *team;
690 
691  if (KMP_MASTER_TID(tid)) {
692  unsigned int i;
693  kmp_uint32 nproc = this_thr->th.th_team_nproc;
694  kmp_info_t **other_threads;
695 
696  team = __kmp_threads[gtid]->th.th_team;
697  KMP_DEBUG_ASSERT(team != NULL);
698  other_threads = team->t.t_threads;
699 
700  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) primary enter for "
701  "barrier type %d\n",
702  gtid, team->t.t_id, tid, bt));
703 
704  if (nproc > 1) {
705 #if KMP_BARRIER_ICV_PUSH
706  {
707  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
708  if (propagate_icvs) {
709  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
710  for (i = 1; i < nproc; ++i) {
711  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
712  team, i, FALSE);
713  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
714  &team->t.t_implicit_task_taskdata[0].td_icvs);
715  }
716  ngo_sync();
717  }
718  }
719 #endif // KMP_BARRIER_ICV_PUSH
720 
721  // Now, release all of the worker threads
722  for (i = 1; i < nproc; ++i) {
723 #if KMP_CACHE_MANAGE
724  // Prefetch next thread's go flag
725  if (i + 1 < nproc)
726  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
727 #endif /* KMP_CACHE_MANAGE */
728  KA_TRACE(
729  20,
730  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
731  "go(%p): %u => %u\n",
732  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
733  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
734  other_threads[i]->th.th_bar[bt].bb.b_go,
735  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
736  kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
737  other_threads[i]);
738  flag.release();
739  }
740  }
741  } else { // Wait for the PRIMARY thread to release us
742  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
743  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
744  if (cancellable) {
745  kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
746  if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
747  return true;
748  } else {
749  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
750  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
751  }
752 #if USE_ITT_BUILD && USE_ITT_NOTIFY
753  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
754  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
755  // disabled)
756  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
757  // Cancel wait on previous parallel region...
758  __kmp_itt_task_starting(itt_sync_obj);
759 
760  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
761  return false;
762 
763  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
764  if (itt_sync_obj != NULL)
765  // Call prepare as early as possible for "new" barrier
766  __kmp_itt_task_finished(itt_sync_obj);
767  } else
768 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
769  // Early exit for reaping threads releasing forkjoin barrier
770  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
771  return false;
772 // The worker thread may now assume that the team is valid.
773 #ifdef KMP_DEBUG
774  tid = __kmp_tid_from_gtid(gtid);
775  team = __kmp_threads[gtid]->th.th_team;
776 #endif
777  KMP_DEBUG_ASSERT(team != NULL);
778  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
779  KA_TRACE(20,
780  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
781  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
782  KMP_MB(); // Flush all pending memory write invalidates.
783  }
784  KA_TRACE(
785  20,
786  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
787  gtid, team->t.t_id, tid, bt));
788  return false;
789 }
790 
791 static void __kmp_linear_barrier_gather(
792  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
793  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
794  __kmp_linear_barrier_gather_template<false>(
795  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
796 }
797 
798 static bool __kmp_linear_barrier_gather_cancellable(
799  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
800  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
801  return __kmp_linear_barrier_gather_template<true>(
802  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
803 }
804 
805 static void __kmp_linear_barrier_release(
806  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
807  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
808  __kmp_linear_barrier_release_template<false>(
809  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
810 }
811 
812 static bool __kmp_linear_barrier_release_cancellable(
813  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
814  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
815  return __kmp_linear_barrier_release_template<true>(
816  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
817 }
818 
819 // Tree barrier
820 static void __kmp_tree_barrier_gather(
821  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
822  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
823  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
824  kmp_team_t *team = this_thr->th.th_team;
825  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
826  kmp_info_t **other_threads = team->t.t_threads;
827  kmp_uint32 nproc = this_thr->th.th_team_nproc;
828  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
829  kmp_uint32 branch_factor = 1 << branch_bits;
830  kmp_uint32 child;
831  kmp_uint32 child_tid;
832  kmp_uint64 new_state = 0;
833 
834  KA_TRACE(
835  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
836  gtid, team->t.t_id, tid, bt));
837  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
838 
839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
840  // Barrier imbalance - save arrive time to the thread
841  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
842  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
843  __itt_get_timestamp();
844  }
845 #endif
846  // Perform tree gather to wait until all threads have arrived; reduce any
847  // required data as we go
848  child_tid = (tid << branch_bits) + 1;
849  if (child_tid < nproc) {
850  // Parent threads wait for all their children to arrive
851  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
852  child = 1;
853  do {
854  kmp_info_t *child_thr = other_threads[child_tid];
855  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
856 #if KMP_CACHE_MANAGE
857  // Prefetch next thread's arrived count
858  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
859  KMP_CACHE_PREFETCH(
860  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
861 #endif /* KMP_CACHE_MANAGE */
862  KA_TRACE(20,
863  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
864  "arrived(%p) == %llu\n",
865  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
866  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
867  // Wait for child to arrive
868  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
869  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
870 #if USE_ITT_BUILD && USE_ITT_NOTIFY
871  // Barrier imbalance - write min of the thread time and a child time to
872  // the thread.
873  if (__kmp_forkjoin_frames_mode == 2) {
874  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
875  child_thr->th.th_bar_min_time);
876  }
877 #endif
878  if (reduce) {
879  KA_TRACE(100,
880  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
881  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
882  team->t.t_id, child_tid));
883  OMPT_REDUCTION_DECL(this_thr, gtid);
884  OMPT_REDUCTION_BEGIN;
885  (*reduce)(this_thr->th.th_local.reduce_data,
886  child_thr->th.th_local.reduce_data);
887  OMPT_REDUCTION_END;
888  }
889  child++;
890  child_tid++;
891  } while (child <= branch_factor && child_tid < nproc);
892  }
893 
894  if (!KMP_MASTER_TID(tid)) { // Worker threads
895  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
896 
897  KA_TRACE(20,
898  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
899  "arrived(%p): %llu => %llu\n",
900  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
901  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
902  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
903 
904  // Mark arrival to parent thread
905  /* After performing this write, a worker thread may not assume that the team
906  is valid any more - it could be deallocated by the primary thread at any
907  time. */
908  kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
909  flag.release();
910  } else {
911  // Need to update the team arrived pointer if we are the primary thread
912  if (nproc > 1) // New value was already computed above
913  team->t.t_bar[bt].b_arrived = new_state;
914  else
915  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
916  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
917  "arrived(%p) = %llu\n",
918  gtid, team->t.t_id, tid, team->t.t_id,
919  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
920  }
921  KA_TRACE(20,
922  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
923  gtid, team->t.t_id, tid, bt));
924 }
925 
926 static void __kmp_tree_barrier_release(
927  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
928  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
929  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
930  kmp_team_t *team;
931  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
932  kmp_uint32 nproc;
933  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
934  kmp_uint32 branch_factor = 1 << branch_bits;
935  kmp_uint32 child;
936  kmp_uint32 child_tid;
937 
938  // Perform a tree release for all of the threads that have been gathered
939  if (!KMP_MASTER_TID(
940  tid)) { // Handle fork barrier workers who aren't part of a team yet
941  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
942  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
943  // Wait for parent thread to release us
944  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
945  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
946 #if USE_ITT_BUILD && USE_ITT_NOTIFY
947  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
948  // In fork barrier where we could not get the object reliably (or
949  // ITTNOTIFY is disabled)
950  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
951  // Cancel wait on previous parallel region...
952  __kmp_itt_task_starting(itt_sync_obj);
953 
954  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
955  return;
956 
957  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
958  if (itt_sync_obj != NULL)
959  // Call prepare as early as possible for "new" barrier
960  __kmp_itt_task_finished(itt_sync_obj);
961  } else
962 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
963  // Early exit for reaping threads releasing forkjoin barrier
964  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
965  return;
966 
967  // The worker thread may now assume that the team is valid.
968  team = __kmp_threads[gtid]->th.th_team;
969  KMP_DEBUG_ASSERT(team != NULL);
970  tid = __kmp_tid_from_gtid(gtid);
971 
972  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
973  KA_TRACE(20,
974  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
975  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
976  KMP_MB(); // Flush all pending memory write invalidates.
977  } else {
978  team = __kmp_threads[gtid]->th.th_team;
979  KMP_DEBUG_ASSERT(team != NULL);
980  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) primary enter for "
981  "barrier type %d\n",
982  gtid, team->t.t_id, tid, bt));
983  }
984  nproc = this_thr->th.th_team_nproc;
985  child_tid = (tid << branch_bits) + 1;
986 
987  if (child_tid < nproc) {
988  kmp_info_t **other_threads = team->t.t_threads;
989  child = 1;
990  // Parent threads release all their children
991  do {
992  kmp_info_t *child_thr = other_threads[child_tid];
993  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
994 #if KMP_CACHE_MANAGE
995  // Prefetch next thread's go count
996  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
997  KMP_CACHE_PREFETCH(
998  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
999 #endif /* KMP_CACHE_MANAGE */
1000 
1001 #if KMP_BARRIER_ICV_PUSH
1002  {
1003  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
1004  if (propagate_icvs) {
1005  __kmp_init_implicit_task(team->t.t_ident,
1006  team->t.t_threads[child_tid], team,
1007  child_tid, FALSE);
1008  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
1009  &team->t.t_implicit_task_taskdata[0].td_icvs);
1010  }
1011  }
1012 #endif // KMP_BARRIER_ICV_PUSH
1013  KA_TRACE(20,
1014  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1015  "go(%p): %u => %u\n",
1016  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1017  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1018  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1019  // Release child from barrier
1020  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1021  flag.release();
1022  child++;
1023  child_tid++;
1024  } while (child <= branch_factor && child_tid < nproc);
1025  }
1026  KA_TRACE(
1027  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1028  gtid, team->t.t_id, tid, bt));
1029 }
1030 
1031 // Hyper Barrier
1032 static void __kmp_hyper_barrier_gather(
1033  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1034  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1035  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
1036  kmp_team_t *team = this_thr->th.th_team;
1037  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1038  kmp_info_t **other_threads = team->t.t_threads;
1039  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
1040  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
1041  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
1042  kmp_uint32 branch_factor = 1 << branch_bits;
1043  kmp_uint32 offset;
1044  kmp_uint32 level;
1045 
1046  KA_TRACE(
1047  20,
1048  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1049  gtid, team->t.t_id, tid, bt));
1050  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1051 
1052 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1053  // Barrier imbalance - save arrive time to the thread
1054  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1055  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
1056  __itt_get_timestamp();
1057  }
1058 #endif
1059  /* Perform a hypercube-embedded tree gather to wait until all of the threads
1060  have arrived, and reduce any required data as we go. */
1061  kmp_flag_64<> p_flag(&thr_bar->b_arrived);
1062  for (level = 0, offset = 1; offset < num_threads;
1063  level += branch_bits, offset <<= branch_bits) {
1064  kmp_uint32 child;
1065  kmp_uint32 child_tid;
1066 
1067  if (((tid >> level) & (branch_factor - 1)) != 0) {
1068  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
1069 
1070  KMP_MB(); // Synchronize parent and child threads.
1071  KA_TRACE(20,
1072  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1073  "arrived(%p): %llu => %llu\n",
1074  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
1075  team->t.t_id, parent_tid, &thr_bar->b_arrived,
1076  thr_bar->b_arrived,
1077  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1078  // Mark arrival to parent thread
1079  /* After performing this write (in the last iteration of the enclosing for
1080  loop), a worker thread may not assume that the team is valid any more
1081  - it could be deallocated by the primary thread at any time. */
1082  p_flag.set_waiter(other_threads[parent_tid]);
1083  p_flag.release();
1084  break;
1085  }
1086 
1087  // Parent threads wait for children to arrive
1088  if (new_state == KMP_BARRIER_UNUSED_STATE)
1089  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1090  for (child = 1, child_tid = tid + (1 << level);
1091  child < branch_factor && child_tid < num_threads;
1092  child++, child_tid += (1 << level)) {
1093  kmp_info_t *child_thr = other_threads[child_tid];
1094  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1095 #if KMP_CACHE_MANAGE
1096  kmp_uint32 next_child_tid = child_tid + (1 << level);
1097  // Prefetch next thread's arrived count
1098  if (child + 1 < branch_factor && next_child_tid < num_threads)
1099  KMP_CACHE_PREFETCH(
1100  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
1101 #endif /* KMP_CACHE_MANAGE */
1102  KA_TRACE(20,
1103  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1104  "arrived(%p) == %llu\n",
1105  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1106  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
1107  // Wait for child to arrive
1108  kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
1109  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1110  KMP_MB(); // Synchronize parent and child threads.
1111 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1112  // Barrier imbalance - write min of the thread time and a child time to
1113  // the thread.
1114  if (__kmp_forkjoin_frames_mode == 2) {
1115  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
1116  child_thr->th.th_bar_min_time);
1117  }
1118 #endif
1119  if (reduce) {
1120  KA_TRACE(100,
1121  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1122  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1123  team->t.t_id, child_tid));
1124  OMPT_REDUCTION_DECL(this_thr, gtid);
1125  OMPT_REDUCTION_BEGIN;
1126  (*reduce)(this_thr->th.th_local.reduce_data,
1127  child_thr->th.th_local.reduce_data);
1128  OMPT_REDUCTION_END;
1129  }
1130  }
1131  }
1132 
1133  if (KMP_MASTER_TID(tid)) {
1134  // Need to update the team arrived pointer if we are the primary thread
1135  if (new_state == KMP_BARRIER_UNUSED_STATE)
1136  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
1137  else
1138  team->t.t_bar[bt].b_arrived = new_state;
1139  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
1140  "arrived(%p) = %llu\n",
1141  gtid, team->t.t_id, tid, team->t.t_id,
1142  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1143  }
1144  KA_TRACE(
1145  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1146  gtid, team->t.t_id, tid, bt));
1147 }
1148 
1149 // The reverse versions seem to beat the forward versions overall
1150 #define KMP_REVERSE_HYPER_BAR
1151 static void __kmp_hyper_barrier_release(
1152  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1153  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1154  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
1155  kmp_team_t *team;
1156  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1157  kmp_info_t **other_threads;
1158  kmp_uint32 num_threads;
1159  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
1160  kmp_uint32 branch_factor = 1 << branch_bits;
1161  kmp_uint32 child;
1162  kmp_uint32 child_tid;
1163  kmp_uint32 offset;
1164  kmp_uint32 level;
1165 
1166  /* Perform a hypercube-embedded tree release for all of the threads that have
1167  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
1168  are released in the reverse order of the corresponding gather, otherwise
1169  threads are released in the same order. */
1170  if (KMP_MASTER_TID(tid)) { // primary thread
1171  team = __kmp_threads[gtid]->th.th_team;
1172  KMP_DEBUG_ASSERT(team != NULL);
1173  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) primary enter for "
1174  "barrier type %d\n",
1175  gtid, team->t.t_id, tid, bt));
1176 #if KMP_BARRIER_ICV_PUSH
1177  if (propagate_icvs) { // primary already has ICVs in final destination; copy
1178  copy_icvs(&thr_bar->th_fixed_icvs,
1179  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1180  }
1181 #endif
1182  } else { // Handle fork barrier workers who aren't part of a team yet
1183  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
1184  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
1185  // Wait for parent thread to release us
1186  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1187  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1188 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1189  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
1190  // In fork barrier where we could not get the object reliably
1191  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
1192  // Cancel wait on previous parallel region...
1193  __kmp_itt_task_starting(itt_sync_obj);
1194 
1195  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1196  return;
1197 
1198  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1199  if (itt_sync_obj != NULL)
1200  // Call prepare as early as possible for "new" barrier
1201  __kmp_itt_task_finished(itt_sync_obj);
1202  } else
1203 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1204  // Early exit for reaping threads releasing forkjoin barrier
1205  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1206  return;
1207 
1208  // The worker thread may now assume that the team is valid.
1209  team = __kmp_threads[gtid]->th.th_team;
1210  KMP_DEBUG_ASSERT(team != NULL);
1211  tid = __kmp_tid_from_gtid(gtid);
1212 
1213  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1214  KA_TRACE(20,
1215  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1216  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1217  KMP_MB(); // Flush all pending memory write invalidates.
1218  }
1219  num_threads = this_thr->th.th_team_nproc;
1220  other_threads = team->t.t_threads;
1221 
1222 #ifdef KMP_REVERSE_HYPER_BAR
1223  // Count up to correct level for parent
1224  for (level = 0, offset = 1;
1225  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
1226  level += branch_bits, offset <<= branch_bits)
1227  ;
1228 
1229  // Now go down from there
1230  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
1231  level -= branch_bits, offset >>= branch_bits)
1232 #else
1233  // Go down the tree, level by level
1234  for (level = 0, offset = 1; offset < num_threads;
1235  level += branch_bits, offset <<= branch_bits)
1236 #endif // KMP_REVERSE_HYPER_BAR
1237  {
1238 #ifdef KMP_REVERSE_HYPER_BAR
1239  /* Now go in reverse order through the children, highest to lowest.
1240  Initial setting of child is conservative here. */
1241  child = num_threads >> ((level == 0) ? level : level - 1);
1242  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
1243  child_tid = tid + (child << level);
1244  child >= 1; child--, child_tid -= (1 << level))
1245 #else
1246  if (((tid >> level) & (branch_factor - 1)) != 0)
1247  // No need to go lower than this, since this is the level parent would be
1248  // notified
1249  break;
1250  // Iterate through children on this level of the tree
1251  for (child = 1, child_tid = tid + (1 << level);
1252  child < branch_factor && child_tid < num_threads;
1253  child++, child_tid += (1 << level))
1254 #endif // KMP_REVERSE_HYPER_BAR
1255  {
1256  if (child_tid >= num_threads)
1257  continue; // Child doesn't exist so keep going
1258  else {
1259  kmp_info_t *child_thr = other_threads[child_tid];
1260  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1261 #if KMP_CACHE_MANAGE
1262  kmp_uint32 next_child_tid = child_tid - (1 << level);
1263 // Prefetch next thread's go count
1264 #ifdef KMP_REVERSE_HYPER_BAR
1265  if (child - 1 >= 1 && next_child_tid < num_threads)
1266 #else
1267  if (child + 1 < branch_factor && next_child_tid < num_threads)
1268 #endif // KMP_REVERSE_HYPER_BAR
1269  KMP_CACHE_PREFETCH(
1270  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
1271 #endif /* KMP_CACHE_MANAGE */
1272 
1273 #if KMP_BARRIER_ICV_PUSH
1274  if (propagate_icvs) // push my fixed ICVs to my child
1275  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1276 #endif // KMP_BARRIER_ICV_PUSH
1277 
1278  KA_TRACE(
1279  20,
1280  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1281  "go(%p): %u => %u\n",
1282  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1283  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1284  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1285  // Release child from barrier
1286  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1287  flag.release();
1288  }
1289  }
1290  }
1291 #if KMP_BARRIER_ICV_PUSH
1292  if (propagate_icvs &&
1293  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
1294  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1295  FALSE);
1296  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1297  &thr_bar->th_fixed_icvs);
1298  }
1299 #endif
1300  KA_TRACE(
1301  20,
1302  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1303  gtid, team->t.t_id, tid, bt));
1304 }
1305 
1306 // Hierarchical Barrier
1307 
1308 // Initialize thread barrier data
1309 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
1310  Performs the minimum amount of initialization required based on how the team
1311  has changed. Returns true if leaf children will require both on-core and
1312  traditional wake-up mechanisms. For example, if the team size increases,
1313  threads already in the team will respond to on-core wakeup on their parent
1314  thread, but threads newly added to the team will only be listening on the
1315  their local b_go. */
1316 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
1317  kmp_bstate_t *thr_bar,
1318  kmp_uint32 nproc, int gtid,
1319  int tid, kmp_team_t *team) {
1320  // Checks to determine if (re-)initialization is needed
1321  bool uninitialized = thr_bar->team == NULL;
1322  bool team_changed = team != thr_bar->team;
1323  bool team_sz_changed = nproc != thr_bar->nproc;
1324  bool tid_changed = tid != thr_bar->old_tid;
1325  bool retval = false;
1326 
1327  if (uninitialized || team_sz_changed) {
1328  __kmp_get_hierarchy(nproc, thr_bar);
1329  }
1330 
1331  if (uninitialized || team_sz_changed || tid_changed) {
1332  thr_bar->my_level = thr_bar->depth - 1; // default for primary thread
1333  thr_bar->parent_tid = -1; // default for primary thread
1334  if (!KMP_MASTER_TID(tid)) {
1335  // if not primary thread, find parent thread in hierarchy
1336  kmp_uint32 d = 0;
1337  while (d < thr_bar->depth) { // find parent based on level of thread in
1338  // hierarchy, and note level
1339  kmp_uint32 rem;
1340  if (d == thr_bar->depth - 2) { // reached level right below the primary
1341  thr_bar->parent_tid = 0;
1342  thr_bar->my_level = d;
1343  break;
1344  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) != 0) {
1345  // TODO: can we make the above op faster?
1346  // thread is not a subtree root at next level, so this is max
1347  thr_bar->parent_tid = tid - rem;
1348  thr_bar->my_level = d;
1349  break;
1350  }
1351  ++d;
1352  }
1353  }
1354  __kmp_type_convert(7 - ((tid - thr_bar->parent_tid) /
1355  (thr_bar->skip_per_level[thr_bar->my_level])),
1356  &(thr_bar->offset));
1357  thr_bar->old_tid = tid;
1358  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1359  thr_bar->team = team;
1360  thr_bar->parent_bar =
1361  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1362  }
1363  if (uninitialized || team_changed || tid_changed) {
1364  thr_bar->team = team;
1365  thr_bar->parent_bar =
1366  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
1367  retval = true;
1368  }
1369  if (uninitialized || team_sz_changed || tid_changed) {
1370  thr_bar->nproc = nproc;
1371  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
1372  if (thr_bar->my_level == 0)
1373  thr_bar->leaf_kids = 0;
1374  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
1375  __kmp_type_convert(nproc - tid - 1, &(thr_bar->leaf_kids));
1376  thr_bar->leaf_state = 0;
1377  for (int i = 0; i < thr_bar->leaf_kids; ++i)
1378  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
1379  }
1380  return retval;
1381 }
1382 
1383 static void __kmp_hierarchical_barrier_gather(
1384  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1385  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1386  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
1387  kmp_team_t *team = this_thr->th.th_team;
1388  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1389  kmp_uint32 nproc = this_thr->th.th_team_nproc;
1390  kmp_info_t **other_threads = team->t.t_threads;
1391  kmp_uint64 new_state = 0;
1392 
1393  int level = team->t.t_level;
1394  if (other_threads[0]
1395  ->th.th_teams_microtask) // are we inside the teams construct?
1396  if (this_thr->th.th_teams_size.nteams > 1)
1397  ++level; // level was not increased in teams construct for team_of_masters
1398  if (level == 1)
1399  thr_bar->use_oncore_barrier = 1;
1400  else
1401  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1402 
1403  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
1404  "barrier type %d\n",
1405  gtid, team->t.t_id, tid, bt));
1406  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
1407 
1408 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1409  // Barrier imbalance - save arrive time to the thread
1410  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
1411  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
1412  }
1413 #endif
1414 
1415  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
1416  team);
1417 
1418  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
1419  kmp_int32 child_tid;
1420  new_state =
1421  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1422  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1423  thr_bar->use_oncore_barrier) {
1424  if (thr_bar->leaf_kids) {
1425  // First, wait for leaf children to check-in on my b_arrived flag
1426  kmp_uint64 leaf_state =
1427  KMP_MASTER_TID(tid)
1428  ? thr_bar->b_arrived | thr_bar->leaf_state
1429  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
1430  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
1431  "for leaf kids\n",
1432  gtid, team->t.t_id, tid));
1433  kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
1434  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1435  if (reduce) {
1436  OMPT_REDUCTION_DECL(this_thr, gtid);
1437  OMPT_REDUCTION_BEGIN;
1438  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
1439  ++child_tid) {
1440  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1441  "T#%d(%d:%d)\n",
1442  gtid, team->t.t_id, tid,
1443  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1444  child_tid));
1445  (*reduce)(this_thr->th.th_local.reduce_data,
1446  other_threads[child_tid]->th.th_local.reduce_data);
1447  }
1448  OMPT_REDUCTION_END;
1449  }
1450  // clear leaf_state bits
1451  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
1452  }
1453  // Next, wait for higher level children on each child's b_arrived flag
1454  for (kmp_uint32 d = 1; d < thr_bar->my_level;
1455  ++d) { // gather lowest level threads first, but skip 0
1456  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1457  skip = thr_bar->skip_per_level[d];
1458  if (last > nproc)
1459  last = nproc;
1460  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1461  kmp_info_t *child_thr = other_threads[child_tid];
1462  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1463  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1464  "T#%d(%d:%d) "
1465  "arrived(%p) == %llu\n",
1466  gtid, team->t.t_id, tid,
1467  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1468  child_tid, &child_bar->b_arrived, new_state));
1469  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1470  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1471  if (reduce) {
1472  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1473  "T#%d(%d:%d)\n",
1474  gtid, team->t.t_id, tid,
1475  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1476  child_tid));
1477  (*reduce)(this_thr->th.th_local.reduce_data,
1478  child_thr->th.th_local.reduce_data);
1479  }
1480  }
1481  }
1482  } else { // Blocktime is not infinite
1483  for (kmp_uint32 d = 0; d < thr_bar->my_level;
1484  ++d) { // Gather lowest level threads first
1485  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
1486  skip = thr_bar->skip_per_level[d];
1487  if (last > nproc)
1488  last = nproc;
1489  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1490  kmp_info_t *child_thr = other_threads[child_tid];
1491  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1492  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
1493  "T#%d(%d:%d) "
1494  "arrived(%p) == %llu\n",
1495  gtid, team->t.t_id, tid,
1496  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1497  child_tid, &child_bar->b_arrived, new_state));
1498  kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
1499  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1500  if (reduce) {
1501  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
1502  "T#%d(%d:%d)\n",
1503  gtid, team->t.t_id, tid,
1504  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1505  child_tid));
1506  (*reduce)(this_thr->th.th_local.reduce_data,
1507  child_thr->th.th_local.reduce_data);
1508  }
1509  }
1510  }
1511  }
1512  }
1513  // All subordinates are gathered; now release parent if not primary thread
1514 
1515  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1516  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1517  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1518  gtid, team->t.t_id, tid,
1519  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1520  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1521  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1522  /* Mark arrival to parent: After performing this write, a worker thread may
1523  not assume that the team is valid any more - it could be deallocated by
1524  the primary thread at any time. */
1525  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1526  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1527  // flag; release it
1528  kmp_flag_64<> flag(&thr_bar->b_arrived,
1529  other_threads[thr_bar->parent_tid]);
1530  flag.release();
1531  } else {
1532  // Leaf does special release on "offset" bits of parent's b_arrived flag
1533  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1534  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived,
1535  thr_bar->offset + 1);
1536  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1537  flag.release();
1538  }
1539  } else { // Primary thread needs to update the team's b_arrived value
1540  team->t.t_bar[bt].b_arrived = new_state;
1541  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1542  "arrived(%p) = %llu\n",
1543  gtid, team->t.t_id, tid, team->t.t_id,
1544  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1545  }
1546  // Is the team access below unsafe or just technically invalid?
1547  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1548  "barrier type %d\n",
1549  gtid, team->t.t_id, tid, bt));
1550 }
1551 
1552 static void __kmp_hierarchical_barrier_release(
1553  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1554  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1555  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1556  kmp_team_t *team;
1557  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1558  kmp_uint32 nproc;
1559  bool team_change = false; // indicates on-core barrier shouldn't be used
1560 
1561  if (KMP_MASTER_TID(tid)) {
1562  team = __kmp_threads[gtid]->th.th_team;
1563  KMP_DEBUG_ASSERT(team != NULL);
1564  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) primary "
1565  "entered barrier type %d\n",
1566  gtid, team->t.t_id, tid, bt));
1567  } else { // Worker threads
1568  // Wait for parent thread to release me
1569  if (!thr_bar->use_oncore_barrier ||
1570  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1571  thr_bar->team == NULL) {
1572  // Use traditional method of waiting on my own b_go flag
1573  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1574  kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1575  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1576  TCW_8(thr_bar->b_go,
1577  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1578  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1579  // infinite, not nested
1580  // Wait on my "offset" bits on parent's b_go flag
1581  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1582  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1583  thr_bar->offset + 1, bt,
1584  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1585  flag.wait(this_thr, TRUE);
1586  if (thr_bar->wait_flag ==
1587  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1588  TCW_8(thr_bar->b_go,
1589  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1590  } else { // Reset my bits on parent's b_go flag
1591  (RCAST(volatile char *,
1592  &(thr_bar->parent_bar->b_go)))[thr_bar->offset + 1] = 0;
1593  }
1594  }
1595  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1596  // Early exit for reaping threads releasing forkjoin barrier
1597  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1598  return;
1599  // The worker thread may now assume that the team is valid.
1600  team = __kmp_threads[gtid]->th.th_team;
1601  KMP_DEBUG_ASSERT(team != NULL);
1602  tid = __kmp_tid_from_gtid(gtid);
1603 
1604  KA_TRACE(
1605  20,
1606  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1608  KMP_MB(); // Flush all pending memory write invalidates.
1609  }
1610 
1611  nproc = this_thr->th.th_team_nproc;
1612  int level = team->t.t_level;
1613  if (team->t.t_threads[0]
1614  ->th.th_teams_microtask) { // are we inside the teams construct?
1615  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1616  this_thr->th.th_teams_level == level)
1617  ++level; // level was not increased in teams construct for team_of_workers
1618  if (this_thr->th.th_teams_size.nteams > 1)
1619  ++level; // level was not increased in teams construct for team_of_masters
1620  }
1621  if (level == 1)
1622  thr_bar->use_oncore_barrier = 1;
1623  else
1624  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1625 
1626  // If the team size has increased, we still communicate with old leaves via
1627  // oncore barrier.
1628  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1629  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1630  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1631  tid, team);
1632  // But if the entire team changes, we won't use oncore barrier at all
1633  if (team_change)
1634  old_leaf_kids = 0;
1635 
1636 #if KMP_BARRIER_ICV_PUSH
1637  if (propagate_icvs) {
1638  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1639  FALSE);
1640  if (KMP_MASTER_TID(
1641  tid)) { // primary already has copy in final destination; copy
1642  copy_icvs(&thr_bar->th_fixed_icvs,
1643  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1644  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1645  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1646  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1647  // leaves (on-core children) pull parent's fixed ICVs directly to local
1648  // ICV store
1649  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1650  &thr_bar->parent_bar->th_fixed_icvs);
1651  // non-leaves will get ICVs piggybacked with b_go via NGO store
1652  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1653  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1654  // access
1655  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1656  else // leaves copy parent's fixed ICVs directly to local ICV store
1657  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1658  &thr_bar->parent_bar->th_fixed_icvs);
1659  }
1660  }
1661 #endif // KMP_BARRIER_ICV_PUSH
1662 
1663  // Now, release my children
1664  if (thr_bar->my_level) { // not a leaf
1665  kmp_int32 child_tid;
1666  kmp_uint32 last;
1667  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1668  thr_bar->use_oncore_barrier) {
1669  if (KMP_MASTER_TID(tid)) { // do a flat release
1670  // Set local b_go to bump children via NGO store of the cache line
1671  // containing IVCs and b_go.
1672  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1673  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1674  // the cache line
1675  ngo_load(&thr_bar->th_fixed_icvs);
1676  // This loops over all the threads skipping only the leaf nodes in the
1677  // hierarchy
1678  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1679  child_tid += thr_bar->skip_per_level[1]) {
1680  kmp_bstate_t *child_bar =
1681  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1682  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1683  "releasing T#%d(%d:%d)"
1684  " go(%p): %u => %u\n",
1685  gtid, team->t.t_id, tid,
1686  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1687  child_tid, &child_bar->b_go, child_bar->b_go,
1688  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1689  // Use ngo store (if available) to both store ICVs and release child
1690  // via child's b_go
1691  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1692  }
1693  ngo_sync();
1694  }
1695  TCW_8(thr_bar->b_go,
1696  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1697  // Now, release leaf children
1698  if (thr_bar->leaf_kids) { // if there are any
1699  // We test team_change on the off-chance that the level 1 team changed.
1700  if (team_change ||
1701  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1702  if (old_leaf_kids) { // release old leaf kids
1703  thr_bar->b_go |= old_leaf_state;
1704  }
1705  // Release new leaf kids
1706  last = tid + thr_bar->skip_per_level[1];
1707  if (last > nproc)
1708  last = nproc;
1709  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1710  ++child_tid) { // skip_per_level[0]=1
1711  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1712  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1713  KA_TRACE(
1714  20,
1715  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1716  " T#%d(%d:%d) go(%p): %u => %u\n",
1717  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1718  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1719  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1720  // Release child using child's b_go flag
1721  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1722  flag.release();
1723  }
1724  } else { // Release all children at once with leaf_state bits on my own
1725  // b_go flag
1726  thr_bar->b_go |= thr_bar->leaf_state;
1727  }
1728  }
1729  } else { // Blocktime is not infinite; do a simple hierarchical release
1730  for (int d = thr_bar->my_level - 1; d >= 0;
1731  --d) { // Release highest level threads first
1732  last = tid + thr_bar->skip_per_level[d + 1];
1733  kmp_uint32 skip = thr_bar->skip_per_level[d];
1734  if (last > nproc)
1735  last = nproc;
1736  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1737  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1738  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1739  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1740  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1741  gtid, team->t.t_id, tid,
1742  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1743  child_tid, &child_bar->b_go, child_bar->b_go,
1744  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1745  // Release child using child's b_go flag
1746  kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1747  flag.release();
1748  }
1749  }
1750  }
1751 #if KMP_BARRIER_ICV_PUSH
1752  if (propagate_icvs && !KMP_MASTER_TID(tid))
1753  // non-leaves copy ICVs from fixed ICVs to local dest
1754  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1755  &thr_bar->th_fixed_icvs);
1756 #endif // KMP_BARRIER_ICV_PUSH
1757  }
1758  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1759  "barrier type %d\n",
1760  gtid, team->t.t_id, tid, bt));
1761 }
1762 
1763 // End of Barrier Algorithms
1764 
1765 // type traits for cancellable value
1766 // if cancellable is true, then is_cancellable is a normal boolean variable
1767 // if cancellable is false, then is_cancellable is a compile time constant
1768 template <bool cancellable> struct is_cancellable {};
1769 template <> struct is_cancellable<true> {
1770  bool value;
1771  is_cancellable() : value(false) {}
1772  is_cancellable(bool b) : value(b) {}
1773  is_cancellable &operator=(bool b) {
1774  value = b;
1775  return *this;
1776  }
1777  operator bool() const { return value; }
1778 };
1779 template <> struct is_cancellable<false> {
1780  is_cancellable &operator=(bool b) { return *this; }
1781  constexpr operator bool() const { return false; }
1782 };
1783 
1784 // Internal function to do a barrier.
1785 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1786  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1787  barrier
1788  When cancellable = false,
1789  Returns 0 if primary thread, 1 if worker thread.
1790  When cancellable = true
1791  Returns 0 if not cancelled, 1 if cancelled. */
1792 template <bool cancellable = false>
1793 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1794  size_t reduce_size, void *reduce_data,
1795  void (*reduce)(void *, void *)) {
1796  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1797  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1798  int tid = __kmp_tid_from_gtid(gtid);
1799  kmp_info_t *this_thr = __kmp_threads[gtid];
1800  kmp_team_t *team = this_thr->th.th_team;
1801  int status = 0;
1802  is_cancellable<cancellable> cancelled;
1803 #if OMPT_SUPPORT && OMPT_OPTIONAL
1804  ompt_data_t *my_task_data;
1805  ompt_data_t *my_parallel_data;
1806  void *return_address;
1807  ompt_sync_region_t barrier_kind;
1808 #endif
1809 
1810  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1811  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1812 
1813 #if OMPT_SUPPORT
1814  if (ompt_enabled.enabled) {
1815 #if OMPT_OPTIONAL
1816  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1817  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1818  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1819  barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1820  if (ompt_enabled.ompt_callback_sync_region) {
1821  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1822  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1823  return_address);
1824  }
1825  if (ompt_enabled.ompt_callback_sync_region_wait) {
1826  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1827  barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1828  return_address);
1829  }
1830 #endif
1831  // It is OK to report the barrier state after the barrier begin callback.
1832  // According to the OMPT specification, a compliant implementation may
1833  // even delay reporting this state until the barrier begins to wait.
1834  auto *ompt_thr_info = &this_thr->th.ompt_thread_info;
1835  switch (barrier_kind) {
1836  case ompt_sync_region_barrier_explicit:
1837  ompt_thr_info->state = ompt_state_wait_barrier_explicit;
1838  break;
1839  case ompt_sync_region_barrier_implicit_workshare:
1840  ompt_thr_info->state = ompt_state_wait_barrier_implicit_workshare;
1841  break;
1842  case ompt_sync_region_barrier_implicit_parallel:
1843  ompt_thr_info->state = ompt_state_wait_barrier_implicit_parallel;
1844  break;
1845  case ompt_sync_region_barrier_teams:
1846  ompt_thr_info->state = ompt_state_wait_barrier_teams;
1847  break;
1848  case ompt_sync_region_barrier_implementation:
1849  [[fallthrough]];
1850  default:
1851  ompt_thr_info->state = ompt_state_wait_barrier_implementation;
1852  }
1853  }
1854 #endif
1855 
1856  if (!team->t.t_serialized) {
1857 #if USE_ITT_BUILD
1858  // This value will be used in itt notify events below.
1859  void *itt_sync_obj = NULL;
1860 #if USE_ITT_NOTIFY
1861  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1862  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1863 #endif
1864 #endif /* USE_ITT_BUILD */
1865  if (__kmp_tasking_mode == tskm_extra_barrier) {
1866  __kmp_tasking_barrier(team, this_thr, gtid);
1867  KA_TRACE(15,
1868  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1869  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1870  }
1871 
1872  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1873  access it when the team struct is not guaranteed to exist. */
1874  // See note about the corresponding code in __kmp_join_barrier() being
1875  // performance-critical.
1876  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1877 #if KMP_USE_MONITOR
1878  this_thr->th.th_team_bt_intervals =
1879  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1880  this_thr->th.th_team_bt_set =
1881  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1882 #else
1883  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1884 #endif
1885  }
1886 
1887 #if USE_ITT_BUILD
1888  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1890 #endif /* USE_ITT_BUILD */
1891 #if USE_DEBUGGER
1892  // Let the debugger know: the thread arrived to the barrier and waiting.
1893  if (KMP_MASTER_TID(tid)) { // Primary thread counter stored in team struct
1894  team->t.t_bar[bt].b_master_arrived += 1;
1895  } else {
1896  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1897  } // if
1898 #endif /* USE_DEBUGGER */
1899  if (reduce != NULL) {
1900  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1901  this_thr->th.th_local.reduce_data = reduce_data;
1902  }
1903 
1904  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1905  __kmp_task_team_setup(this_thr, team);
1906 
1907  if (cancellable) {
1908  cancelled = __kmp_linear_barrier_gather_cancellable(
1909  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1910  } else {
1911  switch (__kmp_barrier_gather_pattern[bt]) {
1912  case bp_dist_bar: {
1913  __kmp_dist_barrier_gather(bt, this_thr, gtid, tid,
1914  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1915  break;
1916  }
1917  case bp_hyper_bar: {
1918  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1919  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1920  break;
1921  }
1922  case bp_hierarchical_bar: {
1923  __kmp_hierarchical_barrier_gather(
1924  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1925  break;
1926  }
1927  case bp_tree_bar: {
1928  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1929  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1930  break;
1931  }
1932  default: {
1933  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1934  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1935  }
1936  }
1937  }
1938 
1939  KMP_MB();
1940 
1941  if (KMP_MASTER_TID(tid)) {
1942  status = 0;
1943  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1944  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1945  }
1946 #if USE_DEBUGGER
1947  // Let the debugger know: All threads are arrived and starting leaving the
1948  // barrier.
1949  team->t.t_bar[bt].b_team_arrived += 1;
1950 #endif
1951 
1952  if (__kmp_omp_cancellation) {
1953  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1954  // Reset cancellation flag for worksharing constructs
1955  if (cancel_request == cancel_loop ||
1956  cancel_request == cancel_sections) {
1957  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1958  }
1959  }
1960 #if USE_ITT_BUILD
1961  /* TODO: In case of split reduction barrier, primary thread may send
1962  acquired event early, before the final summation into the shared
1963  variable is done (final summation can be a long operation for array
1964  reductions). */
1965  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1966  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1967 #endif /* USE_ITT_BUILD */
1968 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1969  // Barrier - report frame end (only if active_level == 1)
1970  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1971  __kmp_forkjoin_frames_mode &&
1972  (this_thr->th.th_teams_microtask == NULL || // either not in teams
1973  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1974  team->t.t_active_level == 1) {
1975  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1976  kmp_uint64 cur_time = __itt_get_timestamp();
1977  kmp_info_t **other_threads = team->t.t_threads;
1978  int nproc = this_thr->th.th_team_nproc;
1979  int i;
1980  switch (__kmp_forkjoin_frames_mode) {
1981  case 1:
1982  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1983  loc, nproc);
1984  this_thr->th.th_frame_time = cur_time;
1985  break;
1986  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1987  // be fixed)
1988  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1989  1, loc, nproc);
1990  break;
1991  case 3:
1992  if (__itt_metadata_add_ptr) {
1993  // Initialize with primary thread's wait time
1994  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1995  // Set arrive time to zero to be able to check it in
1996  // __kmp_invoke_task(); the same is done inside the loop below
1997  this_thr->th.th_bar_arrive_time = 0;
1998  for (i = 1; i < nproc; ++i) {
1999  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2000  other_threads[i]->th.th_bar_arrive_time = 0;
2001  }
2002  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2003  cur_time, delta,
2004  (kmp_uint64)(reduce != NULL));
2005  }
2006  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2007  loc, nproc);
2008  this_thr->th.th_frame_time = cur_time;
2009  break;
2010  }
2011  }
2012 #endif /* USE_ITT_BUILD */
2013  } else {
2014  status = 1;
2015 #if USE_ITT_BUILD
2016  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2017  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2018 #endif /* USE_ITT_BUILD */
2019  }
2020  if ((status == 1 || !is_split) && !cancelled) {
2021  if (cancellable) {
2022  cancelled = __kmp_linear_barrier_release_cancellable(
2023  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2024  } else {
2025  switch (__kmp_barrier_release_pattern[bt]) {
2026  case bp_dist_bar: {
2027  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2028  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2029  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2030  break;
2031  }
2032  case bp_hyper_bar: {
2033  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2034  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2035  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2036  break;
2037  }
2038  case bp_hierarchical_bar: {
2039  __kmp_hierarchical_barrier_release(
2040  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2041  break;
2042  }
2043  case bp_tree_bar: {
2044  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2045  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2046  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2047  break;
2048  }
2049  default: {
2050  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2051  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
2052  }
2053  }
2054  }
2055  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
2056  __kmp_task_team_sync(this_thr, team);
2057  }
2058  }
2059 
2060 #if USE_ITT_BUILD
2061  /* GEH: TODO: Move this under if-condition above and also include in
2062  __kmp_end_split_barrier(). This will more accurately represent the actual
2063  release time of the threads for split barriers. */
2064  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2065  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2066 #endif /* USE_ITT_BUILD */
2067  } else { // Team is serialized.
2068  status = 0;
2069  if (__kmp_tasking_mode != tskm_immediate_exec) {
2070  if (this_thr->th.th_task_team != NULL) {
2071 #if USE_ITT_NOTIFY
2072  void *itt_sync_obj = NULL;
2073  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2074  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
2075  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2076  }
2077 #endif
2078 
2079  KMP_DEBUG_ASSERT(
2080  this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE ||
2081  this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
2082  TRUE);
2083  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2084  __kmp_task_team_setup(this_thr, team);
2085 
2086 #if USE_ITT_BUILD
2087  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2088  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2089 #endif /* USE_ITT_BUILD */
2090  }
2091  }
2092  }
2093  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2094  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
2095  __kmp_tid_from_gtid(gtid), status));
2096 
2097 #if OMPT_SUPPORT
2098  if (ompt_enabled.enabled) {
2099 #if OMPT_OPTIONAL
2100  if (ompt_enabled.ompt_callback_sync_region_wait) {
2101  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2102  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2103  return_address);
2104  }
2105  if (ompt_enabled.ompt_callback_sync_region) {
2106  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2107  barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
2108  return_address);
2109  }
2110 #endif
2111  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
2112  }
2113 #endif
2114 
2115  if (cancellable)
2116  return (int)cancelled;
2117  return status;
2118 }
2119 
2120 // Returns 0 if primary thread, 1 if worker thread.
2121 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
2122  size_t reduce_size, void *reduce_data,
2123  void (*reduce)(void *, void *)) {
2124  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
2125  reduce);
2126 }
2127 
2128 #if defined(KMP_GOMP_COMPAT)
2129 // Returns 1 if cancelled, 0 otherwise
2130 int __kmp_barrier_gomp_cancel(int gtid) {
2131  if (__kmp_omp_cancellation) {
2132  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
2133  0, NULL, NULL);
2134  if (cancelled) {
2135  int tid = __kmp_tid_from_gtid(gtid);
2136  kmp_info_t *this_thr = __kmp_threads[gtid];
2137  if (KMP_MASTER_TID(tid)) {
2138  // Primary thread does not need to revert anything
2139  } else {
2140  // Workers need to revert their private b_arrived flag
2141  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
2142  KMP_BARRIER_STATE_BUMP;
2143  }
2144  }
2145  return cancelled;
2146  }
2147  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
2148  return FALSE;
2149 }
2150 #endif
2151 
2152 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
2153  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
2154  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
2155  KMP_DEBUG_ASSERT(bt < bs_last_barrier);
2156  int tid = __kmp_tid_from_gtid(gtid);
2157  kmp_info_t *this_thr = __kmp_threads[gtid];
2158  kmp_team_t *team = this_thr->th.th_team;
2159 
2160  if (!team->t.t_serialized) {
2161  if (KMP_MASTER_GTID(gtid)) {
2162  switch (__kmp_barrier_release_pattern[bt]) {
2163  case bp_dist_bar: {
2164  __kmp_dist_barrier_release(bt, this_thr, gtid, tid,
2165  FALSE USE_ITT_BUILD_ARG(NULL));
2166  break;
2167  }
2168  case bp_hyper_bar: {
2169  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2170  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
2171  FALSE USE_ITT_BUILD_ARG(NULL));
2172  break;
2173  }
2174  case bp_hierarchical_bar: {
2175  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
2176  FALSE USE_ITT_BUILD_ARG(NULL));
2177  break;
2178  }
2179  case bp_tree_bar: {
2180  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
2181  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
2182  FALSE USE_ITT_BUILD_ARG(NULL));
2183  break;
2184  }
2185  default: {
2186  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
2187  FALSE USE_ITT_BUILD_ARG(NULL));
2188  }
2189  }
2190  if (__kmp_tasking_mode != tskm_immediate_exec) {
2191  __kmp_task_team_sync(this_thr, team);
2192  } // if
2193  }
2194  }
2195 }
2196 
2197 void __kmp_join_barrier(int gtid) {
2198  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
2199  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2200 
2201  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
2202 
2203  kmp_info_t *this_thr = __kmp_threads[gtid];
2204  kmp_team_t *team;
2205  int tid;
2206 #ifdef KMP_DEBUG
2207  int team_id;
2208 #endif /* KMP_DEBUG */
2209 #if USE_ITT_BUILD
2210  void *itt_sync_obj = NULL;
2211 #if USE_ITT_NOTIFY
2212  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
2213  // Get object created at fork_barrier
2214  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2215 #endif
2216 #endif /* USE_ITT_BUILD */
2217 #if ((USE_ITT_BUILD && USE_ITT_NOTIFY) || defined KMP_DEBUG)
2218  int nproc = this_thr->th.th_team_nproc;
2219 #endif
2220  KMP_MB();
2221 
2222  // Get current info
2223  team = this_thr->th.th_team;
2224  KMP_DEBUG_ASSERT(nproc == team->t.t_nproc);
2225  tid = __kmp_tid_from_gtid(gtid);
2226 #ifdef KMP_DEBUG
2227  team_id = team->t.t_id;
2228  kmp_info_t *master_thread = this_thr->th.th_team_master;
2229  if (master_thread != team->t.t_threads[0]) {
2230  __kmp_print_structure();
2231  }
2232 #endif /* KMP_DEBUG */
2233  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
2234  KMP_MB();
2235 
2236  // Verify state
2237  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
2238  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
2239  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
2240  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
2241  gtid, team_id, tid));
2242 
2243 #if OMPT_SUPPORT
2244  if (ompt_enabled.enabled) {
2245 #if OMPT_OPTIONAL
2246  ompt_data_t *my_task_data;
2247  ompt_data_t *my_parallel_data;
2248  void *codeptr = NULL;
2249  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2250  if (KMP_MASTER_TID(ds_tid) &&
2251  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2252  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2253  codeptr = team->t.ompt_team_info.master_return_address;
2254  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
2255  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
2256  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2257  ompt_state_t ompt_state = ompt_state_wait_barrier_implicit_parallel;
2258  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league) {
2259  sync_kind = ompt_sync_region_barrier_teams;
2260  ompt_state = ompt_state_wait_barrier_teams;
2261  }
2262  if (ompt_enabled.ompt_callback_sync_region) {
2263  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2264  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2265  }
2266  if (ompt_enabled.ompt_callback_sync_region_wait) {
2267  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2268  sync_kind, ompt_scope_begin, my_parallel_data, my_task_data, codeptr);
2269  }
2270  if (!KMP_MASTER_TID(ds_tid))
2271  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
2272 #endif
2273  this_thr->th.ompt_thread_info.state = ompt_state;
2274  }
2275 #endif
2276 
2277  if (__kmp_tasking_mode == tskm_extra_barrier) {
2278  __kmp_tasking_barrier(team, this_thr, gtid);
2279  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past tasking barrier\n",
2280  gtid, team_id, tid));
2281  }
2282 #ifdef KMP_DEBUG
2283  if (__kmp_tasking_mode != tskm_immediate_exec) {
2284  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
2285  "%p, th_task_team = %p\n",
2286  __kmp_gtid_from_thread(this_thr), team_id,
2287  team->t.t_task_team[this_thr->th.th_task_state],
2288  this_thr->th.th_task_team));
2289  KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
2290  }
2291 #endif /* KMP_DEBUG */
2292 
2293  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
2294  access it when the team struct is not guaranteed to exist. Doing these
2295  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
2296  we do not perform the copy if blocktime=infinite, since the values are not
2297  used by __kmp_wait_template() in that case. */
2298  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2299 #if KMP_USE_MONITOR
2300  this_thr->th.th_team_bt_intervals =
2301  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2302  this_thr->th.th_team_bt_set =
2303  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2304 #else
2305  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2306 #endif
2307  }
2308 
2309 #if USE_ITT_BUILD
2310  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2311  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
2312 #endif /* USE_ITT_BUILD */
2313 
2314  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
2315  case bp_dist_bar: {
2316  __kmp_dist_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2317  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2318  break;
2319  }
2320  case bp_hyper_bar: {
2321  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2322  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2323  break;
2324  }
2325  case bp_hierarchical_bar: {
2326  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2327  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2328  break;
2329  }
2330  case bp_tree_bar: {
2331  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2332  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2333  break;
2334  }
2335  default: {
2336  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
2337  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
2338  }
2339  }
2340 
2341  /* From this point on, the team data structure may be deallocated at any time
2342  by the primary thread - it is unsafe to reference it in any of the worker
2343  threads. Any per-team data items that need to be referenced before the
2344  end of the barrier should be moved to the kmp_task_team_t structs. */
2345  if (KMP_MASTER_TID(tid)) {
2346  if (__kmp_tasking_mode != tskm_immediate_exec) {
2347  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
2348  }
2349  if (__kmp_display_affinity) {
2350  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
2351  }
2352 #if KMP_STATS_ENABLED
2353  // Have primary thread flag the workers to indicate they are now waiting for
2354  // next parallel region, Also wake them up so they switch their timers to
2355  // idle.
2356  for (int i = 0; i < team->t.t_nproc; ++i) {
2357  kmp_info_t *team_thread = team->t.t_threads[i];
2358  if (team_thread == this_thr)
2359  continue;
2360  team_thread->th.th_stats->setIdleFlag();
2361  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2362  team_thread->th.th_sleep_loc != NULL)
2363  __kmp_null_resume_wrapper(team_thread);
2364  }
2365 #endif
2366 #if USE_ITT_BUILD
2367  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2368  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2369 #endif /* USE_ITT_BUILD */
2370 
2371 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2372  // Join barrier - report frame end
2373  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2374  __kmp_forkjoin_frames_mode &&
2375  (this_thr->th.th_teams_microtask == NULL || // either not in teams
2376  this_thr->th.th_teams_size.nteams == 1) && // or inside single team
2377  team->t.t_active_level == 1) {
2378  kmp_uint64 cur_time = __itt_get_timestamp();
2379  ident_t *loc = team->t.t_ident;
2380  kmp_info_t **other_threads = team->t.t_threads;
2381  switch (__kmp_forkjoin_frames_mode) {
2382  case 1:
2383  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2384  loc, nproc);
2385  break;
2386  case 2:
2387  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
2388  loc, nproc);
2389  break;
2390  case 3:
2391  if (__itt_metadata_add_ptr) {
2392  // Initialize with primary thread's wait time
2393  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
2394  // Set arrive time to zero to be able to check it in
2395  // __kmp_invoke_task(); the same is done inside the loop below
2396  this_thr->th.th_bar_arrive_time = 0;
2397  for (int i = 1; i < nproc; ++i) {
2398  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
2399  other_threads[i]->th.th_bar_arrive_time = 0;
2400  }
2401  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
2402  cur_time, delta, 0);
2403  }
2404  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
2405  loc, nproc);
2406  this_thr->th.th_frame_time = cur_time;
2407  break;
2408  }
2409  }
2410 #endif /* USE_ITT_BUILD */
2411  }
2412 #if USE_ITT_BUILD
2413  else {
2414  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
2415  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
2416  }
2417 #endif /* USE_ITT_BUILD */
2418 
2419 #if KMP_DEBUG
2420  if (KMP_MASTER_TID(tid)) {
2421  KA_TRACE(
2422  15,
2423  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
2424  gtid, team_id, tid, nproc));
2425  }
2426 #endif /* KMP_DEBUG */
2427 
2428  // TODO now, mark worker threads as done so they may be disbanded
2429  KMP_MB(); // Flush all pending memory write invalidates.
2430  KA_TRACE(10,
2431  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
2432 
2433 }
2434 
2435 // TODO release worker threads' fork barriers as we are ready instead of all at
2436 // once
2437 void __kmp_fork_barrier(int gtid, int tid) {
2438  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
2439  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
2440  kmp_info_t *this_thr = __kmp_threads[gtid];
2441  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
2442 #if USE_ITT_BUILD
2443  void *itt_sync_obj = NULL;
2444 #endif /* USE_ITT_BUILD */
2445 #ifdef KMP_DEBUG
2446  if (team)
2447  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
2448  (team != NULL) ? team->t.t_id : -1, tid));
2449 #endif
2450  // th_team pointer only valid for primary thread here
2451  if (KMP_MASTER_TID(tid)) {
2452 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2453  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2454  // Create itt barrier object
2455  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
2456  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
2457  }
2458 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2459 
2460 #ifdef KMP_DEBUG
2461  KMP_DEBUG_ASSERT(team);
2462  kmp_info_t **other_threads = team->t.t_threads;
2463  int i;
2464 
2465  // Verify state
2466  KMP_MB();
2467 
2468  for (i = 1; i < team->t.t_nproc; ++i) {
2469  KA_TRACE(500,
2470  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
2471  "== %u.\n",
2472  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
2473  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
2474  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
2475  KMP_DEBUG_ASSERT(
2476  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
2477  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
2478  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
2479  }
2480 #endif
2481 
2482  if (__kmp_tasking_mode != tskm_immediate_exec)
2483  __kmp_task_team_setup(this_thr, team);
2484 
2485  /* The primary thread may have changed its blocktime between join barrier
2486  and fork barrier. Copy the blocktime info to the thread, where
2487  __kmp_wait_template() can access it when the team struct is not
2488  guaranteed to exist. */
2489  // See note about the corresponding code in __kmp_join_barrier() being
2490  // performance-critical
2491  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
2492 #if KMP_USE_MONITOR
2493  this_thr->th.th_team_bt_intervals =
2494  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2495  this_thr->th.th_team_bt_set =
2496  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2497 #else
2498  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
2499 #endif
2500  }
2501  } // primary thread
2502 
2503  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
2504  case bp_dist_bar: {
2505  __kmp_dist_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2506  TRUE USE_ITT_BUILD_ARG(NULL));
2507  break;
2508  }
2509  case bp_hyper_bar: {
2510  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2511  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2512  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2513  break;
2514  }
2515  case bp_hierarchical_bar: {
2516  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2517  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2518  break;
2519  }
2520  case bp_tree_bar: {
2521  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
2522  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2523  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2524  break;
2525  }
2526  default: {
2527  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2528  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2529  }
2530  }
2531 
2532 #if OMPT_SUPPORT
2533  ompt_state_t ompt_state = this_thr->th.ompt_thread_info.state;
2534  if (ompt_enabled.enabled &&
2535  (ompt_state == ompt_state_wait_barrier_teams ||
2536  ompt_state == ompt_state_wait_barrier_implicit_parallel)) {
2537  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2538  ompt_data_t *task_data = (team)
2539  ? OMPT_CUR_TASK_DATA(this_thr)
2540  : &(this_thr->th.ompt_thread_info.task_data);
2541  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2542 #if OMPT_OPTIONAL
2543  void *codeptr = NULL;
2544  if (KMP_MASTER_TID(ds_tid) &&
2545  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2546  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2547  codeptr = team ? team->t.ompt_team_info.master_return_address : NULL;
2548  ompt_sync_region_t sync_kind = ompt_sync_region_barrier_implicit_parallel;
2549  if (this_thr->th.ompt_thread_info.parallel_flags & ompt_parallel_league)
2550  sync_kind = ompt_sync_region_barrier_teams;
2551  if (ompt_enabled.ompt_callback_sync_region_wait) {
2552  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2553  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2554  }
2555  if (ompt_enabled.ompt_callback_sync_region) {
2556  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2557  sync_kind, ompt_scope_end, NULL, task_data, codeptr);
2558  }
2559 #endif
2560  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2561  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2562  ompt_scope_end, NULL, task_data, 0, ds_tid,
2563  ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2564  }
2565  }
2566 #endif
2567 
2568  // Early exit for reaping threads releasing forkjoin barrier
2569  if (TCR_4(__kmp_global.g.g_done)) {
2570  this_thr->th.th_task_team = NULL;
2571 
2572 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2573  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2574  if (!KMP_MASTER_TID(tid)) {
2575  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2576  if (itt_sync_obj)
2577  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2578  }
2579  }
2580 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2581  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2582  return;
2583  }
2584 
2585  /* We can now assume that a valid team structure has been allocated by the
2586  primary thread and propagated to all worker threads. The current thread,
2587  however, may not be part of the team, so we can't blindly assume that the
2588  team pointer is non-null. */
2589  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2590  KMP_DEBUG_ASSERT(team != NULL);
2591  tid = __kmp_tid_from_gtid(gtid);
2592 
2593 #if KMP_BARRIER_ICV_PULL
2594  /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2595  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2596  implicit task has this data before this function is called. We cannot
2597  modify __kmp_fork_call() to look at the fixed ICVs in the primary thread's
2598  thread struct, because it is not always the case that the threads arrays
2599  have been allocated when __kmp_fork_call() is executed. */
2600  {
2601  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2602  if (!KMP_MASTER_TID(tid)) { // primary thread already has ICVs
2603  // Copy the initial ICVs from the primary thread's thread struct to the
2604  // implicit task for this tid.
2605  KA_TRACE(10,
2606  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2607  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2608  tid, FALSE);
2609  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2610  &team->t.t_threads[0]
2611  ->th.th_bar[bs_forkjoin_barrier]
2612  .bb.th_fixed_icvs);
2613  }
2614  }
2615 #endif // KMP_BARRIER_ICV_PULL
2616 
2617  if (__kmp_tasking_mode != tskm_immediate_exec) {
2618  __kmp_task_team_sync(this_thr, team);
2619  }
2620 
2621 #if KMP_AFFINITY_SUPPORTED
2622  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2623  if (proc_bind == proc_bind_intel) {
2624  // Call dynamic affinity settings
2625  if (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed) {
2626  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2627  }
2628  } else if (proc_bind != proc_bind_false) {
2629  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2630  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2631  __kmp_gtid_from_thread(this_thr),
2632  this_thr->th.th_current_place));
2633  } else {
2634  __kmp_affinity_bind_place(gtid);
2635  }
2636  }
2637 #endif // KMP_AFFINITY_SUPPORTED
2638  // Perform the display affinity functionality
2639  if (__kmp_display_affinity) {
2640  if (team->t.t_display_affinity
2641 #if KMP_AFFINITY_SUPPORTED
2642  || (__kmp_affinity.type == affinity_balanced && team->t.t_size_changed)
2643 #endif
2644  ) {
2645  // NULL means use the affinity-format-var ICV
2646  __kmp_aux_display_affinity(gtid, NULL);
2647  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2648  this_thr->th.th_prev_level = team->t.t_level;
2649  }
2650  }
2651  if (!KMP_MASTER_TID(tid))
2652  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2653 
2654 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2655  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2656  if (!KMP_MASTER_TID(tid)) {
2657  // Get correct barrier object
2658  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2659  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2660  } // (prepare called inside barrier_release)
2661  }
2662 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2663  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2664  team->t.t_id, tid));
2665 }
2666 
2667 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2668  kmp_internal_control_t *new_icvs, ident_t *loc) {
2669  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2670 
2671  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2672  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2673 
2674 /* Primary thread's copy of the ICVs was set up on the implicit taskdata in
2675  __kmp_reinitialize_team. __kmp_fork_call() assumes the primary thread's
2676  implicit task has this data before this function is called. */
2677 #if KMP_BARRIER_ICV_PULL
2678  /* Copy ICVs to primary thread's thread structure into th_fixed_icvs (which
2679  remains untouched), where all of the worker threads can access them and
2680  make their own copies after the barrier. */
2681  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2682  // allocated at this point
2683  copy_icvs(
2684  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2685  new_icvs);
2686  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2687  team->t.t_threads[0], team));
2688 #elif KMP_BARRIER_ICV_PUSH
2689  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2690  // done here.
2691  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2692  team->t.t_threads[0], team));
2693 #else
2694  // Copy the ICVs to each of the non-primary threads. This takes O(nthreads)
2695  // time.
2696  ngo_load(new_icvs);
2697  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2698  // allocated at this point
2699  for (int f = 1; f < new_nproc; ++f) { // Skip the primary thread
2700  // TODO: GEH - pass in better source location info since usually NULL here
2701  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2702  f, team->t.t_threads[f], team));
2703  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2704  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2705  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2706  f, team->t.t_threads[f], team));
2707  }
2708  ngo_sync();
2709 #endif // KMP_BARRIER_ICV_PULL
2710 }
Definition: kmp.h:227