LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #if OMPT_SUPPORT
19 #include "ompt-specific.h"
20 #endif
21 
22 #if KMP_MIC
23 #include <immintrin.h>
24 #define USE_NGO_STORES 1
25 #endif // KMP_MIC
26 
27 #include "tsan_annotations.h"
28 
29 #if KMP_MIC && USE_NGO_STORES
30 // ICV copying
31 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
32 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
35 #else
36 #define ngo_load(src) ((void)0)
37 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
38 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
39 #define ngo_sync() ((void)0)
40 #endif /* KMP_MIC && USE_NGO_STORES */
41 
42 void __kmp_print_structure(void); // Forward declaration
43 
44 // ---------------------------- Barrier Algorithms ----------------------------
45 
46 // Linear Barrier
47 template <bool cancellable = false>
48 static bool __kmp_linear_barrier_gather_template(
49  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
51  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
52  kmp_team_t *team = this_thr->th.th_team;
53  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
54  kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(
57  20,
58  ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59  gtid, team->t.t_id, tid, bt));
60  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61 
62 #if USE_ITT_BUILD && USE_ITT_NOTIFY
63  // Barrier imbalance - save arrive time to the thread
64  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
65  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
66  __itt_get_timestamp();
67  }
68 #endif
69  // We now perform a linear reduction to signal that all of the threads have
70  // arrived.
71  if (!KMP_MASTER_TID(tid)) {
72  KA_TRACE(20,
73  ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
74  "arrived(%p): %llu => %llu\n",
75  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
76  team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
77  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
78  // Mark arrival to master thread
79  /* After performing this write, a worker thread may not assume that the team
80  is valid any more - it could be deallocated by the master thread at any
81  time. */
82  ANNOTATE_BARRIER_BEGIN(this_thr);
83  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
84  flag.release();
85  } else {
86  kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
87  int nproc = this_thr->th.th_team_nproc;
88  int i;
89  // Don't have to worry about sleep bit here or atomic since team setting
90  kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
91 
92  // Collect all the worker team member threads.
93  for (i = 1; i < nproc; ++i) {
94 #if KMP_CACHE_MANAGE
95  // Prefetch next thread's arrived count
96  if (i + 1 < nproc)
97  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
98 #endif /* KMP_CACHE_MANAGE */
99  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
100  "arrived(%p) == %llu\n",
101  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
102  team->t.t_id, i,
103  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
104 
105  // Wait for worker thread to arrive
106  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
107  new_state);
108  if (cancellable) {
109  bool cancelled = flag.wait_cancellable_nosleep(
110  this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
111  if (cancelled)
112  return true;
113  } else {
114  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
115  }
116  ANNOTATE_BARRIER_END(other_threads[i]);
117 #if USE_ITT_BUILD && USE_ITT_NOTIFY
118  // Barrier imbalance - write min of the thread time and the other thread
119  // time to the thread.
120  if (__kmp_forkjoin_frames_mode == 2) {
121  this_thr->th.th_bar_min_time = KMP_MIN(
122  this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
123  }
124 #endif
125  if (reduce) {
126  KA_TRACE(100,
127  ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
128  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
129  team->t.t_id, i));
130  ANNOTATE_REDUCE_AFTER(reduce);
131  (*reduce)(this_thr->th.th_local.reduce_data,
132  other_threads[i]->th.th_local.reduce_data);
133  ANNOTATE_REDUCE_BEFORE(reduce);
134  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
135  }
136  }
137  // Don't have to worry about sleep bit here or atomic since team setting
138  team_bar->b_arrived = new_state;
139  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
140  "arrived(%p) = %llu\n",
141  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
142  new_state));
143  }
144  KA_TRACE(
145  20,
146  ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
147  gtid, team->t.t_id, tid, bt));
148  return false;
149 }
150 
151 template <bool cancellable = false>
152 static bool __kmp_linear_barrier_release_template(
153  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
154  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
155  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
156  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
157  kmp_team_t *team;
158 
159  if (KMP_MASTER_TID(tid)) {
160  unsigned int i;
161  kmp_uint32 nproc = this_thr->th.th_team_nproc;
162  kmp_info_t **other_threads;
163 
164  team = __kmp_threads[gtid]->th.th_team;
165  KMP_DEBUG_ASSERT(team != NULL);
166  other_threads = team->t.t_threads;
167 
168  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
169  "barrier type %d\n",
170  gtid, team->t.t_id, tid, bt));
171 
172  if (nproc > 1) {
173 #if KMP_BARRIER_ICV_PUSH
174  {
175  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
176  if (propagate_icvs) {
177  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
178  for (i = 1; i < nproc; ++i) {
179  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
180  team, i, FALSE);
181  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
182  &team->t.t_implicit_task_taskdata[0].td_icvs);
183  }
184  ngo_sync();
185  }
186  }
187 #endif // KMP_BARRIER_ICV_PUSH
188 
189  // Now, release all of the worker threads
190  for (i = 1; i < nproc; ++i) {
191 #if KMP_CACHE_MANAGE
192  // Prefetch next thread's go flag
193  if (i + 1 < nproc)
194  KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
195 #endif /* KMP_CACHE_MANAGE */
196  KA_TRACE(
197  20,
198  ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
199  "go(%p): %u => %u\n",
200  gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
201  team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
202  other_threads[i]->th.th_bar[bt].bb.b_go,
203  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
204  ANNOTATE_BARRIER_BEGIN(other_threads[i]);
205  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
206  other_threads[i]);
207  flag.release();
208  }
209  }
210  } else { // Wait for the MASTER thread to release us
211  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
212  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
213  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
214  if (cancellable) {
215  bool cancelled = flag.wait_cancellable_nosleep(
216  this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
217  if (cancelled) {
218  return true;
219  }
220  } else {
221  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
222  }
223  ANNOTATE_BARRIER_END(this_thr);
224 #if USE_ITT_BUILD && USE_ITT_NOTIFY
225  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
226  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
227  // disabled)
228  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
229  // Cancel wait on previous parallel region...
230  __kmp_itt_task_starting(itt_sync_obj);
231 
232  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
233  return false;
234 
235  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
236  if (itt_sync_obj != NULL)
237  // Call prepare as early as possible for "new" barrier
238  __kmp_itt_task_finished(itt_sync_obj);
239  } else
240 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
241  // Early exit for reaping threads releasing forkjoin barrier
242  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
243  return false;
244 // The worker thread may now assume that the team is valid.
245 #ifdef KMP_DEBUG
246  tid = __kmp_tid_from_gtid(gtid);
247  team = __kmp_threads[gtid]->th.th_team;
248 #endif
249  KMP_DEBUG_ASSERT(team != NULL);
250  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
251  KA_TRACE(20,
252  ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
253  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
254  KMP_MB(); // Flush all pending memory write invalidates.
255  }
256  KA_TRACE(
257  20,
258  ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
259  gtid, team->t.t_id, tid, bt));
260  return false;
261 }
262 
263 static void __kmp_linear_barrier_gather(
264  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
265  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
266  __kmp_linear_barrier_gather_template<false>(
267  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
268 }
269 
270 static bool __kmp_linear_barrier_gather_cancellable(
271  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
272  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
273  return __kmp_linear_barrier_gather_template<true>(
274  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
275 }
276 
277 static void __kmp_linear_barrier_release(
278  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
279  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
280  __kmp_linear_barrier_release_template<false>(
281  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
282 }
283 
284 static bool __kmp_linear_barrier_release_cancellable(
285  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
286  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
287  return __kmp_linear_barrier_release_template<true>(
288  bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
289 }
290 
291 // Tree barrier
292 static void
293 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
294  int tid, void (*reduce)(void *, void *)
295  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
296  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
297  kmp_team_t *team = this_thr->th.th_team;
298  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
299  kmp_info_t **other_threads = team->t.t_threads;
300  kmp_uint32 nproc = this_thr->th.th_team_nproc;
301  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
302  kmp_uint32 branch_factor = 1 << branch_bits;
303  kmp_uint32 child;
304  kmp_uint32 child_tid;
305  kmp_uint64 new_state;
306 
307  KA_TRACE(
308  20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
309  gtid, team->t.t_id, tid, bt));
310  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
311 
312 #if USE_ITT_BUILD && USE_ITT_NOTIFY
313  // Barrier imbalance - save arrive time to the thread
314  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
315  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
316  __itt_get_timestamp();
317  }
318 #endif
319  // Perform tree gather to wait until all threads have arrived; reduce any
320  // required data as we go
321  child_tid = (tid << branch_bits) + 1;
322  if (child_tid < nproc) {
323  // Parent threads wait for all their children to arrive
324  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
325  child = 1;
326  do {
327  kmp_info_t *child_thr = other_threads[child_tid];
328  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
329 #if KMP_CACHE_MANAGE
330  // Prefetch next thread's arrived count
331  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
332  KMP_CACHE_PREFETCH(
333  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
334 #endif /* KMP_CACHE_MANAGE */
335  KA_TRACE(20,
336  ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
337  "arrived(%p) == %llu\n",
338  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
339  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
340  // Wait for child to arrive
341  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
342  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
343  ANNOTATE_BARRIER_END(child_thr);
344 #if USE_ITT_BUILD && USE_ITT_NOTIFY
345  // Barrier imbalance - write min of the thread time and a child time to
346  // the thread.
347  if (__kmp_forkjoin_frames_mode == 2) {
348  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
349  child_thr->th.th_bar_min_time);
350  }
351 #endif
352  if (reduce) {
353  KA_TRACE(100,
354  ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
355  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
356  team->t.t_id, child_tid));
357  ANNOTATE_REDUCE_AFTER(reduce);
358  (*reduce)(this_thr->th.th_local.reduce_data,
359  child_thr->th.th_local.reduce_data);
360  ANNOTATE_REDUCE_BEFORE(reduce);
361  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
362  }
363  child++;
364  child_tid++;
365  } while (child <= branch_factor && child_tid < nproc);
366  }
367 
368  if (!KMP_MASTER_TID(tid)) { // Worker threads
369  kmp_int32 parent_tid = (tid - 1) >> branch_bits;
370 
371  KA_TRACE(20,
372  ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
373  "arrived(%p): %llu => %llu\n",
374  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
375  team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
376  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
377 
378  // Mark arrival to parent thread
379  /* After performing this write, a worker thread may not assume that the team
380  is valid any more - it could be deallocated by the master thread at any
381  time. */
382  ANNOTATE_BARRIER_BEGIN(this_thr);
383  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
384  flag.release();
385  } else {
386  // Need to update the team arrived pointer if we are the master thread
387  if (nproc > 1) // New value was already computed above
388  team->t.t_bar[bt].b_arrived = new_state;
389  else
390  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
391  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
392  "arrived(%p) = %llu\n",
393  gtid, team->t.t_id, tid, team->t.t_id,
394  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
395  }
396  KA_TRACE(20,
397  ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
398  gtid, team->t.t_id, tid, bt));
399 }
400 
401 static void __kmp_tree_barrier_release(
402  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
403  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
404  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
405  kmp_team_t *team;
406  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
407  kmp_uint32 nproc;
408  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
409  kmp_uint32 branch_factor = 1 << branch_bits;
410  kmp_uint32 child;
411  kmp_uint32 child_tid;
412 
413  // Perform a tree release for all of the threads that have been gathered
414  if (!KMP_MASTER_TID(
415  tid)) { // Handle fork barrier workers who aren't part of a team yet
416  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
417  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
418  // Wait for parent thread to release us
419  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
420  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
421  ANNOTATE_BARRIER_END(this_thr);
422 #if USE_ITT_BUILD && USE_ITT_NOTIFY
423  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
424  // In fork barrier where we could not get the object reliably (or
425  // ITTNOTIFY is disabled)
426  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
427  // Cancel wait on previous parallel region...
428  __kmp_itt_task_starting(itt_sync_obj);
429 
430  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
431  return;
432 
433  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
434  if (itt_sync_obj != NULL)
435  // Call prepare as early as possible for "new" barrier
436  __kmp_itt_task_finished(itt_sync_obj);
437  } else
438 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
439  // Early exit for reaping threads releasing forkjoin barrier
440  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
441  return;
442 
443  // The worker thread may now assume that the team is valid.
444  team = __kmp_threads[gtid]->th.th_team;
445  KMP_DEBUG_ASSERT(team != NULL);
446  tid = __kmp_tid_from_gtid(gtid);
447 
448  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
449  KA_TRACE(20,
450  ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
451  team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
452  KMP_MB(); // Flush all pending memory write invalidates.
453  } else {
454  team = __kmp_threads[gtid]->th.th_team;
455  KMP_DEBUG_ASSERT(team != NULL);
456  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
457  "barrier type %d\n",
458  gtid, team->t.t_id, tid, bt));
459  }
460  nproc = this_thr->th.th_team_nproc;
461  child_tid = (tid << branch_bits) + 1;
462 
463  if (child_tid < nproc) {
464  kmp_info_t **other_threads = team->t.t_threads;
465  child = 1;
466  // Parent threads release all their children
467  do {
468  kmp_info_t *child_thr = other_threads[child_tid];
469  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
470 #if KMP_CACHE_MANAGE
471  // Prefetch next thread's go count
472  if (child + 1 <= branch_factor && child_tid + 1 < nproc)
473  KMP_CACHE_PREFETCH(
474  &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
475 #endif /* KMP_CACHE_MANAGE */
476 
477 #if KMP_BARRIER_ICV_PUSH
478  {
479  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
480  if (propagate_icvs) {
481  __kmp_init_implicit_task(team->t.t_ident,
482  team->t.t_threads[child_tid], team,
483  child_tid, FALSE);
484  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
485  &team->t.t_implicit_task_taskdata[0].td_icvs);
486  }
487  }
488 #endif // KMP_BARRIER_ICV_PUSH
489  KA_TRACE(20,
490  ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
491  "go(%p): %u => %u\n",
492  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
493  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
494  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
495  // Release child from barrier
496  ANNOTATE_BARRIER_BEGIN(child_thr);
497  kmp_flag_64 flag(&child_bar->b_go, child_thr);
498  flag.release();
499  child++;
500  child_tid++;
501  } while (child <= branch_factor && child_tid < nproc);
502  }
503  KA_TRACE(
504  20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
505  gtid, team->t.t_id, tid, bt));
506 }
507 
508 // Hyper Barrier
509 static void
510 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
511  int tid, void (*reduce)(void *, void *)
512  USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
513  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
514  kmp_team_t *team = this_thr->th.th_team;
515  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
516  kmp_info_t **other_threads = team->t.t_threads;
517  kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
518  kmp_uint32 num_threads = this_thr->th.th_team_nproc;
519  kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
520  kmp_uint32 branch_factor = 1 << branch_bits;
521  kmp_uint32 offset;
522  kmp_uint32 level;
523 
524  KA_TRACE(
525  20,
526  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
527  gtid, team->t.t_id, tid, bt));
528  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
529 
530 #if USE_ITT_BUILD && USE_ITT_NOTIFY
531  // Barrier imbalance - save arrive time to the thread
532  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
533  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
534  __itt_get_timestamp();
535  }
536 #endif
537  /* Perform a hypercube-embedded tree gather to wait until all of the threads
538  have arrived, and reduce any required data as we go. */
539  kmp_flag_64 p_flag(&thr_bar->b_arrived);
540  for (level = 0, offset = 1; offset < num_threads;
541  level += branch_bits, offset <<= branch_bits) {
542  kmp_uint32 child;
543  kmp_uint32 child_tid;
544 
545  if (((tid >> level) & (branch_factor - 1)) != 0) {
546  kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
547 
548  KA_TRACE(20,
549  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
550  "arrived(%p): %llu => %llu\n",
551  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
552  team->t.t_id, parent_tid, &thr_bar->b_arrived,
553  thr_bar->b_arrived,
554  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
555  // Mark arrival to parent thread
556  /* After performing this write (in the last iteration of the enclosing for
557  loop), a worker thread may not assume that the team is valid any more
558  - it could be deallocated by the master thread at any time. */
559  ANNOTATE_BARRIER_BEGIN(this_thr);
560  p_flag.set_waiter(other_threads[parent_tid]);
561  p_flag.release();
562  break;
563  }
564 
565  // Parent threads wait for children to arrive
566  if (new_state == KMP_BARRIER_UNUSED_STATE)
567  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
568  for (child = 1, child_tid = tid + (1 << level);
569  child < branch_factor && child_tid < num_threads;
570  child++, child_tid += (1 << level)) {
571  kmp_info_t *child_thr = other_threads[child_tid];
572  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
573 #if KMP_CACHE_MANAGE
574  kmp_uint32 next_child_tid = child_tid + (1 << level);
575  // Prefetch next thread's arrived count
576  if (child + 1 < branch_factor && next_child_tid < num_threads)
577  KMP_CACHE_PREFETCH(
578  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
579 #endif /* KMP_CACHE_MANAGE */
580  KA_TRACE(20,
581  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
582  "arrived(%p) == %llu\n",
583  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
584  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
585  // Wait for child to arrive
586  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
587  c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
588  ANNOTATE_BARRIER_END(child_thr);
589 #if USE_ITT_BUILD && USE_ITT_NOTIFY
590  // Barrier imbalance - write min of the thread time and a child time to
591  // the thread.
592  if (__kmp_forkjoin_frames_mode == 2) {
593  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
594  child_thr->th.th_bar_min_time);
595  }
596 #endif
597  if (reduce) {
598  KA_TRACE(100,
599  ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
600  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
601  team->t.t_id, child_tid));
602  ANNOTATE_REDUCE_AFTER(reduce);
603  (*reduce)(this_thr->th.th_local.reduce_data,
604  child_thr->th.th_local.reduce_data);
605  ANNOTATE_REDUCE_BEFORE(reduce);
606  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
607  }
608  }
609  }
610 
611  if (KMP_MASTER_TID(tid)) {
612  // Need to update the team arrived pointer if we are the master thread
613  if (new_state == KMP_BARRIER_UNUSED_STATE)
614  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
615  else
616  team->t.t_bar[bt].b_arrived = new_state;
617  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
618  "arrived(%p) = %llu\n",
619  gtid, team->t.t_id, tid, team->t.t_id,
620  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
621  }
622  KA_TRACE(
623  20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
624  gtid, team->t.t_id, tid, bt));
625 }
626 
627 // The reverse versions seem to beat the forward versions overall
628 #define KMP_REVERSE_HYPER_BAR
629 static void __kmp_hyper_barrier_release(
630  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
631  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
632  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
633  kmp_team_t *team;
634  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
635  kmp_info_t **other_threads;
636  kmp_uint32 num_threads;
637  kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
638  kmp_uint32 branch_factor = 1 << branch_bits;
639  kmp_uint32 child;
640  kmp_uint32 child_tid;
641  kmp_uint32 offset;
642  kmp_uint32 level;
643 
644  /* Perform a hypercube-embedded tree release for all of the threads that have
645  been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
646  are released in the reverse order of the corresponding gather, otherwise
647  threads are released in the same order. */
648  if (KMP_MASTER_TID(tid)) { // master
649  team = __kmp_threads[gtid]->th.th_team;
650  KMP_DEBUG_ASSERT(team != NULL);
651  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
652  "barrier type %d\n",
653  gtid, team->t.t_id, tid, bt));
654 #if KMP_BARRIER_ICV_PUSH
655  if (propagate_icvs) { // master already has ICVs in final destination; copy
656  copy_icvs(&thr_bar->th_fixed_icvs,
657  &team->t.t_implicit_task_taskdata[tid].td_icvs);
658  }
659 #endif
660  } else { // Handle fork barrier workers who aren't part of a team yet
661  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
662  &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
663  // Wait for parent thread to release us
664  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
665  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
666  ANNOTATE_BARRIER_END(this_thr);
667 #if USE_ITT_BUILD && USE_ITT_NOTIFY
668  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
669  // In fork barrier where we could not get the object reliably
670  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
671  // Cancel wait on previous parallel region...
672  __kmp_itt_task_starting(itt_sync_obj);
673 
674  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
675  return;
676 
677  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
678  if (itt_sync_obj != NULL)
679  // Call prepare as early as possible for "new" barrier
680  __kmp_itt_task_finished(itt_sync_obj);
681  } else
682 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
683  // Early exit for reaping threads releasing forkjoin barrier
684  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
685  return;
686 
687  // The worker thread may now assume that the team is valid.
688  team = __kmp_threads[gtid]->th.th_team;
689  KMP_DEBUG_ASSERT(team != NULL);
690  tid = __kmp_tid_from_gtid(gtid);
691 
692  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
693  KA_TRACE(20,
694  ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
695  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
696  KMP_MB(); // Flush all pending memory write invalidates.
697  }
698  num_threads = this_thr->th.th_team_nproc;
699  other_threads = team->t.t_threads;
700 
701 #ifdef KMP_REVERSE_HYPER_BAR
702  // Count up to correct level for parent
703  for (level = 0, offset = 1;
704  offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
705  level += branch_bits, offset <<= branch_bits)
706  ;
707 
708  // Now go down from there
709  for (level -= branch_bits, offset >>= branch_bits; offset != 0;
710  level -= branch_bits, offset >>= branch_bits)
711 #else
712  // Go down the tree, level by level
713  for (level = 0, offset = 1; offset < num_threads;
714  level += branch_bits, offset <<= branch_bits)
715 #endif // KMP_REVERSE_HYPER_BAR
716  {
717 #ifdef KMP_REVERSE_HYPER_BAR
718  /* Now go in reverse order through the children, highest to lowest.
719  Initial setting of child is conservative here. */
720  child = num_threads >> ((level == 0) ? level : level - 1);
721  for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
722  child_tid = tid + (child << level);
723  child >= 1; child--, child_tid -= (1 << level))
724 #else
725  if (((tid >> level) & (branch_factor - 1)) != 0)
726  // No need to go lower than this, since this is the level parent would be
727  // notified
728  break;
729  // Iterate through children on this level of the tree
730  for (child = 1, child_tid = tid + (1 << level);
731  child < branch_factor && child_tid < num_threads;
732  child++, child_tid += (1 << level))
733 #endif // KMP_REVERSE_HYPER_BAR
734  {
735  if (child_tid >= num_threads)
736  continue; // Child doesn't exist so keep going
737  else {
738  kmp_info_t *child_thr = other_threads[child_tid];
739  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
740 #if KMP_CACHE_MANAGE
741  kmp_uint32 next_child_tid = child_tid - (1 << level);
742 // Prefetch next thread's go count
743 #ifdef KMP_REVERSE_HYPER_BAR
744  if (child - 1 >= 1 && next_child_tid < num_threads)
745 #else
746  if (child + 1 < branch_factor && next_child_tid < num_threads)
747 #endif // KMP_REVERSE_HYPER_BAR
748  KMP_CACHE_PREFETCH(
749  &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
750 #endif /* KMP_CACHE_MANAGE */
751 
752 #if KMP_BARRIER_ICV_PUSH
753  if (propagate_icvs) // push my fixed ICVs to my child
754  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
755 #endif // KMP_BARRIER_ICV_PUSH
756 
757  KA_TRACE(
758  20,
759  ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
760  "go(%p): %u => %u\n",
761  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
762  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
763  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
764  // Release child from barrier
765  ANNOTATE_BARRIER_BEGIN(child_thr);
766  kmp_flag_64 flag(&child_bar->b_go, child_thr);
767  flag.release();
768  }
769  }
770  }
771 #if KMP_BARRIER_ICV_PUSH
772  if (propagate_icvs &&
773  !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
774  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
775  FALSE);
776  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
777  &thr_bar->th_fixed_icvs);
778  }
779 #endif
780  KA_TRACE(
781  20,
782  ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
783  gtid, team->t.t_id, tid, bt));
784 }
785 
786 // Hierarchical Barrier
787 
788 // Initialize thread barrier data
789 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
790  Performs the minimum amount of initialization required based on how the team
791  has changed. Returns true if leaf children will require both on-core and
792  traditional wake-up mechanisms. For example, if the team size increases,
793  threads already in the team will respond to on-core wakeup on their parent
794  thread, but threads newly added to the team will only be listening on the
795  their local b_go. */
796 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
797  kmp_bstate_t *thr_bar,
798  kmp_uint32 nproc, int gtid,
799  int tid, kmp_team_t *team) {
800  // Checks to determine if (re-)initialization is needed
801  bool uninitialized = thr_bar->team == NULL;
802  bool team_changed = team != thr_bar->team;
803  bool team_sz_changed = nproc != thr_bar->nproc;
804  bool tid_changed = tid != thr_bar->old_tid;
805  bool retval = false;
806 
807  if (uninitialized || team_sz_changed) {
808  __kmp_get_hierarchy(nproc, thr_bar);
809  }
810 
811  if (uninitialized || team_sz_changed || tid_changed) {
812  thr_bar->my_level = thr_bar->depth - 1; // default for master
813  thr_bar->parent_tid = -1; // default for master
814  if (!KMP_MASTER_TID(
815  tid)) { // if not master, find parent thread in hierarchy
816  kmp_uint32 d = 0;
817  while (d < thr_bar->depth) { // find parent based on level of thread in
818  // hierarchy, and note level
819  kmp_uint32 rem;
820  if (d == thr_bar->depth - 2) { // reached level right below the master
821  thr_bar->parent_tid = 0;
822  thr_bar->my_level = d;
823  break;
824  } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
825  0) { // TODO: can we make this op faster?
826  // thread is not a subtree root at next level, so this is max
827  thr_bar->parent_tid = tid - rem;
828  thr_bar->my_level = d;
829  break;
830  }
831  ++d;
832  }
833  }
834  thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
835  thr_bar->old_tid = tid;
836  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
837  thr_bar->team = team;
838  thr_bar->parent_bar =
839  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
840  }
841  if (uninitialized || team_changed || tid_changed) {
842  thr_bar->team = team;
843  thr_bar->parent_bar =
844  &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
845  retval = true;
846  }
847  if (uninitialized || team_sz_changed || tid_changed) {
848  thr_bar->nproc = nproc;
849  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
850  if (thr_bar->my_level == 0)
851  thr_bar->leaf_kids = 0;
852  if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
853  thr_bar->leaf_kids = nproc - tid - 1;
854  thr_bar->leaf_state = 0;
855  for (int i = 0; i < thr_bar->leaf_kids; ++i)
856  ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
857  }
858  return retval;
859 }
860 
861 static void __kmp_hierarchical_barrier_gather(
862  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
864  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
865  kmp_team_t *team = this_thr->th.th_team;
866  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
867  kmp_uint32 nproc = this_thr->th.th_team_nproc;
868  kmp_info_t **other_threads = team->t.t_threads;
869  kmp_uint64 new_state;
870 
871  int level = team->t.t_level;
872 #if OMP_40_ENABLED
873  if (other_threads[0]
874  ->th.th_teams_microtask) // are we inside the teams construct?
875  if (this_thr->th.th_teams_size.nteams > 1)
876  ++level; // level was not increased in teams construct for team_of_masters
877 #endif
878  if (level == 1)
879  thr_bar->use_oncore_barrier = 1;
880  else
881  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
882 
883  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
884  "barrier type %d\n",
885  gtid, team->t.t_id, tid, bt));
886  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
887 
888 #if USE_ITT_BUILD && USE_ITT_NOTIFY
889  // Barrier imbalance - save arrive time to the thread
890  if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
891  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
892  }
893 #endif
894 
895  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
896  team);
897 
898  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
899  kmp_int32 child_tid;
900  new_state =
901  (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
902  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
903  thr_bar->use_oncore_barrier) {
904  if (thr_bar->leaf_kids) {
905  // First, wait for leaf children to check-in on my b_arrived flag
906  kmp_uint64 leaf_state =
907  KMP_MASTER_TID(tid)
908  ? thr_bar->b_arrived | thr_bar->leaf_state
909  : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
910  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
911  "for leaf kids\n",
912  gtid, team->t.t_id, tid));
913  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
914  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
915  if (reduce) {
916  ANNOTATE_REDUCE_AFTER(reduce);
917  for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
918  ++child_tid) {
919  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
920  "T#%d(%d:%d)\n",
921  gtid, team->t.t_id, tid,
922  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
923  child_tid));
924  ANNOTATE_BARRIER_END(other_threads[child_tid]);
925  (*reduce)(this_thr->th.th_local.reduce_data,
926  other_threads[child_tid]->th.th_local.reduce_data);
927  }
928  ANNOTATE_REDUCE_BEFORE(reduce);
929  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
930  }
931  // clear leaf_state bits
932  KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
933  }
934  // Next, wait for higher level children on each child's b_arrived flag
935  for (kmp_uint32 d = 1; d < thr_bar->my_level;
936  ++d) { // gather lowest level threads first, but skip 0
937  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
938  skip = thr_bar->skip_per_level[d];
939  if (last > nproc)
940  last = nproc;
941  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
942  kmp_info_t *child_thr = other_threads[child_tid];
943  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
944  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
945  "T#%d(%d:%d) "
946  "arrived(%p) == %llu\n",
947  gtid, team->t.t_id, tid,
948  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
949  child_tid, &child_bar->b_arrived, new_state));
950  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
951  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
952  ANNOTATE_BARRIER_END(child_thr);
953  if (reduce) {
954  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
955  "T#%d(%d:%d)\n",
956  gtid, team->t.t_id, tid,
957  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
958  child_tid));
959  ANNOTATE_REDUCE_AFTER(reduce);
960  (*reduce)(this_thr->th.th_local.reduce_data,
961  child_thr->th.th_local.reduce_data);
962  ANNOTATE_REDUCE_BEFORE(reduce);
963  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
964  }
965  }
966  }
967  } else { // Blocktime is not infinite
968  for (kmp_uint32 d = 0; d < thr_bar->my_level;
969  ++d) { // Gather lowest level threads first
970  kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
971  skip = thr_bar->skip_per_level[d];
972  if (last > nproc)
973  last = nproc;
974  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
975  kmp_info_t *child_thr = other_threads[child_tid];
976  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
977  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
978  "T#%d(%d:%d) "
979  "arrived(%p) == %llu\n",
980  gtid, team->t.t_id, tid,
981  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
982  child_tid, &child_bar->b_arrived, new_state));
983  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
984  flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
985  ANNOTATE_BARRIER_END(child_thr);
986  if (reduce) {
987  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
988  "T#%d(%d:%d)\n",
989  gtid, team->t.t_id, tid,
990  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
991  child_tid));
992  ANNOTATE_REDUCE_AFTER(reduce);
993  (*reduce)(this_thr->th.th_local.reduce_data,
994  child_thr->th.th_local.reduce_data);
995  ANNOTATE_REDUCE_BEFORE(reduce);
996  ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
997  }
998  }
999  }
1000  }
1001  }
1002  // All subordinates are gathered; now release parent if not master thread
1003 
1004  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1005  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1006  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1007  gtid, team->t.t_id, tid,
1008  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1009  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1010  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1011  /* Mark arrival to parent: After performing this write, a worker thread may
1012  not assume that the team is valid any more - it could be deallocated by
1013  the master thread at any time. */
1014  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1015  !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1016  // flag; release it
1017  ANNOTATE_BARRIER_BEGIN(this_thr);
1018  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
1019  flag.release();
1020  } else {
1021  // Leaf does special release on "offset" bits of parent's b_arrived flag
1022  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1023  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1024  flag.set_waiter(other_threads[thr_bar->parent_tid]);
1025  flag.release();
1026  }
1027  } else { // Master thread needs to update the team's b_arrived value
1028  team->t.t_bar[bt].b_arrived = new_state;
1029  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1030  "arrived(%p) = %llu\n",
1031  gtid, team->t.t_id, tid, team->t.t_id,
1032  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1033  }
1034  // Is the team access below unsafe or just technically invalid?
1035  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1036  "barrier type %d\n",
1037  gtid, team->t.t_id, tid, bt));
1038 }
1039 
1040 static void __kmp_hierarchical_barrier_release(
1041  enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1042  int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1043  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1044  kmp_team_t *team;
1045  kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1046  kmp_uint32 nproc;
1047  bool team_change = false; // indicates on-core barrier shouldn't be used
1048 
1049  if (KMP_MASTER_TID(tid)) {
1050  team = __kmp_threads[gtid]->th.th_team;
1051  KMP_DEBUG_ASSERT(team != NULL);
1052  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1053  "entered barrier type %d\n",
1054  gtid, team->t.t_id, tid, bt));
1055  } else { // Worker threads
1056  // Wait for parent thread to release me
1057  if (!thr_bar->use_oncore_barrier ||
1058  __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1059  thr_bar->team == NULL) {
1060  // Use traditional method of waiting on my own b_go flag
1061  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1062  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1063  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1064  ANNOTATE_BARRIER_END(this_thr);
1065  TCW_8(thr_bar->b_go,
1066  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1067  } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1068  // infinite, not nested
1069  // Wait on my "offset" bits on parent's b_go flag
1070  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1071  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1072  thr_bar->offset, bt,
1073  this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1074  flag.wait(this_thr, TRUE);
1075  if (thr_bar->wait_flag ==
1076  KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1077  TCW_8(thr_bar->b_go,
1078  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1079  } else { // Reset my bits on parent's b_go flag
1080  (RCAST(volatile char *,
1081  &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1082  }
1083  }
1084  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1085  // Early exit for reaping threads releasing forkjoin barrier
1086  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1087  return;
1088  // The worker thread may now assume that the team is valid.
1089  team = __kmp_threads[gtid]->th.th_team;
1090  KMP_DEBUG_ASSERT(team != NULL);
1091  tid = __kmp_tid_from_gtid(gtid);
1092 
1093  KA_TRACE(
1094  20,
1095  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1096  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1097  KMP_MB(); // Flush all pending memory write invalidates.
1098  }
1099 
1100  nproc = this_thr->th.th_team_nproc;
1101  int level = team->t.t_level;
1102 #if OMP_40_ENABLED
1103  if (team->t.t_threads[0]
1104  ->th.th_teams_microtask) { // are we inside the teams construct?
1105  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1106  this_thr->th.th_teams_level == level)
1107  ++level; // level was not increased in teams construct for team_of_workers
1108  if (this_thr->th.th_teams_size.nteams > 1)
1109  ++level; // level was not increased in teams construct for team_of_masters
1110  }
1111 #endif
1112  if (level == 1)
1113  thr_bar->use_oncore_barrier = 1;
1114  else
1115  thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1116 
1117  // If the team size has increased, we still communicate with old leaves via
1118  // oncore barrier.
1119  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1120  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1121  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1122  tid, team);
1123  // But if the entire team changes, we won't use oncore barrier at all
1124  if (team_change)
1125  old_leaf_kids = 0;
1126 
1127 #if KMP_BARRIER_ICV_PUSH
1128  if (propagate_icvs) {
1129  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1130  FALSE);
1131  if (KMP_MASTER_TID(
1132  tid)) { // master already has copy in final destination; copy
1133  copy_icvs(&thr_bar->th_fixed_icvs,
1134  &team->t.t_implicit_task_taskdata[tid].td_icvs);
1135  } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1136  thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1137  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1138  // leaves (on-core children) pull parent's fixed ICVs directly to local
1139  // ICV store
1140  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1141  &thr_bar->parent_bar->th_fixed_icvs);
1142  // non-leaves will get ICVs piggybacked with b_go via NGO store
1143  } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1144  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1145  // access
1146  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1147  else // leaves copy parent's fixed ICVs directly to local ICV store
1148  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1149  &thr_bar->parent_bar->th_fixed_icvs);
1150  }
1151  }
1152 #endif // KMP_BARRIER_ICV_PUSH
1153 
1154  // Now, release my children
1155  if (thr_bar->my_level) { // not a leaf
1156  kmp_int32 child_tid;
1157  kmp_uint32 last;
1158  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1159  thr_bar->use_oncore_barrier) {
1160  if (KMP_MASTER_TID(tid)) { // do a flat release
1161  // Set local b_go to bump children via NGO store of the cache line
1162  // containing IVCs and b_go.
1163  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1164  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1165  // the cache line
1166  ngo_load(&thr_bar->th_fixed_icvs);
1167  // This loops over all the threads skipping only the leaf nodes in the
1168  // hierarchy
1169  for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1170  child_tid += thr_bar->skip_per_level[1]) {
1171  kmp_bstate_t *child_bar =
1172  &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1173  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1174  "releasing T#%d(%d:%d)"
1175  " go(%p): %u => %u\n",
1176  gtid, team->t.t_id, tid,
1177  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1178  child_tid, &child_bar->b_go, child_bar->b_go,
1179  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1180  // Use ngo store (if available) to both store ICVs and release child
1181  // via child's b_go
1182  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1183  }
1184  ngo_sync();
1185  }
1186  TCW_8(thr_bar->b_go,
1187  KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1188  // Now, release leaf children
1189  if (thr_bar->leaf_kids) { // if there are any
1190  // We test team_change on the off-chance that the level 1 team changed.
1191  if (team_change ||
1192  old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1193  if (old_leaf_kids) { // release old leaf kids
1194  thr_bar->b_go |= old_leaf_state;
1195  }
1196  // Release new leaf kids
1197  last = tid + thr_bar->skip_per_level[1];
1198  if (last > nproc)
1199  last = nproc;
1200  for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1201  ++child_tid) { // skip_per_level[0]=1
1202  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1203  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1204  KA_TRACE(
1205  20,
1206  ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1207  " T#%d(%d:%d) go(%p): %u => %u\n",
1208  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1209  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1210  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1211  // Release child using child's b_go flag
1212  ANNOTATE_BARRIER_BEGIN(child_thr);
1213  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1214  flag.release();
1215  }
1216  } else { // Release all children at once with leaf_state bits on my own
1217  // b_go flag
1218  thr_bar->b_go |= thr_bar->leaf_state;
1219  }
1220  }
1221  } else { // Blocktime is not infinite; do a simple hierarchical release
1222  for (int d = thr_bar->my_level - 1; d >= 0;
1223  --d) { // Release highest level threads first
1224  last = tid + thr_bar->skip_per_level[d + 1];
1225  kmp_uint32 skip = thr_bar->skip_per_level[d];
1226  if (last > nproc)
1227  last = nproc;
1228  for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1229  kmp_info_t *child_thr = team->t.t_threads[child_tid];
1230  kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1231  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1232  "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1233  gtid, team->t.t_id, tid,
1234  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1235  child_tid, &child_bar->b_go, child_bar->b_go,
1236  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1237  // Release child using child's b_go flag
1238  ANNOTATE_BARRIER_BEGIN(child_thr);
1239  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1240  flag.release();
1241  }
1242  }
1243  }
1244 #if KMP_BARRIER_ICV_PUSH
1245  if (propagate_icvs && !KMP_MASTER_TID(tid))
1246  // non-leaves copy ICVs from fixed ICVs to local dest
1247  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1248  &thr_bar->th_fixed_icvs);
1249 #endif // KMP_BARRIER_ICV_PUSH
1250  }
1251  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1252  "barrier type %d\n",
1253  gtid, team->t.t_id, tid, bt));
1254 }
1255 
1256 // End of Barrier Algorithms
1257 
1258 // type traits for cancellable value
1259 // if cancellable is true, then is_cancellable is a normal boolean variable
1260 // if cancellable is false, then is_cancellable is a compile time constant
1261 template <bool cancellable> struct is_cancellable {};
1262 template <> struct is_cancellable<true> {
1263  bool value;
1264  is_cancellable() : value(false) {}
1265  is_cancellable(bool b) : value(b) {}
1266  is_cancellable &operator=(bool b) {
1267  value = b;
1268  return *this;
1269  }
1270  operator bool() const { return value; }
1271 };
1272 template <> struct is_cancellable<false> {
1273  is_cancellable &operator=(bool b) { return *this; }
1274  constexpr operator bool() const { return false; }
1275 };
1276 
1277 // Internal function to do a barrier.
1278 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1279  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1280  barrier
1281  When cancellable = false,
1282  Returns 0 if master thread, 1 if worker thread.
1283  When cancellable = true
1284  Returns 0 if not cancelled, 1 if cancelled. */
1285 template <bool cancellable = false>
1286 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1287  size_t reduce_size, void *reduce_data,
1288  void (*reduce)(void *, void *)) {
1289  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1290  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1291  int tid = __kmp_tid_from_gtid(gtid);
1292  kmp_info_t *this_thr = __kmp_threads[gtid];
1293  kmp_team_t *team = this_thr->th.th_team;
1294  int status = 0;
1295  is_cancellable<cancellable> cancelled;
1296 #if OMPT_SUPPORT && OMPT_OPTIONAL
1297  ompt_data_t *my_task_data;
1298  ompt_data_t *my_parallel_data;
1299  void *return_address;
1300 #endif
1301 
1302  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1303  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1304 
1305  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1306 #if OMPT_SUPPORT
1307  if (ompt_enabled.enabled) {
1308 #if OMPT_OPTIONAL
1309  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1310  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1311  return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1312  if (ompt_enabled.ompt_callback_sync_region) {
1313  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1314  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1315  my_task_data, return_address);
1316  }
1317  if (ompt_enabled.ompt_callback_sync_region_wait) {
1318  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1319  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1320  my_task_data, return_address);
1321  }
1322 #endif
1323  // It is OK to report the barrier state after the barrier begin callback.
1324  // According to the OMPT specification, a compliant implementation may
1325  // even delay reporting this state until the barrier begins to wait.
1326  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1327  }
1328 #endif
1329 
1330  if (!team->t.t_serialized) {
1331 #if USE_ITT_BUILD
1332  // This value will be used in itt notify events below.
1333  void *itt_sync_obj = NULL;
1334 #if USE_ITT_NOTIFY
1335  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1336  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1337 #endif
1338 #endif /* USE_ITT_BUILD */
1339  if (__kmp_tasking_mode == tskm_extra_barrier) {
1340  __kmp_tasking_barrier(team, this_thr, gtid);
1341  KA_TRACE(15,
1342  ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1343  __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1344  }
1345 
1346  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1347  access it when the team struct is not guaranteed to exist. */
1348  // See note about the corresponding code in __kmp_join_barrier() being
1349  // performance-critical.
1350  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1351 #if KMP_USE_MONITOR
1352  this_thr->th.th_team_bt_intervals =
1353  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1354  this_thr->th.th_team_bt_set =
1355  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1356 #else
1357  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1358 #endif
1359  }
1360 
1361 #if USE_ITT_BUILD
1362  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1363  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1364 #endif /* USE_ITT_BUILD */
1365 #if USE_DEBUGGER
1366  // Let the debugger know: the thread arrived to the barrier and waiting.
1367  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1368  team->t.t_bar[bt].b_master_arrived += 1;
1369  } else {
1370  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1371  } // if
1372 #endif /* USE_DEBUGGER */
1373  if (reduce != NULL) {
1374  // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1375  this_thr->th.th_local.reduce_data = reduce_data;
1376  }
1377 
1378  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1379  // use 0 to only setup the current team if nthreads > 1
1380  __kmp_task_team_setup(this_thr, team, 0);
1381 
1382  if (cancellable) {
1383  cancelled = __kmp_linear_barrier_gather_cancellable(
1384  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1385  } else {
1386  switch (__kmp_barrier_gather_pattern[bt]) {
1387  case bp_hyper_bar: {
1388  // don't set branch bits to 0; use linear
1389  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1390  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1391  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1392  break;
1393  }
1394  case bp_hierarchical_bar: {
1395  __kmp_hierarchical_barrier_gather(
1396  bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1397  break;
1398  }
1399  case bp_tree_bar: {
1400  // don't set branch bits to 0; use linear
1401  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1402  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1403  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1404  break;
1405  }
1406  default: {
1407  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1408  reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1409  }
1410  }
1411  }
1412 
1413  KMP_MB();
1414 
1415  if (KMP_MASTER_TID(tid)) {
1416  status = 0;
1417  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1418  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1419  }
1420 #if USE_DEBUGGER
1421  // Let the debugger know: All threads are arrived and starting leaving the
1422  // barrier.
1423  team->t.t_bar[bt].b_team_arrived += 1;
1424 #endif
1425 
1426 #if OMP_40_ENABLED
1427  if (__kmp_omp_cancellation) {
1428  kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1429  // Reset cancellation flag for worksharing constructs
1430  if (cancel_request == cancel_loop ||
1431  cancel_request == cancel_sections) {
1432  KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1433  }
1434  }
1435 #endif
1436 #if USE_ITT_BUILD
1437  /* TODO: In case of split reduction barrier, master thread may send
1438  acquired event early, before the final summation into the shared
1439  variable is done (final summation can be a long operation for array
1440  reductions). */
1441  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1442  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1443 #endif /* USE_ITT_BUILD */
1444 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1445  // Barrier - report frame end (only if active_level == 1)
1446  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1447  __kmp_forkjoin_frames_mode &&
1448 #if OMP_40_ENABLED
1449  this_thr->th.th_teams_microtask == NULL &&
1450 #endif
1451  team->t.t_active_level == 1) {
1452  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1453  kmp_uint64 cur_time = __itt_get_timestamp();
1454  kmp_info_t **other_threads = team->t.t_threads;
1455  int nproc = this_thr->th.th_team_nproc;
1456  int i;
1457  switch (__kmp_forkjoin_frames_mode) {
1458  case 1:
1459  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1460  loc, nproc);
1461  this_thr->th.th_frame_time = cur_time;
1462  break;
1463  case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1464  // be fixed)
1465  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1466  1, loc, nproc);
1467  break;
1468  case 3:
1469  if (__itt_metadata_add_ptr) {
1470  // Initialize with master's wait time
1471  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1472  // Set arrive time to zero to be able to check it in
1473  // __kmp_invoke_task(); the same is done inside the loop below
1474  this_thr->th.th_bar_arrive_time = 0;
1475  for (i = 1; i < nproc; ++i) {
1476  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1477  other_threads[i]->th.th_bar_arrive_time = 0;
1478  }
1479  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1480  cur_time, delta,
1481  (kmp_uint64)(reduce != NULL));
1482  }
1483  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1484  loc, nproc);
1485  this_thr->th.th_frame_time = cur_time;
1486  break;
1487  }
1488  }
1489 #endif /* USE_ITT_BUILD */
1490  } else {
1491  status = 1;
1492 #if USE_ITT_BUILD
1493  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1494  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1495 #endif /* USE_ITT_BUILD */
1496  }
1497  if ((status == 1 || !is_split) && !cancelled) {
1498  if (cancellable) {
1499  cancelled = __kmp_linear_barrier_release_cancellable(
1500  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1501  } else {
1502  switch (__kmp_barrier_release_pattern[bt]) {
1503  case bp_hyper_bar: {
1504  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1505  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1506  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507  break;
1508  }
1509  case bp_hierarchical_bar: {
1510  __kmp_hierarchical_barrier_release(
1511  bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1512  break;
1513  }
1514  case bp_tree_bar: {
1515  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1516  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1517  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518  break;
1519  }
1520  default: {
1521  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1522  FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1523  }
1524  }
1525  }
1526  if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1527  __kmp_task_team_sync(this_thr, team);
1528  }
1529  }
1530 
1531 #if USE_ITT_BUILD
1532  /* GEH: TODO: Move this under if-condition above and also include in
1533  __kmp_end_split_barrier(). This will more accurately represent the actual
1534  release time of the threads for split barriers. */
1535  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1536  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1537 #endif /* USE_ITT_BUILD */
1538  } else { // Team is serialized.
1539  status = 0;
1540  if (__kmp_tasking_mode != tskm_immediate_exec) {
1541 #if OMP_45_ENABLED
1542  if (this_thr->th.th_task_team != NULL) {
1543 #if USE_ITT_NOTIFY
1544  void *itt_sync_obj = NULL;
1545  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1546  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1547  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1548  }
1549 #endif
1550 
1551  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1552  TRUE);
1553  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1554  __kmp_task_team_setup(this_thr, team, 0);
1555 
1556 #if USE_ITT_BUILD
1557  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1558  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1559 #endif /* USE_ITT_BUILD */
1560  }
1561 #else
1562  // The task team should be NULL for serialized code (tasks will be
1563  // executed immediately)
1564  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1565  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1566 #endif
1567  }
1568  }
1569  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1570  gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1571  __kmp_tid_from_gtid(gtid), status));
1572 
1573 #if OMPT_SUPPORT
1574  if (ompt_enabled.enabled) {
1575 #if OMPT_OPTIONAL
1576  if (ompt_enabled.ompt_callback_sync_region_wait) {
1577  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1578  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1579  my_task_data, return_address);
1580  }
1581  if (ompt_enabled.ompt_callback_sync_region) {
1582  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1583  ompt_sync_region_barrier, ompt_scope_end, my_parallel_data,
1584  my_task_data, return_address);
1585  }
1586 #endif
1587  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1588  }
1589 #endif
1590  ANNOTATE_BARRIER_END(&team->t.t_bar);
1591 
1592  if (cancellable)
1593  return (int)cancelled;
1594  return status;
1595 }
1596 
1597 // Returns 0 if master thread, 1 if worker thread.
1598 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1599  size_t reduce_size, void *reduce_data,
1600  void (*reduce)(void *, void *)) {
1601  return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1602  reduce);
1603 }
1604 
1605 #if defined(KMP_GOMP_COMPAT)
1606 // Returns 1 if cancelled, 0 otherwise
1607 int __kmp_barrier_gomp_cancel(int gtid) {
1608  if (__kmp_omp_cancellation) {
1609  int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1610  0, NULL, NULL);
1611  if (cancelled) {
1612  int tid = __kmp_tid_from_gtid(gtid);
1613  kmp_info_t *this_thr = __kmp_threads[gtid];
1614  if (KMP_MASTER_TID(tid)) {
1615  // Master does not need to revert anything
1616  } else {
1617  // Workers need to revert their private b_arrived flag
1618  this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1619  KMP_BARRIER_STATE_BUMP;
1620  }
1621  }
1622  return cancelled;
1623  }
1624  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1625  return FALSE;
1626 }
1627 #endif
1628 
1629 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1630  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1631  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1632  int tid = __kmp_tid_from_gtid(gtid);
1633  kmp_info_t *this_thr = __kmp_threads[gtid];
1634  kmp_team_t *team = this_thr->th.th_team;
1635 
1636  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1637  if (!team->t.t_serialized) {
1638  if (KMP_MASTER_GTID(gtid)) {
1639  switch (__kmp_barrier_release_pattern[bt]) {
1640  case bp_hyper_bar: {
1641  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1642  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1643  FALSE USE_ITT_BUILD_ARG(NULL));
1644  break;
1645  }
1646  case bp_hierarchical_bar: {
1647  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1648  FALSE USE_ITT_BUILD_ARG(NULL));
1649  break;
1650  }
1651  case bp_tree_bar: {
1652  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1653  __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1654  FALSE USE_ITT_BUILD_ARG(NULL));
1655  break;
1656  }
1657  default: {
1658  __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1659  FALSE USE_ITT_BUILD_ARG(NULL));
1660  }
1661  }
1662  if (__kmp_tasking_mode != tskm_immediate_exec) {
1663  __kmp_task_team_sync(this_thr, team);
1664  } // if
1665  }
1666  }
1667  ANNOTATE_BARRIER_END(&team->t.t_bar);
1668 }
1669 
1670 void __kmp_join_barrier(int gtid) {
1671  KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1672  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1673  kmp_info_t *this_thr = __kmp_threads[gtid];
1674  kmp_team_t *team;
1675  kmp_uint nproc;
1676  kmp_info_t *master_thread;
1677  int tid;
1678 #ifdef KMP_DEBUG
1679  int team_id;
1680 #endif /* KMP_DEBUG */
1681 #if USE_ITT_BUILD
1682  void *itt_sync_obj = NULL;
1683 #if USE_ITT_NOTIFY
1684  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1685  // Get object created at fork_barrier
1686  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1687 #endif
1688 #endif /* USE_ITT_BUILD */
1689  KMP_MB();
1690 
1691  // Get current info
1692  team = this_thr->th.th_team;
1693  nproc = this_thr->th.th_team_nproc;
1694  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1695  tid = __kmp_tid_from_gtid(gtid);
1696 #ifdef KMP_DEBUG
1697  team_id = team->t.t_id;
1698 #endif /* KMP_DEBUG */
1699  master_thread = this_thr->th.th_team_master;
1700 #ifdef KMP_DEBUG
1701  if (master_thread != team->t.t_threads[0]) {
1702  __kmp_print_structure();
1703  }
1704 #endif /* KMP_DEBUG */
1705  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1706  KMP_MB();
1707 
1708  // Verify state
1709  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1710  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1711  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1712  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1713  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1714  gtid, team_id, tid));
1715 
1716  ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1717 #if OMPT_SUPPORT
1718  if (ompt_enabled.enabled) {
1719 #if OMPT_OPTIONAL
1720  ompt_data_t *my_task_data;
1721  ompt_data_t *my_parallel_data;
1722  void *codeptr = NULL;
1723  int ds_tid = this_thr->th.th_info.ds.ds_tid;
1724  if (KMP_MASTER_TID(ds_tid) &&
1725  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1726  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1727  codeptr = team->t.ompt_team_info.master_return_address;
1728  my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1729  my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1730  if (ompt_enabled.ompt_callback_sync_region) {
1731  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1732  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1733  my_task_data, codeptr);
1734  }
1735  if (ompt_enabled.ompt_callback_sync_region_wait) {
1736  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1737  ompt_sync_region_barrier, ompt_scope_begin, my_parallel_data,
1738  my_task_data, codeptr);
1739  }
1740  if (!KMP_MASTER_TID(ds_tid))
1741  this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1742 #endif
1743  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1744  }
1745 #endif
1746 
1747  if (__kmp_tasking_mode == tskm_extra_barrier) {
1748  __kmp_tasking_barrier(team, this_thr, gtid);
1749  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1750  team_id, tid));
1751  }
1752 #ifdef KMP_DEBUG
1753  if (__kmp_tasking_mode != tskm_immediate_exec) {
1754  KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1755  "%p, th_task_team = %p\n",
1756  __kmp_gtid_from_thread(this_thr), team_id,
1757  team->t.t_task_team[this_thr->th.th_task_state],
1758  this_thr->th.th_task_team));
1759  KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1760  team->t.t_task_team[this_thr->th.th_task_state]);
1761  }
1762 #endif /* KMP_DEBUG */
1763 
1764  /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1765  access it when the team struct is not guaranteed to exist. Doing these
1766  loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1767  we do not perform the copy if blocktime=infinite, since the values are not
1768  used by __kmp_wait_template() in that case. */
1769  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1770 #if KMP_USE_MONITOR
1771  this_thr->th.th_team_bt_intervals =
1772  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1773  this_thr->th.th_team_bt_set =
1774  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1775 #else
1776  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1777 #endif
1778  }
1779 
1780 #if USE_ITT_BUILD
1781  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1782  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1783 #endif /* USE_ITT_BUILD */
1784 
1785  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1786  case bp_hyper_bar: {
1787  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1788  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1789  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1790  break;
1791  }
1792  case bp_hierarchical_bar: {
1793  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1794  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1795  break;
1796  }
1797  case bp_tree_bar: {
1798  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1799  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1800  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1801  break;
1802  }
1803  default: {
1804  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1805  NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1806  }
1807  }
1808 
1809  /* From this point on, the team data structure may be deallocated at any time
1810  by the master thread - it is unsafe to reference it in any of the worker
1811  threads. Any per-team data items that need to be referenced before the
1812  end of the barrier should be moved to the kmp_task_team_t structs. */
1813  if (KMP_MASTER_TID(tid)) {
1814  if (__kmp_tasking_mode != tskm_immediate_exec) {
1815  __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1816  }
1817 #if OMP_50_ENABLED
1818  if (__kmp_display_affinity) {
1819  KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1820  }
1821 #endif
1822 #if KMP_STATS_ENABLED
1823  // Have master thread flag the workers to indicate they are now waiting for
1824  // next parallel region, Also wake them up so they switch their timers to
1825  // idle.
1826  for (int i = 0; i < team->t.t_nproc; ++i) {
1827  kmp_info_t *team_thread = team->t.t_threads[i];
1828  if (team_thread == this_thr)
1829  continue;
1830  team_thread->th.th_stats->setIdleFlag();
1831  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1832  team_thread->th.th_sleep_loc != NULL)
1833  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1834  team_thread->th.th_sleep_loc);
1835  }
1836 #endif
1837 #if USE_ITT_BUILD
1838  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1839  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1840 #endif /* USE_ITT_BUILD */
1841 
1842 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1843  // Join barrier - report frame end
1844  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1845  __kmp_forkjoin_frames_mode &&
1846 #if OMP_40_ENABLED
1847  this_thr->th.th_teams_microtask == NULL &&
1848 #endif
1849  team->t.t_active_level == 1) {
1850  kmp_uint64 cur_time = __itt_get_timestamp();
1851  ident_t *loc = team->t.t_ident;
1852  kmp_info_t **other_threads = team->t.t_threads;
1853  int nproc = this_thr->th.th_team_nproc;
1854  int i;
1855  switch (__kmp_forkjoin_frames_mode) {
1856  case 1:
1857  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1858  loc, nproc);
1859  break;
1860  case 2:
1861  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1862  loc, nproc);
1863  break;
1864  case 3:
1865  if (__itt_metadata_add_ptr) {
1866  // Initialize with master's wait time
1867  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1868  // Set arrive time to zero to be able to check it in
1869  // __kmp_invoke_task(); the same is done inside the loop below
1870  this_thr->th.th_bar_arrive_time = 0;
1871  for (i = 1; i < nproc; ++i) {
1872  delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1873  other_threads[i]->th.th_bar_arrive_time = 0;
1874  }
1875  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1876  cur_time, delta, 0);
1877  }
1878  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1879  loc, nproc);
1880  this_thr->th.th_frame_time = cur_time;
1881  break;
1882  }
1883  }
1884 #endif /* USE_ITT_BUILD */
1885  }
1886 #if USE_ITT_BUILD
1887  else {
1888  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1889  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1890  }
1891 #endif /* USE_ITT_BUILD */
1892 
1893 #if KMP_DEBUG
1894  if (KMP_MASTER_TID(tid)) {
1895  KA_TRACE(
1896  15,
1897  ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1898  gtid, team_id, tid, nproc));
1899  }
1900 #endif /* KMP_DEBUG */
1901 
1902  // TODO now, mark worker threads as done so they may be disbanded
1903  KMP_MB(); // Flush all pending memory write invalidates.
1904  KA_TRACE(10,
1905  ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1906 
1907  ANNOTATE_BARRIER_END(&team->t.t_bar);
1908 }
1909 
1910 // TODO release worker threads' fork barriers as we are ready instead of all at
1911 // once
1912 void __kmp_fork_barrier(int gtid, int tid) {
1913  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1914  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1915  kmp_info_t *this_thr = __kmp_threads[gtid];
1916  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1917 #if USE_ITT_BUILD
1918  void *itt_sync_obj = NULL;
1919 #endif /* USE_ITT_BUILD */
1920  if (team)
1921  ANNOTATE_BARRIER_END(&team->t.t_bar);
1922 
1923  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1924  (team != NULL) ? team->t.t_id : -1, tid));
1925 
1926  // th_team pointer only valid for master thread here
1927  if (KMP_MASTER_TID(tid)) {
1928 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1929  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1930  // Create itt barrier object
1931  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1932  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1933  }
1934 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1935 
1936 #ifdef KMP_DEBUG
1937  kmp_info_t **other_threads = team->t.t_threads;
1938  int i;
1939 
1940  // Verify state
1941  KMP_MB();
1942 
1943  for (i = 1; i < team->t.t_nproc; ++i) {
1944  KA_TRACE(500,
1945  ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1946  "== %u.\n",
1947  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1948  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1949  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1950  KMP_DEBUG_ASSERT(
1951  (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1952  ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1953  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1954  }
1955 #endif
1956 
1957  if (__kmp_tasking_mode != tskm_immediate_exec) {
1958  // 0 indicates setup current task team if nthreads > 1
1959  __kmp_task_team_setup(this_thr, team, 0);
1960  }
1961 
1962  /* The master thread may have changed its blocktime between the join barrier
1963  and the fork barrier. Copy the blocktime info to the thread, where
1964  __kmp_wait_template() can access it when the team struct is not
1965  guaranteed to exist. */
1966  // See note about the corresponding code in __kmp_join_barrier() being
1967  // performance-critical
1968  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1969 #if KMP_USE_MONITOR
1970  this_thr->th.th_team_bt_intervals =
1971  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1972  this_thr->th.th_team_bt_set =
1973  team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1974 #else
1975  this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1976 #endif
1977  }
1978  } // master
1979 
1980  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1981  case bp_hyper_bar: {
1982  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1983  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1984  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1985  break;
1986  }
1987  case bp_hierarchical_bar: {
1988  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1989  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1990  break;
1991  }
1992  case bp_tree_bar: {
1993  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1994  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1995  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1996  break;
1997  }
1998  default: {
1999  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
2000  TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
2001  }
2002  }
2003 
2004 #if OMPT_SUPPORT
2005  if (ompt_enabled.enabled &&
2006  this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2007  int ds_tid = this_thr->th.th_info.ds.ds_tid;
2008  ompt_data_t *task_data = (team)
2009  ? OMPT_CUR_TASK_DATA(this_thr)
2010  : &(this_thr->th.ompt_thread_info.task_data);
2011  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2012 #if OMPT_OPTIONAL
2013  void *codeptr = NULL;
2014  if (KMP_MASTER_TID(ds_tid) &&
2015  (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2016  ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2017  codeptr = team->t.ompt_team_info.master_return_address;
2018  if (ompt_enabled.ompt_callback_sync_region_wait) {
2019  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2020  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
2021  }
2022  if (ompt_enabled.ompt_callback_sync_region) {
2023  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2024  ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr);
2025  }
2026 #endif
2027  if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2028  ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2029  ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2030  }
2031  }
2032 #endif
2033 
2034  // Early exit for reaping threads releasing forkjoin barrier
2035  if (TCR_4(__kmp_global.g.g_done)) {
2036  this_thr->th.th_task_team = NULL;
2037 
2038 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2039  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2040  if (!KMP_MASTER_TID(tid)) {
2041  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2042  if (itt_sync_obj)
2043  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2044  }
2045  }
2046 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2047  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2048  return;
2049  }
2050 
2051  /* We can now assume that a valid team structure has been allocated by the
2052  master and propagated to all worker threads. The current thread, however,
2053  may not be part of the team, so we can't blindly assume that the team
2054  pointer is non-null. */
2055  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2056  KMP_DEBUG_ASSERT(team != NULL);
2057  tid = __kmp_tid_from_gtid(gtid);
2058 
2059 #if KMP_BARRIER_ICV_PULL
2060  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2061  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2062  implicit task has this data before this function is called. We cannot
2063  modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2064  struct, because it is not always the case that the threads arrays have
2065  been allocated when __kmp_fork_call() is executed. */
2066  {
2067  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2068  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2069  // Copy the initial ICVs from the master's thread struct to the implicit
2070  // task for this tid.
2071  KA_TRACE(10,
2072  ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2073  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2074  tid, FALSE);
2075  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2076  &team->t.t_threads[0]
2077  ->th.th_bar[bs_forkjoin_barrier]
2078  .bb.th_fixed_icvs);
2079  }
2080  }
2081 #endif // KMP_BARRIER_ICV_PULL
2082 
2083  if (__kmp_tasking_mode != tskm_immediate_exec) {
2084  __kmp_task_team_sync(this_thr, team);
2085  }
2086 
2087 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2088  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2089  if (proc_bind == proc_bind_intel) {
2090 #endif
2091 #if KMP_AFFINITY_SUPPORTED
2092  // Call dynamic affinity settings
2093  if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2094  __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2095  }
2096 #endif // KMP_AFFINITY_SUPPORTED
2097 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2098  } else if (proc_bind != proc_bind_false) {
2099  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2100  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2101  __kmp_gtid_from_thread(this_thr),
2102  this_thr->th.th_current_place));
2103  } else {
2104  __kmp_affinity_set_place(gtid);
2105  }
2106  }
2107 #endif
2108 #if OMP_50_ENABLED
2109  // Perform the display affinity functionality
2110  if (__kmp_display_affinity) {
2111  if (team->t.t_display_affinity
2112 #if KMP_AFFINITY_SUPPORTED
2113  || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2114 #endif
2115  ) {
2116  // NULL means use the affinity-format-var ICV
2117  __kmp_aux_display_affinity(gtid, NULL);
2118  this_thr->th.th_prev_num_threads = team->t.t_nproc;
2119  this_thr->th.th_prev_level = team->t.t_level;
2120  }
2121  }
2122  if (!KMP_MASTER_TID(tid))
2123  KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2124 #endif
2125 
2126 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2127  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2128  if (!KMP_MASTER_TID(tid)) {
2129  // Get correct barrier object
2130  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2131  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2132  } // (prepare called inside barrier_release)
2133  }
2134 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2135  ANNOTATE_BARRIER_END(&team->t.t_bar);
2136  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2137  team->t.t_id, tid));
2138 }
2139 
2140 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2141  kmp_internal_control_t *new_icvs, ident_t *loc) {
2142  KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2143 
2144  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2145  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2146 
2147 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2148  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2149  implicit task has this data before this function is called. */
2150 #if KMP_BARRIER_ICV_PULL
2151  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2152  untouched), where all of the worker threads can access them and make their
2153  own copies after the barrier. */
2154  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2155  // allocated at this point
2156  copy_icvs(
2157  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2158  new_icvs);
2159  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2160  team->t.t_threads[0], team));
2161 #elif KMP_BARRIER_ICV_PUSH
2162  // The ICVs will be propagated in the fork barrier, so nothing needs to be
2163  // done here.
2164  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2165  team->t.t_threads[0], team));
2166 #else
2167  // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2168  // time.
2169  ngo_load(new_icvs);
2170  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2171  // allocated at this point
2172  for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2173  // TODO: GEH - pass in better source location info since usually NULL here
2174  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2175  f, team->t.t_threads[f], team));
2176  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2177  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2178  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2179  f, team->t.t_threads[f], team));
2180  }
2181  ngo_sync();
2182 #endif // KMP_BARRIER_ICV_PULL
2183 }
Definition: kmp.h:223