LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 // from top do bottom
49 //
50 // gtid: global thread identifier for thread containing stack
51 // thread_data: thread data for task team thread containing stack
52 // threshold: value above which the trace statement triggers
53 // location: string identifying call site of this function (for trace)
54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55  kmp_thread_data_t *thread_data,
56  int threshold, char *location) {
57  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58  kmp_taskdata_t **stack_top = task_stack->ts_top;
59  kmp_int32 entries = task_stack->ts_entries;
60  kmp_taskdata_t *tied_task;
61 
62  KA_TRACE(
63  threshold,
64  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65  "first_block = %p, stack_top = %p \n",
66  location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68  KMP_DEBUG_ASSERT(stack_top != NULL);
69  KMP_DEBUG_ASSERT(entries > 0);
70 
71  while (entries != 0) {
72  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73  // fix up ts_top if we need to pop from previous block
74  if (entries & TASK_STACK_INDEX_MASK == 0) {
75  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77  stack_block = stack_block->sb_prev;
78  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79  }
80 
81  // finish bookkeeping
82  stack_top--;
83  entries--;
84 
85  tied_task = *stack_top;
86 
87  KMP_DEBUG_ASSERT(tied_task != NULL);
88  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92  "stack_top=%p, tied_task=%p\n",
93  location, gtid, entries, stack_top, tied_task));
94  }
95  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97  KA_TRACE(threshold,
98  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99  location, gtid));
100 }
101 
102 // __kmp_init_task_stack: initialize the task stack for the first time
103 // after a thread_data structure is created.
104 // It should not be necessary to do this again (assuming the stack works).
105 //
106 // gtid: global thread identifier of calling thread
107 // thread_data: thread data for task team thread containing stack
108 static void __kmp_init_task_stack(kmp_int32 gtid,
109  kmp_thread_data_t *thread_data) {
110  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111  kmp_stack_block_t *first_block;
112 
113  // set up the first block of the stack
114  first_block = &task_stack->ts_first_block;
115  task_stack->ts_top = (kmp_taskdata_t **)first_block;
116  memset((void *)first_block, '\0',
117  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119  // initialize the stack to be empty
120  task_stack->ts_entries = TASK_STACK_EMPTY;
121  first_block->sb_next = NULL;
122  first_block->sb_prev = NULL;
123 }
124 
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 static void __kmp_free_task_stack(kmp_int32 gtid,
130  kmp_thread_data_t *thread_data) {
131  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135  // free from the second block of the stack
136  while (stack_block != NULL) {
137  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139  stack_block->sb_next = NULL;
140  stack_block->sb_prev = NULL;
141  if (stack_block != &task_stack->ts_first_block) {
142  __kmp_thread_free(thread,
143  stack_block); // free the block, if not the first
144  }
145  stack_block = next_block;
146  }
147  // initialize the stack to be empty
148  task_stack->ts_entries = 0;
149  task_stack->ts_top = NULL;
150 }
151 
152 // __kmp_push_task_stack: Push the tied task onto the task stack.
153 // Grow the stack if necessary by allocating another block.
154 //
155 // gtid: global thread identifier for calling thread
156 // thread: thread info for thread containing stack
157 // tied_task: the task to push on the stack
158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159  kmp_taskdata_t *tied_task) {
160  // GEH - need to consider what to do if tt_threads_data not allocated yet
161  kmp_thread_data_t *thread_data =
162  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166  return; // Don't push anything on stack if team or team tasks are serialized
167  }
168 
169  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172  KA_TRACE(20,
173  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174  gtid, thread, tied_task));
175  // Store entry
176  *(task_stack->ts_top) = tied_task;
177 
178  // Do bookkeeping for next push
179  task_stack->ts_top++;
180  task_stack->ts_entries++;
181 
182  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183  // Find beginning of this task block
184  kmp_stack_block_t *stack_block =
185  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187  // Check if we already have a block
188  if (stack_block->sb_next !=
189  NULL) { // reset ts_top to beginning of next block
190  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191  } else { // Alloc new block and link it up
192  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193  thread, sizeof(kmp_stack_block_t));
194 
195  task_stack->ts_top = &new_block->sb_block[0];
196  stack_block->sb_next = new_block;
197  new_block->sb_prev = stack_block;
198  new_block->sb_next = NULL;
199 
200  KA_TRACE(
201  30,
202  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203  gtid, tied_task, new_block));
204  }
205  }
206  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207  tied_task));
208 }
209 
210 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211 // the task, just check to make sure it matches the ending task passed in.
212 //
213 // gtid: global thread identifier for the calling thread
214 // thread: thread info structure containing stack
215 // tied_task: the task popped off the stack
216 // ending_task: the task that is ending (should match popped task)
217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218  kmp_taskdata_t *ending_task) {
219  // GEH - need to consider what to do if tt_threads_data not allocated yet
220  kmp_thread_data_t *thread_data =
221  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223  kmp_taskdata_t *tied_task;
224 
225  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226  // Don't pop anything from stack if team or team tasks are serialized
227  return;
228  }
229 
230  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234  thread));
235 
236  // fix up ts_top if we need to pop from previous block
237  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240  stack_block = stack_block->sb_prev;
241  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242  }
243 
244  // finish bookkeeping
245  task_stack->ts_top--;
246  task_stack->ts_entries--;
247 
248  tied_task = *(task_stack->ts_top);
249 
250  KMP_DEBUG_ASSERT(tied_task != NULL);
251  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255  tied_task));
256  return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264  const kmp_taskdata_t *tasknew,
265  const kmp_taskdata_t *taskcurr) {
266  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268  // only descendant of all deferred tied tasks can be scheduled, checking
269  // the last one is enough, as it in turn is the descendant of all others
270  kmp_taskdata_t *current = taskcurr->td_last_tied;
271  KMP_DEBUG_ASSERT(current != NULL);
272  // check if the task is not suspended on barrier
273  if (current->td_flags.tasktype == TASK_EXPLICIT ||
274  current->td_taskwait_thread > 0) { // <= 0 on barrier
275  kmp_int32 level = current->td_level;
276  kmp_taskdata_t *parent = tasknew->td_parent;
277  while (parent != current && parent->td_level > level) {
278  // check generation up to the level of the current task
279  parent = parent->td_parent;
280  KMP_DEBUG_ASSERT(parent != NULL);
281  }
282  if (parent != current)
283  return false;
284  }
285  }
286  // Check mutexinoutset dependencies, acquire locks
287  kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296  continue;
297  // could not get the lock, release previous locks
298  for (int j = i - 1; j >= 0; --j)
299  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300  return false;
301  }
302  // negative num_locks means all locks acquired successfully
303  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304  }
305  return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313  kmp_thread_data_t *thread_data) {
314  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316  kmp_int32 new_size = 2 * size;
317 
318  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319  "%d] for thread_data %p\n",
320  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322  kmp_taskdata_t **new_deque =
323  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325  int i, j;
326  for (i = thread_data->td.td_deque_head, j = 0; j < size;
327  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328  new_deque[j] = thread_data->td.td_deque[i];
329 
330  __kmp_free(thread_data->td.td_deque);
331 
332  thread_data->td.td_deque_head = 0;
333  thread_data->td.td_deque_tail = size;
334  thread_data->td.td_deque = new_deque;
335  thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340  kmp_thread_data_t *thread_data = &l->td;
341  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342  thread_data->td.td_deque_last_stolen = -1;
343  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344  "for thread_data %p\n",
345  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349  return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359  kmp_thread_data_t *thread_data;
360  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361  if (lst->priority == pri) {
362  // Found queue of tasks with given priority.
363  thread_data = &lst->td;
364  } else if (lst->priority < pri) {
365  // All current priority queues contain tasks with lower priority.
366  // Allocate new one for given priority tasks.
367  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368  thread_data = &list->td;
369  list->priority = pri;
370  list->next = lst;
371  task_team->tt.tt_task_pri_list = list;
372  } else { // task_team->tt.tt_task_pri_list->priority > pri
373  kmp_task_pri_t *next_queue = lst->next;
374  while (next_queue && next_queue->priority > pri) {
375  lst = next_queue;
376  next_queue = lst->next;
377  }
378  // lst->priority > pri && (next == NULL || pri >= next->priority)
379  if (next_queue == NULL) {
380  // No queue with pri priority, need to allocate new one.
381  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382  thread_data = &list->td;
383  list->priority = pri;
384  list->next = NULL;
385  lst->next = list;
386  } else if (next_queue->priority == pri) {
387  // Found queue of tasks with given priority.
388  thread_data = &next_queue->td;
389  } else { // lst->priority > pri > next->priority
390  // insert newly allocated between existed queues
391  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392  thread_data = &list->td;
393  list->priority = pri;
394  list->next = next_queue;
395  lst->next = list;
396  }
397  }
398  return thread_data;
399 }
400 
401 // __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403  kmp_taskdata_t *taskdata,
404  kmp_task_team_t *task_team,
405  kmp_int32 pri) {
406  kmp_thread_data_t *thread_data = NULL;
407  KA_TRACE(20,
408  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409  gtid, taskdata, pri));
410 
411  // Find task queue specific to priority value
412  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413  if (UNLIKELY(lst == NULL)) {
414  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415  if (task_team->tt.tt_task_pri_list == NULL) {
416  // List of queues is still empty, allocate one.
417  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418  thread_data = &list->td;
419  list->priority = pri;
420  list->next = NULL;
421  task_team->tt.tt_task_pri_list = list;
422  } else {
423  // Other thread initialized a queue. Check if it fits and get thread_data.
424  thread_data = __kmp_get_priority_deque_data(task_team, pri);
425  }
426  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427  } else {
428  if (lst->priority == pri) {
429  // Found queue of tasks with given priority.
430  thread_data = &lst->td;
431  } else {
432  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433  thread_data = __kmp_get_priority_deque_data(task_team, pri);
434  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435  }
436  }
437  KMP_DEBUG_ASSERT(thread_data);
438 
439  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440  // Check if deque is full
441  if (TCR_4(thread_data->td.td_deque_ntasks) >=
442  TASK_DEQUE_SIZE(thread_data->td)) {
443  if (__kmp_enable_task_throttling &&
444  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445  thread->th.th_current_task)) {
446  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448  "TASK_NOT_PUSHED for task %p\n",
449  gtid, taskdata));
450  return TASK_NOT_PUSHED;
451  } else {
452  // expand deque to push the task which is not allowed to execute
453  __kmp_realloc_task_deque(thread, thread_data);
454  }
455  }
456  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457  TASK_DEQUE_SIZE(thread_data->td));
458  // Push taskdata.
459  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460  // Wrap index.
461  thread_data->td.td_deque_tail =
462  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463  TCW_4(thread_data->td.td_deque_ntasks,
464  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466  KMP_FSYNC_RELEASING(taskdata); // releasing child
467  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469  gtid, taskdata, thread_data->td.td_deque_ntasks,
470  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472  task_team->tt.tt_num_task_pri++; // atomic inc
473  return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 // __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478  kmp_info_t *thread = __kmp_threads[gtid];
479  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481  // If we encounter a hidden helper task, and the current thread is not a
482  // hidden helper thread, we have to give the task to any hidden helper thread
483  // starting from its shadow one.
484  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488  // Signal the hidden helper threads.
489  __kmp_hidden_helper_worker_thread_signal();
490  return TASK_SUCCESSFULLY_PUSHED;
491  }
492 
493  kmp_task_team_t *task_team = thread->th.th_task_team;
494  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495  kmp_thread_data_t *thread_data;
496 
497  KA_TRACE(20,
498  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501  // untied task needs to increment counter so that the task structure is not
502  // freed prematurely
503  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504  KMP_DEBUG_USE_VAR(counter);
505  KA_TRACE(
506  20,
507  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508  gtid, counter, taskdata));
509  }
510 
511  // The first check avoids building task_team thread data if serialized
512  if (UNLIKELY(taskdata->td_flags.task_serial)) {
513  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514  "TASK_NOT_PUSHED for task %p\n",
515  gtid, taskdata));
516  return TASK_NOT_PUSHED;
517  }
518 
519  // Now that serialized tasks have returned, we can assume that we are not in
520  // immediate exec mode
521  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523  __kmp_enable_tasking(task_team, thread);
524  }
525  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529  __kmp_max_task_priority > 0) {
530  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532  }
533 
534  // Find tasking deque specific to encountering thread
535  thread_data = &task_team->tt.tt_threads_data[tid];
536 
537  // No lock needed since only owner can allocate. If the task is hidden_helper,
538  // we don't need it either because we have initialized the dequeue for hidden
539  // helper thread data.
540  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541  __kmp_alloc_task_deque(thread, thread_data);
542  }
543 
544  int locked = 0;
545  // Check if deque is full
546  if (TCR_4(thread_data->td.td_deque_ntasks) >=
547  TASK_DEQUE_SIZE(thread_data->td)) {
548  if (__kmp_enable_task_throttling &&
549  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550  thread->th.th_current_task)) {
551  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552  "TASK_NOT_PUSHED for task %p\n",
553  gtid, taskdata));
554  return TASK_NOT_PUSHED;
555  } else {
556  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557  locked = 1;
558  if (TCR_4(thread_data->td.td_deque_ntasks) >=
559  TASK_DEQUE_SIZE(thread_data->td)) {
560  // expand deque to push the task which is not allowed to execute
561  __kmp_realloc_task_deque(thread, thread_data);
562  }
563  }
564  }
565  // Lock the deque for the task push operation
566  if (!locked) {
567  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568  // Need to recheck as we can get a proxy task from thread outside of OpenMP
569  if (TCR_4(thread_data->td.td_deque_ntasks) >=
570  TASK_DEQUE_SIZE(thread_data->td)) {
571  if (__kmp_enable_task_throttling &&
572  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573  thread->th.th_current_task)) {
574  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576  "returning TASK_NOT_PUSHED for task %p\n",
577  gtid, taskdata));
578  return TASK_NOT_PUSHED;
579  } else {
580  // expand deque to push the task which is not allowed to execute
581  __kmp_realloc_task_deque(thread, thread_data);
582  }
583  }
584  }
585  // Must have room since no thread can add tasks but calling thread
586  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587  TASK_DEQUE_SIZE(thread_data->td));
588 
589  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590  taskdata; // Push taskdata
591  // Wrap index.
592  thread_data->td.td_deque_tail =
593  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594  TCW_4(thread_data->td.td_deque_ntasks,
595  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597  KMP_FSYNC_RELEASING(taskdata); // releasing child
598  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599  "task=%p ntasks=%d head=%u tail=%u\n",
600  gtid, taskdata, thread_data->td.td_deque_ntasks,
601  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605  return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614  "this_thread=%p, curtask=%p, "
615  "curtask_parent=%p\n",
616  0, this_thr, this_thr->th.th_current_task,
617  this_thr->th.th_current_task->td_parent));
618 
619  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622  "this_thread=%p, curtask=%p, "
623  "curtask_parent=%p\n",
624  0, this_thr, this_thr->th.th_current_task,
625  this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635  int tid) {
636  // current task of the thread is a parent of the new just created implicit
637  // tasks of new team
638  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639  "curtask=%p "
640  "parent_task=%p\n",
641  tid, this_thr, this_thr->th.th_current_task,
642  team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644  KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646  if (tid == 0) {
647  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648  team->t.t_implicit_task_taskdata[0].td_parent =
649  this_thr->th.th_current_task;
650  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651  }
652  } else {
653  team->t.t_implicit_task_taskdata[tid].td_parent =
654  team->t.t_implicit_task_taskdata[0].td_parent;
655  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656  }
657 
658  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659  "curtask=%p "
660  "parent_task=%p\n",
661  tid, this_thr, this_thr->th.th_current_task,
662  team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671  kmp_taskdata_t *current_task) {
672  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673  kmp_info_t *thread = __kmp_threads[gtid];
674 
675  KA_TRACE(10,
676  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677  gtid, taskdata, current_task));
678 
679  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681  // mark currently executing task as suspended
682  // TODO: GEH - make sure root team implicit task is initialized properly.
683  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684  current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688  if (taskdata->td_flags.tiedness == TASK_TIED) {
689  __kmp_push_task_stack(gtid, thread, taskdata);
690  }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693  // mark starting task as executing and as current task
694  thread->th.th_current_task = taskdata;
695 
696  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697  taskdata->td_flags.tiedness == TASK_UNTIED);
698  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699  taskdata->td_flags.tiedness == TASK_UNTIED);
700  taskdata->td_flags.started = 1;
701  taskdata->td_flags.executing = 1;
702  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705  // GEH TODO: shouldn't we pass some sort of location identifier here?
706  // APT: yes, we will pass location here.
707  // need to store current thread state (in a thread or taskdata structure)
708  // before setting work_state, otherwise wrong state is set after end of task
709 
710  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712  return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 
718 // __ompt_task_start:
719 // Build and trigger task-begin event
720 static inline void __ompt_task_start(kmp_task_t *task,
721  kmp_taskdata_t *current_task,
722  kmp_int32 gtid) {
723  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724  ompt_task_status_t status = ompt_task_switch;
725  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726  status = ompt_task_yield;
727  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
728  }
729  /* let OMPT know that we're about to run this task */
730  if (ompt_enabled.ompt_callback_task_schedule) {
731  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732  &(current_task->ompt_task_info.task_data), status,
733  &(taskdata->ompt_task_info.task_data));
734  }
735  taskdata->ompt_task_info.scheduling_parent = current_task;
736 }
737 
738 // __ompt_task_finish:
739 // Build and trigger final task-schedule event
740 static inline void __ompt_task_finish(kmp_task_t *task,
741  kmp_taskdata_t *resumed_task,
742  ompt_task_status_t status) {
743  if (ompt_enabled.ompt_callback_task_schedule) {
744  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747  status = ompt_task_cancel;
748  }
749 
750  /* let OMPT know that we're returning to the callee task */
751  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752  &(taskdata->ompt_task_info.task_data), status,
753  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754  }
755 }
756 #endif
757 
758 template <bool ompt>
759 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760  kmp_task_t *task,
761  void *frame_address,
762  void *return_address) {
763  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765 
766  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767  "current_task=%p\n",
768  gtid, loc_ref, taskdata, current_task));
769 
770  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771  // untied task needs to increment counter so that the task structure is not
772  // freed prematurely
773  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774  KMP_DEBUG_USE_VAR(counter);
775  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776  "incremented for task %p\n",
777  gtid, counter, taskdata));
778  }
779 
780  taskdata->td_flags.task_serial =
781  1; // Execute this task immediately, not deferred.
782  __kmp_task_start(gtid, task, current_task);
783 
784 #if OMPT_SUPPORT
785  if (ompt) {
786  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787  current_task->ompt_task_info.frame.enter_frame.ptr =
788  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789  current_task->ompt_task_info.frame.enter_frame_flags =
790  taskdata->ompt_task_info.frame.exit_frame_flags =
791  OMPT_FRAME_FLAGS_APP;
792  }
793  if (ompt_enabled.ompt_callback_task_create) {
794  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796  &(parent_info->task_data), &(parent_info->frame),
797  &(taskdata->ompt_task_info.task_data),
798  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
799  }
800  __ompt_task_start(task, current_task, gtid);
801  }
802 #endif // OMPT_SUPPORT
803 
804  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
805  loc_ref, taskdata));
806 }
807 
808 #if OMPT_SUPPORT
809 OMPT_NOINLINE
810 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
811  kmp_task_t *task,
812  void *frame_address,
813  void *return_address) {
814  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
815  return_address);
816 }
817 #endif // OMPT_SUPPORT
818 
819 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
820 // execution
821 //
822 // loc_ref: source location information; points to beginning of task block.
823 // gtid: global thread number.
824 // task: task thunk for the started task.
825 #ifdef __s390x__
826 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
827 // In order for it to work correctly, the caller also needs to be compiled with
828 // backchain. If a caller is compiled without backchain,
829 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
830 // crash.
831 __attribute__((target("backchain")))
832 #endif
833 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
834  kmp_task_t *task) {
835 #if OMPT_SUPPORT
836  if (UNLIKELY(ompt_enabled.enabled)) {
837  OMPT_STORE_RETURN_ADDRESS(gtid);
838  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
839  OMPT_GET_FRAME_ADDRESS(1),
840  OMPT_LOAD_RETURN_ADDRESS(gtid));
841  return;
842  }
843 #endif
844  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
845 }
846 
847 #ifdef TASK_UNUSED
848 // __kmpc_omp_task_begin: report that a given task has started execution
849 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
850 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
851  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
852 
853  KA_TRACE(
854  10,
855  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
856  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
857 
858  __kmp_task_start(gtid, task, current_task);
859 
860  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
861  loc_ref, KMP_TASK_TO_TASKDATA(task)));
862  return;
863 }
864 #endif // TASK_UNUSED
865 
866 // __kmp_free_task: free the current task space and the space for shareds
867 //
868 // gtid: Global thread ID of calling thread
869 // taskdata: task to free
870 // thread: thread data structure of caller
871 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
872  kmp_info_t *thread) {
873  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
874  taskdata));
875 
876  // Check to make sure all flags and counters have the correct values
877  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
878  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
879  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
880  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
881  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
882  taskdata->td_flags.task_serial == 1);
883  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
884  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
885  // Clear data to not be re-used later by mistake.
886  task->data1.destructors = NULL;
887  task->data2.priority = 0;
888 
889  taskdata->td_flags.freed = 1;
890 #if OMPX_TASKGRAPH
891  // do not free tasks in taskgraph
892  if (!taskdata->is_taskgraph) {
893 #endif
894 // deallocate the taskdata and shared variable blocks associated with this task
895 #if USE_FAST_MEMORY
896  __kmp_fast_free(thread, taskdata);
897 #else /* ! USE_FAST_MEMORY */
898  __kmp_thread_free(thread, taskdata);
899 #endif
900 #if OMPX_TASKGRAPH
901  } else {
902  taskdata->td_flags.complete = 0;
903  taskdata->td_flags.started = 0;
904  taskdata->td_flags.freed = 0;
905  taskdata->td_flags.executing = 0;
906  taskdata->td_flags.task_serial =
907  (taskdata->td_parent->td_flags.final ||
908  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
909 
910  // taskdata->td_allow_completion_event.pending_events_count = 1;
911  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
912  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
913  // start at one because counts current task and children
914  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
915  }
916 #endif
917 
918  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
919 }
920 
921 // __kmp_free_task_and_ancestors: free the current task and ancestors without
922 // children
923 //
924 // gtid: Global thread ID of calling thread
925 // taskdata: task to free
926 // thread: thread data structure of caller
927 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
928  kmp_taskdata_t *taskdata,
929  kmp_info_t *thread) {
930  // Proxy tasks must always be allowed to free their parents
931  // because they can be run in background even in serial mode.
932  kmp_int32 team_serial =
933  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
934  !taskdata->td_flags.proxy;
935  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
936 
937  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
938  KMP_DEBUG_ASSERT(children >= 0);
939 
940  // Now, go up the ancestor tree to see if any ancestors can now be freed.
941  while (children == 0) {
942  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
943 
944  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
945  "and freeing itself\n",
946  gtid, taskdata));
947 
948  // --- Deallocate my ancestor task ---
949  __kmp_free_task(gtid, taskdata, thread);
950 
951  taskdata = parent_taskdata;
952 
953  if (team_serial)
954  return;
955  // Stop checking ancestors at implicit task instead of walking up ancestor
956  // tree to avoid premature deallocation of ancestors.
957  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
958  if (taskdata->td_dephash) { // do we need to cleanup dephash?
959  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
960  kmp_tasking_flags_t flags_old = taskdata->td_flags;
961  if (children == 0 && flags_old.complete == 1) {
962  kmp_tasking_flags_t flags_new = flags_old;
963  flags_new.complete = 0;
964  if (KMP_COMPARE_AND_STORE_ACQ32(
965  RCAST(kmp_int32 *, &taskdata->td_flags),
966  *RCAST(kmp_int32 *, &flags_old),
967  *RCAST(kmp_int32 *, &flags_new))) {
968  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
969  "dephash of implicit task %p\n",
970  gtid, taskdata));
971  // cleanup dephash of finished implicit task
972  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
973  }
974  }
975  }
976  return;
977  }
978  // Predecrement simulated by "- 1" calculation
979  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
980  KMP_DEBUG_ASSERT(children >= 0);
981  }
982 
983  KA_TRACE(
984  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
985  "not freeing it yet\n",
986  gtid, taskdata, children));
987 }
988 
989 // Only need to keep track of child task counts if any of the following:
990 // 1. team parallel and tasking not serialized;
991 // 2. it is a proxy or detachable or hidden helper task
992 // 3. the children counter of its parent task is greater than 0.
993 // The reason for the 3rd one is for serialized team that found detached task,
994 // hidden helper task, T. In this case, the execution of T is still deferred,
995 // and it is also possible that a regular task depends on T. In this case, if we
996 // don't track the children, task synchronization will be broken.
997 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
998  kmp_tasking_flags_t flags = taskdata->td_flags;
999  bool ret = !(flags.team_serial || flags.tasking_ser);
1000  ret = ret || flags.proxy == TASK_PROXY ||
1001  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1002  ret = ret ||
1003  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1004 #if OMPX_TASKGRAPH
1005  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1006  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1007 #endif
1008  return ret;
1009 }
1010 
1011 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1012 //
1013 // gtid: global thread ID for calling thread
1014 // task: task to be finished
1015 // resumed_task: task to be resumed. (may be NULL if task is serialized)
1016 //
1017 // template<ompt>: effectively ompt_enabled.enabled!=0
1018 // the version with ompt=false is inlined, allowing to optimize away all ompt
1019 // code in this case
1020 template <bool ompt>
1021 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1022  kmp_taskdata_t *resumed_task) {
1023  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1024  kmp_info_t *thread = __kmp_threads[gtid];
1025  kmp_task_team_t *task_team =
1026  thread->th.th_task_team; // might be NULL for serial teams...
1027 #if OMPX_TASKGRAPH
1028  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1029  bool is_taskgraph;
1030 #endif
1031 #if KMP_DEBUG
1032  kmp_int32 children = 0;
1033 #endif
1034  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1035  "task %p\n",
1036  gtid, taskdata, resumed_task));
1037 
1038  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1039 
1040 #if OMPX_TASKGRAPH
1041  is_taskgraph = taskdata->is_taskgraph;
1042 #endif
1043 
1044 // Pop task from stack if tied
1045 #ifdef BUILD_TIED_TASK_STACK
1046  if (taskdata->td_flags.tiedness == TASK_TIED) {
1047  __kmp_pop_task_stack(gtid, thread, taskdata);
1048  }
1049 #endif /* BUILD_TIED_TASK_STACK */
1050 
1051  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1052  // untied task needs to check the counter so that the task structure is not
1053  // freed prematurely
1054  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1055  KA_TRACE(
1056  20,
1057  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1058  gtid, counter, taskdata));
1059  if (counter > 0) {
1060  // untied task is not done, to be continued possibly by other thread, do
1061  // not free it now
1062  if (resumed_task == NULL) {
1063  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1064  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1065  // task is the parent
1066  }
1067  thread->th.th_current_task = resumed_task; // restore current_task
1068  resumed_task->td_flags.executing = 1; // resume previous task
1069  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1070  "resuming task %p\n",
1071  gtid, taskdata, resumed_task));
1072  return;
1073  }
1074  }
1075 
1076  // bookkeeping for resuming task:
1077  // GEH - note tasking_ser => task_serial
1078  KMP_DEBUG_ASSERT(
1079  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1080  taskdata->td_flags.task_serial);
1081  if (taskdata->td_flags.task_serial) {
1082  if (resumed_task == NULL) {
1083  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1084  // task is the parent
1085  }
1086  } else {
1087  KMP_DEBUG_ASSERT(resumed_task !=
1088  NULL); // verify that resumed task is passed as argument
1089  }
1090 
1091  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1092  destructor thunk that has been generated by the compiler. The code is
1093  placed here, since at this point other tasks might have been released
1094  hence overlapping the destructor invocations with some other work in the
1095  released tasks. The OpenMP spec is not specific on when the destructors
1096  are invoked, so we should be free to choose. */
1097  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1098  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1099  KMP_ASSERT(destr_thunk);
1100  destr_thunk(gtid, task);
1101  }
1102 
1103  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1104  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1105  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1106 
1107  bool completed = true;
1108  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1109  if (taskdata->td_allow_completion_event.type ==
1110  KMP_EVENT_ALLOW_COMPLETION) {
1111  // event hasn't been fulfilled yet. Try to detach task.
1112  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1113  if (taskdata->td_allow_completion_event.type ==
1114  KMP_EVENT_ALLOW_COMPLETION) {
1115  // task finished execution
1116  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1117  taskdata->td_flags.executing = 0; // suspend the finishing task
1118 
1119 #if OMPT_SUPPORT
1120  // For a detached task, which is not completed, we switch back
1121  // the omp_fulfill_event signals completion
1122  // locking is necessary to avoid a race with ompt_task_late_fulfill
1123  if (ompt)
1124  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1125 #endif
1126 
1127  // no access to taskdata after this point!
1128  // __kmp_fulfill_event might free taskdata at any time from now
1129 
1130  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1131  completed = false;
1132  }
1133  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1134  }
1135  }
1136 
1137  // Tasks with valid target async handles must be re-enqueued.
1138  if (taskdata->td_target_data.async_handle != NULL) {
1139  // Note: no need to translate gtid to its shadow. If the current thread is a
1140  // hidden helper one, then the gtid is already correct. Otherwise, hidden
1141  // helper threads are disabled, and gtid refers to a OpenMP thread.
1142 #if OMPT_SUPPORT
1143  if (ompt) {
1144  __ompt_task_finish(task, resumed_task, ompt_task_switch);
1145  }
1146 #endif
1147  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1148  if (KMP_HIDDEN_HELPER_THREAD(gtid))
1149  __kmp_hidden_helper_worker_thread_signal();
1150  completed = false;
1151  }
1152 
1153  if (completed) {
1154  taskdata->td_flags.complete = 1; // mark the task as completed
1155 #if OMPX_TASKGRAPH
1156  taskdata->td_flags.onced = 1; // mark the task as ran once already
1157 #endif
1158 
1159 #if OMPT_SUPPORT
1160  // This is not a detached task, we are done here
1161  if (ompt)
1162  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1163 #endif
1164  // TODO: What would be the balance between the conditions in the function
1165  // and an atomic operation?
1166  if (__kmp_track_children_task(taskdata)) {
1167  __kmp_release_deps(gtid, taskdata);
1168  // Predecrement simulated by "- 1" calculation
1169 #if KMP_DEBUG
1170  children = -1 +
1171 #endif
1172  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1173  KMP_DEBUG_ASSERT(children >= 0);
1174 #if OMPX_TASKGRAPH
1175  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1176 #else
1177  if (taskdata->td_taskgroup)
1178 #endif
1179  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1180  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1181  task_team->tt.tt_hidden_helper_task_encountered)) {
1182  // if we found proxy or hidden helper tasks there could exist a dependency
1183  // chain with the proxy task as origin
1184  __kmp_release_deps(gtid, taskdata);
1185  }
1186  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1187  // called. Othertwise, if a task is executed immediately from the
1188  // release_deps code, the flag will be reset to 1 again by this same
1189  // function
1190  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1191  taskdata->td_flags.executing = 0; // suspend the finishing task
1192 
1193  // Decrement the counter of hidden helper tasks to be executed.
1194  if (taskdata->td_flags.hidden_helper) {
1195  // Hidden helper tasks can only be executed by hidden helper threads.
1196  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1197  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1198  }
1199  }
1200 
1201  KA_TRACE(
1202  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1203  gtid, taskdata, children));
1204 
1205  // Free this task and then ancestor tasks if they have no children.
1206  // Restore th_current_task first as suggested by John:
1207  // johnmc: if an asynchronous inquiry peers into the runtime system
1208  // it doesn't see the freed task as the current task.
1209  thread->th.th_current_task = resumed_task;
1210  if (completed)
1211  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1212 
1213  // TODO: GEH - make sure root team implicit task is initialized properly.
1214  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1215  resumed_task->td_flags.executing = 1; // resume previous task
1216 
1217 #if OMPX_TASKGRAPH
1218  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1219  taskdata->td_taskgroup) {
1220  // TDG: we only release taskgroup barrier here because
1221  // free_task_and_ancestors will call
1222  // __kmp_free_task, which resets all task parameters such as
1223  // taskdata->started, etc. If we release the barrier earlier, these
1224  // parameters could be read before being reset. This is not an issue for
1225  // non-TDG implementation because we never reuse a task(data) structure
1226  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1227  }
1228 #endif
1229 
1230  KA_TRACE(
1231  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1232  gtid, taskdata, resumed_task));
1233 
1234  return;
1235 }
1236 
1237 template <bool ompt>
1238 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1239  kmp_int32 gtid,
1240  kmp_task_t *task) {
1241  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1242  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1243  KMP_DEBUG_ASSERT(gtid >= 0);
1244  // this routine will provide task to resume
1245  __kmp_task_finish<ompt>(gtid, task, NULL);
1246 
1247  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1248  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1249 
1250 #if OMPT_SUPPORT
1251  if (ompt) {
1252  ompt_frame_t *ompt_frame;
1253  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1254  ompt_frame->enter_frame = ompt_data_none;
1255  ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1256  }
1257 #endif
1258 
1259  return;
1260 }
1261 
1262 #if OMPT_SUPPORT
1263 OMPT_NOINLINE
1264 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1265  kmp_task_t *task) {
1266  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1267 }
1268 #endif // OMPT_SUPPORT
1269 
1270 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1271 //
1272 // loc_ref: source location information; points to end of task block.
1273 // gtid: global thread number.
1274 // task: task thunk for the completed task.
1275 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1276  kmp_task_t *task) {
1277 #if OMPT_SUPPORT
1278  if (UNLIKELY(ompt_enabled.enabled)) {
1279  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1280  return;
1281  }
1282 #endif
1283  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1284 }
1285 
1286 #ifdef TASK_UNUSED
1287 // __kmpc_omp_task_complete: report that a task has completed execution
1288 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1289 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1290  kmp_task_t *task) {
1291  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1292  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1293 
1294  __kmp_task_finish<false>(gtid, task,
1295  NULL); // Not sure how to find task to resume
1296 
1297  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1298  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1299  return;
1300 }
1301 #endif // TASK_UNUSED
1302 
1303 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1304 // task for a given thread
1305 //
1306 // loc_ref: reference to source location of parallel region
1307 // this_thr: thread data structure corresponding to implicit task
1308 // team: team for this_thr
1309 // tid: thread id of given thread within team
1310 // set_curr_task: TRUE if need to push current task to thread
1311 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1312 // have already been done elsewhere.
1313 // TODO: Get better loc_ref. Value passed in may be NULL
1314 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1315  kmp_team_t *team, int tid, int set_curr_task) {
1316  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1317 
1318  KF_TRACE(
1319  10,
1320  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1321  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1322 
1323  task->td_task_id = KMP_GEN_TASK_ID();
1324  task->td_team = team;
1325  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1326  // in debugger)
1327  task->td_ident = loc_ref;
1328  task->td_taskwait_ident = NULL;
1329  task->td_taskwait_counter = 0;
1330  task->td_taskwait_thread = 0;
1331 
1332  task->td_flags.tiedness = TASK_TIED;
1333  task->td_flags.tasktype = TASK_IMPLICIT;
1334  task->td_flags.proxy = TASK_FULL;
1335 
1336  // All implicit tasks are executed immediately, not deferred
1337  task->td_flags.task_serial = 1;
1338  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1339  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1340 
1341  task->td_flags.started = 1;
1342  task->td_flags.executing = 1;
1343  task->td_flags.complete = 0;
1344  task->td_flags.freed = 0;
1345 #if OMPX_TASKGRAPH
1346  task->td_flags.onced = 0;
1347 #endif
1348 
1349  task->td_depnode = NULL;
1350  task->td_last_tied = task;
1351  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 
1353  if (set_curr_task) { // only do this init first time thread is created
1354  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1355  // Not used: don't need to deallocate implicit task
1356  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1357  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1358  task->td_dephash = NULL;
1359  __kmp_push_current_task_to_thread(this_thr, team, tid);
1360  } else {
1361  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1362  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1363  }
1364 
1365 #if OMPT_SUPPORT
1366  if (UNLIKELY(ompt_enabled.enabled))
1367  __ompt_task_init(task, tid);
1368 #endif
1369 
1370  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1371  team, task));
1372 }
1373 
1374 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1375 // at the end of parallel regions. Some resources are kept for reuse in the next
1376 // parallel region.
1377 //
1378 // thread: thread data structure corresponding to implicit task
1379 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1380  kmp_taskdata_t *task = thread->th.th_current_task;
1381  if (task->td_dephash) {
1382  int children;
1383  task->td_flags.complete = 1;
1384 #if OMPX_TASKGRAPH
1385  task->td_flags.onced = 1;
1386 #endif
1387  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1388  kmp_tasking_flags_t flags_old = task->td_flags;
1389  if (children == 0 && flags_old.complete == 1) {
1390  kmp_tasking_flags_t flags_new = flags_old;
1391  flags_new.complete = 0;
1392  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1393  *RCAST(kmp_int32 *, &flags_old),
1394  *RCAST(kmp_int32 *, &flags_new))) {
1395  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1396  "dephash of implicit task %p\n",
1397  thread->th.th_info.ds.ds_gtid, task));
1398  __kmp_dephash_free_entries(thread, task->td_dephash);
1399  }
1400  }
1401  }
1402 }
1403 
1404 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1405 // when these are destroyed regions
1406 //
1407 // thread: thread data structure corresponding to implicit task
1408 void __kmp_free_implicit_task(kmp_info_t *thread) {
1409  kmp_taskdata_t *task = thread->th.th_current_task;
1410  if (task && task->td_dephash) {
1411  __kmp_dephash_free(thread, task->td_dephash);
1412  task->td_dephash = NULL;
1413  }
1414 }
1415 
1416 // Round up a size to a power of two specified by val: Used to insert padding
1417 // between structures co-allocated using a single malloc() call
1418 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1419  if (size & (val - 1)) {
1420  size &= ~(val - 1);
1421  if (size <= KMP_SIZE_T_MAX - val) {
1422  size += val; // Round up if there is no overflow.
1423  }
1424  }
1425  return size;
1426 } // __kmp_round_up_to_va
1427 
1428 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1429 //
1430 // loc_ref: source location information
1431 // gtid: global thread number.
1432 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1433 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1434 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1435 // private vars accessed in task.
1436 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1437 // in task.
1438 // task_entry: Pointer to task code entry point generated by compiler.
1439 // returns: a pointer to the allocated kmp_task_t structure (task).
1440 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1441  kmp_tasking_flags_t *flags,
1442  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1443  kmp_routine_entry_t task_entry) {
1444  kmp_task_t *task;
1445  kmp_taskdata_t *taskdata;
1446  kmp_info_t *thread = __kmp_threads[gtid];
1447  kmp_team_t *team = thread->th.th_team;
1448  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1449  size_t shareds_offset;
1450 
1451  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1452  __kmp_middle_initialize();
1453 
1454  if (flags->hidden_helper) {
1455  if (__kmp_enable_hidden_helper) {
1456  if (!TCR_4(__kmp_init_hidden_helper))
1457  __kmp_hidden_helper_initialize();
1458  } else {
1459  // If the hidden helper task is not enabled, reset the flag to FALSE.
1460  flags->hidden_helper = FALSE;
1461  }
1462  }
1463 
1464  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1465  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1466  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1467  sizeof_shareds, task_entry));
1468 
1469  KMP_DEBUG_ASSERT(parent_task);
1470  if (parent_task->td_flags.final) {
1471  if (flags->merged_if0) {
1472  }
1473  flags->final = 1;
1474  }
1475 
1476  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1477  // Untied task encountered causes the TSC algorithm to check entire deque of
1478  // the victim thread. If no untied task encountered, then checking the head
1479  // of the deque should be enough.
1480  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1481  }
1482 
1483  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1484  // the tasking setup
1485  // when that happens is too late.
1486  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1487  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1488  if (flags->proxy == TASK_PROXY) {
1489  flags->tiedness = TASK_UNTIED;
1490  flags->merged_if0 = 1;
1491  }
1492  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1493  tasking support enabled */
1494  if ((thread->th.th_task_team) == NULL) {
1495  /* This should only happen if the team is serialized
1496  setup a task team and propagate it to the thread */
1497  KMP_DEBUG_ASSERT(team->t.t_serialized);
1498  KA_TRACE(30,
1499  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1500  gtid));
1501  __kmp_task_team_setup(thread, team);
1502  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1503  }
1504  kmp_task_team_t *task_team = thread->th.th_task_team;
1505 
1506  /* tasking must be enabled now as the task might not be pushed */
1507  if (!KMP_TASKING_ENABLED(task_team)) {
1508  KA_TRACE(
1509  30,
1510  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1511  __kmp_enable_tasking(task_team, thread);
1512  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1513  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1514  // No lock needed since only owner can allocate
1515  if (thread_data->td.td_deque == NULL) {
1516  __kmp_alloc_task_deque(thread, thread_data);
1517  }
1518  }
1519 
1520  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1521  task_team->tt.tt_found_proxy_tasks == FALSE)
1522  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1523  if (flags->hidden_helper &&
1524  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1525  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1526  }
1527 
1528  // Calculate shared structure offset including padding after kmp_task_t struct
1529  // to align pointers in shared struct
1530  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1531  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1532 
1533  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1534  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1535  shareds_offset));
1536  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1537  sizeof_shareds));
1538 
1539  // Avoid double allocation here by combining shareds with taskdata
1540 #if USE_FAST_MEMORY
1541  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1542  sizeof_shareds);
1543 #else /* ! USE_FAST_MEMORY */
1544  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1545  sizeof_shareds);
1546 #endif /* USE_FAST_MEMORY */
1547 
1548  task = KMP_TASKDATA_TO_TASK(taskdata);
1549 
1550 // Make sure task & taskdata are aligned appropriately
1551 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1552  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1553  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1554 #else
1555  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1556  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1557 #endif
1558  if (sizeof_shareds > 0) {
1559  // Avoid double allocation here by combining shareds with taskdata
1560  task->shareds = &((char *)taskdata)[shareds_offset];
1561  // Make sure shareds struct is aligned to pointer size
1562  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1563  0);
1564  } else {
1565  task->shareds = NULL;
1566  }
1567  task->routine = task_entry;
1568  task->part_id = 0; // AC: Always start with 0 part id
1569 
1570  taskdata->td_task_id = KMP_GEN_TASK_ID();
1571  taskdata->td_team = thread->th.th_team;
1572  taskdata->td_alloc_thread = thread;
1573  taskdata->td_parent = parent_task;
1574  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1575  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1576  taskdata->td_ident = loc_ref;
1577  taskdata->td_taskwait_ident = NULL;
1578  taskdata->td_taskwait_counter = 0;
1579  taskdata->td_taskwait_thread = 0;
1580  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1581  // avoid copying icvs for proxy tasks
1582  if (flags->proxy == TASK_FULL)
1583  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1584 
1585  taskdata->td_flags = *flags;
1586  taskdata->td_task_team = thread->th.th_task_team;
1587  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1588  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1589  // If it is hidden helper task, we need to set the team and task team
1590  // correspondingly.
1591  if (flags->hidden_helper) {
1592  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1593  taskdata->td_team = shadow_thread->th.th_team;
1594  taskdata->td_task_team = shadow_thread->th.th_task_team;
1595  }
1596 
1597  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1598  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1599 
1600  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1601  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1602 
1603  // GEH - Note we serialize the task if the team is serialized to make sure
1604  // implicit parallel region tasks are not left until program termination to
1605  // execute. Also, it helps locality to execute immediately.
1606 
1607  taskdata->td_flags.task_serial =
1608  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1609  taskdata->td_flags.tasking_ser || flags->merged_if0);
1610 
1611  taskdata->td_flags.started = 0;
1612  taskdata->td_flags.executing = 0;
1613  taskdata->td_flags.complete = 0;
1614  taskdata->td_flags.freed = 0;
1615 #if OMPX_TASKGRAPH
1616  taskdata->td_flags.onced = 0;
1617 #endif
1618  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1619  // start at one because counts current task and children
1620  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1621  taskdata->td_taskgroup =
1622  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1623  taskdata->td_dephash = NULL;
1624  taskdata->td_depnode = NULL;
1625  taskdata->td_target_data.async_handle = NULL;
1626  if (flags->tiedness == TASK_UNTIED)
1627  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1628  else
1629  taskdata->td_last_tied = taskdata;
1630  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1631 #if OMPT_SUPPORT
1632  if (UNLIKELY(ompt_enabled.enabled))
1633  __ompt_task_init(taskdata, gtid);
1634 #endif
1635  // TODO: What would be the balance between the conditions in the function and
1636  // an atomic operation?
1637  if (__kmp_track_children_task(taskdata)) {
1638  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1639  if (parent_task->td_taskgroup)
1640  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1641  // Only need to keep track of allocated child tasks for explicit tasks since
1642  // implicit not deallocated
1643  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1644  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1645  }
1646  if (flags->hidden_helper) {
1647  taskdata->td_flags.task_serial = FALSE;
1648  // Increment the number of hidden helper tasks to be executed
1649  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1650  }
1651  }
1652 
1653 #if OMPX_TASKGRAPH
1654  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1655  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1656  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1657  taskdata->is_taskgraph = 1;
1658  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1659  taskdata->td_task_id = KMP_GEN_TASK_ID();
1660  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1661  }
1662 #endif
1663  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1664  gtid, taskdata, taskdata->td_parent));
1665 
1666  return task;
1667 }
1668 
1669 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1670  kmp_int32 flags, size_t sizeof_kmp_task_t,
1671  size_t sizeof_shareds,
1672  kmp_routine_entry_t task_entry) {
1673  kmp_task_t *retval;
1674  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1675  __kmp_assert_valid_gtid(gtid);
1676  input_flags->native = FALSE;
1677  // __kmp_task_alloc() sets up all other runtime flags
1678  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1679  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1680  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1681  input_flags->proxy ? "proxy" : "",
1682  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1683  sizeof_shareds, task_entry));
1684 
1685  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1686  sizeof_shareds, task_entry);
1687 
1688  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1689 
1690  return retval;
1691 }
1692 
1693 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1694  kmp_int32 flags,
1695  size_t sizeof_kmp_task_t,
1696  size_t sizeof_shareds,
1697  kmp_routine_entry_t task_entry,
1698  kmp_int64 device_id) {
1699  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1700  // target task is untied defined in the specification
1701  input_flags.tiedness = TASK_UNTIED;
1702  input_flags.target = 1;
1703 
1704  if (__kmp_enable_hidden_helper)
1705  input_flags.hidden_helper = TRUE;
1706 
1707  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1708  sizeof_shareds, task_entry);
1709 }
1710 
1724 kmp_int32
1726  kmp_task_t *new_task, kmp_int32 naffins,
1727  kmp_task_affinity_info_t *affin_list) {
1728  return 0;
1729 }
1730 
1731 // __kmp_invoke_task: invoke the specified task
1732 //
1733 // gtid: global thread ID of caller
1734 // task: the task to invoke
1735 // current_task: the task to resume after task invocation
1736 #ifdef __s390x__
1737 __attribute__((target("backchain")))
1738 #endif
1739 static void
1740 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1741  kmp_taskdata_t *current_task) {
1742  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1743  kmp_info_t *thread;
1744  int discard = 0 /* false */;
1745  KA_TRACE(
1746  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1747  gtid, taskdata, current_task));
1748  KMP_DEBUG_ASSERT(task);
1749  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1750  taskdata->td_flags.complete == 1)) {
1751  // This is a proxy task that was already completed but it needs to run
1752  // its bottom-half finish
1753  KA_TRACE(
1754  30,
1755  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1756  gtid, taskdata));
1757 
1758  __kmp_bottom_half_finish_proxy(gtid, task);
1759 
1760  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1761  "proxy task %p, resuming task %p\n",
1762  gtid, taskdata, current_task));
1763 
1764  return;
1765  }
1766 
1767 #if OMPT_SUPPORT
1768  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1769  // does not execute code.
1770  ompt_thread_info_t oldInfo;
1771  if (UNLIKELY(ompt_enabled.enabled)) {
1772  // Store the threads states and restore them after the task
1773  thread = __kmp_threads[gtid];
1774  oldInfo = thread->th.ompt_thread_info;
1775  thread->th.ompt_thread_info.wait_id = 0;
1776  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1777  ? ompt_state_work_serial
1778  : ompt_state_work_parallel;
1779  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1780  }
1781 #endif
1782 
1783  // Proxy tasks are not handled by the runtime
1784  if (taskdata->td_flags.proxy != TASK_PROXY) {
1785  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1786  }
1787 
1788  // TODO: cancel tasks if the parallel region has also been cancelled
1789  // TODO: check if this sequence can be hoisted above __kmp_task_start
1790  // if cancellation has been enabled for this run ...
1791  if (UNLIKELY(__kmp_omp_cancellation)) {
1792  thread = __kmp_threads[gtid];
1793  kmp_team_t *this_team = thread->th.th_team;
1794  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1795  if ((taskgroup && taskgroup->cancel_request) ||
1796  (this_team->t.t_cancel_request == cancel_parallel)) {
1797 #if OMPT_SUPPORT && OMPT_OPTIONAL
1798  ompt_data_t *task_data;
1799  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1800  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1801  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1802  task_data,
1803  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1804  : ompt_cancel_parallel) |
1805  ompt_cancel_discarded_task,
1806  NULL);
1807  }
1808 #endif
1809  KMP_COUNT_BLOCK(TASK_cancelled);
1810  // this task belongs to a task group and we need to cancel it
1811  discard = 1 /* true */;
1812  }
1813  }
1814 
1815  // Invoke the task routine and pass in relevant data.
1816  // Thunks generated by gcc take a different argument list.
1817  if (!discard) {
1818  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1819  taskdata->td_last_tied = current_task->td_last_tied;
1820  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1821  }
1822 #if KMP_STATS_ENABLED
1823  KMP_COUNT_BLOCK(TASK_executed);
1824  switch (KMP_GET_THREAD_STATE()) {
1825  case FORK_JOIN_BARRIER:
1826  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1827  break;
1828  case PLAIN_BARRIER:
1829  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1830  break;
1831  case TASKYIELD:
1832  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1833  break;
1834  case TASKWAIT:
1835  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1836  break;
1837  case TASKGROUP:
1838  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1839  break;
1840  default:
1841  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1842  break;
1843  }
1844 #endif // KMP_STATS_ENABLED
1845 
1846 // OMPT task begin
1847 #if OMPT_SUPPORT
1848  if (UNLIKELY(ompt_enabled.enabled))
1849  __ompt_task_start(task, current_task, gtid);
1850 #endif
1851 #if OMPT_SUPPORT && OMPT_OPTIONAL
1852  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1853  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1854  ompt_data_t instance = ompt_data_none;
1855  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1856  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1857  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1858  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1859  ompt_dispatch_taskloop_chunk, instance);
1860  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1861  }
1862 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1863 
1864 #if OMPD_SUPPORT
1865  if (ompd_state & OMPD_ENABLE_BP)
1866  ompd_bp_task_begin();
1867 #endif
1868 
1869 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1870  kmp_uint64 cur_time;
1871  kmp_int32 kmp_itt_count_task =
1872  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1873  current_task->td_flags.tasktype == TASK_IMPLICIT;
1874  if (kmp_itt_count_task) {
1875  thread = __kmp_threads[gtid];
1876  // Time outer level explicit task on barrier for adjusting imbalance time
1877  if (thread->th.th_bar_arrive_time)
1878  cur_time = __itt_get_timestamp();
1879  else
1880  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1881  }
1882  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1883 #endif
1884 
1885 #if ENABLE_LIBOMPTARGET
1886  if (taskdata->td_target_data.async_handle != NULL) {
1887  // If we have a valid target async handle, that means that we have already
1888  // executed the task routine once. We must query for the handle completion
1889  // instead of re-executing the routine.
1890  KMP_ASSERT(tgt_target_nowait_query);
1891  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1892  } else
1893 #endif
1894  if (task->routine != NULL) {
1895 #ifdef KMP_GOMP_COMPAT
1896  if (taskdata->td_flags.native) {
1897  ((void (*)(void *))(*(task->routine)))(task->shareds);
1898  } else
1899 #endif /* KMP_GOMP_COMPAT */
1900  {
1901  (*(task->routine))(gtid, task);
1902  }
1903  }
1904  KMP_POP_PARTITIONED_TIMER();
1905 
1906 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1907  if (kmp_itt_count_task) {
1908  // Barrier imbalance - adjust arrive time with the task duration
1909  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1910  }
1911  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1912  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1913 #endif
1914  }
1915 
1916 #if OMPD_SUPPORT
1917  if (ompd_state & OMPD_ENABLE_BP)
1918  ompd_bp_task_end();
1919 #endif
1920 
1921  // Proxy tasks are not handled by the runtime
1922  if (taskdata->td_flags.proxy != TASK_PROXY) {
1923 #if OMPT_SUPPORT
1924  if (UNLIKELY(ompt_enabled.enabled)) {
1925  thread->th.ompt_thread_info = oldInfo;
1926  if (taskdata->td_flags.tiedness == TASK_TIED) {
1927  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1928  }
1929  __kmp_task_finish<true>(gtid, task, current_task);
1930  } else
1931 #endif
1932  __kmp_task_finish<false>(gtid, task, current_task);
1933  }
1934 #if OMPT_SUPPORT
1935  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1936  __ompt_task_finish(task, current_task, ompt_task_switch);
1937  }
1938 #endif
1939 
1940  KA_TRACE(
1941  30,
1942  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1943  gtid, taskdata, current_task));
1944  return;
1945 }
1946 
1947 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1948 //
1949 // loc_ref: location of original task pragma (ignored)
1950 // gtid: Global Thread ID of encountering thread
1951 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1952 // Returns:
1953 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1954 // be resumed later.
1955 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1956 // resumed later.
1957 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1958  kmp_task_t *new_task) {
1959  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1960 
1961  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1962  loc_ref, new_taskdata));
1963 
1964 #if OMPT_SUPPORT
1965  kmp_taskdata_t *parent;
1966  if (UNLIKELY(ompt_enabled.enabled)) {
1967  parent = new_taskdata->td_parent;
1968  if (ompt_enabled.ompt_callback_task_create) {
1969  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1970  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1971  &(new_taskdata->ompt_task_info.task_data),
1972  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1973  OMPT_GET_RETURN_ADDRESS(0));
1974  }
1975  }
1976 #endif
1977 
1978  /* Should we execute the new task or queue it? For now, let's just always try
1979  to queue it. If the queue fills up, then we'll execute it. */
1980 
1981  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1982  { // Execute this task immediately
1983  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1984  new_taskdata->td_flags.task_serial = 1;
1985  __kmp_invoke_task(gtid, new_task, current_task);
1986  }
1987 
1988  KA_TRACE(
1989  10,
1990  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1991  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1992  gtid, loc_ref, new_taskdata));
1993 
1994 #if OMPT_SUPPORT
1995  if (UNLIKELY(ompt_enabled.enabled)) {
1996  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1997  parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1998  }
1999 #endif
2000  return TASK_CURRENT_NOT_QUEUED;
2001 }
2002 
2003 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
2004 //
2005 // gtid: Global Thread ID of encountering thread
2006 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2007 // serialize_immediate: if TRUE then if the task is executed immediately its
2008 // execution will be serialized
2009 // Returns:
2010 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2011 // be resumed later.
2012 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2013 // resumed later.
2014 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2015  bool serialize_immediate) {
2016  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2017 
2018 #if OMPX_TASKGRAPH
2019  if (new_taskdata->is_taskgraph &&
2020  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2021  kmp_tdg_info_t *tdg = new_taskdata->tdg;
2022  // extend the record_map if needed
2023  if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
2024  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2025  // map_size could have been updated by another thread if recursive
2026  // taskloop
2027  if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
2028  kmp_uint old_size = tdg->map_size;
2029  kmp_uint new_size = old_size * 2;
2030  kmp_node_info_t *old_record = tdg->record_map;
2031  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2032  new_size * sizeof(kmp_node_info_t));
2033 
2034  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2035  tdg->record_map = new_record;
2036 
2037  __kmp_free(old_record);
2038 
2039  for (kmp_int i = old_size; i < new_size; i++) {
2040  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2041  __kmp_successors_size * sizeof(kmp_int32));
2042  new_record[i].task = nullptr;
2043  new_record[i].successors = successorsList;
2044  new_record[i].nsuccessors = 0;
2045  new_record[i].npredecessors = 0;
2046  new_record[i].successors_size = __kmp_successors_size;
2047  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2048  }
2049  // update the size at the end, so that we avoid other
2050  // threads use old_record while map_size is already updated
2051  tdg->map_size = new_size;
2052  }
2053  __kmp_release_bootstrap_lock(&tdg->graph_lock);
2054  }
2055  // record a task
2056  if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
2057  tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
2058  tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
2059  new_taskdata->td_parent;
2060  KMP_ATOMIC_INC(&tdg->num_tasks);
2061  }
2062  }
2063 #endif
2064 
2065  /* Should we execute the new task or queue it? For now, let's just always try
2066  to queue it. If the queue fills up, then we'll execute it. */
2067  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2068  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2069  { // Execute this task immediately
2070  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2071  if (serialize_immediate)
2072  new_taskdata->td_flags.task_serial = 1;
2073  __kmp_invoke_task(gtid, new_task, current_task);
2074  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2075  __kmp_wpolicy_passive) {
2076  kmp_info_t *this_thr = __kmp_threads[gtid];
2077  kmp_team_t *team = this_thr->th.th_team;
2078  kmp_int32 nthreads = this_thr->th.th_team_nproc;
2079  for (int i = 0; i < nthreads; ++i) {
2080  kmp_info_t *thread = team->t.t_threads[i];
2081  if (thread == this_thr)
2082  continue;
2083  if (thread->th.th_sleep_loc != NULL) {
2084  __kmp_null_resume_wrapper(thread);
2085  break; // awake one thread at a time
2086  }
2087  }
2088  }
2089  return TASK_CURRENT_NOT_QUEUED;
2090 }
2091 
2092 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2093 // non-thread-switchable task from the parent thread only!
2094 //
2095 // loc_ref: location of original task pragma (ignored)
2096 // gtid: Global Thread ID of encountering thread
2097 // new_task: non-thread-switchable task thunk allocated by
2098 // __kmp_omp_task_alloc()
2099 // Returns:
2100 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2101 // be resumed later.
2102 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2103 // resumed later.
2104 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2105  kmp_task_t *new_task) {
2106  kmp_int32 res;
2107  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2108 
2109 #if KMP_DEBUG || OMPT_SUPPORT
2110  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2111 #endif
2112  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2113  new_taskdata));
2114  __kmp_assert_valid_gtid(gtid);
2115 
2116 #if OMPT_SUPPORT
2117  kmp_taskdata_t *parent = NULL;
2118  if (UNLIKELY(ompt_enabled.enabled)) {
2119  if (!new_taskdata->td_flags.started) {
2120  OMPT_STORE_RETURN_ADDRESS(gtid);
2121  parent = new_taskdata->td_parent;
2122  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2123  parent->ompt_task_info.frame.enter_frame.ptr =
2124  OMPT_GET_FRAME_ADDRESS(0);
2125  }
2126  if (ompt_enabled.ompt_callback_task_create) {
2127  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2128  &(parent->ompt_task_info.task_data),
2129  &(parent->ompt_task_info.frame),
2130  &(new_taskdata->ompt_task_info.task_data),
2131  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2132  OMPT_LOAD_RETURN_ADDRESS(gtid));
2133  }
2134  } else {
2135  // We are scheduling the continuation of an UNTIED task.
2136  // Scheduling back to the parent task.
2137  __ompt_task_finish(new_task,
2138  new_taskdata->ompt_task_info.scheduling_parent,
2139  ompt_task_switch);
2140  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2141  }
2142  }
2143 #endif
2144 
2145  res = __kmp_omp_task(gtid, new_task, true);
2146 
2147  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2148  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2149  gtid, loc_ref, new_taskdata));
2150 #if OMPT_SUPPORT
2151  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2152  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2153  }
2154 #endif
2155  return res;
2156 }
2157 
2158 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2159 // a taskloop task with the correct OMPT return address
2160 //
2161 // loc_ref: location of original task pragma (ignored)
2162 // gtid: Global Thread ID of encountering thread
2163 // new_task: non-thread-switchable task thunk allocated by
2164 // __kmp_omp_task_alloc()
2165 // codeptr_ra: return address for OMPT callback
2166 // Returns:
2167 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2168 // be resumed later.
2169 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2170 // resumed later.
2171 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2172  kmp_task_t *new_task, void *codeptr_ra) {
2173  kmp_int32 res;
2174  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2175 
2176 #if KMP_DEBUG || OMPT_SUPPORT
2177  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2178 #endif
2179  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2180  new_taskdata));
2181 
2182 #if OMPT_SUPPORT
2183  kmp_taskdata_t *parent = NULL;
2184  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2185  parent = new_taskdata->td_parent;
2186  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2187  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2188  if (ompt_enabled.ompt_callback_task_create) {
2189  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2190  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2191  &(new_taskdata->ompt_task_info.task_data),
2192  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
2193  }
2194  }
2195 #endif
2196 
2197  res = __kmp_omp_task(gtid, new_task, true);
2198 
2199  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2200  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2201  gtid, loc_ref, new_taskdata));
2202 #if OMPT_SUPPORT
2203  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2204  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2205  }
2206 #endif
2207  return res;
2208 }
2209 
2210 template <bool ompt>
2211 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2212  void *frame_address,
2213  void *return_address) {
2214  kmp_taskdata_t *taskdata = nullptr;
2215  kmp_info_t *thread;
2216  int thread_finished = FALSE;
2217  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2218 
2219  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2220  KMP_DEBUG_ASSERT(gtid >= 0);
2221 
2222  if (__kmp_tasking_mode != tskm_immediate_exec) {
2223  thread = __kmp_threads[gtid];
2224  taskdata = thread->th.th_current_task;
2225 
2226 #if OMPT_SUPPORT && OMPT_OPTIONAL
2227  ompt_data_t *my_task_data;
2228  ompt_data_t *my_parallel_data;
2229 
2230  if (ompt) {
2231  my_task_data = &(taskdata->ompt_task_info.task_data);
2232  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2233 
2234  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2235 
2236  if (ompt_enabled.ompt_callback_sync_region) {
2237  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2238  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2239  my_task_data, return_address);
2240  }
2241 
2242  if (ompt_enabled.ompt_callback_sync_region_wait) {
2243  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2244  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2245  my_task_data, return_address);
2246  }
2247  }
2248 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2249 
2250 // Debugger: The taskwait is active. Store location and thread encountered the
2251 // taskwait.
2252 #if USE_ITT_BUILD
2253 // Note: These values are used by ITT events as well.
2254 #endif /* USE_ITT_BUILD */
2255  taskdata->td_taskwait_counter += 1;
2256  taskdata->td_taskwait_ident = loc_ref;
2257  taskdata->td_taskwait_thread = gtid + 1;
2258 
2259 #if USE_ITT_BUILD
2260  void *itt_sync_obj = NULL;
2261 #if USE_ITT_NOTIFY
2262  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2263 #endif /* USE_ITT_NOTIFY */
2264 #endif /* USE_ITT_BUILD */
2265 
2266  bool must_wait =
2267  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2268 
2269  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2270  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2271  // If hidden helper thread is encountered, we must enable wait here.
2272  must_wait =
2273  must_wait ||
2274  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2275  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2276 
2277  if (must_wait) {
2278  kmp_flag_32<false, false> flag(
2279  RCAST(std::atomic<kmp_uint32> *,
2280  &(taskdata->td_incomplete_child_tasks)),
2281  0U);
2282  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2283  flag.execute_tasks(thread, gtid, FALSE,
2284  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2285  __kmp_task_stealing_constraint);
2286  }
2287  }
2288 #if USE_ITT_BUILD
2289  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2290  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2291 #endif /* USE_ITT_BUILD */
2292 
2293  // Debugger: The taskwait is completed. Location remains, but thread is
2294  // negated.
2295  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2296 
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298  if (ompt) {
2299  if (ompt_enabled.ompt_callback_sync_region_wait) {
2300  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2301  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2302  my_task_data, return_address);
2303  }
2304  if (ompt_enabled.ompt_callback_sync_region) {
2305  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2306  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2307  my_task_data, return_address);
2308  }
2309  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2310  }
2311 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2312  }
2313 
2314  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2315  "returning TASK_CURRENT_NOT_QUEUED\n",
2316  gtid, taskdata));
2317 
2318  return TASK_CURRENT_NOT_QUEUED;
2319 }
2320 
2321 #if OMPT_SUPPORT && OMPT_OPTIONAL
2322 OMPT_NOINLINE
2323 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2324  void *frame_address,
2325  void *return_address) {
2326  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2327  return_address);
2328 }
2329 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2330 
2331 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2332 // complete
2333 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2334 #if OMPT_SUPPORT && OMPT_OPTIONAL
2335  if (UNLIKELY(ompt_enabled.enabled)) {
2336  OMPT_STORE_RETURN_ADDRESS(gtid);
2337  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2338  OMPT_LOAD_RETURN_ADDRESS(gtid));
2339  }
2340 #endif
2341  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2342 }
2343 
2344 // __kmpc_omp_taskyield: switch to a different task
2345 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2346  kmp_taskdata_t *taskdata = NULL;
2347  kmp_info_t *thread;
2348  int thread_finished = FALSE;
2349 
2350  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2351  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2352 
2353  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2354  gtid, loc_ref, end_part));
2355  __kmp_assert_valid_gtid(gtid);
2356 
2357  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2358  thread = __kmp_threads[gtid];
2359  taskdata = thread->th.th_current_task;
2360 // Should we model this as a task wait or not?
2361 // Debugger: The taskwait is active. Store location and thread encountered the
2362 // taskwait.
2363 #if USE_ITT_BUILD
2364 // Note: These values are used by ITT events as well.
2365 #endif /* USE_ITT_BUILD */
2366  taskdata->td_taskwait_counter += 1;
2367  taskdata->td_taskwait_ident = loc_ref;
2368  taskdata->td_taskwait_thread = gtid + 1;
2369 
2370 #if USE_ITT_BUILD
2371  void *itt_sync_obj = NULL;
2372 #if USE_ITT_NOTIFY
2373  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2374 #endif /* USE_ITT_NOTIFY */
2375 #endif /* USE_ITT_BUILD */
2376  if (!taskdata->td_flags.team_serial) {
2377  kmp_task_team_t *task_team = thread->th.th_task_team;
2378  if (task_team != NULL) {
2379  if (KMP_TASKING_ENABLED(task_team)) {
2380 #if OMPT_SUPPORT
2381  if (UNLIKELY(ompt_enabled.enabled))
2382  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2383 #endif
2384  __kmp_execute_tasks_32(
2385  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2386  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2387  __kmp_task_stealing_constraint);
2388 #if OMPT_SUPPORT
2389  if (UNLIKELY(ompt_enabled.enabled))
2390  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2391 #endif
2392  }
2393  }
2394  }
2395 #if USE_ITT_BUILD
2396  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2397 #endif /* USE_ITT_BUILD */
2398 
2399  // Debugger: The taskwait is completed. Location remains, but thread is
2400  // negated.
2401  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2402  }
2403 
2404  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2405  "returning TASK_CURRENT_NOT_QUEUED\n",
2406  gtid, taskdata));
2407 
2408  return TASK_CURRENT_NOT_QUEUED;
2409 }
2410 
2411 // Task Reduction implementation
2412 //
2413 // Note: initial implementation didn't take into account the possibility
2414 // to specify omp_orig for initializer of the UDR (user defined reduction).
2415 // Corrected implementation takes into account the omp_orig object.
2416 // Compiler is free to use old implementation if omp_orig is not specified.
2417 
2426 typedef struct kmp_taskred_flags {
2428  unsigned lazy_priv : 1;
2429  unsigned reserved31 : 31;
2431 
2435 typedef struct kmp_task_red_input {
2436  void *reduce_shar;
2437  size_t reduce_size;
2438  // three compiler-generated routines (init, fini are optional):
2439  void *reduce_init;
2440  void *reduce_fini;
2441  void *reduce_comb;
2444 
2448 typedef struct kmp_taskred_data {
2449  void *reduce_shar;
2450  size_t reduce_size;
2452  void *reduce_priv;
2453  void *reduce_pend;
2454  // three compiler-generated routines (init, fini are optional):
2455  void *reduce_comb;
2456  void *reduce_init;
2457  void *reduce_fini;
2458  void *reduce_orig;
2460 
2466 typedef struct kmp_taskred_input {
2467  void *reduce_shar;
2468  void *reduce_orig;
2469  size_t reduce_size;
2470  // three compiler-generated routines (init, fini are optional):
2471  void *reduce_init;
2472  void *reduce_fini;
2473  void *reduce_comb;
2480 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2481 template <>
2482 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2483  kmp_task_red_input_t &src) {
2484  item.reduce_orig = NULL;
2485 }
2486 template <>
2487 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2488  kmp_taskred_input_t &src) {
2489  if (src.reduce_orig != NULL) {
2490  item.reduce_orig = src.reduce_orig;
2491  } else {
2492  item.reduce_orig = src.reduce_shar;
2493  } // non-NULL reduce_orig means new interface used
2494 }
2495 
2496 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2497 template <>
2498 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2499  size_t offset) {
2500  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2501 }
2502 template <>
2503 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2504  size_t offset) {
2505  ((void (*)(void *, void *))item.reduce_init)(
2506  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2507 }
2508 
2509 template <typename T>
2510 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2511  __kmp_assert_valid_gtid(gtid);
2512  kmp_info_t *thread = __kmp_threads[gtid];
2513  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2514  kmp_uint32 nth = thread->th.th_team_nproc;
2515  kmp_taskred_data_t *arr;
2516 
2517  // check input data just in case
2518  KMP_ASSERT(tg != NULL);
2519  KMP_ASSERT(data != NULL);
2520  KMP_ASSERT(num > 0);
2521  if (nth == 1 && !__kmp_enable_hidden_helper) {
2522  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2523  gtid, tg));
2524  return (void *)tg;
2525  }
2526  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2527  gtid, tg, num));
2528  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2529  thread, num * sizeof(kmp_taskred_data_t));
2530  for (int i = 0; i < num; ++i) {
2531  size_t size = data[i].reduce_size - 1;
2532  // round the size up to cache line per thread-specific item
2533  size += CACHE_LINE - size % CACHE_LINE;
2534  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2535  arr[i].reduce_shar = data[i].reduce_shar;
2536  arr[i].reduce_size = size;
2537  arr[i].flags = data[i].flags;
2538  arr[i].reduce_comb = data[i].reduce_comb;
2539  arr[i].reduce_init = data[i].reduce_init;
2540  arr[i].reduce_fini = data[i].reduce_fini;
2541  __kmp_assign_orig<T>(arr[i], data[i]);
2542  if (!arr[i].flags.lazy_priv) {
2543  // allocate cache-line aligned block and fill it with zeros
2544  arr[i].reduce_priv = __kmp_allocate(nth * size);
2545  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2546  if (arr[i].reduce_init != NULL) {
2547  // initialize all thread-specific items
2548  for (size_t j = 0; j < nth; ++j) {
2549  __kmp_call_init<T>(arr[i], j * size);
2550  }
2551  }
2552  } else {
2553  // only allocate space for pointers now,
2554  // objects will be lazily allocated/initialized if/when requested
2555  // note that __kmp_allocate zeroes the allocated memory
2556  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2557  }
2558  }
2559  tg->reduce_data = (void *)arr;
2560  tg->reduce_num_data = num;
2561  return (void *)tg;
2562 }
2563 
2578 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2579 #if OMPX_TASKGRAPH
2580  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2581  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2582  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2583  this_tdg->rec_taskred_data =
2584  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2585  this_tdg->rec_num_taskred = num;
2586  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2587  sizeof(kmp_task_red_input_t) * num);
2588  }
2589 #endif
2590  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2591 }
2592 
2605 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2606 #if OMPX_TASKGRAPH
2607  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2608  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2609  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2610  this_tdg->rec_taskred_data =
2611  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2612  this_tdg->rec_num_taskred = num;
2613  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2614  sizeof(kmp_task_red_input_t) * num);
2615  }
2616 #endif
2617  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2618 }
2619 
2620 // Copy task reduction data (except for shared pointers).
2621 template <typename T>
2622 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2623  kmp_taskgroup_t *tg, void *reduce_data) {
2624  kmp_taskred_data_t *arr;
2625  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2626  " from data %p\n",
2627  thr, tg, reduce_data));
2628  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2629  thr, num * sizeof(kmp_taskred_data_t));
2630  // threads will share private copies, thunk routines, sizes, flags, etc.:
2631  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2632  for (int i = 0; i < num; ++i) {
2633  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2634  }
2635  tg->reduce_data = (void *)arr;
2636  tg->reduce_num_data = num;
2637 }
2638 
2648 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2649  __kmp_assert_valid_gtid(gtid);
2650  kmp_info_t *thread = __kmp_threads[gtid];
2651  kmp_int32 nth = thread->th.th_team_nproc;
2652  if (nth == 1)
2653  return data; // nothing to do
2654 
2655  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2656  if (tg == NULL)
2657  tg = thread->th.th_current_task->td_taskgroup;
2658  KMP_ASSERT(tg != NULL);
2659  kmp_taskred_data_t *arr;
2660  kmp_int32 num;
2661  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2662 
2663 #if OMPX_TASKGRAPH
2664  if ((thread->th.th_current_task->is_taskgraph) &&
2665  (!__kmp_tdg_is_recording(
2666  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2667  tg = thread->th.th_current_task->td_taskgroup;
2668  KMP_ASSERT(tg != NULL);
2669  KMP_ASSERT(tg->reduce_data != NULL);
2670  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2671  num = tg->reduce_num_data;
2672  }
2673 #endif
2674 
2675  KMP_ASSERT(data != NULL);
2676  while (tg != NULL) {
2677  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2678  num = tg->reduce_num_data;
2679  for (int i = 0; i < num; ++i) {
2680  if (!arr[i].flags.lazy_priv) {
2681  if (data == arr[i].reduce_shar ||
2682  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2683  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2684  } else {
2685  // check shared location first
2686  void **p_priv = (void **)(arr[i].reduce_priv);
2687  if (data == arr[i].reduce_shar)
2688  goto found;
2689  // check if we get some thread specific location as parameter
2690  for (int j = 0; j < nth; ++j)
2691  if (data == p_priv[j])
2692  goto found;
2693  continue; // not found, continue search
2694  found:
2695  if (p_priv[tid] == NULL) {
2696  // allocate thread specific object lazily
2697  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2698  if (arr[i].reduce_init != NULL) {
2699  if (arr[i].reduce_orig != NULL) { // new interface
2700  ((void (*)(void *, void *))arr[i].reduce_init)(
2701  p_priv[tid], arr[i].reduce_orig);
2702  } else { // old interface (single parameter)
2703  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2704  }
2705  }
2706  }
2707  return p_priv[tid];
2708  }
2709  }
2710  KMP_ASSERT(tg->parent);
2711  tg = tg->parent;
2712  }
2713  KMP_ASSERT2(0, "Unknown task reduction item");
2714  return NULL; // ERROR, this line never executed
2715 }
2716 
2717 // Finalize task reduction.
2718 // Called from __kmpc_end_taskgroup()
2719 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2720  kmp_int32 nth = th->th.th_team_nproc;
2721  KMP_DEBUG_ASSERT(
2722  nth > 1 ||
2723  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2724  // are using hidden helper threads
2725  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2726  kmp_int32 num = tg->reduce_num_data;
2727  for (int i = 0; i < num; ++i) {
2728  void *sh_data = arr[i].reduce_shar;
2729  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2730  void (*f_comb)(void *, void *) =
2731  (void (*)(void *, void *))(arr[i].reduce_comb);
2732  if (!arr[i].flags.lazy_priv) {
2733  void *pr_data = arr[i].reduce_priv;
2734  size_t size = arr[i].reduce_size;
2735  for (int j = 0; j < nth; ++j) {
2736  void *priv_data = (char *)pr_data + j * size;
2737  f_comb(sh_data, priv_data); // combine results
2738  if (f_fini)
2739  f_fini(priv_data); // finalize if needed
2740  }
2741  } else {
2742  void **pr_data = (void **)(arr[i].reduce_priv);
2743  for (int j = 0; j < nth; ++j) {
2744  if (pr_data[j] != NULL) {
2745  f_comb(sh_data, pr_data[j]); // combine results
2746  if (f_fini)
2747  f_fini(pr_data[j]); // finalize if needed
2748  __kmp_free(pr_data[j]);
2749  }
2750  }
2751  }
2752  __kmp_free(arr[i].reduce_priv);
2753  }
2754  __kmp_thread_free(th, arr);
2755  tg->reduce_data = NULL;
2756  tg->reduce_num_data = 0;
2757 }
2758 
2759 // Cleanup task reduction data for parallel or worksharing,
2760 // do not touch task private data other threads still working with.
2761 // Called from __kmpc_end_taskgroup()
2762 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2763  __kmp_thread_free(th, tg->reduce_data);
2764  tg->reduce_data = NULL;
2765  tg->reduce_num_data = 0;
2766 }
2767 
2768 template <typename T>
2769 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2770  int num, T *data) {
2771  __kmp_assert_valid_gtid(gtid);
2772  kmp_info_t *thr = __kmp_threads[gtid];
2773  kmp_int32 nth = thr->th.th_team_nproc;
2774  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2775  if (nth == 1) {
2776  KA_TRACE(10,
2777  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2778  gtid, thr->th.th_current_task->td_taskgroup));
2779  return (void *)thr->th.th_current_task->td_taskgroup;
2780  }
2781  kmp_team_t *team = thr->th.th_team;
2782  void *reduce_data;
2783  kmp_taskgroup_t *tg;
2784  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2785  if (reduce_data == NULL &&
2786  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2787  (void *)1)) {
2788  // single thread enters this block to initialize common reduction data
2789  KMP_DEBUG_ASSERT(reduce_data == NULL);
2790  // first initialize own data, then make a copy other threads can use
2791  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2792  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2793  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2794  // fini counters should be 0 at this point
2795  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2796  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2797  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2798  } else {
2799  while (
2800  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2801  (void *)1) { // wait for task reduction initialization
2802  KMP_CPU_PAUSE();
2803  }
2804  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2805  tg = thr->th.th_current_task->td_taskgroup;
2806  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2807  }
2808  return tg;
2809 }
2810 
2827 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2828  int num, void *data) {
2829  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2830  (kmp_task_red_input_t *)data);
2831 }
2832 
2847 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2848  void *data) {
2849  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2850  (kmp_taskred_input_t *)data);
2851 }
2852 
2861 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2862  __kmpc_end_taskgroup(loc, gtid);
2863 }
2864 
2865 // __kmpc_taskgroup: Start a new taskgroup
2866 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2867  __kmp_assert_valid_gtid(gtid);
2868  kmp_info_t *thread = __kmp_threads[gtid];
2869  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2870  kmp_taskgroup_t *tg_new =
2871  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2872  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2873  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2874  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2875  tg_new->parent = taskdata->td_taskgroup;
2876  tg_new->reduce_data = NULL;
2877  tg_new->reduce_num_data = 0;
2878  tg_new->gomp_data = NULL;
2879  taskdata->td_taskgroup = tg_new;
2880 
2881 #if OMPT_SUPPORT && OMPT_OPTIONAL
2882  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2883  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2884  if (!codeptr)
2885  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2886  kmp_team_t *team = thread->th.th_team;
2887  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2888  // FIXME: I think this is wrong for lwt!
2889  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2890 
2891  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2892  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2893  &(my_task_data), codeptr);
2894  }
2895 #endif
2896 }
2897 
2898 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2899 // and its descendants are complete
2900 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2901  __kmp_assert_valid_gtid(gtid);
2902  kmp_info_t *thread = __kmp_threads[gtid];
2903  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2904  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2905  int thread_finished = FALSE;
2906 
2907 #if OMPT_SUPPORT && OMPT_OPTIONAL
2908  kmp_team_t *team;
2909  ompt_data_t my_task_data;
2910  ompt_data_t my_parallel_data;
2911  void *codeptr = nullptr;
2912  if (UNLIKELY(ompt_enabled.enabled)) {
2913  team = thread->th.th_team;
2914  my_task_data = taskdata->ompt_task_info.task_data;
2915  // FIXME: I think this is wrong for lwt!
2916  my_parallel_data = team->t.ompt_team_info.parallel_data;
2917  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2918  if (!codeptr)
2919  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2920  }
2921 #endif
2922 
2923  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2924  KMP_DEBUG_ASSERT(taskgroup != NULL);
2925  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2926 
2927  if (__kmp_tasking_mode != tskm_immediate_exec) {
2928  // mark task as waiting not on a barrier
2929  taskdata->td_taskwait_counter += 1;
2930  taskdata->td_taskwait_ident = loc;
2931  taskdata->td_taskwait_thread = gtid + 1;
2932 #if USE_ITT_BUILD
2933  // For ITT the taskgroup wait is similar to taskwait until we need to
2934  // distinguish them
2935  void *itt_sync_obj = NULL;
2936 #if USE_ITT_NOTIFY
2937  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2938 #endif /* USE_ITT_NOTIFY */
2939 #endif /* USE_ITT_BUILD */
2940 
2941 #if OMPT_SUPPORT && OMPT_OPTIONAL
2942  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2943  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2944  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2945  &(my_task_data), codeptr);
2946  }
2947 #endif
2948 
2949  if (!taskdata->td_flags.team_serial ||
2950  (thread->th.th_task_team != NULL &&
2951  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2952  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2953  kmp_flag_32<false, false> flag(
2954  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2955  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2956  flag.execute_tasks(thread, gtid, FALSE,
2957  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2958  __kmp_task_stealing_constraint);
2959  }
2960  }
2961  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2962 
2963 #if OMPT_SUPPORT && OMPT_OPTIONAL
2964  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2965  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2966  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2967  &(my_task_data), codeptr);
2968  }
2969 #endif
2970 
2971 #if USE_ITT_BUILD
2972  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2973  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2974 #endif /* USE_ITT_BUILD */
2975  }
2976  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2977 
2978  if (taskgroup->reduce_data != NULL &&
2979  !taskgroup->gomp_data) { // need to reduce?
2980  int cnt;
2981  void *reduce_data;
2982  kmp_team_t *t = thread->th.th_team;
2983  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2984  // check if <priv> data of the first reduction variable shared for the team
2985  void *priv0 = arr[0].reduce_priv;
2986  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2987  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2988  // finishing task reduction on parallel
2989  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2990  if (cnt == thread->th.th_team_nproc - 1) {
2991  // we are the last thread passing __kmpc_reduction_modifier_fini()
2992  // finalize task reduction:
2993  __kmp_task_reduction_fini(thread, taskgroup);
2994  // cleanup fields in the team structure:
2995  // TODO: is relaxed store enough here (whole barrier should follow)?
2996  __kmp_thread_free(thread, reduce_data);
2997  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2998  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2999  } else {
3000  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3001  // so do not finalize reduction, just clean own copy of the data
3002  __kmp_task_reduction_clean(thread, taskgroup);
3003  }
3004  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
3005  NULL &&
3006  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3007  // finishing task reduction on worksharing
3008  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3009  if (cnt == thread->th.th_team_nproc - 1) {
3010  // we are the last thread passing __kmpc_reduction_modifier_fini()
3011  __kmp_task_reduction_fini(thread, taskgroup);
3012  // cleanup fields in team structure:
3013  // TODO: is relaxed store enough here (whole barrier should follow)?
3014  __kmp_thread_free(thread, reduce_data);
3015  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3016  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3017  } else {
3018  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3019  // so do not finalize reduction, just clean own copy of the data
3020  __kmp_task_reduction_clean(thread, taskgroup);
3021  }
3022  } else {
3023  // finishing task reduction on taskgroup
3024  __kmp_task_reduction_fini(thread, taskgroup);
3025  }
3026  }
3027  // Restore parent taskgroup for the current task
3028  taskdata->td_taskgroup = taskgroup->parent;
3029  __kmp_thread_free(thread, taskgroup);
3030 
3031  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3032  gtid, taskdata));
3033 
3034 #if OMPT_SUPPORT && OMPT_OPTIONAL
3035  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3036  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3037  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3038  &(my_task_data), codeptr);
3039  }
3040 #endif
3041 }
3042 
3043 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3044  kmp_task_team_t *task_team,
3045  kmp_int32 is_constrained) {
3046  kmp_task_t *task = NULL;
3047  kmp_taskdata_t *taskdata;
3048  kmp_taskdata_t *current;
3049  kmp_thread_data_t *thread_data;
3050  int ntasks = task_team->tt.tt_num_task_pri;
3051  if (ntasks == 0) {
3052  KA_TRACE(
3053  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3054  return NULL;
3055  }
3056  do {
3057  // decrement num_tasks to "reserve" one task to get for execution
3058  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3059  ntasks - 1))
3060  break;
3061  ntasks = task_team->tt.tt_num_task_pri;
3062  } while (ntasks > 0);
3063  if (ntasks == 0) {
3064  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3065  __kmp_get_gtid()));
3066  return NULL;
3067  }
3068  // We got a "ticket" to get a "reserved" priority task
3069  int deque_ntasks;
3070  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3071  do {
3072  KMP_ASSERT(list != NULL);
3073  thread_data = &list->td;
3074  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3075  deque_ntasks = thread_data->td.td_deque_ntasks;
3076  if (deque_ntasks == 0) {
3077  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3078  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3079  __kmp_get_gtid(), thread_data));
3080  list = list->next;
3081  }
3082  } while (deque_ntasks == 0);
3083  KMP_DEBUG_ASSERT(deque_ntasks);
3084  int target = thread_data->td.td_deque_head;
3085  current = __kmp_threads[gtid]->th.th_current_task;
3086  taskdata = thread_data->td.td_deque[target];
3087  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3088  // Bump head pointer and Wrap.
3089  thread_data->td.td_deque_head =
3090  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3091  } else {
3092  if (!task_team->tt.tt_untied_task_encountered) {
3093  // The TSC does not allow to steal victim task
3094  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3095  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3096  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3097  gtid, thread_data, task_team, deque_ntasks, target,
3098  thread_data->td.td_deque_tail));
3099  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3100  return NULL;
3101  }
3102  int i;
3103  // walk through the deque trying to steal any task
3104  taskdata = NULL;
3105  for (i = 1; i < deque_ntasks; ++i) {
3106  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3107  taskdata = thread_data->td.td_deque[target];
3108  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3109  break; // found task to execute
3110  } else {
3111  taskdata = NULL;
3112  }
3113  }
3114  if (taskdata == NULL) {
3115  // No appropriate candidate found to execute
3116  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3117  KA_TRACE(
3118  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3119  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3120  gtid, thread_data, task_team, deque_ntasks,
3121  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3122  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3123  return NULL;
3124  }
3125  int prev = target;
3126  for (i = i + 1; i < deque_ntasks; ++i) {
3127  // shift remaining tasks in the deque left by 1
3128  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3129  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3130  prev = target;
3131  }
3132  KMP_DEBUG_ASSERT(
3133  thread_data->td.td_deque_tail ==
3134  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3135  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3136  }
3137  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3138  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3139  task = KMP_TASKDATA_TO_TASK(taskdata);
3140  return task;
3141 }
3142 
3143 // __kmp_remove_my_task: remove a task from my own deque
3144 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3145  kmp_task_team_t *task_team,
3146  kmp_int32 is_constrained) {
3147  kmp_task_t *task;
3148  kmp_taskdata_t *taskdata;
3149  kmp_thread_data_t *thread_data;
3150  kmp_uint32 tail;
3151 
3152  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3153  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3154  NULL); // Caller should check this condition
3155 
3156  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3157 
3158  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3159  gtid, thread_data->td.td_deque_ntasks,
3160  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3161 
3162  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3163  KA_TRACE(10,
3164  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3165  "ntasks=%d head=%u tail=%u\n",
3166  gtid, thread_data->td.td_deque_ntasks,
3167  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3168  return NULL;
3169  }
3170 
3171  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3172 
3173  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3174  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3175  KA_TRACE(10,
3176  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3177  "ntasks=%d head=%u tail=%u\n",
3178  gtid, thread_data->td.td_deque_ntasks,
3179  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3180  return NULL;
3181  }
3182 
3183  tail = (thread_data->td.td_deque_tail - 1) &
3184  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3185  taskdata = thread_data->td.td_deque[tail];
3186 
3187  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3188  thread->th.th_current_task)) {
3189  // The TSC does not allow to steal victim task
3190  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3191  KA_TRACE(10,
3192  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3193  "ntasks=%d head=%u tail=%u\n",
3194  gtid, thread_data->td.td_deque_ntasks,
3195  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3196  return NULL;
3197  }
3198 
3199  thread_data->td.td_deque_tail = tail;
3200  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3201 
3202  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3203 
3204  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3205  "ntasks=%d head=%u tail=%u\n",
3206  gtid, taskdata, thread_data->td.td_deque_ntasks,
3207  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3208 
3209  task = KMP_TASKDATA_TO_TASK(taskdata);
3210  return task;
3211 }
3212 
3213 // __kmp_steal_task: remove a task from another thread's deque
3214 // Assume that calling thread has already checked existence of
3215 // task_team thread_data before calling this routine.
3216 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3217  kmp_task_team_t *task_team,
3218  std::atomic<kmp_int32> *unfinished_threads,
3219  int *thread_finished,
3220  kmp_int32 is_constrained) {
3221  kmp_task_t *task;
3222  kmp_taskdata_t *taskdata;
3223  kmp_taskdata_t *current;
3224  kmp_thread_data_t *victim_td, *threads_data;
3225  kmp_int32 target;
3226  kmp_info_t *victim_thr;
3227 
3228  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3229 
3230  threads_data = task_team->tt.tt_threads_data;
3231  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3232  KMP_DEBUG_ASSERT(victim_tid >= 0);
3233  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3234 
3235  victim_td = &threads_data[victim_tid];
3236  victim_thr = victim_td->td.td_thr;
3237  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3238 
3239  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3240  "task_team=%p ntasks=%d head=%u tail=%u\n",
3241  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3242  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3243  victim_td->td.td_deque_tail));
3244 
3245  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3246  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3247  "task_team=%p ntasks=%d head=%u tail=%u\n",
3248  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3249  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3250  victim_td->td.td_deque_tail));
3251  return NULL;
3252  }
3253 
3254  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3255 
3256  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3257  // Check again after we acquire the lock
3258  if (ntasks == 0) {
3259  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3260  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3261  "task_team=%p ntasks=%d head=%u tail=%u\n",
3262  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3263  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3264  return NULL;
3265  }
3266 
3267  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3268  current = __kmp_threads[gtid]->th.th_current_task;
3269  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3270  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3271  // Bump head pointer and Wrap.
3272  victim_td->td.td_deque_head =
3273  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3274  } else {
3275  if (!task_team->tt.tt_untied_task_encountered) {
3276  // The TSC does not allow to steal victim task
3277  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3278  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3279  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3280  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3281  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3282  return NULL;
3283  }
3284  int i;
3285  // walk through victim's deque trying to steal any task
3286  target = victim_td->td.td_deque_head;
3287  taskdata = NULL;
3288  for (i = 1; i < ntasks; ++i) {
3289  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3290  taskdata = victim_td->td.td_deque[target];
3291  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3292  break; // found victim task
3293  } else {
3294  taskdata = NULL;
3295  }
3296  }
3297  if (taskdata == NULL) {
3298  // No appropriate candidate to steal found
3299  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3300  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3301  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3302  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3303  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3304  return NULL;
3305  }
3306  int prev = target;
3307  for (i = i + 1; i < ntasks; ++i) {
3308  // shift remaining tasks in the deque left by 1
3309  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3310  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3311  prev = target;
3312  }
3313  KMP_DEBUG_ASSERT(
3314  victim_td->td.td_deque_tail ==
3315  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3316  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3317  }
3318  if (*thread_finished) {
3319  // We need to un-mark this victim as a finished victim. This must be done
3320  // before releasing the lock, or else other threads (starting with the
3321  // primary thread victim) might be prematurely released from the barrier!!!
3322 #if KMP_DEBUG
3323  kmp_int32 count =
3324 #endif
3325  KMP_ATOMIC_INC(unfinished_threads);
3326  KA_TRACE(
3327  20,
3328  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3329  gtid, count + 1, task_team));
3330  *thread_finished = FALSE;
3331  }
3332  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3333 
3334  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3335 
3336  KMP_COUNT_BLOCK(TASK_stolen);
3337  KA_TRACE(10,
3338  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3339  "task_team=%p ntasks=%d head=%u tail=%u\n",
3340  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3341  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3342 
3343  task = KMP_TASKDATA_TO_TASK(taskdata);
3344  return task;
3345 }
3346 
3347 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3348 // condition is statisfied (return true) or there are none left (return false).
3349 //
3350 // final_spin is TRUE if this is the spin at the release barrier.
3351 // thread_finished indicates whether the thread is finished executing all
3352 // the tasks it has on its deque, and is at the release barrier.
3353 // spinner is the location on which to spin.
3354 // spinner == NULL means only execute a single task and return.
3355 // checker is the value to check to terminate the spin.
3356 template <class C>
3357 static inline int __kmp_execute_tasks_template(
3358  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3359  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3360  kmp_int32 is_constrained) {
3361  kmp_task_team_t *task_team = thread->th.th_task_team;
3362  kmp_thread_data_t *threads_data;
3363  kmp_task_t *task;
3364  kmp_info_t *other_thread;
3365  kmp_taskdata_t *current_task = thread->th.th_current_task;
3366  std::atomic<kmp_int32> *unfinished_threads;
3367  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3368  tid = thread->th.th_info.ds.ds_tid;
3369 
3370  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3371  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3372 
3373  if (task_team == NULL || current_task == NULL)
3374  return FALSE;
3375 
3376  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3377  "*thread_finished=%d\n",
3378  gtid, final_spin, *thread_finished));
3379 
3380  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3381  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3382 
3383  KMP_DEBUG_ASSERT(threads_data != NULL);
3384 
3385  nthreads = task_team->tt.tt_nproc;
3386  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3387  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3388 
3389  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3390  // getting tasks from target constructs
3391  while (1) { // Inner loop to find a task and execute it
3392  task = NULL;
3393  if (task_team->tt.tt_num_task_pri) { // get priority task first
3394  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3395  }
3396  if (task == NULL && use_own_tasks) { // check own queue next
3397  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3398  }
3399  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3400  int asleep = 1;
3401  use_own_tasks = 0;
3402  // Try to steal from the last place I stole from successfully.
3403  if (victim_tid == -2) { // haven't stolen anything yet
3404  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3405  if (victim_tid !=
3406  -1) // if we have a last stolen from victim, get the thread
3407  other_thread = threads_data[victim_tid].td.td_thr;
3408  }
3409  if (victim_tid != -1) { // found last victim
3410  asleep = 0;
3411  } else if (!new_victim) { // no recent steals and we haven't already
3412  // used a new victim; select a random thread
3413  do { // Find a different thread to steal work from.
3414  // Pick a random thread. Initial plan was to cycle through all the
3415  // threads, and only return if we tried to steal from every thread,
3416  // and failed. Arch says that's not such a great idea.
3417  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3418  if (victim_tid >= tid) {
3419  ++victim_tid; // Adjusts random distribution to exclude self
3420  }
3421  // Found a potential victim
3422  other_thread = threads_data[victim_tid].td.td_thr;
3423  // There is a slight chance that __kmp_enable_tasking() did not wake
3424  // up all threads waiting at the barrier. If victim is sleeping,
3425  // then wake it up. Since we were going to pay the cache miss
3426  // penalty for referencing another thread's kmp_info_t struct
3427  // anyway,
3428  // the check shouldn't cost too much performance at this point. In
3429  // extra barrier mode, tasks do not sleep at the separate tasking
3430  // barrier, so this isn't a problem.
3431  asleep = 0;
3432  if ((__kmp_tasking_mode == tskm_task_teams) &&
3433  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3434  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3435  NULL)) {
3436  asleep = 1;
3437  __kmp_null_resume_wrapper(other_thread);
3438  // A sleeping thread should not have any tasks on it's queue.
3439  // There is a slight possibility that it resumes, steals a task
3440  // from another thread, which spawns more tasks, all in the time
3441  // that it takes this thread to check => don't write an assertion
3442  // that the victim's queue is empty. Try stealing from a
3443  // different thread.
3444  }
3445  } while (asleep);
3446  }
3447 
3448  if (!asleep) {
3449  // We have a victim to try to steal from
3450  task =
3451  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3452  thread_finished, is_constrained);
3453  }
3454  if (task != NULL) { // set last stolen to victim
3455  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3456  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3457  // The pre-refactored code did not try more than 1 successful new
3458  // vicitm, unless the last one generated more local tasks;
3459  // new_victim keeps track of this
3460  new_victim = 1;
3461  }
3462  } else { // No tasks found; unset last_stolen
3463  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3464  victim_tid = -2; // no successful victim found
3465  }
3466  }
3467 
3468  if (task == NULL)
3469  break; // break out of tasking loop
3470 
3471 // Found a task; execute it
3472 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3473  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3474  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3475  // get the object reliably
3476  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3477  }
3478  __kmp_itt_task_starting(itt_sync_obj);
3479  }
3480 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3481  __kmp_invoke_task(gtid, task, current_task);
3482 #if USE_ITT_BUILD
3483  if (itt_sync_obj != NULL)
3484  __kmp_itt_task_finished(itt_sync_obj);
3485 #endif /* USE_ITT_BUILD */
3486  // If this thread is only partway through the barrier and the condition is
3487  // met, then return now, so that the barrier gather/release pattern can
3488  // proceed. If this thread is in the last spin loop in the barrier,
3489  // waiting to be released, we know that the termination condition will not
3490  // be satisfied, so don't waste any cycles checking it.
3491  if (flag == NULL || (!final_spin && flag->done_check())) {
3492  KA_TRACE(
3493  15,
3494  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3495  gtid));
3496  return TRUE;
3497  }
3498  if (thread->th.th_task_team == NULL) {
3499  break;
3500  }
3501  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3502  // If execution of a stolen task results in more tasks being placed on our
3503  // run queue, reset use_own_tasks
3504  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3505  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3506  "other tasks, restart\n",
3507  gtid));
3508  use_own_tasks = 1;
3509  new_victim = 0;
3510  }
3511  }
3512 
3513  // The task source has been exhausted. If in final spin loop of barrier,
3514  // check if termination condition is satisfied. The work queue may be empty
3515  // but there might be proxy tasks still executing.
3516  if (final_spin &&
3517  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3518  // First, decrement the #unfinished threads, if that has not already been
3519  // done. This decrement might be to the spin location, and result in the
3520  // termination condition being satisfied.
3521  if (!*thread_finished) {
3522 #if KMP_DEBUG
3523  kmp_int32 count = -1 +
3524 #endif
3525  KMP_ATOMIC_DEC(unfinished_threads);
3526  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3527  "unfinished_threads to %d task_team=%p\n",
3528  gtid, count, task_team));
3529  *thread_finished = TRUE;
3530  }
3531 
3532  // It is now unsafe to reference thread->th.th_team !!!
3533  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3534  // thread to pass through the barrier, where it might reset each thread's
3535  // th.th_team field for the next parallel region. If we can steal more
3536  // work, we know that this has not happened yet.
3537  if (flag != NULL && flag->done_check()) {
3538  KA_TRACE(
3539  15,
3540  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3541  gtid));
3542  return TRUE;
3543  }
3544  }
3545 
3546  // If this thread's task team is NULL, primary thread has recognized that
3547  // there are no more tasks; bail out
3548  if (thread->th.th_task_team == NULL) {
3549  KA_TRACE(15,
3550  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3551  return FALSE;
3552  }
3553 
3554  // Check the flag again to see if it has already done in case to be trapped
3555  // into infinite loop when a if0 task depends on a hidden helper task
3556  // outside any parallel region. Detached tasks are not impacted in this case
3557  // because the only thread executing this function has to execute the proxy
3558  // task so it is in another code path that has the same check.
3559  if (flag == NULL || (!final_spin && flag->done_check())) {
3560  KA_TRACE(15,
3561  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3562  gtid));
3563  return TRUE;
3564  }
3565 
3566  // We could be getting tasks from target constructs; if this is the only
3567  // thread, keep trying to execute tasks from own queue
3568  if (nthreads == 1 &&
3569  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3570  use_own_tasks = 1;
3571  else {
3572  KA_TRACE(15,
3573  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3574  return FALSE;
3575  }
3576  }
3577 }
3578 
3579 template <bool C, bool S>
3580 int __kmp_execute_tasks_32(
3581  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3582  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3583  kmp_int32 is_constrained) {
3584  return __kmp_execute_tasks_template(
3585  thread, gtid, flag, final_spin,
3586  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3587 }
3588 
3589 template <bool C, bool S>
3590 int __kmp_execute_tasks_64(
3591  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3592  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3593  kmp_int32 is_constrained) {
3594  return __kmp_execute_tasks_template(
3595  thread, gtid, flag, final_spin,
3596  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3597 }
3598 
3599 template <bool C, bool S>
3600 int __kmp_atomic_execute_tasks_64(
3601  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3602  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3603  kmp_int32 is_constrained) {
3604  return __kmp_execute_tasks_template(
3605  thread, gtid, flag, final_spin,
3606  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3607 }
3608 
3609 int __kmp_execute_tasks_oncore(
3610  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3611  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3612  kmp_int32 is_constrained) {
3613  return __kmp_execute_tasks_template(
3614  thread, gtid, flag, final_spin,
3615  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3616 }
3617 
3618 template int
3619 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3620  kmp_flag_32<false, false> *, int,
3621  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3622 
3623 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3624  kmp_flag_64<false, true> *,
3625  int,
3626  int *USE_ITT_BUILD_ARG(void *),
3627  kmp_int32);
3628 
3629 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3630  kmp_flag_64<true, false> *,
3631  int,
3632  int *USE_ITT_BUILD_ARG(void *),
3633  kmp_int32);
3634 
3635 template int __kmp_atomic_execute_tasks_64<false, true>(
3636  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3637  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3638 
3639 template int __kmp_atomic_execute_tasks_64<true, false>(
3640  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3641  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3642 
3643 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3644 // next barrier so they can assist in executing enqueued tasks.
3645 // First thread in allocates the task team atomically.
3646 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3647  kmp_info_t *this_thr) {
3648  kmp_thread_data_t *threads_data;
3649  int nthreads, i, is_init_thread;
3650 
3651  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3652  __kmp_gtid_from_thread(this_thr)));
3653 
3654  KMP_DEBUG_ASSERT(task_team != NULL);
3655  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3656 
3657  nthreads = task_team->tt.tt_nproc;
3658  KMP_DEBUG_ASSERT(nthreads > 0);
3659  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3660 
3661  // Allocate or increase the size of threads_data if necessary
3662  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3663 
3664  if (!is_init_thread) {
3665  // Some other thread already set up the array.
3666  KA_TRACE(
3667  20,
3668  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3669  __kmp_gtid_from_thread(this_thr)));
3670  return;
3671  }
3672  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3673  KMP_DEBUG_ASSERT(threads_data != NULL);
3674 
3675  if (__kmp_tasking_mode == tskm_task_teams &&
3676  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3677  // Release any threads sleeping at the barrier, so that they can steal
3678  // tasks and execute them. In extra barrier mode, tasks do not sleep
3679  // at the separate tasking barrier, so this isn't a problem.
3680  for (i = 0; i < nthreads; i++) {
3681  void *sleep_loc;
3682  kmp_info_t *thread = threads_data[i].td.td_thr;
3683 
3684  if (i == this_thr->th.th_info.ds.ds_tid) {
3685  continue;
3686  }
3687  // Since we haven't locked the thread's suspend mutex lock at this
3688  // point, there is a small window where a thread might be putting
3689  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3690  // To work around this, __kmp_execute_tasks_template() periodically checks
3691  // see if other threads are sleeping (using the same random mechanism that
3692  // is used for task stealing) and awakens them if they are.
3693  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3694  NULL) {
3695  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3696  __kmp_gtid_from_thread(this_thr),
3697  __kmp_gtid_from_thread(thread)));
3698  __kmp_null_resume_wrapper(thread);
3699  } else {
3700  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3701  __kmp_gtid_from_thread(this_thr),
3702  __kmp_gtid_from_thread(thread)));
3703  }
3704  }
3705  }
3706 
3707  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3708  __kmp_gtid_from_thread(this_thr)));
3709 }
3710 
3711 /* // TODO: Check the comment consistency
3712  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3713  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3714  * After a child * thread checks into a barrier and calls __kmp_release() from
3715  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3716  * longer assume that the kmp_team_t structure is intact (at any moment, the
3717  * primary thread may exit the barrier code and free the team data structure,
3718  * and return the threads to the thread pool).
3719  *
3720  * This does not work with the tasking code, as the thread is still
3721  * expected to participate in the execution of any tasks that may have been
3722  * spawned my a member of the team, and the thread still needs access to all
3723  * to each thread in the team, so that it can steal work from it.
3724  *
3725  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3726  * counting mechanism, and is allocated by the primary thread before calling
3727  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3728  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3729  * of the kmp_task_team_t structs for consecutive barriers can overlap
3730  * (and will, unless the primary thread is the last thread to exit the barrier
3731  * release phase, which is not typical). The existence of such a struct is
3732  * useful outside the context of tasking.
3733  *
3734  * We currently use the existence of the threads array as an indicator that
3735  * tasks were spawned since the last barrier. If the structure is to be
3736  * useful outside the context of tasking, then this will have to change, but
3737  * not setting the field minimizes the performance impact of tasking on
3738  * barriers, when no explicit tasks were spawned (pushed, actually).
3739  */
3740 
3741 static kmp_task_team_t *__kmp_free_task_teams =
3742  NULL; // Free list for task_team data structures
3743 // Lock for task team data structures
3744 kmp_bootstrap_lock_t __kmp_task_team_lock =
3745  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3746 
3747 // __kmp_alloc_task_deque:
3748 // Allocates a task deque for a particular thread, and initialize the necessary
3749 // data structures relating to the deque. This only happens once per thread
3750 // per task team since task teams are recycled. No lock is needed during
3751 // allocation since each thread allocates its own deque.
3752 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3753  kmp_thread_data_t *thread_data) {
3754  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3755  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3756 
3757  // Initialize last stolen task field to "none"
3758  thread_data->td.td_deque_last_stolen = -1;
3759 
3760  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3761  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3762  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3763 
3764  KE_TRACE(
3765  10,
3766  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3767  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3768  // Allocate space for task deque, and zero the deque
3769  // Cannot use __kmp_thread_calloc() because threads not around for
3770  // kmp_reap_task_team( ).
3771  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3772  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3773  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3774 }
3775 
3776 // __kmp_free_task_deque:
3777 // Deallocates a task deque for a particular thread. Happens at library
3778 // deallocation so don't need to reset all thread data fields.
3779 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3780  if (thread_data->td.td_deque != NULL) {
3781  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3782  TCW_4(thread_data->td.td_deque_ntasks, 0);
3783  __kmp_free(thread_data->td.td_deque);
3784  thread_data->td.td_deque = NULL;
3785  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3786  }
3787 
3788 #ifdef BUILD_TIED_TASK_STACK
3789  // GEH: Figure out what to do here for td_susp_tied_tasks
3790  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3791  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3792  }
3793 #endif // BUILD_TIED_TASK_STACK
3794 }
3795 
3796 // __kmp_realloc_task_threads_data:
3797 // Allocates a threads_data array for a task team, either by allocating an
3798 // initial array or enlarging an existing array. Only the first thread to get
3799 // the lock allocs or enlarges the array and re-initializes the array elements.
3800 // That thread returns "TRUE", the rest return "FALSE".
3801 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3802 // The current size is given by task_team -> tt.tt_max_threads.
3803 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3804  kmp_task_team_t *task_team) {
3805  kmp_thread_data_t **threads_data_p;
3806  kmp_int32 nthreads, maxthreads;
3807  int is_init_thread = FALSE;
3808 
3809  if (TCR_4(task_team->tt.tt_found_tasks)) {
3810  // Already reallocated and initialized.
3811  return FALSE;
3812  }
3813 
3814  threads_data_p = &task_team->tt.tt_threads_data;
3815  nthreads = task_team->tt.tt_nproc;
3816  maxthreads = task_team->tt.tt_max_threads;
3817 
3818  // All threads must lock when they encounter the first task of the implicit
3819  // task region to make sure threads_data fields are (re)initialized before
3820  // used.
3821  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3822 
3823  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3824  // first thread to enable tasking
3825  kmp_team_t *team = thread->th.th_team;
3826  int i;
3827 
3828  is_init_thread = TRUE;
3829  if (maxthreads < nthreads) {
3830 
3831  if (*threads_data_p != NULL) {
3832  kmp_thread_data_t *old_data = *threads_data_p;
3833  kmp_thread_data_t *new_data = NULL;
3834 
3835  KE_TRACE(
3836  10,
3837  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3838  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3839  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3840  // Reallocate threads_data to have more elements than current array
3841  // Cannot use __kmp_thread_realloc() because threads not around for
3842  // kmp_reap_task_team( ). Note all new array entries are initialized
3843  // to zero by __kmp_allocate().
3844  new_data = (kmp_thread_data_t *)__kmp_allocate(
3845  nthreads * sizeof(kmp_thread_data_t));
3846  // copy old data to new data
3847  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3848  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3849 
3850 #ifdef BUILD_TIED_TASK_STACK
3851  // GEH: Figure out if this is the right thing to do
3852  for (i = maxthreads; i < nthreads; i++) {
3853  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3854  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3855  }
3856 #endif // BUILD_TIED_TASK_STACK
3857  // Install the new data and free the old data
3858  (*threads_data_p) = new_data;
3859  __kmp_free(old_data);
3860  } else {
3861  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3862  "threads data for task_team %p, size = %d\n",
3863  __kmp_gtid_from_thread(thread), task_team, nthreads));
3864  // Make the initial allocate for threads_data array, and zero entries
3865  // Cannot use __kmp_thread_calloc() because threads not around for
3866  // kmp_reap_task_team( ).
3867  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3868  nthreads * sizeof(kmp_thread_data_t));
3869 #ifdef BUILD_TIED_TASK_STACK
3870  // GEH: Figure out if this is the right thing to do
3871  for (i = 0; i < nthreads; i++) {
3872  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3873  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3874  }
3875 #endif // BUILD_TIED_TASK_STACK
3876  }
3877  task_team->tt.tt_max_threads = nthreads;
3878  } else {
3879  // If array has (more than) enough elements, go ahead and use it
3880  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3881  }
3882 
3883  // initialize threads_data pointers back to thread_info structures
3884  for (i = 0; i < nthreads; i++) {
3885  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3886  thread_data->td.td_thr = team->t.t_threads[i];
3887 
3888  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3889  // The last stolen field survives across teams / barrier, and the number
3890  // of threads may have changed. It's possible (likely?) that a new
3891  // parallel region will exhibit the same behavior as previous region.
3892  thread_data->td.td_deque_last_stolen = -1;
3893  }
3894  }
3895 
3896  KMP_MB();
3897  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3898  }
3899 
3900  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3901  return is_init_thread;
3902 }
3903 
3904 // __kmp_free_task_threads_data:
3905 // Deallocates a threads_data array for a task team, including any attached
3906 // tasking deques. Only occurs at library shutdown.
3907 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3908  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3909  if (task_team->tt.tt_threads_data != NULL) {
3910  int i;
3911  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3912  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3913  }
3914  __kmp_free(task_team->tt.tt_threads_data);
3915  task_team->tt.tt_threads_data = NULL;
3916  }
3917  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3918 }
3919 
3920 // __kmp_free_task_pri_list:
3921 // Deallocates tasking deques used for priority tasks.
3922 // Only occurs at library shutdown.
3923 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3924  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3925  if (task_team->tt.tt_task_pri_list != NULL) {
3926  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3927  while (list != NULL) {
3928  kmp_task_pri_t *next = list->next;
3929  __kmp_free_task_deque(&list->td);
3930  __kmp_free(list);
3931  list = next;
3932  }
3933  task_team->tt.tt_task_pri_list = NULL;
3934  }
3935  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3936 }
3937 
3938 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3939  kmp_team_t *team) {
3940  int team_nth = team->t.t_nproc;
3941  // Only need to init if task team is isn't active or team size changed
3942  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3943  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3944  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3945  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3946  TCW_4(task_team->tt.tt_nproc, team_nth);
3947  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3948  TCW_4(task_team->tt.tt_active, TRUE);
3949  }
3950 }
3951 
3952 // __kmp_allocate_task_team:
3953 // Allocates a task team associated with a specific team, taking it from
3954 // the global task team free list if possible. Also initializes data
3955 // structures.
3956 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3957  kmp_team_t *team) {
3958  kmp_task_team_t *task_team = NULL;
3959 
3960  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3961  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3962 
3963  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3964  // Take a task team from the task team pool
3965  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3966  if (__kmp_free_task_teams != NULL) {
3967  task_team = __kmp_free_task_teams;
3968  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3969  task_team->tt.tt_next = NULL;
3970  }
3971  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3972  }
3973 
3974  if (task_team == NULL) {
3975  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3976  "task team for team %p\n",
3977  __kmp_gtid_from_thread(thread), team));
3978  // Allocate a new task team if one is not available. Cannot use
3979  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3980  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3981  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3982  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3983 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3984  // suppress race conditions detection on synchronization flags in debug mode
3985  // this helps to analyze library internals eliminating false positives
3986  __itt_suppress_mark_range(
3987  __itt_suppress_range, __itt_suppress_threading_errors,
3988  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3989  __itt_suppress_mark_range(__itt_suppress_range,
3990  __itt_suppress_threading_errors,
3991  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3992  sizeof(task_team->tt.tt_active));
3993 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3994  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3995  // task_team->tt.tt_threads_data = NULL;
3996  // task_team->tt.tt_max_threads = 0;
3997  // task_team->tt.tt_next = NULL;
3998  }
3999 
4000  __kmp_task_team_init(task_team, team);
4001 
4002  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4003  "unfinished_threads init'd to %d\n",
4004  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
4005  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4006  return task_team;
4007 }
4008 
4009 // __kmp_free_task_team:
4010 // Frees the task team associated with a specific thread, and adds it
4011 // to the global task team free list.
4012 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4013  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4014  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4015 
4016  // Put task team back on free list
4017  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4018 
4019  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4020  task_team->tt.tt_next = __kmp_free_task_teams;
4021  TCW_PTR(__kmp_free_task_teams, task_team);
4022 
4023  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4024 }
4025 
4026 // __kmp_reap_task_teams:
4027 // Free all the task teams on the task team free list.
4028 // Should only be done during library shutdown.
4029 // Cannot do anything that needs a thread structure or gtid since they are
4030 // already gone.
4031 void __kmp_reap_task_teams(void) {
4032  kmp_task_team_t *task_team;
4033 
4034  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4035  // Free all task_teams on the free list
4036  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4037  while ((task_team = __kmp_free_task_teams) != NULL) {
4038  __kmp_free_task_teams = task_team->tt.tt_next;
4039  task_team->tt.tt_next = NULL;
4040 
4041  // Free threads_data if necessary
4042  if (task_team->tt.tt_threads_data != NULL) {
4043  __kmp_free_task_threads_data(task_team);
4044  }
4045  if (task_team->tt.tt_task_pri_list != NULL) {
4046  __kmp_free_task_pri_list(task_team);
4047  }
4048  __kmp_free(task_team);
4049  }
4050  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4051  }
4052 }
4053 
4054 // View the array of two task team pointers as a pair of pointers:
4055 // 1) a single task_team pointer
4056 // 2) next pointer for stack
4057 // Serial teams can create a stack of task teams for nested serial teams.
4058 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4059  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4060  kmp_task_team_list_t *current =
4061  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4062  kmp_task_team_list_t *node =
4063  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
4064  node->task_team = current->task_team;
4065  node->next = current->next;
4066  thread->th.th_task_team = current->task_team = NULL;
4067  current->next = node;
4068 }
4069 
4070 // Serial team pops a task team off the stack
4071 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4072  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4073  kmp_task_team_list_t *current =
4074  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4075  if (current->task_team) {
4076  __kmp_free_task_team(thread, current->task_team);
4077  }
4078  kmp_task_team_list_t *next = current->next;
4079  if (next) {
4080  current->task_team = next->task_team;
4081  current->next = next->next;
4082  KMP_DEBUG_ASSERT(next != current);
4083  __kmp_free(next);
4084  thread->th.th_task_team = current->task_team;
4085  }
4086 }
4087 
4088 // __kmp_wait_to_unref_task_teams:
4089 // Some threads could still be in the fork barrier release code, possibly
4090 // trying to steal tasks. Wait for each thread to unreference its task team.
4091 void __kmp_wait_to_unref_task_teams(void) {
4092  kmp_info_t *thread;
4093  kmp_uint32 spins;
4094  kmp_uint64 time;
4095  int done;
4096 
4097  KMP_INIT_YIELD(spins);
4098  KMP_INIT_BACKOFF(time);
4099 
4100  for (;;) {
4101  done = TRUE;
4102 
4103  // TODO: GEH - this may be is wrong because some sync would be necessary
4104  // in case threads are added to the pool during the traversal. Need to
4105  // verify that lock for thread pool is held when calling this routine.
4106  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4107  thread = thread->th.th_next_pool) {
4108 #if KMP_OS_WINDOWS
4109  DWORD exit_val;
4110 #endif
4111  if (TCR_PTR(thread->th.th_task_team) == NULL) {
4112  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4113  __kmp_gtid_from_thread(thread)));
4114  continue;
4115  }
4116 #if KMP_OS_WINDOWS
4117  // TODO: GEH - add this check for Linux* OS / OS X* as well?
4118  if (!__kmp_is_thread_alive(thread, &exit_val)) {
4119  thread->th.th_task_team = NULL;
4120  continue;
4121  }
4122 #endif
4123 
4124  done = FALSE; // Because th_task_team pointer is not NULL for this thread
4125 
4126  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4127  "unreference task_team\n",
4128  __kmp_gtid_from_thread(thread)));
4129 
4130  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4131  void *sleep_loc;
4132  // If the thread is sleeping, awaken it.
4133  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4134  NULL) {
4135  KA_TRACE(
4136  10,
4137  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4138  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4139  __kmp_null_resume_wrapper(thread);
4140  }
4141  }
4142  }
4143  if (done) {
4144  break;
4145  }
4146 
4147  // If oversubscribed or have waited a bit, yield.
4148  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4149  }
4150 }
4151 
4152 // __kmp_task_team_setup: Create a task_team for the current team, but use
4153 // an already created, unused one if it already exists.
4154 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
4155  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4156 
4157  // For the serial and root teams, setup the first task team pointer to point
4158  // to task team. The other pointer is a stack of task teams from previous
4159  // serial levels.
4160  if (team == this_thr->th.th_serial_team ||
4161  team == this_thr->th.th_root->r.r_root_team) {
4162  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4163  if (team->t.t_task_team[0] == NULL) {
4164  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
4165  KA_TRACE(
4166  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4167  " for serial/root team %p\n",
4168  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
4169 
4170  } else
4171  __kmp_task_team_init(team->t.t_task_team[0], team);
4172  return;
4173  }
4174 
4175  // If this task_team hasn't been created yet, allocate it. It will be used in
4176  // the region after the next.
4177  // If it exists, it is the current task team and shouldn't be touched yet as
4178  // it may still be in use.
4179  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4180  team->t.t_task_team[this_thr->th.th_task_state] =
4181  __kmp_allocate_task_team(this_thr, team);
4182  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4183  " for team %d at parity=%d\n",
4184  __kmp_gtid_from_thread(this_thr),
4185  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4186  this_thr->th.th_task_state));
4187  }
4188 
4189  // After threads exit the release, they will call sync, and then point to this
4190  // other task_team; make sure it is allocated and properly initialized. As
4191  // threads spin in the barrier release phase, they will continue to use the
4192  // previous task_team struct(above), until they receive the signal to stop
4193  // checking for tasks (they can't safely reference the kmp_team_t struct,
4194  // which could be reallocated by the primary thread).
4195  int other_team = 1 - this_thr->th.th_task_state;
4196  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4197  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4198  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
4199  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4200  "task_team %p for team %d at parity=%d\n",
4201  __kmp_gtid_from_thread(this_thr),
4202  team->t.t_task_team[other_team], team->t.t_id, other_team));
4203  } else { // Leave the old task team struct in place for the upcoming region;
4204  // adjust as needed
4205  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4206  __kmp_task_team_init(task_team, team);
4207  // if team size has changed, the first thread to enable tasking will
4208  // realloc threads_data if necessary
4209  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4210  "%p for team %d at parity=%d\n",
4211  __kmp_gtid_from_thread(this_thr),
4212  team->t.t_task_team[other_team], team->t.t_id, other_team));
4213  }
4214 
4215  // For regular thread, task enabling should be called when the task is going
4216  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4217  // it ahead of time so that some operations can be performed without race
4218  // condition.
4219  if (this_thr == __kmp_hidden_helper_main_thread) {
4220  for (int i = 0; i < 2; ++i) {
4221  kmp_task_team_t *task_team = team->t.t_task_team[i];
4222  if (KMP_TASKING_ENABLED(task_team)) {
4223  continue;
4224  }
4225  __kmp_enable_tasking(task_team, this_thr);
4226  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4227  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4228  if (thread_data->td.td_deque == NULL) {
4229  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4230  }
4231  }
4232  }
4233  }
4234 }
4235 
4236 // __kmp_task_team_sync: Propagation of task team data from team to threads
4237 // which happens just after the release phase of a team barrier. This may be
4238 // called by any thread. This is not called for serial or root teams.
4239 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4240  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4241  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4242  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4243 
4244  // Toggle the th_task_state field, to switch which task_team this thread
4245  // refers to
4246  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4247 
4248  // It is now safe to propagate the task team pointer from the team struct to
4249  // the current thread.
4250  TCW_PTR(this_thr->th.th_task_team,
4251  team->t.t_task_team[this_thr->th.th_task_state]);
4252  KA_TRACE(20,
4253  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4254  "%p from Team #%d (parity=%d)\n",
4255  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4256  team->t.t_id, this_thr->th.th_task_state));
4257 }
4258 
4259 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4260 // barrier gather phase. Only called by the primary thread.
4261 //
4262 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4263 // by passing in 0 optionally as the last argument. When wait is zero, primary
4264 // thread does not wait for unfinished_threads to reach 0.
4265 void __kmp_task_team_wait(
4266  kmp_info_t *this_thr,
4267  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4268  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4269 
4270  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4271  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4272 
4273  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4274  if (wait) {
4275  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4276  "(for unfinished_threads to reach 0) on task_team = %p\n",
4277  __kmp_gtid_from_thread(this_thr), task_team));
4278  // Worker threads may have dropped through to release phase, but could
4279  // still be executing tasks. Wait here for tasks to complete. To avoid
4280  // memory contention, only primary thread checks termination condition.
4281  kmp_flag_32<false, false> flag(
4282  RCAST(std::atomic<kmp_uint32> *,
4283  &task_team->tt.tt_unfinished_threads),
4284  0U);
4285  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4286  }
4287  // Deactivate the old task team, so that the worker threads will stop
4288  // referencing it while spinning.
4289  KA_TRACE(
4290  20,
4291  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4292  "setting active to false, setting local and team's pointer to NULL\n",
4293  __kmp_gtid_from_thread(this_thr), task_team));
4294  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4295  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4296  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4297  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4298  KMP_MB();
4299 
4300  TCW_PTR(this_thr->th.th_task_team, NULL);
4301  }
4302 }
4303 
4304 // __kmp_tasking_barrier:
4305 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4306 // Internal function to execute all tasks prior to a regular barrier or a join
4307 // barrier. It is a full barrier itself, which unfortunately turns regular
4308 // barriers into double barriers and join barriers into 1 1/2 barriers.
4309 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4310  std::atomic<kmp_uint32> *spin = RCAST(
4311  std::atomic<kmp_uint32> *,
4312  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4313  int flag = FALSE;
4314  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4315 
4316 #if USE_ITT_BUILD
4317  KMP_FSYNC_SPIN_INIT(spin, NULL);
4318 #endif /* USE_ITT_BUILD */
4319  kmp_flag_32<false, false> spin_flag(spin, 0U);
4320  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4321  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4322 #if USE_ITT_BUILD
4323  // TODO: What about itt_sync_obj??
4324  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4325 #endif /* USE_ITT_BUILD */
4326 
4327  if (TCR_4(__kmp_global.g.g_done)) {
4328  if (__kmp_global.g.g_abort)
4329  __kmp_abort_thread();
4330  break;
4331  }
4332  KMP_YIELD(TRUE);
4333  }
4334 #if USE_ITT_BUILD
4335  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4336 #endif /* USE_ITT_BUILD */
4337 }
4338 
4339 // __kmp_give_task puts a task into a given thread queue if:
4340 // - the queue for that thread was created
4341 // - there's space in that queue
4342 // Because of this, __kmp_push_task needs to check if there's space after
4343 // getting the lock
4344 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4345  kmp_int32 pass) {
4346  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4347  kmp_task_team_t *task_team = taskdata->td_task_team;
4348 
4349  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4350  taskdata, tid));
4351 
4352  // If task_team is NULL something went really bad...
4353  KMP_DEBUG_ASSERT(task_team != NULL);
4354 
4355  bool result = false;
4356  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4357 
4358  if (thread_data->td.td_deque == NULL) {
4359  // There's no queue in this thread, go find another one
4360  // We're guaranteed that at least one thread has a queue
4361  KA_TRACE(30,
4362  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4363  tid, taskdata));
4364  return result;
4365  }
4366 
4367  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4368  TASK_DEQUE_SIZE(thread_data->td)) {
4369  KA_TRACE(
4370  30,
4371  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4372  taskdata, tid));
4373 
4374  // if this deque is bigger than the pass ratio give a chance to another
4375  // thread
4376  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4377  return result;
4378 
4379  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4380  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4381  TASK_DEQUE_SIZE(thread_data->td)) {
4382  // expand deque to push the task which is not allowed to execute
4383  __kmp_realloc_task_deque(thread, thread_data);
4384  }
4385 
4386  } else {
4387 
4388  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4389 
4390  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4391  TASK_DEQUE_SIZE(thread_data->td)) {
4392  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4393  "thread %d.\n",
4394  taskdata, tid));
4395 
4396  // if this deque is bigger than the pass ratio give a chance to another
4397  // thread
4398  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4399  goto release_and_exit;
4400 
4401  __kmp_realloc_task_deque(thread, thread_data);
4402  }
4403  }
4404 
4405  // lock is held here, and there is space in the deque
4406 
4407  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4408  // Wrap index.
4409  thread_data->td.td_deque_tail =
4410  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4411  TCW_4(thread_data->td.td_deque_ntasks,
4412  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4413 
4414  result = true;
4415  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4416  taskdata, tid));
4417 
4418 release_and_exit:
4419  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4420 
4421  return result;
4422 }
4423 
4424 #define PROXY_TASK_FLAG 0x40000000
4425 /* The finish of the proxy tasks is divided in two pieces:
4426  - the top half is the one that can be done from a thread outside the team
4427  - the bottom half must be run from a thread within the team
4428 
4429  In order to run the bottom half the task gets queued back into one of the
4430  threads of the team. Once the td_incomplete_child_task counter of the parent
4431  is decremented the threads can leave the barriers. So, the bottom half needs
4432  to be queued before the counter is decremented. The top half is therefore
4433  divided in two parts:
4434  - things that can be run before queuing the bottom half
4435  - things that must be run after queuing the bottom half
4436 
4437  This creates a second race as the bottom half can free the task before the
4438  second top half is executed. To avoid this we use the
4439  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4440  half. */
4441 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4442  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4443  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4444  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4445  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4446 
4447  taskdata->td_flags.complete = 1; // mark the task as completed
4448 #if OMPX_TASKGRAPH
4449  taskdata->td_flags.onced = 1;
4450 #endif
4451 
4452  if (taskdata->td_taskgroup)
4453  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4454 
4455  // Create an imaginary children for this task so the bottom half cannot
4456  // release the task before we have completed the second top half
4457  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4458 }
4459 
4460 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4461 #if KMP_DEBUG
4462  kmp_int32 children = 0;
4463  // Predecrement simulated by "- 1" calculation
4464  children = -1 +
4465 #endif
4466  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4467  KMP_DEBUG_ASSERT(children >= 0);
4468 
4469  // Remove the imaginary children
4470  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4471 }
4472 
4473 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4474  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4475  kmp_info_t *thread = __kmp_threads[gtid];
4476 
4477  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4478  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4479  1); // top half must run before bottom half
4480 
4481  // We need to wait to make sure the top half is finished
4482  // Spinning here should be ok as this should happen quickly
4483  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4484  PROXY_TASK_FLAG) > 0)
4485  ;
4486 
4487  __kmp_release_deps(gtid, taskdata);
4488  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4489 }
4490 
4499 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4500  KMP_DEBUG_ASSERT(ptask != NULL);
4501  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4502  KA_TRACE(
4503  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4504  gtid, taskdata));
4505  __kmp_assert_valid_gtid(gtid);
4506  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4507 
4508  __kmp_first_top_half_finish_proxy(taskdata);
4509  __kmp_second_top_half_finish_proxy(taskdata);
4510  __kmp_bottom_half_finish_proxy(gtid, ptask);
4511 
4512  KA_TRACE(10,
4513  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4514  gtid, taskdata));
4515 }
4516 
4517 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4518  KMP_DEBUG_ASSERT(ptask != NULL);
4519  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4520 
4521  // Enqueue task to complete bottom half completion from a thread within the
4522  // corresponding team
4523  kmp_team_t *team = taskdata->td_team;
4524  kmp_int32 nthreads = team->t.t_nproc;
4525  kmp_info_t *thread;
4526 
4527  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4528  // but we cannot use __kmp_get_random here
4529  kmp_int32 start_k = start % nthreads;
4530  kmp_int32 pass = 1;
4531  kmp_int32 k = start_k;
4532 
4533  do {
4534  // For now we're just linearly trying to find a thread
4535  thread = team->t.t_threads[k];
4536  k = (k + 1) % nthreads;
4537 
4538  // we did a full pass through all the threads
4539  if (k == start_k)
4540  pass = pass << 1;
4541 
4542  } while (!__kmp_give_task(thread, k, ptask, pass));
4543 
4544  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4545  // awake at least one thread to execute given task
4546  for (int i = 0; i < nthreads; ++i) {
4547  thread = team->t.t_threads[i];
4548  if (thread->th.th_sleep_loc != NULL) {
4549  __kmp_null_resume_wrapper(thread);
4550  break;
4551  }
4552  }
4553  }
4554 }
4555 
4563 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4564  KMP_DEBUG_ASSERT(ptask != NULL);
4565  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4566 
4567  KA_TRACE(
4568  10,
4569  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4570  taskdata));
4571 
4572  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4573 
4574  __kmp_first_top_half_finish_proxy(taskdata);
4575 
4576  __kmpc_give_task(ptask);
4577 
4578  __kmp_second_top_half_finish_proxy(taskdata);
4579 
4580  KA_TRACE(
4581  10,
4582  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4583  taskdata));
4584 }
4585 
4586 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4587  kmp_task_t *task) {
4588  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4589  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4590  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4591  td->td_allow_completion_event.ed.task = task;
4592  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4593  }
4594  return &td->td_allow_completion_event;
4595 }
4596 
4597 void __kmp_fulfill_event(kmp_event_t *event) {
4598  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4599  kmp_task_t *ptask = event->ed.task;
4600  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4601  bool detached = false;
4602  int gtid = __kmp_get_gtid();
4603 
4604  // The associated task might have completed or could be completing at this
4605  // point.
4606  // We need to take the lock to avoid races
4607  __kmp_acquire_tas_lock(&event->lock, gtid);
4608  if (taskdata->td_flags.proxy == TASK_PROXY) {
4609  detached = true;
4610  } else {
4611 #if OMPT_SUPPORT
4612  // The OMPT event must occur under mutual exclusion,
4613  // otherwise the tool might access ptask after free
4614  if (UNLIKELY(ompt_enabled.enabled))
4615  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4616 #endif
4617  }
4618  event->type = KMP_EVENT_UNINITIALIZED;
4619  __kmp_release_tas_lock(&event->lock, gtid);
4620 
4621  if (detached) {
4622 #if OMPT_SUPPORT
4623  // We free ptask afterwards and know the task is finished,
4624  // so locking is not necessary
4625  if (UNLIKELY(ompt_enabled.enabled))
4626  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4627 #endif
4628  // If the task detached complete the proxy task
4629  if (gtid >= 0) {
4630  kmp_team_t *team = taskdata->td_team;
4631  kmp_info_t *thread = __kmp_get_thread();
4632  if (thread->th.th_team == team) {
4633  __kmpc_proxy_task_completed(gtid, ptask);
4634  return;
4635  }
4636  }
4637 
4638  // fallback
4640  }
4641  }
4642 }
4643 
4644 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4645 // for taskloop
4646 //
4647 // thread: allocating thread
4648 // task_src: pointer to source task to be duplicated
4649 // taskloop_recur: used only when dealing with taskgraph,
4650 // indicating whether we need to update task->td_task_id
4651 // returns: a pointer to the allocated kmp_task_t structure (task).
4652 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4653 #if OMPX_TASKGRAPH
4654  , int taskloop_recur
4655 #endif
4656 ) {
4657  kmp_task_t *task;
4658  kmp_taskdata_t *taskdata;
4659  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4660  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4661  size_t shareds_offset;
4662  size_t task_size;
4663 
4664  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4665  task_src));
4666  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4667  TASK_FULL); // it should not be proxy task
4668  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4669  task_size = taskdata_src->td_size_alloc;
4670 
4671  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4672  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4673  task_size));
4674 #if USE_FAST_MEMORY
4675  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4676 #else
4677  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4678 #endif /* USE_FAST_MEMORY */
4679  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4680 
4681  task = KMP_TASKDATA_TO_TASK(taskdata);
4682 
4683  // Initialize new task (only specific fields not affected by memcpy)
4684 #if OMPX_TASKGRAPH
4685  if (taskdata->is_taskgraph && !taskloop_recur &&
4686  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4687  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4688 #endif
4689  taskdata->td_task_id = KMP_GEN_TASK_ID();
4690  if (task->shareds != NULL) { // need setup shareds pointer
4691  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4692  task->shareds = &((char *)taskdata)[shareds_offset];
4693  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4694  0);
4695  }
4696  taskdata->td_alloc_thread = thread;
4697  taskdata->td_parent = parent_task;
4698  // task inherits the taskgroup from the parent task
4699  taskdata->td_taskgroup = parent_task->td_taskgroup;
4700  // tied task needs to initialize the td_last_tied at creation,
4701  // untied one does this when it is scheduled for execution
4702  if (taskdata->td_flags.tiedness == TASK_TIED)
4703  taskdata->td_last_tied = taskdata;
4704 
4705  // Only need to keep track of child task counts if team parallel and tasking
4706  // not serialized
4707  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4708  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4709  if (parent_task->td_taskgroup)
4710  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4711  // Only need to keep track of allocated child tasks for explicit tasks since
4712  // implicit not deallocated
4713  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4714  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4715  }
4716 
4717  KA_TRACE(20,
4718  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4719  thread, taskdata, taskdata->td_parent));
4720 #if OMPT_SUPPORT
4721  if (UNLIKELY(ompt_enabled.enabled))
4722  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4723 #endif
4724  return task;
4725 }
4726 
4727 // Routine optionally generated by the compiler for setting the lastprivate flag
4728 // and calling needed constructors for private/firstprivate objects
4729 // (used to form taskloop tasks from pattern task)
4730 // Parameters: dest task, src task, lastprivate flag.
4731 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4732 
4733 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4734 
4735 // class to encapsulate manipulating loop bounds in a taskloop task.
4736 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4737 // the loop bound variables.
4738 class kmp_taskloop_bounds_t {
4739  kmp_task_t *task;
4740  const kmp_taskdata_t *taskdata;
4741  size_t lower_offset;
4742  size_t upper_offset;
4743 
4744 public:
4745  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4746  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4747  lower_offset((char *)lb - (char *)task),
4748  upper_offset((char *)ub - (char *)task) {
4749  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4750  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4751  }
4752  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4753  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4754  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4755  size_t get_lower_offset() const { return lower_offset; }
4756  size_t get_upper_offset() const { return upper_offset; }
4757  kmp_uint64 get_lb() const {
4758  kmp_int64 retval;
4759 #if defined(KMP_GOMP_COMPAT)
4760  // Intel task just returns the lower bound normally
4761  if (!taskdata->td_flags.native) {
4762  retval = *(kmp_int64 *)((char *)task + lower_offset);
4763  } else {
4764  // GOMP task has to take into account the sizeof(long)
4765  if (taskdata->td_size_loop_bounds == 4) {
4766  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4767  retval = (kmp_int64)*lb;
4768  } else {
4769  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4770  retval = (kmp_int64)*lb;
4771  }
4772  }
4773 #else
4774  (void)taskdata;
4775  retval = *(kmp_int64 *)((char *)task + lower_offset);
4776 #endif // defined(KMP_GOMP_COMPAT)
4777  return retval;
4778  }
4779  kmp_uint64 get_ub() const {
4780  kmp_int64 retval;
4781 #if defined(KMP_GOMP_COMPAT)
4782  // Intel task just returns the upper bound normally
4783  if (!taskdata->td_flags.native) {
4784  retval = *(kmp_int64 *)((char *)task + upper_offset);
4785  } else {
4786  // GOMP task has to take into account the sizeof(long)
4787  if (taskdata->td_size_loop_bounds == 4) {
4788  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4789  retval = (kmp_int64)*ub;
4790  } else {
4791  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4792  retval = (kmp_int64)*ub;
4793  }
4794  }
4795 #else
4796  retval = *(kmp_int64 *)((char *)task + upper_offset);
4797 #endif // defined(KMP_GOMP_COMPAT)
4798  return retval;
4799  }
4800  void set_lb(kmp_uint64 lb) {
4801 #if defined(KMP_GOMP_COMPAT)
4802  // Intel task just sets the lower bound normally
4803  if (!taskdata->td_flags.native) {
4804  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4805  } else {
4806  // GOMP task has to take into account the sizeof(long)
4807  if (taskdata->td_size_loop_bounds == 4) {
4808  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4809  *lower = (kmp_uint32)lb;
4810  } else {
4811  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4812  *lower = (kmp_uint64)lb;
4813  }
4814  }
4815 #else
4816  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4817 #endif // defined(KMP_GOMP_COMPAT)
4818  }
4819  void set_ub(kmp_uint64 ub) {
4820 #if defined(KMP_GOMP_COMPAT)
4821  // Intel task just sets the upper bound normally
4822  if (!taskdata->td_flags.native) {
4823  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4824  } else {
4825  // GOMP task has to take into account the sizeof(long)
4826  if (taskdata->td_size_loop_bounds == 4) {
4827  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4828  *upper = (kmp_uint32)ub;
4829  } else {
4830  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4831  *upper = (kmp_uint64)ub;
4832  }
4833  }
4834 #else
4835  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4836 #endif // defined(KMP_GOMP_COMPAT)
4837  }
4838 };
4839 
4840 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4841 //
4842 // loc Source location information
4843 // gtid Global thread ID
4844 // task Pattern task, exposes the loop iteration range
4845 // lb Pointer to loop lower bound in task structure
4846 // ub Pointer to loop upper bound in task structure
4847 // st Loop stride
4848 // ub_glob Global upper bound (used for lastprivate check)
4849 // num_tasks Number of tasks to execute
4850 // grainsize Number of loop iterations per task
4851 // extras Number of chunks with grainsize+1 iterations
4852 // last_chunk Reduction of grainsize for last task
4853 // tc Iterations count
4854 // task_dup Tasks duplication routine
4855 // codeptr_ra Return address for OMPT events
4856 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4857  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4858  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4859  kmp_uint64 grainsize, kmp_uint64 extras,
4860  kmp_int64 last_chunk, kmp_uint64 tc,
4861 #if OMPT_SUPPORT
4862  void *codeptr_ra,
4863 #endif
4864  void *task_dup) {
4865  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4866  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4867  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4868  // compiler provides global bounds here
4869  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4870  kmp_uint64 lower = task_bounds.get_lb();
4871  kmp_uint64 upper = task_bounds.get_ub();
4872  kmp_uint64 i;
4873  kmp_info_t *thread = __kmp_threads[gtid];
4874  kmp_taskdata_t *current_task = thread->th.th_current_task;
4875  kmp_task_t *next_task;
4876  kmp_int32 lastpriv = 0;
4877 
4878  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4879  (last_chunk < 0 ? last_chunk : extras));
4880  KMP_DEBUG_ASSERT(num_tasks > extras);
4881  KMP_DEBUG_ASSERT(num_tasks > 0);
4882  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4883  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4884  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4885  ub_glob, st, task_dup));
4886 
4887  // Launch num_tasks tasks, assign grainsize iterations each task
4888  for (i = 0; i < num_tasks; ++i) {
4889  kmp_uint64 chunk_minus_1;
4890  if (extras == 0) {
4891  chunk_minus_1 = grainsize - 1;
4892  } else {
4893  chunk_minus_1 = grainsize;
4894  --extras; // first extras iterations get bigger chunk (grainsize+1)
4895  }
4896  upper = lower + st * chunk_minus_1;
4897  if (upper > *ub) {
4898  upper = *ub;
4899  }
4900  if (i == num_tasks - 1) {
4901  // schedule the last task, set lastprivate flag if needed
4902  if (st == 1) { // most common case
4903  KMP_DEBUG_ASSERT(upper == *ub);
4904  if (upper == ub_glob)
4905  lastpriv = 1;
4906  } else if (st > 0) { // positive loop stride
4907  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4908  if ((kmp_uint64)st > ub_glob - upper)
4909  lastpriv = 1;
4910  } else { // negative loop stride
4911  KMP_DEBUG_ASSERT(upper + st < *ub);
4912  if (upper - ub_glob < (kmp_uint64)(-st))
4913  lastpriv = 1;
4914  }
4915  }
4916 
4917 #if OMPX_TASKGRAPH
4918  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4919 #else
4920  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4921 #endif
4922 
4923  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4924  kmp_taskloop_bounds_t next_task_bounds =
4925  kmp_taskloop_bounds_t(next_task, task_bounds);
4926 
4927  // adjust task-specific bounds
4928  next_task_bounds.set_lb(lower);
4929  if (next_taskdata->td_flags.native) {
4930  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4931  } else {
4932  next_task_bounds.set_ub(upper);
4933  }
4934  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4935  // etc.
4936  ptask_dup(next_task, task, lastpriv);
4937  KA_TRACE(40,
4938  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4939  "upper %lld stride %lld, (offsets %p %p)\n",
4940  gtid, i, next_task, lower, upper, st,
4941  next_task_bounds.get_lower_offset(),
4942  next_task_bounds.get_upper_offset()));
4943 #if OMPT_SUPPORT
4944  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4945  codeptr_ra); // schedule new task
4946 #if OMPT_OPTIONAL
4947  if (ompt_enabled.ompt_callback_dispatch) {
4948  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4949  lower, upper, st);
4950  }
4951 #endif // OMPT_OPTIONAL
4952 #else
4953  __kmp_omp_task(gtid, next_task, true); // schedule new task
4954 #endif
4955  lower = upper + st; // adjust lower bound for the next iteration
4956  }
4957  // free the pattern task and exit
4958  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4959  // do not execute the pattern task, just do internal bookkeeping
4960  __kmp_task_finish<false>(gtid, task, current_task);
4961 }
4962 
4963 // Structure to keep taskloop parameters for auxiliary task
4964 // kept in the shareds of the task structure.
4965 typedef struct __taskloop_params {
4966  kmp_task_t *task;
4967  kmp_uint64 *lb;
4968  kmp_uint64 *ub;
4969  void *task_dup;
4970  kmp_int64 st;
4971  kmp_uint64 ub_glob;
4972  kmp_uint64 num_tasks;
4973  kmp_uint64 grainsize;
4974  kmp_uint64 extras;
4975  kmp_int64 last_chunk;
4976  kmp_uint64 tc;
4977  kmp_uint64 num_t_min;
4978 #if OMPT_SUPPORT
4979  void *codeptr_ra;
4980 #endif
4981 } __taskloop_params_t;
4982 
4983 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4984  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4985  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4986  kmp_uint64,
4987 #if OMPT_SUPPORT
4988  void *,
4989 #endif
4990  void *);
4991 
4992 // Execute part of the taskloop submitted as a task.
4993 int __kmp_taskloop_task(int gtid, void *ptask) {
4994  __taskloop_params_t *p =
4995  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4996  kmp_task_t *task = p->task;
4997  kmp_uint64 *lb = p->lb;
4998  kmp_uint64 *ub = p->ub;
4999  void *task_dup = p->task_dup;
5000  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5001  kmp_int64 st = p->st;
5002  kmp_uint64 ub_glob = p->ub_glob;
5003  kmp_uint64 num_tasks = p->num_tasks;
5004  kmp_uint64 grainsize = p->grainsize;
5005  kmp_uint64 extras = p->extras;
5006  kmp_int64 last_chunk = p->last_chunk;
5007  kmp_uint64 tc = p->tc;
5008  kmp_uint64 num_t_min = p->num_t_min;
5009 #if OMPT_SUPPORT
5010  void *codeptr_ra = p->codeptr_ra;
5011 #endif
5012 #if KMP_DEBUG
5013  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5014  KMP_DEBUG_ASSERT(task != NULL);
5015  KA_TRACE(20,
5016  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5017  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5018  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5019  st, task_dup));
5020 #endif
5021  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5022  if (num_tasks > num_t_min)
5023  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5024  grainsize, extras, last_chunk, tc, num_t_min,
5025 #if OMPT_SUPPORT
5026  codeptr_ra,
5027 #endif
5028  task_dup);
5029  else
5030  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5031  grainsize, extras, last_chunk, tc,
5032 #if OMPT_SUPPORT
5033  codeptr_ra,
5034 #endif
5035  task_dup);
5036 
5037  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5038  return 0;
5039 }
5040 
5041 // Schedule part of the taskloop as a task,
5042 // execute the rest of the taskloop.
5043 //
5044 // loc Source location information
5045 // gtid Global thread ID
5046 // task Pattern task, exposes the loop iteration range
5047 // lb Pointer to loop lower bound in task structure
5048 // ub Pointer to loop upper bound in task structure
5049 // st Loop stride
5050 // ub_glob Global upper bound (used for lastprivate check)
5051 // num_tasks Number of tasks to execute
5052 // grainsize Number of loop iterations per task
5053 // extras Number of chunks with grainsize+1 iterations
5054 // last_chunk Reduction of grainsize for last task
5055 // tc Iterations count
5056 // num_t_min Threshold to launch tasks recursively
5057 // task_dup Tasks duplication routine
5058 // codeptr_ra Return address for OMPT events
5059 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5060  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5061  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5062  kmp_uint64 grainsize, kmp_uint64 extras,
5063  kmp_int64 last_chunk, kmp_uint64 tc,
5064  kmp_uint64 num_t_min,
5065 #if OMPT_SUPPORT
5066  void *codeptr_ra,
5067 #endif
5068  void *task_dup) {
5069  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5070  KMP_DEBUG_ASSERT(task != NULL);
5071  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5072  KA_TRACE(20,
5073  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5074  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5075  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5076  st, task_dup));
5077  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5078  kmp_uint64 lower = *lb;
5079  kmp_info_t *thread = __kmp_threads[gtid];
5080  // kmp_taskdata_t *current_task = thread->th.th_current_task;
5081  kmp_task_t *next_task;
5082  size_t lower_offset =
5083  (char *)lb - (char *)task; // remember offset of lb in the task structure
5084  size_t upper_offset =
5085  (char *)ub - (char *)task; // remember offset of ub in the task structure
5086 
5087  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5088  (last_chunk < 0 ? last_chunk : extras));
5089  KMP_DEBUG_ASSERT(num_tasks > extras);
5090  KMP_DEBUG_ASSERT(num_tasks > 0);
5091 
5092  // split the loop in two halves
5093  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5094  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5095  kmp_uint64 gr_size0 = grainsize;
5096  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5097  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5098  if (last_chunk < 0) {
5099  ext0 = ext1 = 0;
5100  last_chunk1 = last_chunk;
5101  tc0 = grainsize * n_tsk0;
5102  tc1 = tc - tc0;
5103  } else if (n_tsk0 <= extras) {
5104  gr_size0++; // integrate extras into grainsize
5105  ext0 = 0; // no extra iters in 1st half
5106  ext1 = extras - n_tsk0; // remaining extras
5107  tc0 = gr_size0 * n_tsk0;
5108  tc1 = tc - tc0;
5109  } else { // n_tsk0 > extras
5110  ext1 = 0; // no extra iters in 2nd half
5111  ext0 = extras;
5112  tc1 = grainsize * n_tsk1;
5113  tc0 = tc - tc1;
5114  }
5115  ub0 = lower + st * (tc0 - 1);
5116  lb1 = ub0 + st;
5117 
5118  // create pattern task for 2nd half of the loop
5119 #if OMPX_TASKGRAPH
5120  next_task = __kmp_task_dup_alloc(thread, task,
5121  /* taskloop_recur */ 1);
5122 #else
5123  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5124 #endif
5125  // adjust lower bound (upper bound is not changed) for the 2nd half
5126  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5127  if (ptask_dup != NULL) // construct firstprivates, etc.
5128  ptask_dup(next_task, task, 0);
5129  *ub = ub0; // adjust upper bound for the 1st half
5130 
5131  // create auxiliary task for 2nd half of the loop
5132  // make sure new task has same parent task as the pattern task
5133  kmp_taskdata_t *current_task = thread->th.th_current_task;
5134  thread->th.th_current_task = taskdata->td_parent;
5135  kmp_task_t *new_task =
5136  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5137  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5138  // restore current task
5139  thread->th.th_current_task = current_task;
5140  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5141  p->task = next_task;
5142  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5143  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5144  p->task_dup = task_dup;
5145  p->st = st;
5146  p->ub_glob = ub_glob;
5147  p->num_tasks = n_tsk1;
5148  p->grainsize = grainsize;
5149  p->extras = ext1;
5150  p->last_chunk = last_chunk1;
5151  p->tc = tc1;
5152  p->num_t_min = num_t_min;
5153 #if OMPT_SUPPORT
5154  p->codeptr_ra = codeptr_ra;
5155 #endif
5156 
5157 #if OMPX_TASKGRAPH
5158  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5159  new_task_data->tdg = taskdata->tdg;
5160  new_task_data->is_taskgraph = 0;
5161 #endif
5162 
5163 #if OMPT_SUPPORT
5164  // schedule new task with correct return address for OMPT events
5165  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5166 #else
5167  __kmp_omp_task(gtid, new_task, true); // schedule new task
5168 #endif
5169 
5170  // execute the 1st half of current subrange
5171  if (n_tsk0 > num_t_min)
5172  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5173  ext0, last_chunk0, tc0, num_t_min,
5174 #if OMPT_SUPPORT
5175  codeptr_ra,
5176 #endif
5177  task_dup);
5178  else
5179  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5180  gr_size0, ext0, last_chunk0, tc0,
5181 #if OMPT_SUPPORT
5182  codeptr_ra,
5183 #endif
5184  task_dup);
5185 
5186  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5187 }
5188 
5189 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5190  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5191  int nogroup, int sched, kmp_uint64 grainsize,
5192  int modifier, void *task_dup) {
5193  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5194  KMP_DEBUG_ASSERT(task != NULL);
5195  if (nogroup == 0) {
5196 #if OMPT_SUPPORT && OMPT_OPTIONAL
5197  OMPT_STORE_RETURN_ADDRESS(gtid);
5198 #endif
5199  __kmpc_taskgroup(loc, gtid);
5200  }
5201 
5202 #if OMPX_TASKGRAPH
5203  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5204 #endif
5205  // =========================================================================
5206  // calculate loop parameters
5207  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5208  kmp_uint64 tc;
5209  // compiler provides global bounds here
5210  kmp_uint64 lower = task_bounds.get_lb();
5211  kmp_uint64 upper = task_bounds.get_ub();
5212  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5213  kmp_uint64 num_tasks = 0, extras = 0;
5214  kmp_int64 last_chunk =
5215  0; // reduce grainsize of last task by last_chunk in strict mode
5216  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5217  kmp_info_t *thread = __kmp_threads[gtid];
5218  kmp_taskdata_t *current_task = thread->th.th_current_task;
5219 
5220  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5221  "grain %llu(%d, %d), dup %p\n",
5222  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5223  task_dup));
5224 
5225  // compute trip count
5226  if (st == 1) { // most common case
5227  tc = upper - lower + 1;
5228  } else if (st < 0) {
5229  tc = (lower - upper) / (-st) + 1;
5230  } else { // st > 0
5231  tc = (upper - lower) / st + 1;
5232  }
5233  if (tc == 0) {
5234  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5235  // free the pattern task and exit
5236  __kmp_task_start(gtid, task, current_task);
5237  // do not execute anything for zero-trip loop
5238  __kmp_task_finish<false>(gtid, task, current_task);
5239  return;
5240  }
5241 
5242 #if OMPT_SUPPORT && OMPT_OPTIONAL
5243  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5244  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5245  if (ompt_enabled.ompt_callback_work) {
5246  ompt_callbacks.ompt_callback(ompt_callback_work)(
5247  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5248  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5249  }
5250 #endif
5251 
5252  if (num_tasks_min == 0)
5253  // TODO: can we choose better default heuristic?
5254  num_tasks_min =
5255  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5256 
5257  // compute num_tasks/grainsize based on the input provided
5258  switch (sched) {
5259  case 0: // no schedule clause specified, we can choose the default
5260  // let's try to schedule (team_size*10) tasks
5261  grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5262  KMP_FALLTHROUGH();
5263  case 2: // num_tasks provided
5264  if (grainsize > tc) {
5265  num_tasks = tc; // too big num_tasks requested, adjust values
5266  grainsize = 1;
5267  extras = 0;
5268  } else {
5269  num_tasks = grainsize;
5270  grainsize = tc / num_tasks;
5271  extras = tc % num_tasks;
5272  }
5273  break;
5274  case 1: // grainsize provided
5275  if (grainsize > tc) {
5276  num_tasks = 1;
5277  grainsize = tc; // too big grainsize requested, adjust values
5278  extras = 0;
5279  } else {
5280  if (modifier) {
5281  num_tasks = (tc + grainsize - 1) / grainsize;
5282  last_chunk = tc - (num_tasks * grainsize);
5283  extras = 0;
5284  } else {
5285  num_tasks = tc / grainsize;
5286  // adjust grainsize for balanced distribution of iterations
5287  grainsize = tc / num_tasks;
5288  extras = tc % num_tasks;
5289  }
5290  }
5291  break;
5292  default:
5293  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5294  }
5295 
5296  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5297  (last_chunk < 0 ? last_chunk : extras));
5298  KMP_DEBUG_ASSERT(num_tasks > extras);
5299  KMP_DEBUG_ASSERT(num_tasks > 0);
5300  // =========================================================================
5301 
5302  // check if clause value first
5303  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5304  if (if_val == 0) { // if(0) specified, mark task as serial
5305  taskdata->td_flags.task_serial = 1;
5306  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5307  // always start serial tasks linearly
5308  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5309  grainsize, extras, last_chunk, tc,
5310 #if OMPT_SUPPORT
5311  OMPT_GET_RETURN_ADDRESS(0),
5312 #endif
5313  task_dup);
5314  // !taskdata->td_flags.native => currently force linear spawning of tasks
5315  // for GOMP_taskloop
5316  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5317  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5318  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5319  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5320  last_chunk));
5321  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5322  grainsize, extras, last_chunk, tc, num_tasks_min,
5323 #if OMPT_SUPPORT
5324  OMPT_GET_RETURN_ADDRESS(0),
5325 #endif
5326  task_dup);
5327  } else {
5328  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5329  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5330  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5331  last_chunk));
5332  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5333  grainsize, extras, last_chunk, tc,
5334 #if OMPT_SUPPORT
5335  OMPT_GET_RETURN_ADDRESS(0),
5336 #endif
5337  task_dup);
5338  }
5339 
5340 #if OMPT_SUPPORT && OMPT_OPTIONAL
5341  if (ompt_enabled.ompt_callback_work) {
5342  ompt_callbacks.ompt_callback(ompt_callback_work)(
5343  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5344  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5345  }
5346 #endif
5347 
5348  if (nogroup == 0) {
5349 #if OMPT_SUPPORT && OMPT_OPTIONAL
5350  OMPT_STORE_RETURN_ADDRESS(gtid);
5351 #endif
5352  __kmpc_end_taskgroup(loc, gtid);
5353  }
5354  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5355 }
5356 
5373 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5374  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5375  int sched, kmp_uint64 grainsize, void *task_dup) {
5376  __kmp_assert_valid_gtid(gtid);
5377  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5378  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5379  0, task_dup);
5380  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5381 }
5382 
5400 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5401  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5402  int nogroup, int sched, kmp_uint64 grainsize,
5403  int modifier, void *task_dup) {
5404  __kmp_assert_valid_gtid(gtid);
5405  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5406  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5407  modifier, task_dup);
5408  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5409 }
5410 
5420  if (gtid == KMP_GTID_DNE)
5421  return NULL;
5422 
5423  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5424  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5425 
5426  if (!taskdata)
5427  return NULL;
5428 
5429  return &taskdata->td_target_data.async_handle;
5430 }
5431 
5440 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5441  if (gtid == KMP_GTID_DNE)
5442  return FALSE;
5443 
5444  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5445  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5446 
5447  if (!taskdata)
5448  return FALSE;
5449 
5450  return taskdata->td_task_team != NULL;
5451 }
5452 
5453 #if OMPX_TASKGRAPH
5454 // __kmp_find_tdg: identify a TDG through its ID
5455 // gtid: Global Thread ID
5456 // tdg_id: ID of the TDG
5457 // returns: If a TDG corresponding to this ID is found and not
5458 // its initial state, return the pointer to it, otherwise nullptr
5459 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5460  kmp_tdg_info_t *res = nullptr;
5461  if (__kmp_max_tdgs == 0)
5462  return res;
5463 
5464  if (__kmp_global_tdgs == NULL)
5465  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5466  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5467 
5468  if ((__kmp_global_tdgs[tdg_id]) &&
5469  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5470  res = __kmp_global_tdgs[tdg_id];
5471  return res;
5472 }
5473 
5474 // __kmp_print_tdg_dot: prints the TDG to a dot file
5475 // tdg: ID of the TDG
5476 // gtid: Global Thread ID
5477 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5478  kmp_int32 tdg_id = tdg->tdg_id;
5479  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5480 
5481  char file_name[20];
5482  sprintf(file_name, "tdg_%d.dot", tdg_id);
5483  kmp_safe_raii_file_t tdg_file(file_name, "w");
5484 
5485  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5486  fprintf(tdg_file,
5487  "digraph TDG {\n"
5488  " compound=true\n"
5489  " subgraph cluster {\n"
5490  " label=TDG_%d\n",
5491  tdg_id);
5492  for (kmp_int32 i = 0; i < num_tasks; i++) {
5493  fprintf(tdg_file, " %d[style=bold]\n", i);
5494  }
5495  fprintf(tdg_file, " }\n");
5496  for (kmp_int32 i = 0; i < num_tasks; i++) {
5497  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5498  kmp_int32 *successors = tdg->record_map[i].successors;
5499  if (nsuccessors > 0) {
5500  for (kmp_int32 j = 0; j < nsuccessors; j++)
5501  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5502  }
5503  }
5504  fprintf(tdg_file, "}");
5505  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5506 }
5507 
5508 // __kmp_start_record: launch the execution of a previous
5509 // recorded TDG
5510 // gtid: Global Thread ID
5511 // tdg: ID of the TDG
5512 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5513  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5514  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5515  tdg->tdg_id, tdg->num_roots));
5516  kmp_node_info_t *this_record_map = tdg->record_map;
5517  kmp_int32 *this_root_tasks = tdg->root_tasks;
5518  kmp_int32 this_num_roots = tdg->num_roots;
5519  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5520 
5521  kmp_info_t *thread = __kmp_threads[gtid];
5522  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5523 
5524  if (tdg->rec_taskred_data) {
5525  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5526  }
5527 
5528  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5529  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5530 
5531  td->td_parent = parent_task;
5532  this_record_map[j].parent_task = parent_task;
5533 
5534  kmp_taskgroup_t *parent_taskgroup =
5535  this_record_map[j].parent_task->td_taskgroup;
5536 
5537  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5538  this_record_map[j].npredecessors);
5539  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5540 
5541  if (parent_taskgroup) {
5542  KMP_ATOMIC_INC(&parent_taskgroup->count);
5543  // The taskgroup is different so we must update it
5544  td->td_taskgroup = parent_taskgroup;
5545  } else if (td->td_taskgroup != nullptr) {
5546  // If the parent doesnt have a taskgroup, remove it from the task
5547  td->td_taskgroup = nullptr;
5548  }
5549  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5550  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5551  }
5552 
5553  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5554  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5555  }
5556  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5557  tdg->tdg_id, tdg->num_roots));
5558 }
5559 
5560 // __kmp_start_record: set up a TDG structure and turn the
5561 // recording flag to true
5562 // gtid: Global Thread ID of the encountering thread
5563 // input_flags: Flags associated with the TDG
5564 // tdg_id: ID of the TDG to record
5565 static inline void __kmp_start_record(kmp_int32 gtid,
5566  kmp_taskgraph_flags_t *flags,
5567  kmp_int32 tdg_id) {
5568  kmp_tdg_info_t *tdg =
5569  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5570  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5571  // Initializing the TDG structure
5572  tdg->tdg_id = tdg_id;
5573  tdg->map_size = INIT_MAPSIZE;
5574  tdg->num_roots = -1;
5575  tdg->root_tasks = nullptr;
5576  tdg->tdg_status = KMP_TDG_RECORDING;
5577  tdg->rec_num_taskred = 0;
5578  tdg->rec_taskred_data = nullptr;
5579  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5580 
5581  // Initializing the list of nodes in this TDG
5582  kmp_node_info_t *this_record_map =
5583  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5584  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5585  kmp_int32 *successorsList =
5586  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5587  this_record_map[i].task = nullptr;
5588  this_record_map[i].successors = successorsList;
5589  this_record_map[i].nsuccessors = 0;
5590  this_record_map[i].npredecessors = 0;
5591  this_record_map[i].successors_size = __kmp_successors_size;
5592  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5593  }
5594 
5595  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5596 }
5597 
5598 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5599 // the beginning of the record process of a task region
5600 // loc_ref: Location of TDG, not used yet
5601 // gtid: Global Thread ID of the encountering thread
5602 // input_flags: Flags associated with the TDG
5603 // tdg_id: ID of the TDG to record, for now, incremental integer
5604 // returns: 1 if we record, otherwise, 0
5605 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5606  kmp_int32 input_flags, kmp_int32 tdg_id) {
5607 
5608  kmp_int32 res;
5609  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5610  KA_TRACE(10,
5611  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5612  gtid, loc_ref, input_flags, tdg_id));
5613 
5614  if (__kmp_max_tdgs == 0) {
5615  KA_TRACE(
5616  10,
5617  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5618  "__kmp_max_tdgs = 0\n",
5619  gtid, loc_ref, input_flags, tdg_id));
5620  return 1;
5621  }
5622 
5623  __kmpc_taskgroup(loc_ref, gtid);
5624  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5625  // TODO: use re_record flag
5626  __kmp_exec_tdg(gtid, tdg);
5627  res = 0;
5628  } else {
5629  __kmp_curr_tdg_idx = tdg_id;
5630  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5631  __kmp_start_record(gtid, flags, tdg_id);
5632  __kmp_num_tdg++;
5633  res = 1;
5634  }
5635  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5636  gtid, tdg_id, res ? "record" : "execute"));
5637  return res;
5638 }
5639 
5640 // __kmp_end_record: set up a TDG after recording it
5641 // gtid: Global thread ID
5642 // tdg: Pointer to the TDG
5643 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5644  // Store roots
5645  kmp_node_info_t *this_record_map = tdg->record_map;
5646  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5647  kmp_int32 *this_root_tasks =
5648  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5649  kmp_int32 this_map_size = tdg->map_size;
5650  kmp_int32 this_num_roots = 0;
5651  kmp_info_t *thread = __kmp_threads[gtid];
5652 
5653  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5654  if (this_record_map[i].npredecessors == 0) {
5655  this_root_tasks[this_num_roots++] = i;
5656  }
5657  }
5658 
5659  // Update with roots info and mapsize
5660  tdg->map_size = this_map_size;
5661  tdg->num_roots = this_num_roots;
5662  tdg->root_tasks = this_root_tasks;
5663  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5664  tdg->tdg_status = KMP_TDG_READY;
5665 
5666  if (thread->th.th_current_task->td_dephash) {
5667  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5668  thread->th.th_current_task->td_dephash = NULL;
5669  }
5670 
5671  // Reset predecessor counter
5672  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5673  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5674  this_record_map[i].npredecessors);
5675  }
5676  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5677 
5678  if (__kmp_tdg_dot)
5679  __kmp_print_tdg_dot(tdg, gtid);
5680 }
5681 
5682 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5683 // the end of recording phase
5684 //
5685 // loc_ref: Source location information
5686 // gtid: Global thread ID
5687 // input_flags: Flags attached to the graph
5688 // tdg_id: ID of the TDG just finished recording
5689 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5690  kmp_int32 input_flags, kmp_int32 tdg_id) {
5691  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5692 
5693  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5694  " tdg=%d with flags=%d\n",
5695  gtid, loc_ref, tdg_id, input_flags));
5696  if (__kmp_max_tdgs) {
5697  // TODO: use input_flags->nowait
5698  __kmpc_end_taskgroup(loc_ref, gtid);
5699  if (__kmp_tdg_is_recording(tdg->tdg_status))
5700  __kmp_end_record(gtid, tdg);
5701  }
5702  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5703  " tdg=%d, its status is now READY\n",
5704  gtid, loc_ref, tdg_id));
5705 }
5706 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:247
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags