LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 #ifdef BUILD_TIED_TASK_STACK
46 
47 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
48 // from top do bottom
49 //
50 // gtid: global thread identifier for thread containing stack
51 // thread_data: thread data for task team thread containing stack
52 // threshold: value above which the trace statement triggers
53 // location: string identifying call site of this function (for trace)
54 static void __kmp_trace_task_stack(kmp_int32 gtid,
55  kmp_thread_data_t *thread_data,
56  int threshold, char *location) {
57  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
58  kmp_taskdata_t **stack_top = task_stack->ts_top;
59  kmp_int32 entries = task_stack->ts_entries;
60  kmp_taskdata_t *tied_task;
61 
62  KA_TRACE(
63  threshold,
64  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
65  "first_block = %p, stack_top = %p \n",
66  location, gtid, entries, task_stack->ts_first_block, stack_top));
67 
68  KMP_DEBUG_ASSERT(stack_top != NULL);
69  KMP_DEBUG_ASSERT(entries > 0);
70 
71  while (entries != 0) {
72  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
73  // fix up ts_top if we need to pop from previous block
74  if (entries & TASK_STACK_INDEX_MASK == 0) {
75  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
76 
77  stack_block = stack_block->sb_prev;
78  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
79  }
80 
81  // finish bookkeeping
82  stack_top--;
83  entries--;
84 
85  tied_task = *stack_top;
86 
87  KMP_DEBUG_ASSERT(tied_task != NULL);
88  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
89 
90  KA_TRACE(threshold,
91  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
92  "stack_top=%p, tied_task=%p\n",
93  location, gtid, entries, stack_top, tied_task));
94  }
95  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
96 
97  KA_TRACE(threshold,
98  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
99  location, gtid));
100 }
101 
102 // __kmp_init_task_stack: initialize the task stack for the first time
103 // after a thread_data structure is created.
104 // It should not be necessary to do this again (assuming the stack works).
105 //
106 // gtid: global thread identifier of calling thread
107 // thread_data: thread data for task team thread containing stack
108 static void __kmp_init_task_stack(kmp_int32 gtid,
109  kmp_thread_data_t *thread_data) {
110  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
111  kmp_stack_block_t *first_block;
112 
113  // set up the first block of the stack
114  first_block = &task_stack->ts_first_block;
115  task_stack->ts_top = (kmp_taskdata_t **)first_block;
116  memset((void *)first_block, '\0',
117  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
118 
119  // initialize the stack to be empty
120  task_stack->ts_entries = TASK_STACK_EMPTY;
121  first_block->sb_next = NULL;
122  first_block->sb_prev = NULL;
123 }
124 
125 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
126 //
127 // gtid: global thread identifier for calling thread
128 // thread_data: thread info for thread containing stack
129 static void __kmp_free_task_stack(kmp_int32 gtid,
130  kmp_thread_data_t *thread_data) {
131  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
132  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
133 
134  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
135  // free from the second block of the stack
136  while (stack_block != NULL) {
137  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
138 
139  stack_block->sb_next = NULL;
140  stack_block->sb_prev = NULL;
141  if (stack_block != &task_stack->ts_first_block) {
142  __kmp_thread_free(thread,
143  stack_block); // free the block, if not the first
144  }
145  stack_block = next_block;
146  }
147  // initialize the stack to be empty
148  task_stack->ts_entries = 0;
149  task_stack->ts_top = NULL;
150 }
151 
152 // __kmp_push_task_stack: Push the tied task onto the task stack.
153 // Grow the stack if necessary by allocating another block.
154 //
155 // gtid: global thread identifier for calling thread
156 // thread: thread info for thread containing stack
157 // tied_task: the task to push on the stack
158 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
159  kmp_taskdata_t *tied_task) {
160  // GEH - need to consider what to do if tt_threads_data not allocated yet
161  kmp_thread_data_t *thread_data =
162  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
163  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
164 
165  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
166  return; // Don't push anything on stack if team or team tasks are serialized
167  }
168 
169  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
170  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
171 
172  KA_TRACE(20,
173  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
174  gtid, thread, tied_task));
175  // Store entry
176  *(task_stack->ts_top) = tied_task;
177 
178  // Do bookkeeping for next push
179  task_stack->ts_top++;
180  task_stack->ts_entries++;
181 
182  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
183  // Find beginning of this task block
184  kmp_stack_block_t *stack_block =
185  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
186 
187  // Check if we already have a block
188  if (stack_block->sb_next !=
189  NULL) { // reset ts_top to beginning of next block
190  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
191  } else { // Alloc new block and link it up
192  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
193  thread, sizeof(kmp_stack_block_t));
194 
195  task_stack->ts_top = &new_block->sb_block[0];
196  stack_block->sb_next = new_block;
197  new_block->sb_prev = stack_block;
198  new_block->sb_next = NULL;
199 
200  KA_TRACE(
201  30,
202  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
203  gtid, tied_task, new_block));
204  }
205  }
206  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
207  tied_task));
208 }
209 
210 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
211 // the task, just check to make sure it matches the ending task passed in.
212 //
213 // gtid: global thread identifier for the calling thread
214 // thread: thread info structure containing stack
215 // tied_task: the task popped off the stack
216 // ending_task: the task that is ending (should match popped task)
217 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
218  kmp_taskdata_t *ending_task) {
219  // GEH - need to consider what to do if tt_threads_data not allocated yet
220  kmp_thread_data_t *thread_data =
221  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
222  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
223  kmp_taskdata_t *tied_task;
224 
225  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
226  // Don't pop anything from stack if team or team tasks are serialized
227  return;
228  }
229 
230  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
231  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
232 
233  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
234  thread));
235 
236  // fix up ts_top if we need to pop from previous block
237  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
238  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
239 
240  stack_block = stack_block->sb_prev;
241  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
242  }
243 
244  // finish bookkeeping
245  task_stack->ts_top--;
246  task_stack->ts_entries--;
247 
248  tied_task = *(task_stack->ts_top);
249 
250  KMP_DEBUG_ASSERT(tied_task != NULL);
251  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
252  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
253 
254  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
255  tied_task));
256  return;
257 }
258 #endif /* BUILD_TIED_TASK_STACK */
259 
260 // returns 1 if new task is allowed to execute, 0 otherwise
261 // checks Task Scheduling constraint (if requested) and
262 // mutexinoutset dependencies if any
263 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
264  const kmp_taskdata_t *tasknew,
265  const kmp_taskdata_t *taskcurr) {
266  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
267  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
268  // only descendant of all deferred tied tasks can be scheduled, checking
269  // the last one is enough, as it in turn is the descendant of all others
270  kmp_taskdata_t *current = taskcurr->td_last_tied;
271  KMP_DEBUG_ASSERT(current != NULL);
272  // check if the task is not suspended on barrier
273  if (current->td_flags.tasktype == TASK_EXPLICIT ||
274  current->td_taskwait_thread > 0) { // <= 0 on barrier
275  kmp_int32 level = current->td_level;
276  kmp_taskdata_t *parent = tasknew->td_parent;
277  while (parent != current && parent->td_level > level) {
278  // check generation up to the level of the current task
279  parent = parent->td_parent;
280  KMP_DEBUG_ASSERT(parent != NULL);
281  }
282  if (parent != current)
283  return false;
284  }
285  }
286  // Check mutexinoutset dependencies, acquire locks
287  kmp_depnode_t *node = tasknew->td_depnode;
288 #if OMPX_TASKGRAPH
289  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
290 #else
291  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
292 #endif
293  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
294  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
295  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
296  continue;
297  // could not get the lock, release previous locks
298  for (int j = i - 1; j >= 0; --j)
299  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
300  return false;
301  }
302  // negative num_locks means all locks acquired successfully
303  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
304  }
305  return true;
306 }
307 
308 // __kmp_realloc_task_deque:
309 // Re-allocates a task deque for a particular thread, copies the content from
310 // the old deque and adjusts the necessary data structures relating to the
311 // deque. This operation must be done with the deque_lock being held
312 static void __kmp_realloc_task_deque(kmp_info_t *thread,
313  kmp_thread_data_t *thread_data) {
314  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
315  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
316  kmp_int32 new_size = 2 * size;
317 
318  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
319  "%d] for thread_data %p\n",
320  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
321 
322  kmp_taskdata_t **new_deque =
323  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
324 
325  int i, j;
326  for (i = thread_data->td.td_deque_head, j = 0; j < size;
327  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
328  new_deque[j] = thread_data->td.td_deque[i];
329 
330  __kmp_free(thread_data->td.td_deque);
331 
332  thread_data->td.td_deque_head = 0;
333  thread_data->td.td_deque_tail = size;
334  thread_data->td.td_deque = new_deque;
335  thread_data->td.td_deque_size = new_size;
336 }
337 
338 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
339  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
340  kmp_thread_data_t *thread_data = &l->td;
341  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
342  thread_data->td.td_deque_last_stolen = -1;
343  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
344  "for thread_data %p\n",
345  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
346  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
347  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
348  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
349  return l;
350 }
351 
352 // The function finds the deque of priority tasks with given priority, or
353 // allocates a new deque and put it into sorted (high -> low) list of deques.
354 // Deques of non-default priority tasks are shared between all threads in team,
355 // as opposed to per-thread deques of tasks with default priority.
356 // The function is called under the lock task_team->tt.tt_task_pri_lock.
357 static kmp_thread_data_t *
358 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
359  kmp_thread_data_t *thread_data;
360  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
361  if (lst->priority == pri) {
362  // Found queue of tasks with given priority.
363  thread_data = &lst->td;
364  } else if (lst->priority < pri) {
365  // All current priority queues contain tasks with lower priority.
366  // Allocate new one for given priority tasks.
367  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
368  thread_data = &list->td;
369  list->priority = pri;
370  list->next = lst;
371  task_team->tt.tt_task_pri_list = list;
372  } else { // task_team->tt.tt_task_pri_list->priority > pri
373  kmp_task_pri_t *next_queue = lst->next;
374  while (next_queue && next_queue->priority > pri) {
375  lst = next_queue;
376  next_queue = lst->next;
377  }
378  // lst->priority > pri && (next == NULL || pri >= next->priority)
379  if (next_queue == NULL) {
380  // No queue with pri priority, need to allocate new one.
381  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
382  thread_data = &list->td;
383  list->priority = pri;
384  list->next = NULL;
385  lst->next = list;
386  } else if (next_queue->priority == pri) {
387  // Found queue of tasks with given priority.
388  thread_data = &next_queue->td;
389  } else { // lst->priority > pri > next->priority
390  // insert newly allocated between existed queues
391  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
392  thread_data = &list->td;
393  list->priority = pri;
394  list->next = next_queue;
395  lst->next = list;
396  }
397  }
398  return thread_data;
399 }
400 
401 // __kmp_push_priority_task: Add a task to the team's priority task deque
402 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
403  kmp_taskdata_t *taskdata,
404  kmp_task_team_t *task_team,
405  kmp_int32 pri) {
406  kmp_thread_data_t *thread_data = NULL;
407  KA_TRACE(20,
408  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
409  gtid, taskdata, pri));
410 
411  // Find task queue specific to priority value
412  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
413  if (UNLIKELY(lst == NULL)) {
414  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
415  if (task_team->tt.tt_task_pri_list == NULL) {
416  // List of queues is still empty, allocate one.
417  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
418  thread_data = &list->td;
419  list->priority = pri;
420  list->next = NULL;
421  task_team->tt.tt_task_pri_list = list;
422  } else {
423  // Other thread initialized a queue. Check if it fits and get thread_data.
424  thread_data = __kmp_get_priority_deque_data(task_team, pri);
425  }
426  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
427  } else {
428  if (lst->priority == pri) {
429  // Found queue of tasks with given priority.
430  thread_data = &lst->td;
431  } else {
432  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
433  thread_data = __kmp_get_priority_deque_data(task_team, pri);
434  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
435  }
436  }
437  KMP_DEBUG_ASSERT(thread_data);
438 
439  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
440  // Check if deque is full
441  if (TCR_4(thread_data->td.td_deque_ntasks) >=
442  TASK_DEQUE_SIZE(thread_data->td)) {
443  if (__kmp_enable_task_throttling &&
444  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
445  thread->th.th_current_task)) {
446  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
447  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
448  "TASK_NOT_PUSHED for task %p\n",
449  gtid, taskdata));
450  return TASK_NOT_PUSHED;
451  } else {
452  // expand deque to push the task which is not allowed to execute
453  __kmp_realloc_task_deque(thread, thread_data);
454  }
455  }
456  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
457  TASK_DEQUE_SIZE(thread_data->td));
458  // Push taskdata.
459  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
460  // Wrap index.
461  thread_data->td.td_deque_tail =
462  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
463  TCW_4(thread_data->td.td_deque_ntasks,
464  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
465  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
466  KMP_FSYNC_RELEASING(taskdata); // releasing child
467  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
468  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
469  gtid, taskdata, thread_data->td.td_deque_ntasks,
470  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
471  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
472  task_team->tt.tt_num_task_pri++; // atomic inc
473  return TASK_SUCCESSFULLY_PUSHED;
474 }
475 
476 // __kmp_push_task: Add a task to the thread's deque
477 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
478  kmp_info_t *thread = __kmp_threads[gtid];
479  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
480 
481  // If we encounter a hidden helper task, and the current thread is not a
482  // hidden helper thread, we have to give the task to any hidden helper thread
483  // starting from its shadow one.
484  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
485  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
486  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
487  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
488  // Signal the hidden helper threads.
489  __kmp_hidden_helper_worker_thread_signal();
490  return TASK_SUCCESSFULLY_PUSHED;
491  }
492 
493  kmp_task_team_t *task_team = thread->th.th_task_team;
494  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
495  kmp_thread_data_t *thread_data;
496 
497  KA_TRACE(20,
498  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
499 
500  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
501  // untied task needs to increment counter so that the task structure is not
502  // freed prematurely
503  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
504  KMP_DEBUG_USE_VAR(counter);
505  KA_TRACE(
506  20,
507  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
508  gtid, counter, taskdata));
509  }
510 
511  // The first check avoids building task_team thread data if serialized
512  if (UNLIKELY(taskdata->td_flags.task_serial)) {
513  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
514  "TASK_NOT_PUSHED for task %p\n",
515  gtid, taskdata));
516  return TASK_NOT_PUSHED;
517  }
518 
519  // Now that serialized tasks have returned, we can assume that we are not in
520  // immediate exec mode
521  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
522  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
523  __kmp_enable_tasking(task_team, thread);
524  }
525  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
526  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
527 
528  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
529  __kmp_max_task_priority > 0) {
530  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
531  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
532  }
533 
534  // Find tasking deque specific to encountering thread
535  thread_data = &task_team->tt.tt_threads_data[tid];
536 
537  // No lock needed since only owner can allocate. If the task is hidden_helper,
538  // we don't need it either because we have initialized the dequeue for hidden
539  // helper thread data.
540  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
541  __kmp_alloc_task_deque(thread, thread_data);
542  }
543 
544  int locked = 0;
545  // Check if deque is full
546  if (TCR_4(thread_data->td.td_deque_ntasks) >=
547  TASK_DEQUE_SIZE(thread_data->td)) {
548  if (__kmp_enable_task_throttling &&
549  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
550  thread->th.th_current_task)) {
551  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
552  "TASK_NOT_PUSHED for task %p\n",
553  gtid, taskdata));
554  return TASK_NOT_PUSHED;
555  } else {
556  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
557  locked = 1;
558  if (TCR_4(thread_data->td.td_deque_ntasks) >=
559  TASK_DEQUE_SIZE(thread_data->td)) {
560  // expand deque to push the task which is not allowed to execute
561  __kmp_realloc_task_deque(thread, thread_data);
562  }
563  }
564  }
565  // Lock the deque for the task push operation
566  if (!locked) {
567  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
568  // Need to recheck as we can get a proxy task from thread outside of OpenMP
569  if (TCR_4(thread_data->td.td_deque_ntasks) >=
570  TASK_DEQUE_SIZE(thread_data->td)) {
571  if (__kmp_enable_task_throttling &&
572  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
573  thread->th.th_current_task)) {
574  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
575  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
576  "returning TASK_NOT_PUSHED for task %p\n",
577  gtid, taskdata));
578  return TASK_NOT_PUSHED;
579  } else {
580  // expand deque to push the task which is not allowed to execute
581  __kmp_realloc_task_deque(thread, thread_data);
582  }
583  }
584  }
585  // Must have room since no thread can add tasks but calling thread
586  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
587  TASK_DEQUE_SIZE(thread_data->td));
588 
589  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
590  taskdata; // Push taskdata
591  // Wrap index.
592  thread_data->td.td_deque_tail =
593  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
594  TCW_4(thread_data->td.td_deque_ntasks,
595  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
596  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
597  KMP_FSYNC_RELEASING(taskdata); // releasing child
598  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
599  "task=%p ntasks=%d head=%u tail=%u\n",
600  gtid, taskdata, thread_data->td.td_deque_ntasks,
601  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
602 
603  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
604 
605  return TASK_SUCCESSFULLY_PUSHED;
606 }
607 
608 // __kmp_pop_current_task_from_thread: set up current task from called thread
609 // when team ends
610 //
611 // this_thr: thread structure to set current_task in.
612 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
613  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
614  "this_thread=%p, curtask=%p, "
615  "curtask_parent=%p\n",
616  0, this_thr, this_thr->th.th_current_task,
617  this_thr->th.th_current_task->td_parent));
618 
619  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
620 
621  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
622  "this_thread=%p, curtask=%p, "
623  "curtask_parent=%p\n",
624  0, this_thr, this_thr->th.th_current_task,
625  this_thr->th.th_current_task->td_parent));
626 }
627 
628 // __kmp_push_current_task_to_thread: set up current task in called thread for a
629 // new team
630 //
631 // this_thr: thread structure to set up
632 // team: team for implicit task data
633 // tid: thread within team to set up
634 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
635  int tid) {
636  // current task of the thread is a parent of the new just created implicit
637  // tasks of new team
638  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
639  "curtask=%p "
640  "parent_task=%p\n",
641  tid, this_thr, this_thr->th.th_current_task,
642  team->t.t_implicit_task_taskdata[tid].td_parent));
643 
644  KMP_DEBUG_ASSERT(this_thr != NULL);
645 
646  if (tid == 0) {
647  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
648  team->t.t_implicit_task_taskdata[0].td_parent =
649  this_thr->th.th_current_task;
650  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
651  }
652  } else {
653  team->t.t_implicit_task_taskdata[tid].td_parent =
654  team->t.t_implicit_task_taskdata[0].td_parent;
655  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
656  }
657 
658  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
659  "curtask=%p "
660  "parent_task=%p\n",
661  tid, this_thr, this_thr->th.th_current_task,
662  team->t.t_implicit_task_taskdata[tid].td_parent));
663 }
664 
665 // __kmp_task_start: bookkeeping for a task starting execution
666 //
667 // GTID: global thread id of calling thread
668 // task: task starting execution
669 // current_task: task suspending
670 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
671  kmp_taskdata_t *current_task) {
672  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
673  kmp_info_t *thread = __kmp_threads[gtid];
674 
675  KA_TRACE(10,
676  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
677  gtid, taskdata, current_task));
678 
679  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
680 
681  // mark currently executing task as suspended
682  // TODO: GEH - make sure root team implicit task is initialized properly.
683  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
684  current_task->td_flags.executing = 0;
685 
686 // Add task to stack if tied
687 #ifdef BUILD_TIED_TASK_STACK
688  if (taskdata->td_flags.tiedness == TASK_TIED) {
689  __kmp_push_task_stack(gtid, thread, taskdata);
690  }
691 #endif /* BUILD_TIED_TASK_STACK */
692 
693  // mark starting task as executing and as current task
694  thread->th.th_current_task = taskdata;
695 
696  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
697  taskdata->td_flags.tiedness == TASK_UNTIED);
698  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
699  taskdata->td_flags.tiedness == TASK_UNTIED);
700  taskdata->td_flags.started = 1;
701  taskdata->td_flags.executing = 1;
702  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
703  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
704 
705  // GEH TODO: shouldn't we pass some sort of location identifier here?
706  // APT: yes, we will pass location here.
707  // need to store current thread state (in a thread or taskdata structure)
708  // before setting work_state, otherwise wrong state is set after end of task
709 
710  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
711 
712  return;
713 }
714 
715 #if OMPT_SUPPORT
716 //------------------------------------------------------------------------------
717 
718 // __ompt_task_start:
719 // Build and trigger task-begin event
720 static inline void __ompt_task_start(kmp_task_t *task,
721  kmp_taskdata_t *current_task,
722  kmp_int32 gtid) {
723  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
724  ompt_task_status_t status = ompt_task_switch;
725  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
726  status = ompt_task_yield;
727  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
728  }
729  /* let OMPT know that we're about to run this task */
730  if (ompt_enabled.ompt_callback_task_schedule) {
731  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
732  &(current_task->ompt_task_info.task_data), status,
733  &(taskdata->ompt_task_info.task_data));
734  }
735  taskdata->ompt_task_info.scheduling_parent = current_task;
736 }
737 
738 // __ompt_task_finish:
739 // Build and trigger final task-schedule event
740 static inline void __ompt_task_finish(kmp_task_t *task,
741  kmp_taskdata_t *resumed_task,
742  ompt_task_status_t status) {
743  if (ompt_enabled.ompt_callback_task_schedule) {
744  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
745  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
746  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
747  status = ompt_task_cancel;
748  }
749 
750  /* let OMPT know that we're returning to the callee task */
751  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
752  &(taskdata->ompt_task_info.task_data), status,
753  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
754  }
755 }
756 #endif
757 
758 template <bool ompt>
759 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
760  kmp_task_t *task,
761  void *frame_address,
762  void *return_address) {
763  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
764  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
765 
766  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
767  "current_task=%p\n",
768  gtid, loc_ref, taskdata, current_task));
769 
770  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
771  // untied task needs to increment counter so that the task structure is not
772  // freed prematurely
773  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
774  KMP_DEBUG_USE_VAR(counter);
775  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
776  "incremented for task %p\n",
777  gtid, counter, taskdata));
778  }
779 
780  taskdata->td_flags.task_serial =
781  1; // Execute this task immediately, not deferred.
782  __kmp_task_start(gtid, task, current_task);
783 
784 #if OMPT_SUPPORT
785  if (ompt) {
786  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
787  current_task->ompt_task_info.frame.enter_frame.ptr =
788  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
789  current_task->ompt_task_info.frame.enter_frame_flags =
790  taskdata->ompt_task_info.frame.exit_frame_flags =
791  OMPT_FRAME_FLAGS_APP;
792  }
793  if (ompt_enabled.ompt_callback_task_create) {
794  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
795  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
796  &(parent_info->task_data), &(parent_info->frame),
797  &(taskdata->ompt_task_info.task_data),
798  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
799  }
800  __ompt_task_start(task, current_task, gtid);
801  }
802 #endif // OMPT_SUPPORT
803 
804  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
805  loc_ref, taskdata));
806 }
807 
808 #if OMPT_SUPPORT
809 OMPT_NOINLINE
810 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
811  kmp_task_t *task,
812  void *frame_address,
813  void *return_address) {
814  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
815  return_address);
816 }
817 #endif // OMPT_SUPPORT
818 
819 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
820 // execution
821 //
822 // loc_ref: source location information; points to beginning of task block.
823 // gtid: global thread number.
824 // task: task thunk for the started task.
825 #ifdef __s390x__
826 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
827 // In order for it to work correctly, the caller also needs to be compiled with
828 // backchain. If a caller is compiled without backchain,
829 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
830 // crash.
831 __attribute__((target("backchain")))
832 #endif
833 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
834  kmp_task_t *task) {
835 #if OMPT_SUPPORT
836  if (UNLIKELY(ompt_enabled.enabled)) {
837  OMPT_STORE_RETURN_ADDRESS(gtid);
838  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
839  OMPT_GET_FRAME_ADDRESS(1),
840  OMPT_LOAD_RETURN_ADDRESS(gtid));
841  return;
842  }
843 #endif
844  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
845 }
846 
847 #ifdef TASK_UNUSED
848 // __kmpc_omp_task_begin: report that a given task has started execution
849 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
850 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
851  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
852 
853  KA_TRACE(
854  10,
855  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
856  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
857 
858  __kmp_task_start(gtid, task, current_task);
859 
860  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
861  loc_ref, KMP_TASK_TO_TASKDATA(task)));
862  return;
863 }
864 #endif // TASK_UNUSED
865 
866 // __kmp_free_task: free the current task space and the space for shareds
867 //
868 // gtid: Global thread ID of calling thread
869 // taskdata: task to free
870 // thread: thread data structure of caller
871 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
872  kmp_info_t *thread) {
873  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
874  taskdata));
875 
876  // Check to make sure all flags and counters have the correct values
877  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
878  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
879  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
880  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
881  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
882  taskdata->td_flags.task_serial == 1);
883  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
884  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
885  // Clear data to not be re-used later by mistake.
886  task->data1.destructors = NULL;
887  task->data2.priority = 0;
888 
889  taskdata->td_flags.freed = 1;
890 #if OMPX_TASKGRAPH
891  // do not free tasks in taskgraph
892  if (!taskdata->is_taskgraph) {
893 #endif
894 // deallocate the taskdata and shared variable blocks associated with this task
895 #if USE_FAST_MEMORY
896  __kmp_fast_free(thread, taskdata);
897 #else /* ! USE_FAST_MEMORY */
898  __kmp_thread_free(thread, taskdata);
899 #endif
900 #if OMPX_TASKGRAPH
901  } else {
902  taskdata->td_flags.complete = 0;
903  taskdata->td_flags.started = 0;
904  taskdata->td_flags.freed = 0;
905  taskdata->td_flags.executing = 0;
906  taskdata->td_flags.task_serial =
907  (taskdata->td_parent->td_flags.final ||
908  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
909 
910  // taskdata->td_allow_completion_event.pending_events_count = 1;
911  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
912  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
913  // start at one because counts current task and children
914  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
915  }
916 #endif
917 
918  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
919 }
920 
921 // __kmp_free_task_and_ancestors: free the current task and ancestors without
922 // children
923 //
924 // gtid: Global thread ID of calling thread
925 // taskdata: task to free
926 // thread: thread data structure of caller
927 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
928  kmp_taskdata_t *taskdata,
929  kmp_info_t *thread) {
930  // Proxy tasks must always be allowed to free their parents
931  // because they can be run in background even in serial mode.
932  kmp_int32 team_serial =
933  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
934  !taskdata->td_flags.proxy;
935  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
936 
937  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
938  KMP_DEBUG_ASSERT(children >= 0);
939 
940  // Now, go up the ancestor tree to see if any ancestors can now be freed.
941  while (children == 0) {
942  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
943 
944  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
945  "and freeing itself\n",
946  gtid, taskdata));
947 
948  // --- Deallocate my ancestor task ---
949  __kmp_free_task(gtid, taskdata, thread);
950 
951  taskdata = parent_taskdata;
952 
953  if (team_serial)
954  return;
955  // Stop checking ancestors at implicit task instead of walking up ancestor
956  // tree to avoid premature deallocation of ancestors.
957  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
958  if (taskdata->td_dephash) { // do we need to cleanup dephash?
959  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
960  kmp_tasking_flags_t flags_old = taskdata->td_flags;
961  if (children == 0 && flags_old.complete == 1) {
962  kmp_tasking_flags_t flags_new = flags_old;
963  flags_new.complete = 0;
964  if (KMP_COMPARE_AND_STORE_ACQ32(
965  RCAST(kmp_int32 *, &taskdata->td_flags),
966  *RCAST(kmp_int32 *, &flags_old),
967  *RCAST(kmp_int32 *, &flags_new))) {
968  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
969  "dephash of implicit task %p\n",
970  gtid, taskdata));
971  // cleanup dephash of finished implicit task
972  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
973  }
974  }
975  }
976  return;
977  }
978  // Predecrement simulated by "- 1" calculation
979  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
980  KMP_DEBUG_ASSERT(children >= 0);
981  }
982 
983  KA_TRACE(
984  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
985  "not freeing it yet\n",
986  gtid, taskdata, children));
987 }
988 
989 // Only need to keep track of child task counts if any of the following:
990 // 1. team parallel and tasking not serialized;
991 // 2. it is a proxy or detachable or hidden helper task
992 // 3. the children counter of its parent task is greater than 0.
993 // The reason for the 3rd one is for serialized team that found detached task,
994 // hidden helper task, T. In this case, the execution of T is still deferred,
995 // and it is also possible that a regular task depends on T. In this case, if we
996 // don't track the children, task synchronization will be broken.
997 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
998  kmp_tasking_flags_t flags = taskdata->td_flags;
999  bool ret = !(flags.team_serial || flags.tasking_ser);
1000  ret = ret || flags.proxy == TASK_PROXY ||
1001  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
1002  ret = ret ||
1003  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
1004 #if OMPX_TASKGRAPH
1005  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
1006  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
1007 #endif
1008  return ret;
1009 }
1010 
1011 // __kmp_task_finish: bookkeeping to do when a task finishes execution
1012 //
1013 // gtid: global thread ID for calling thread
1014 // task: task to be finished
1015 // resumed_task: task to be resumed. (may be NULL if task is serialized)
1016 //
1017 // template<ompt>: effectively ompt_enabled.enabled!=0
1018 // the version with ompt=false is inlined, allowing to optimize away all ompt
1019 // code in this case
1020 template <bool ompt>
1021 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
1022  kmp_taskdata_t *resumed_task) {
1023  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1024  kmp_info_t *thread = __kmp_threads[gtid];
1025  kmp_task_team_t *task_team =
1026  thread->th.th_task_team; // might be NULL for serial teams...
1027 #if OMPX_TASKGRAPH
1028  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
1029  bool is_taskgraph;
1030 #endif
1031 #if KMP_DEBUG
1032  kmp_int32 children = 0;
1033 #endif
1034  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
1035  "task %p\n",
1036  gtid, taskdata, resumed_task));
1037 
1038  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
1039 
1040 #if OMPX_TASKGRAPH
1041  is_taskgraph = taskdata->is_taskgraph;
1042 #endif
1043 
1044 // Pop task from stack if tied
1045 #ifdef BUILD_TIED_TASK_STACK
1046  if (taskdata->td_flags.tiedness == TASK_TIED) {
1047  __kmp_pop_task_stack(gtid, thread, taskdata);
1048  }
1049 #endif /* BUILD_TIED_TASK_STACK */
1050 
1051  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
1052  // untied task needs to check the counter so that the task structure is not
1053  // freed prematurely
1054  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
1055  KA_TRACE(
1056  20,
1057  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
1058  gtid, counter, taskdata));
1059  if (counter > 0) {
1060  // untied task is not done, to be continued possibly by other thread, do
1061  // not free it now
1062  if (resumed_task == NULL) {
1063  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
1064  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1065  // task is the parent
1066  }
1067  thread->th.th_current_task = resumed_task; // restore current_task
1068  resumed_task->td_flags.executing = 1; // resume previous task
1069  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
1070  "resuming task %p\n",
1071  gtid, taskdata, resumed_task));
1072  return;
1073  }
1074  }
1075 
1076  // bookkeeping for resuming task:
1077  // GEH - note tasking_ser => task_serial
1078  KMP_DEBUG_ASSERT(
1079  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
1080  taskdata->td_flags.task_serial);
1081  if (taskdata->td_flags.task_serial) {
1082  if (resumed_task == NULL) {
1083  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
1084  // task is the parent
1085  }
1086  } else {
1087  KMP_DEBUG_ASSERT(resumed_task !=
1088  NULL); // verify that resumed task is passed as argument
1089  }
1090 
1091  /* If the tasks' destructor thunk flag has been set, we need to invoke the
1092  destructor thunk that has been generated by the compiler. The code is
1093  placed here, since at this point other tasks might have been released
1094  hence overlapping the destructor invocations with some other work in the
1095  released tasks. The OpenMP spec is not specific on when the destructors
1096  are invoked, so we should be free to choose. */
1097  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
1098  kmp_routine_entry_t destr_thunk = task->data1.destructors;
1099  KMP_ASSERT(destr_thunk);
1100  destr_thunk(gtid, task);
1101  }
1102 
1103  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
1104  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
1105  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
1106 
1107  bool completed = true;
1108  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
1109  if (taskdata->td_allow_completion_event.type ==
1110  KMP_EVENT_ALLOW_COMPLETION) {
1111  // event hasn't been fulfilled yet. Try to detach task.
1112  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1113  if (taskdata->td_allow_completion_event.type ==
1114  KMP_EVENT_ALLOW_COMPLETION) {
1115  // task finished execution
1116  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1117  taskdata->td_flags.executing = 0; // suspend the finishing task
1118 
1119 #if OMPT_SUPPORT
1120  // For a detached task, which is not completed, we switch back
1121  // the omp_fulfill_event signals completion
1122  // locking is necessary to avoid a race with ompt_task_late_fulfill
1123  if (ompt)
1124  __ompt_task_finish(task, resumed_task, ompt_task_detach);
1125 #endif
1126 
1127  // no access to taskdata after this point!
1128  // __kmp_fulfill_event might free taskdata at any time from now
1129 
1130  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
1131  completed = false;
1132  }
1133  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
1134  }
1135  }
1136 
1137  // Tasks with valid target async handles must be re-enqueued.
1138  if (taskdata->td_target_data.async_handle != NULL) {
1139  // Note: no need to translate gtid to its shadow. If the current thread is a
1140  // hidden helper one, then the gtid is already correct. Otherwise, hidden
1141  // helper threads are disabled, and gtid refers to a OpenMP thread.
1142 #if OMPT_SUPPORT
1143  if (ompt) {
1144  __ompt_task_finish(task, resumed_task, ompt_task_switch);
1145  }
1146 #endif
1147  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
1148  if (KMP_HIDDEN_HELPER_THREAD(gtid))
1149  __kmp_hidden_helper_worker_thread_signal();
1150  completed = false;
1151  }
1152 
1153  if (completed) {
1154  taskdata->td_flags.complete = 1; // mark the task as completed
1155 #if OMPX_TASKGRAPH
1156  taskdata->td_flags.onced = 1; // mark the task as ran once already
1157 #endif
1158 
1159 #if OMPT_SUPPORT
1160  // This is not a detached task, we are done here
1161  if (ompt)
1162  __ompt_task_finish(task, resumed_task, ompt_task_complete);
1163 #endif
1164  // TODO: What would be the balance between the conditions in the function
1165  // and an atomic operation?
1166  if (__kmp_track_children_task(taskdata)) {
1167  __kmp_release_deps(gtid, taskdata);
1168  // Predecrement simulated by "- 1" calculation
1169 #if KMP_DEBUG
1170  children = -1 +
1171 #endif
1172  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
1173  KMP_DEBUG_ASSERT(children >= 0);
1174 #if OMPX_TASKGRAPH
1175  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
1176 #else
1177  if (taskdata->td_taskgroup)
1178 #endif
1179  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1180  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
1181  task_team->tt.tt_hidden_helper_task_encountered)) {
1182  // if we found proxy or hidden helper tasks there could exist a dependency
1183  // chain with the proxy task as origin
1184  __kmp_release_deps(gtid, taskdata);
1185  }
1186  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
1187  // called. Othertwise, if a task is executed immediately from the
1188  // release_deps code, the flag will be reset to 1 again by this same
1189  // function
1190  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
1191  taskdata->td_flags.executing = 0; // suspend the finishing task
1192 
1193  // Decrement the counter of hidden helper tasks to be executed.
1194  if (taskdata->td_flags.hidden_helper) {
1195  // Hidden helper tasks can only be executed by hidden helper threads.
1196  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
1197  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
1198  }
1199  }
1200 
1201  KA_TRACE(
1202  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
1203  gtid, taskdata, children));
1204 
1205  // Free this task and then ancestor tasks if they have no children.
1206  // Restore th_current_task first as suggested by John:
1207  // johnmc: if an asynchronous inquiry peers into the runtime system
1208  // it doesn't see the freed task as the current task.
1209  thread->th.th_current_task = resumed_task;
1210  if (completed)
1211  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
1212 
1213  // TODO: GEH - make sure root team implicit task is initialized properly.
1214  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
1215  resumed_task->td_flags.executing = 1; // resume previous task
1216 
1217 #if OMPX_TASKGRAPH
1218  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
1219  taskdata->td_taskgroup) {
1220  // TDG: we only release taskgroup barrier here because
1221  // free_task_and_ancestors will call
1222  // __kmp_free_task, which resets all task parameters such as
1223  // taskdata->started, etc. If we release the barrier earlier, these
1224  // parameters could be read before being reset. This is not an issue for
1225  // non-TDG implementation because we never reuse a task(data) structure
1226  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
1227  }
1228 #endif
1229 
1230  KA_TRACE(
1231  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1232  gtid, taskdata, resumed_task));
1233 
1234  return;
1235 }
1236 
1237 template <bool ompt>
1238 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1239  kmp_int32 gtid,
1240  kmp_task_t *task) {
1241  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1242  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1243  KMP_DEBUG_ASSERT(gtid >= 0);
1244  // this routine will provide task to resume
1245  __kmp_task_finish<ompt>(gtid, task, NULL);
1246 
1247  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1248  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1249 
1250 #if OMPT_SUPPORT
1251  if (ompt) {
1252  ompt_frame_t *ompt_frame;
1253  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1254  ompt_frame->enter_frame = ompt_data_none;
1255  ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1256  }
1257 #endif
1258 
1259  return;
1260 }
1261 
1262 #if OMPT_SUPPORT
1263 OMPT_NOINLINE
1264 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1265  kmp_task_t *task) {
1266  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1267 }
1268 #endif // OMPT_SUPPORT
1269 
1270 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1271 //
1272 // loc_ref: source location information; points to end of task block.
1273 // gtid: global thread number.
1274 // task: task thunk for the completed task.
1275 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1276  kmp_task_t *task) {
1277 #if OMPT_SUPPORT
1278  if (UNLIKELY(ompt_enabled.enabled)) {
1279  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1280  return;
1281  }
1282 #endif
1283  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1284 }
1285 
1286 #ifdef TASK_UNUSED
1287 // __kmpc_omp_task_complete: report that a task has completed execution
1288 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1289 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1290  kmp_task_t *task) {
1291  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1292  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1293 
1294  __kmp_task_finish<false>(gtid, task,
1295  NULL); // Not sure how to find task to resume
1296 
1297  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1298  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1299  return;
1300 }
1301 #endif // TASK_UNUSED
1302 
1303 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1304 // task for a given thread
1305 //
1306 // loc_ref: reference to source location of parallel region
1307 // this_thr: thread data structure corresponding to implicit task
1308 // team: team for this_thr
1309 // tid: thread id of given thread within team
1310 // set_curr_task: TRUE if need to push current task to thread
1311 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1312 // have already been done elsewhere.
1313 // TODO: Get better loc_ref. Value passed in may be NULL
1314 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1315  kmp_team_t *team, int tid, int set_curr_task) {
1316  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1317 
1318  KF_TRACE(
1319  10,
1320  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1321  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1322 
1323  task->td_task_id = KMP_GEN_TASK_ID();
1324  task->td_team = team;
1325  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1326  // in debugger)
1327  task->td_ident = loc_ref;
1328  task->td_taskwait_ident = NULL;
1329  task->td_taskwait_counter = 0;
1330  task->td_taskwait_thread = 0;
1331 
1332  task->td_flags.tiedness = TASK_TIED;
1333  task->td_flags.tasktype = TASK_IMPLICIT;
1334  task->td_flags.proxy = TASK_FULL;
1335 
1336  // All implicit tasks are executed immediately, not deferred
1337  task->td_flags.task_serial = 1;
1338  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1339  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1340 
1341  task->td_flags.started = 1;
1342  task->td_flags.executing = 1;
1343  task->td_flags.complete = 0;
1344  task->td_flags.freed = 0;
1345 #if OMPX_TASKGRAPH
1346  task->td_flags.onced = 0;
1347 #endif
1348 
1349  task->td_depnode = NULL;
1350  task->td_last_tied = task;
1351  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 
1353  if (set_curr_task) { // only do this init first time thread is created
1354  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1355  // Not used: don't need to deallocate implicit task
1356  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1357  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1358  task->td_dephash = NULL;
1359  __kmp_push_current_task_to_thread(this_thr, team, tid);
1360  } else {
1361  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1362  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1363  }
1364 
1365 #if OMPT_SUPPORT
1366  if (UNLIKELY(ompt_enabled.enabled))
1367  __ompt_task_init(task, tid);
1368 #endif
1369 
1370  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1371  team, task));
1372 }
1373 
1374 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1375 // at the end of parallel regions. Some resources are kept for reuse in the next
1376 // parallel region.
1377 //
1378 // thread: thread data structure corresponding to implicit task
1379 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1380  kmp_taskdata_t *task = thread->th.th_current_task;
1381  if (task->td_dephash) {
1382  int children;
1383  task->td_flags.complete = 1;
1384 #if OMPX_TASKGRAPH
1385  task->td_flags.onced = 1;
1386 #endif
1387  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1388  kmp_tasking_flags_t flags_old = task->td_flags;
1389  if (children == 0 && flags_old.complete == 1) {
1390  kmp_tasking_flags_t flags_new = flags_old;
1391  flags_new.complete = 0;
1392  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1393  *RCAST(kmp_int32 *, &flags_old),
1394  *RCAST(kmp_int32 *, &flags_new))) {
1395  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1396  "dephash of implicit task %p\n",
1397  thread->th.th_info.ds.ds_gtid, task));
1398  __kmp_dephash_free_entries(thread, task->td_dephash);
1399  }
1400  }
1401  }
1402 }
1403 
1404 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1405 // when these are destroyed regions
1406 //
1407 // thread: thread data structure corresponding to implicit task
1408 void __kmp_free_implicit_task(kmp_info_t *thread) {
1409  kmp_taskdata_t *task = thread->th.th_current_task;
1410  if (task && task->td_dephash) {
1411  __kmp_dephash_free(thread, task->td_dephash);
1412  task->td_dephash = NULL;
1413  }
1414 }
1415 
1416 // Round up a size to a power of two specified by val: Used to insert padding
1417 // between structures co-allocated using a single malloc() call
1418 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1419  if (size & (val - 1)) {
1420  size &= ~(val - 1);
1421  if (size <= KMP_SIZE_T_MAX - val) {
1422  size += val; // Round up if there is no overflow.
1423  }
1424  }
1425  return size;
1426 } // __kmp_round_up_to_va
1427 
1428 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1429 //
1430 // loc_ref: source location information
1431 // gtid: global thread number.
1432 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1433 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1434 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1435 // private vars accessed in task.
1436 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1437 // in task.
1438 // task_entry: Pointer to task code entry point generated by compiler.
1439 // returns: a pointer to the allocated kmp_task_t structure (task).
1440 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1441  kmp_tasking_flags_t *flags,
1442  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1443  kmp_routine_entry_t task_entry) {
1444  kmp_task_t *task;
1445  kmp_taskdata_t *taskdata;
1446  kmp_info_t *thread = __kmp_threads[gtid];
1447  kmp_team_t *team = thread->th.th_team;
1448  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1449  size_t shareds_offset;
1450 
1451  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1452  __kmp_middle_initialize();
1453 
1454  if (flags->hidden_helper) {
1455  if (__kmp_enable_hidden_helper) {
1456  if (!TCR_4(__kmp_init_hidden_helper))
1457  __kmp_hidden_helper_initialize();
1458  } else {
1459  // If the hidden helper task is not enabled, reset the flag to FALSE.
1460  flags->hidden_helper = FALSE;
1461  }
1462  }
1463 
1464  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1465  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1466  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1467  sizeof_shareds, task_entry));
1468 
1469  KMP_DEBUG_ASSERT(parent_task);
1470  if (parent_task->td_flags.final) {
1471  if (flags->merged_if0) {
1472  }
1473  flags->final = 1;
1474  }
1475 
1476  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1477  // Untied task encountered causes the TSC algorithm to check entire deque of
1478  // the victim thread. If no untied task encountered, then checking the head
1479  // of the deque should be enough.
1480  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1481  }
1482 
1483  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1484  // the tasking setup
1485  // when that happens is too late.
1486  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1487  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1488  if (flags->proxy == TASK_PROXY) {
1489  flags->tiedness = TASK_UNTIED;
1490  flags->merged_if0 = 1;
1491  }
1492  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1493  tasking support enabled */
1494  if ((thread->th.th_task_team) == NULL) {
1495  /* This should only happen if the team is serialized
1496  setup a task team and propagate it to the thread */
1497  KMP_DEBUG_ASSERT(team->t.t_serialized);
1498  KA_TRACE(30,
1499  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1500  gtid));
1501  __kmp_task_team_setup(thread, team);
1502  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1503  }
1504  kmp_task_team_t *task_team = thread->th.th_task_team;
1505 
1506  /* tasking must be enabled now as the task might not be pushed */
1507  if (!KMP_TASKING_ENABLED(task_team)) {
1508  KA_TRACE(
1509  30,
1510  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1511  __kmp_enable_tasking(task_team, thread);
1512  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1513  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1514  // No lock needed since only owner can allocate
1515  if (thread_data->td.td_deque == NULL) {
1516  __kmp_alloc_task_deque(thread, thread_data);
1517  }
1518  }
1519 
1520  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1521  task_team->tt.tt_found_proxy_tasks == FALSE)
1522  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1523  if (flags->hidden_helper &&
1524  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1525  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1526  }
1527 
1528  // Calculate shared structure offset including padding after kmp_task_t struct
1529  // to align pointers in shared struct
1530  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1531  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1532 
1533  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1534  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1535  shareds_offset));
1536  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1537  sizeof_shareds));
1538 
1539  // Avoid double allocation here by combining shareds with taskdata
1540 #if USE_FAST_MEMORY
1541  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1542  sizeof_shareds);
1543 #else /* ! USE_FAST_MEMORY */
1544  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1545  sizeof_shareds);
1546 #endif /* USE_FAST_MEMORY */
1547 
1548  task = KMP_TASKDATA_TO_TASK(taskdata);
1549 
1550 // Make sure task & taskdata are aligned appropriately
1551 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1552  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1553  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1554 #else
1555  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1556  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1557 #endif
1558  if (sizeof_shareds > 0) {
1559  // Avoid double allocation here by combining shareds with taskdata
1560  task->shareds = &((char *)taskdata)[shareds_offset];
1561  // Make sure shareds struct is aligned to pointer size
1562  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1563  0);
1564  } else {
1565  task->shareds = NULL;
1566  }
1567  task->routine = task_entry;
1568  task->part_id = 0; // AC: Always start with 0 part id
1569 
1570  taskdata->td_task_id = KMP_GEN_TASK_ID();
1571  taskdata->td_team = thread->th.th_team;
1572  taskdata->td_alloc_thread = thread;
1573  taskdata->td_parent = parent_task;
1574  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1575  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1576  taskdata->td_ident = loc_ref;
1577  taskdata->td_taskwait_ident = NULL;
1578  taskdata->td_taskwait_counter = 0;
1579  taskdata->td_taskwait_thread = 0;
1580  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1581  // avoid copying icvs for proxy tasks
1582  if (flags->proxy == TASK_FULL)
1583  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1584 
1585  taskdata->td_flags = *flags;
1586  taskdata->td_task_team = thread->th.th_task_team;
1587  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1588  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1589  // If it is hidden helper task, we need to set the team and task team
1590  // correspondingly.
1591  if (flags->hidden_helper) {
1592  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1593  taskdata->td_team = shadow_thread->th.th_team;
1594  taskdata->td_task_team = shadow_thread->th.th_task_team;
1595  }
1596 
1597  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1598  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1599 
1600  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1601  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1602 
1603  // GEH - Note we serialize the task if the team is serialized to make sure
1604  // implicit parallel region tasks are not left until program termination to
1605  // execute. Also, it helps locality to execute immediately.
1606 
1607  taskdata->td_flags.task_serial =
1608  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1609  taskdata->td_flags.tasking_ser || flags->merged_if0);
1610 
1611  taskdata->td_flags.started = 0;
1612  taskdata->td_flags.executing = 0;
1613  taskdata->td_flags.complete = 0;
1614  taskdata->td_flags.freed = 0;
1615 #if OMPX_TASKGRAPH
1616  taskdata->td_flags.onced = 0;
1617  taskdata->is_taskgraph = 0;
1618  taskdata->tdg = nullptr;
1619 #endif
1620  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1621  // start at one because counts current task and children
1622  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1623  taskdata->td_taskgroup =
1624  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1625  taskdata->td_dephash = NULL;
1626  taskdata->td_depnode = NULL;
1627  taskdata->td_target_data.async_handle = NULL;
1628  if (flags->tiedness == TASK_UNTIED)
1629  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1630  else
1631  taskdata->td_last_tied = taskdata;
1632  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1633 #if OMPT_SUPPORT
1634  if (UNLIKELY(ompt_enabled.enabled))
1635  __ompt_task_init(taskdata, gtid);
1636 #endif
1637  // TODO: What would be the balance between the conditions in the function and
1638  // an atomic operation?
1639  if (__kmp_track_children_task(taskdata)) {
1640  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1641  if (parent_task->td_taskgroup)
1642  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1643  // Only need to keep track of allocated child tasks for explicit tasks since
1644  // implicit not deallocated
1645  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1646  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1647  }
1648  if (flags->hidden_helper) {
1649  taskdata->td_flags.task_serial = FALSE;
1650  // Increment the number of hidden helper tasks to be executed
1651  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1652  }
1653  }
1654 
1655 #if OMPX_TASKGRAPH
1656  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1657  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1658  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1659  taskdata->is_taskgraph = 1;
1660  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1661  taskdata->td_task_id = KMP_GEN_TASK_ID();
1662  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1663  }
1664 #endif
1665  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1666  gtid, taskdata, taskdata->td_parent));
1667 
1668  return task;
1669 }
1670 
1671 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1672  kmp_int32 flags, size_t sizeof_kmp_task_t,
1673  size_t sizeof_shareds,
1674  kmp_routine_entry_t task_entry) {
1675  kmp_task_t *retval;
1676  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1677  __kmp_assert_valid_gtid(gtid);
1678  input_flags->native = FALSE;
1679  // __kmp_task_alloc() sets up all other runtime flags
1680  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1681  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1682  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1683  input_flags->proxy ? "proxy" : "",
1684  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1685  sizeof_shareds, task_entry));
1686 
1687  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1688  sizeof_shareds, task_entry);
1689 
1690  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1691 
1692  return retval;
1693 }
1694 
1695 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1696  kmp_int32 flags,
1697  size_t sizeof_kmp_task_t,
1698  size_t sizeof_shareds,
1699  kmp_routine_entry_t task_entry,
1700  kmp_int64 device_id) {
1701  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1702  // target task is untied defined in the specification
1703  input_flags.tiedness = TASK_UNTIED;
1704  input_flags.target = 1;
1705 
1706  if (__kmp_enable_hidden_helper)
1707  input_flags.hidden_helper = TRUE;
1708 
1709  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1710  sizeof_shareds, task_entry);
1711 }
1712 
1726 kmp_int32
1728  kmp_task_t *new_task, kmp_int32 naffins,
1729  kmp_task_affinity_info_t *affin_list) {
1730  return 0;
1731 }
1732 
1733 // __kmp_invoke_task: invoke the specified task
1734 //
1735 // gtid: global thread ID of caller
1736 // task: the task to invoke
1737 // current_task: the task to resume after task invocation
1738 #ifdef __s390x__
1739 __attribute__((target("backchain")))
1740 #endif
1741 static void
1742 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1743  kmp_taskdata_t *current_task) {
1744  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1745  kmp_info_t *thread;
1746  int discard = 0 /* false */;
1747  KA_TRACE(
1748  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1749  gtid, taskdata, current_task));
1750  KMP_DEBUG_ASSERT(task);
1751  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1752  taskdata->td_flags.complete == 1)) {
1753  // This is a proxy task that was already completed but it needs to run
1754  // its bottom-half finish
1755  KA_TRACE(
1756  30,
1757  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1758  gtid, taskdata));
1759 
1760  __kmp_bottom_half_finish_proxy(gtid, task);
1761 
1762  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1763  "proxy task %p, resuming task %p\n",
1764  gtid, taskdata, current_task));
1765 
1766  return;
1767  }
1768 
1769 #if OMPT_SUPPORT
1770  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1771  // does not execute code.
1772  ompt_thread_info_t oldInfo;
1773  if (UNLIKELY(ompt_enabled.enabled)) {
1774  // Store the threads states and restore them after the task
1775  thread = __kmp_threads[gtid];
1776  oldInfo = thread->th.ompt_thread_info;
1777  thread->th.ompt_thread_info.wait_id = 0;
1778  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1779  ? ompt_state_work_serial
1780  : ompt_state_work_parallel;
1781  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1782  }
1783 #endif
1784 
1785  // Proxy tasks are not handled by the runtime
1786  if (taskdata->td_flags.proxy != TASK_PROXY) {
1787  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1788  }
1789 
1790  // TODO: cancel tasks if the parallel region has also been cancelled
1791  // TODO: check if this sequence can be hoisted above __kmp_task_start
1792  // if cancellation has been enabled for this run ...
1793  if (UNLIKELY(__kmp_omp_cancellation)) {
1794  thread = __kmp_threads[gtid];
1795  kmp_team_t *this_team = thread->th.th_team;
1796  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1797  if ((taskgroup && taskgroup->cancel_request) ||
1798  (this_team->t.t_cancel_request == cancel_parallel)) {
1799 #if OMPT_SUPPORT && OMPT_OPTIONAL
1800  ompt_data_t *task_data;
1801  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1802  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1803  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1804  task_data,
1805  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1806  : ompt_cancel_parallel) |
1807  ompt_cancel_discarded_task,
1808  NULL);
1809  }
1810 #endif
1811  KMP_COUNT_BLOCK(TASK_cancelled);
1812  // this task belongs to a task group and we need to cancel it
1813  discard = 1 /* true */;
1814  }
1815  }
1816 
1817  // Invoke the task routine and pass in relevant data.
1818  // Thunks generated by gcc take a different argument list.
1819  if (!discard) {
1820  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1821  taskdata->td_last_tied = current_task->td_last_tied;
1822  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1823  }
1824 #if KMP_STATS_ENABLED
1825  KMP_COUNT_BLOCK(TASK_executed);
1826  switch (KMP_GET_THREAD_STATE()) {
1827  case FORK_JOIN_BARRIER:
1828  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1829  break;
1830  case PLAIN_BARRIER:
1831  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1832  break;
1833  case TASKYIELD:
1834  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1835  break;
1836  case TASKWAIT:
1837  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1838  break;
1839  case TASKGROUP:
1840  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1841  break;
1842  default:
1843  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1844  break;
1845  }
1846 #endif // KMP_STATS_ENABLED
1847 
1848 // OMPT task begin
1849 #if OMPT_SUPPORT
1850  if (UNLIKELY(ompt_enabled.enabled))
1851  __ompt_task_start(task, current_task, gtid);
1852 #endif
1853 #if OMPT_SUPPORT && OMPT_OPTIONAL
1854  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1855  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1856  ompt_data_t instance = ompt_data_none;
1857  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1858  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1859  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1860  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1861  ompt_dispatch_taskloop_chunk, instance);
1862  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1863  }
1864 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1865 
1866 #if OMPD_SUPPORT
1867  if (ompd_state & OMPD_ENABLE_BP)
1868  ompd_bp_task_begin();
1869 #endif
1870 
1871 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1872  kmp_uint64 cur_time;
1873  kmp_int32 kmp_itt_count_task =
1874  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1875  current_task->td_flags.tasktype == TASK_IMPLICIT;
1876  if (kmp_itt_count_task) {
1877  thread = __kmp_threads[gtid];
1878  // Time outer level explicit task on barrier for adjusting imbalance time
1879  if (thread->th.th_bar_arrive_time)
1880  cur_time = __itt_get_timestamp();
1881  else
1882  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1883  }
1884  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1885 #endif
1886 
1887 #if ENABLE_LIBOMPTARGET
1888  if (taskdata->td_target_data.async_handle != NULL) {
1889  // If we have a valid target async handle, that means that we have already
1890  // executed the task routine once. We must query for the handle completion
1891  // instead of re-executing the routine.
1892  KMP_ASSERT(tgt_target_nowait_query);
1893  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1894  } else
1895 #endif
1896  if (task->routine != NULL) {
1897 #ifdef KMP_GOMP_COMPAT
1898  if (taskdata->td_flags.native) {
1899  ((void (*)(void *))(*(task->routine)))(task->shareds);
1900  } else
1901 #endif /* KMP_GOMP_COMPAT */
1902  {
1903  (*(task->routine))(gtid, task);
1904  }
1905  }
1906  KMP_POP_PARTITIONED_TIMER();
1907 
1908 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1909  if (kmp_itt_count_task) {
1910  // Barrier imbalance - adjust arrive time with the task duration
1911  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1912  }
1913  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1914  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1915 #endif
1916  }
1917 
1918 #if OMPD_SUPPORT
1919  if (ompd_state & OMPD_ENABLE_BP)
1920  ompd_bp_task_end();
1921 #endif
1922 
1923  // Proxy tasks are not handled by the runtime
1924  if (taskdata->td_flags.proxy != TASK_PROXY) {
1925 #if OMPT_SUPPORT
1926  if (UNLIKELY(ompt_enabled.enabled)) {
1927  thread->th.ompt_thread_info = oldInfo;
1928  if (taskdata->td_flags.tiedness == TASK_TIED) {
1929  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1930  }
1931  __kmp_task_finish<true>(gtid, task, current_task);
1932  } else
1933 #endif
1934  __kmp_task_finish<false>(gtid, task, current_task);
1935  }
1936 #if OMPT_SUPPORT
1937  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1938  __ompt_task_finish(task, current_task, ompt_task_switch);
1939  }
1940 #endif
1941 
1942  KA_TRACE(
1943  30,
1944  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1945  gtid, taskdata, current_task));
1946  return;
1947 }
1948 
1949 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1950 //
1951 // loc_ref: location of original task pragma (ignored)
1952 // gtid: Global Thread ID of encountering thread
1953 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1954 // Returns:
1955 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1956 // be resumed later.
1957 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1958 // resumed later.
1959 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1960  kmp_task_t *new_task) {
1961  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1962 
1963  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1964  loc_ref, new_taskdata));
1965 
1966 #if OMPT_SUPPORT
1967  kmp_taskdata_t *parent;
1968  if (UNLIKELY(ompt_enabled.enabled)) {
1969  parent = new_taskdata->td_parent;
1970  if (ompt_enabled.ompt_callback_task_create) {
1971  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1972  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1973  &(new_taskdata->ompt_task_info.task_data),
1974  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1975  OMPT_GET_RETURN_ADDRESS(0));
1976  }
1977  }
1978 #endif
1979 
1980  /* Should we execute the new task or queue it? For now, let's just always try
1981  to queue it. If the queue fills up, then we'll execute it. */
1982 
1983  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1984  { // Execute this task immediately
1985  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1986  new_taskdata->td_flags.task_serial = 1;
1987  __kmp_invoke_task(gtid, new_task, current_task);
1988  }
1989 
1990  KA_TRACE(
1991  10,
1992  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1993  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1994  gtid, loc_ref, new_taskdata));
1995 
1996 #if OMPT_SUPPORT
1997  if (UNLIKELY(ompt_enabled.enabled)) {
1998  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1999  parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
2000  }
2001 #endif
2002  return TASK_CURRENT_NOT_QUEUED;
2003 }
2004 
2005 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
2006 //
2007 // gtid: Global Thread ID of encountering thread
2008 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
2009 // serialize_immediate: if TRUE then if the task is executed immediately its
2010 // execution will be serialized
2011 // Returns:
2012 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2013 // be resumed later.
2014 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2015 // resumed later.
2016 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
2017  bool serialize_immediate) {
2018  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2019 
2020 #if OMPX_TASKGRAPH
2021  if (new_taskdata->is_taskgraph &&
2022  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
2023  kmp_tdg_info_t *tdg = new_taskdata->tdg;
2024  // extend the record_map if needed
2025  if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
2026  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
2027  // map_size could have been updated by another thread if recursive
2028  // taskloop
2029  if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
2030  kmp_uint old_size = tdg->map_size;
2031  kmp_uint new_size = old_size * 2;
2032  kmp_node_info_t *old_record = tdg->record_map;
2033  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
2034  new_size * sizeof(kmp_node_info_t));
2035 
2036  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
2037  tdg->record_map = new_record;
2038 
2039  __kmp_free(old_record);
2040 
2041  for (kmp_int i = old_size; i < new_size; i++) {
2042  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
2043  __kmp_successors_size * sizeof(kmp_int32));
2044  new_record[i].task = nullptr;
2045  new_record[i].successors = successorsList;
2046  new_record[i].nsuccessors = 0;
2047  new_record[i].npredecessors = 0;
2048  new_record[i].successors_size = __kmp_successors_size;
2049  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
2050  }
2051  // update the size at the end, so that we avoid other
2052  // threads use old_record while map_size is already updated
2053  tdg->map_size = new_size;
2054  }
2055  __kmp_release_bootstrap_lock(&tdg->graph_lock);
2056  }
2057  // record a task
2058  if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
2059  tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
2060  tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
2061  new_taskdata->td_parent;
2062  KMP_ATOMIC_INC(&tdg->num_tasks);
2063  }
2064  }
2065 #endif
2066 
2067  /* Should we execute the new task or queue it? For now, let's just always try
2068  to queue it. If the queue fills up, then we'll execute it. */
2069  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
2070  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
2071  { // Execute this task immediately
2072  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
2073  if (serialize_immediate)
2074  new_taskdata->td_flags.task_serial = 1;
2075  __kmp_invoke_task(gtid, new_task, current_task);
2076  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
2077  __kmp_wpolicy_passive) {
2078  kmp_info_t *this_thr = __kmp_threads[gtid];
2079  kmp_team_t *team = this_thr->th.th_team;
2080  kmp_int32 nthreads = this_thr->th.th_team_nproc;
2081  for (int i = 0; i < nthreads; ++i) {
2082  kmp_info_t *thread = team->t.t_threads[i];
2083  if (thread == this_thr)
2084  continue;
2085  if (thread->th.th_sleep_loc != NULL) {
2086  __kmp_null_resume_wrapper(thread);
2087  break; // awake one thread at a time
2088  }
2089  }
2090  }
2091  return TASK_CURRENT_NOT_QUEUED;
2092 }
2093 
2094 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
2095 // non-thread-switchable task from the parent thread only!
2096 //
2097 // loc_ref: location of original task pragma (ignored)
2098 // gtid: Global Thread ID of encountering thread
2099 // new_task: non-thread-switchable task thunk allocated by
2100 // __kmp_omp_task_alloc()
2101 // Returns:
2102 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2103 // be resumed later.
2104 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2105 // resumed later.
2106 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
2107  kmp_task_t *new_task) {
2108  kmp_int32 res;
2109  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2110 
2111 #if KMP_DEBUG || OMPT_SUPPORT
2112  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2113 #endif
2114  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2115  new_taskdata));
2116  __kmp_assert_valid_gtid(gtid);
2117 
2118 #if OMPT_SUPPORT
2119  kmp_taskdata_t *parent = NULL;
2120  if (UNLIKELY(ompt_enabled.enabled)) {
2121  if (!new_taskdata->td_flags.started) {
2122  OMPT_STORE_RETURN_ADDRESS(gtid);
2123  parent = new_taskdata->td_parent;
2124  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
2125  parent->ompt_task_info.frame.enter_frame.ptr =
2126  OMPT_GET_FRAME_ADDRESS(0);
2127  }
2128  if (ompt_enabled.ompt_callback_task_create) {
2129  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2130  &(parent->ompt_task_info.task_data),
2131  &(parent->ompt_task_info.frame),
2132  &(new_taskdata->ompt_task_info.task_data),
2133  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
2134  OMPT_LOAD_RETURN_ADDRESS(gtid));
2135  }
2136  } else {
2137  // We are scheduling the continuation of an UNTIED task.
2138  // Scheduling back to the parent task.
2139  __ompt_task_finish(new_task,
2140  new_taskdata->ompt_task_info.scheduling_parent,
2141  ompt_task_switch);
2142  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
2143  }
2144  }
2145 #endif
2146 
2147  res = __kmp_omp_task(gtid, new_task, true);
2148 
2149  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2150  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2151  gtid, loc_ref, new_taskdata));
2152 #if OMPT_SUPPORT
2153  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2154  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2155  }
2156 #endif
2157  return res;
2158 }
2159 
2160 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
2161 // a taskloop task with the correct OMPT return address
2162 //
2163 // loc_ref: location of original task pragma (ignored)
2164 // gtid: Global Thread ID of encountering thread
2165 // new_task: non-thread-switchable task thunk allocated by
2166 // __kmp_omp_task_alloc()
2167 // codeptr_ra: return address for OMPT callback
2168 // Returns:
2169 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
2170 // be resumed later.
2171 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
2172 // resumed later.
2173 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
2174  kmp_task_t *new_task, void *codeptr_ra) {
2175  kmp_int32 res;
2176  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
2177 
2178 #if KMP_DEBUG || OMPT_SUPPORT
2179  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
2180 #endif
2181  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
2182  new_taskdata));
2183 
2184 #if OMPT_SUPPORT
2185  kmp_taskdata_t *parent = NULL;
2186  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
2187  parent = new_taskdata->td_parent;
2188  if (!parent->ompt_task_info.frame.enter_frame.ptr)
2189  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
2190  if (ompt_enabled.ompt_callback_task_create) {
2191  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
2192  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
2193  &(new_taskdata->ompt_task_info.task_data),
2194  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
2195  }
2196  }
2197 #endif
2198 
2199  res = __kmp_omp_task(gtid, new_task, true);
2200 
2201  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
2202  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
2203  gtid, loc_ref, new_taskdata));
2204 #if OMPT_SUPPORT
2205  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
2206  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
2207  }
2208 #endif
2209  return res;
2210 }
2211 
2212 template <bool ompt>
2213 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2214  void *frame_address,
2215  void *return_address) {
2216  kmp_taskdata_t *taskdata = nullptr;
2217  kmp_info_t *thread;
2218  int thread_finished = FALSE;
2219  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2220 
2221  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2222  KMP_DEBUG_ASSERT(gtid >= 0);
2223 
2224  if (__kmp_tasking_mode != tskm_immediate_exec) {
2225  thread = __kmp_threads[gtid];
2226  taskdata = thread->th.th_current_task;
2227 
2228 #if OMPT_SUPPORT && OMPT_OPTIONAL
2229  ompt_data_t *my_task_data;
2230  ompt_data_t *my_parallel_data;
2231 
2232  if (ompt) {
2233  my_task_data = &(taskdata->ompt_task_info.task_data);
2234  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2235 
2236  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2237 
2238  if (ompt_enabled.ompt_callback_sync_region) {
2239  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2240  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2241  my_task_data, return_address);
2242  }
2243 
2244  if (ompt_enabled.ompt_callback_sync_region_wait) {
2245  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2246  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2247  my_task_data, return_address);
2248  }
2249  }
2250 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2251 
2252 // Debugger: The taskwait is active. Store location and thread encountered the
2253 // taskwait.
2254 #if USE_ITT_BUILD
2255 // Note: These values are used by ITT events as well.
2256 #endif /* USE_ITT_BUILD */
2257  taskdata->td_taskwait_counter += 1;
2258  taskdata->td_taskwait_ident = loc_ref;
2259  taskdata->td_taskwait_thread = gtid + 1;
2260 
2261 #if USE_ITT_BUILD
2262  void *itt_sync_obj = NULL;
2263 #if USE_ITT_NOTIFY
2264  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2265 #endif /* USE_ITT_NOTIFY */
2266 #endif /* USE_ITT_BUILD */
2267 
2268  bool must_wait =
2269  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2270 
2271  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2272  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2273  // If hidden helper thread is encountered, we must enable wait here.
2274  must_wait =
2275  must_wait ||
2276  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2277  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2278 
2279  if (must_wait) {
2280  kmp_flag_32<false, false> flag(
2281  RCAST(std::atomic<kmp_uint32> *,
2282  &(taskdata->td_incomplete_child_tasks)),
2283  0U);
2284  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2285  flag.execute_tasks(thread, gtid, FALSE,
2286  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2287  __kmp_task_stealing_constraint);
2288  }
2289  }
2290 #if USE_ITT_BUILD
2291  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2292  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2293 #endif /* USE_ITT_BUILD */
2294 
2295  // Debugger: The taskwait is completed. Location remains, but thread is
2296  // negated.
2297  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2298 
2299 #if OMPT_SUPPORT && OMPT_OPTIONAL
2300  if (ompt) {
2301  if (ompt_enabled.ompt_callback_sync_region_wait) {
2302  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2303  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2304  my_task_data, return_address);
2305  }
2306  if (ompt_enabled.ompt_callback_sync_region) {
2307  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2308  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2309  my_task_data, return_address);
2310  }
2311  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2312  }
2313 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2314  }
2315 
2316  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2317  "returning TASK_CURRENT_NOT_QUEUED\n",
2318  gtid, taskdata));
2319 
2320  return TASK_CURRENT_NOT_QUEUED;
2321 }
2322 
2323 #if OMPT_SUPPORT && OMPT_OPTIONAL
2324 OMPT_NOINLINE
2325 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2326  void *frame_address,
2327  void *return_address) {
2328  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2329  return_address);
2330 }
2331 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2332 
2333 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2334 // complete
2335 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2336 #if OMPT_SUPPORT && OMPT_OPTIONAL
2337  if (UNLIKELY(ompt_enabled.enabled)) {
2338  OMPT_STORE_RETURN_ADDRESS(gtid);
2339  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2340  OMPT_LOAD_RETURN_ADDRESS(gtid));
2341  }
2342 #endif
2343  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2344 }
2345 
2346 // __kmpc_omp_taskyield: switch to a different task
2347 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2348  kmp_taskdata_t *taskdata = NULL;
2349  kmp_info_t *thread;
2350  int thread_finished = FALSE;
2351 
2352  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2353  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2354 
2355  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2356  gtid, loc_ref, end_part));
2357  __kmp_assert_valid_gtid(gtid);
2358 
2359  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2360  thread = __kmp_threads[gtid];
2361  taskdata = thread->th.th_current_task;
2362 // Should we model this as a task wait or not?
2363 // Debugger: The taskwait is active. Store location and thread encountered the
2364 // taskwait.
2365 #if USE_ITT_BUILD
2366 // Note: These values are used by ITT events as well.
2367 #endif /* USE_ITT_BUILD */
2368  taskdata->td_taskwait_counter += 1;
2369  taskdata->td_taskwait_ident = loc_ref;
2370  taskdata->td_taskwait_thread = gtid + 1;
2371 
2372 #if USE_ITT_BUILD
2373  void *itt_sync_obj = NULL;
2374 #if USE_ITT_NOTIFY
2375  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2376 #endif /* USE_ITT_NOTIFY */
2377 #endif /* USE_ITT_BUILD */
2378  if (!taskdata->td_flags.team_serial) {
2379  kmp_task_team_t *task_team = thread->th.th_task_team;
2380  if (task_team != NULL) {
2381  if (KMP_TASKING_ENABLED(task_team)) {
2382 #if OMPT_SUPPORT
2383  if (UNLIKELY(ompt_enabled.enabled))
2384  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2385 #endif
2386  __kmp_execute_tasks_32(
2387  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2388  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2389  __kmp_task_stealing_constraint);
2390 #if OMPT_SUPPORT
2391  if (UNLIKELY(ompt_enabled.enabled))
2392  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2393 #endif
2394  }
2395  }
2396  }
2397 #if USE_ITT_BUILD
2398  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2399 #endif /* USE_ITT_BUILD */
2400 
2401  // Debugger: The taskwait is completed. Location remains, but thread is
2402  // negated.
2403  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2404  }
2405 
2406  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2407  "returning TASK_CURRENT_NOT_QUEUED\n",
2408  gtid, taskdata));
2409 
2410  return TASK_CURRENT_NOT_QUEUED;
2411 }
2412 
2413 // Task Reduction implementation
2414 //
2415 // Note: initial implementation didn't take into account the possibility
2416 // to specify omp_orig for initializer of the UDR (user defined reduction).
2417 // Corrected implementation takes into account the omp_orig object.
2418 // Compiler is free to use old implementation if omp_orig is not specified.
2419 
2428 typedef struct kmp_taskred_flags {
2430  unsigned lazy_priv : 1;
2431  unsigned reserved31 : 31;
2433 
2437 typedef struct kmp_task_red_input {
2438  void *reduce_shar;
2439  size_t reduce_size;
2440  // three compiler-generated routines (init, fini are optional):
2441  void *reduce_init;
2442  void *reduce_fini;
2443  void *reduce_comb;
2446 
2450 typedef struct kmp_taskred_data {
2451  void *reduce_shar;
2452  size_t reduce_size;
2454  void *reduce_priv;
2455  void *reduce_pend;
2456  // three compiler-generated routines (init, fini are optional):
2457  void *reduce_comb;
2458  void *reduce_init;
2459  void *reduce_fini;
2460  void *reduce_orig;
2462 
2468 typedef struct kmp_taskred_input {
2469  void *reduce_shar;
2470  void *reduce_orig;
2471  size_t reduce_size;
2472  // three compiler-generated routines (init, fini are optional):
2473  void *reduce_init;
2474  void *reduce_fini;
2475  void *reduce_comb;
2482 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2483 template <>
2484 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2485  kmp_task_red_input_t &src) {
2486  item.reduce_orig = NULL;
2487 }
2488 template <>
2489 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2490  kmp_taskred_input_t &src) {
2491  if (src.reduce_orig != NULL) {
2492  item.reduce_orig = src.reduce_orig;
2493  } else {
2494  item.reduce_orig = src.reduce_shar;
2495  } // non-NULL reduce_orig means new interface used
2496 }
2497 
2498 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2499 template <>
2500 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2501  size_t offset) {
2502  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2503 }
2504 template <>
2505 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2506  size_t offset) {
2507  ((void (*)(void *, void *))item.reduce_init)(
2508  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2509 }
2510 
2511 template <typename T>
2512 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2513  __kmp_assert_valid_gtid(gtid);
2514  kmp_info_t *thread = __kmp_threads[gtid];
2515  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2516  kmp_uint32 nth = thread->th.th_team_nproc;
2517  kmp_taskred_data_t *arr;
2518 
2519  // check input data just in case
2520  KMP_ASSERT(tg != NULL);
2521  KMP_ASSERT(data != NULL);
2522  KMP_ASSERT(num > 0);
2523  if (nth == 1 && !__kmp_enable_hidden_helper) {
2524  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2525  gtid, tg));
2526  return (void *)tg;
2527  }
2528  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2529  gtid, tg, num));
2530  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2531  thread, num * sizeof(kmp_taskred_data_t));
2532  for (int i = 0; i < num; ++i) {
2533  size_t size = data[i].reduce_size - 1;
2534  // round the size up to cache line per thread-specific item
2535  size += CACHE_LINE - size % CACHE_LINE;
2536  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2537  arr[i].reduce_shar = data[i].reduce_shar;
2538  arr[i].reduce_size = size;
2539  arr[i].flags = data[i].flags;
2540  arr[i].reduce_comb = data[i].reduce_comb;
2541  arr[i].reduce_init = data[i].reduce_init;
2542  arr[i].reduce_fini = data[i].reduce_fini;
2543  __kmp_assign_orig<T>(arr[i], data[i]);
2544  if (!arr[i].flags.lazy_priv) {
2545  // allocate cache-line aligned block and fill it with zeros
2546  arr[i].reduce_priv = __kmp_allocate(nth * size);
2547  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2548  if (arr[i].reduce_init != NULL) {
2549  // initialize all thread-specific items
2550  for (size_t j = 0; j < nth; ++j) {
2551  __kmp_call_init<T>(arr[i], j * size);
2552  }
2553  }
2554  } else {
2555  // only allocate space for pointers now,
2556  // objects will be lazily allocated/initialized if/when requested
2557  // note that __kmp_allocate zeroes the allocated memory
2558  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2559  }
2560  }
2561  tg->reduce_data = (void *)arr;
2562  tg->reduce_num_data = num;
2563  return (void *)tg;
2564 }
2565 
2580 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2581 #if OMPX_TASKGRAPH
2582  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2583  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2584  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2585  this_tdg->rec_taskred_data =
2586  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2587  this_tdg->rec_num_taskred = num;
2588  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2589  sizeof(kmp_task_red_input_t) * num);
2590  }
2591 #endif
2592  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2593 }
2594 
2607 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2608 #if OMPX_TASKGRAPH
2609  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2610  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2611  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2612  this_tdg->rec_taskred_data =
2613  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2614  this_tdg->rec_num_taskred = num;
2615  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2616  sizeof(kmp_task_red_input_t) * num);
2617  }
2618 #endif
2619  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2620 }
2621 
2622 // Copy task reduction data (except for shared pointers).
2623 template <typename T>
2624 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2625  kmp_taskgroup_t *tg, void *reduce_data) {
2626  kmp_taskred_data_t *arr;
2627  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2628  " from data %p\n",
2629  thr, tg, reduce_data));
2630  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2631  thr, num * sizeof(kmp_taskred_data_t));
2632  // threads will share private copies, thunk routines, sizes, flags, etc.:
2633  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2634  for (int i = 0; i < num; ++i) {
2635  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2636  }
2637  tg->reduce_data = (void *)arr;
2638  tg->reduce_num_data = num;
2639 }
2640 
2650 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2651  __kmp_assert_valid_gtid(gtid);
2652  kmp_info_t *thread = __kmp_threads[gtid];
2653  kmp_int32 nth = thread->th.th_team_nproc;
2654  if (nth == 1)
2655  return data; // nothing to do
2656 
2657  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2658  if (tg == NULL)
2659  tg = thread->th.th_current_task->td_taskgroup;
2660  KMP_ASSERT(tg != NULL);
2661  kmp_taskred_data_t *arr;
2662  kmp_int32 num;
2663  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2664 
2665 #if OMPX_TASKGRAPH
2666  if ((thread->th.th_current_task->is_taskgraph) &&
2667  (!__kmp_tdg_is_recording(
2668  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2669  tg = thread->th.th_current_task->td_taskgroup;
2670  KMP_ASSERT(tg != NULL);
2671  KMP_ASSERT(tg->reduce_data != NULL);
2672  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2673  num = tg->reduce_num_data;
2674  }
2675 #endif
2676 
2677  KMP_ASSERT(data != NULL);
2678  while (tg != NULL) {
2679  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2680  num = tg->reduce_num_data;
2681  for (int i = 0; i < num; ++i) {
2682  if (!arr[i].flags.lazy_priv) {
2683  if (data == arr[i].reduce_shar ||
2684  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2685  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2686  } else {
2687  // check shared location first
2688  void **p_priv = (void **)(arr[i].reduce_priv);
2689  if (data == arr[i].reduce_shar)
2690  goto found;
2691  // check if we get some thread specific location as parameter
2692  for (int j = 0; j < nth; ++j)
2693  if (data == p_priv[j])
2694  goto found;
2695  continue; // not found, continue search
2696  found:
2697  if (p_priv[tid] == NULL) {
2698  // allocate thread specific object lazily
2699  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2700  if (arr[i].reduce_init != NULL) {
2701  if (arr[i].reduce_orig != NULL) { // new interface
2702  ((void (*)(void *, void *))arr[i].reduce_init)(
2703  p_priv[tid], arr[i].reduce_orig);
2704  } else { // old interface (single parameter)
2705  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2706  }
2707  }
2708  }
2709  return p_priv[tid];
2710  }
2711  }
2712  KMP_ASSERT(tg->parent);
2713  tg = tg->parent;
2714  }
2715  KMP_ASSERT2(0, "Unknown task reduction item");
2716  return NULL; // ERROR, this line never executed
2717 }
2718 
2719 // Finalize task reduction.
2720 // Called from __kmpc_end_taskgroup()
2721 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2722  kmp_int32 nth = th->th.th_team_nproc;
2723  KMP_DEBUG_ASSERT(
2724  nth > 1 ||
2725  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2726  // are using hidden helper threads
2727  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2728  kmp_int32 num = tg->reduce_num_data;
2729  for (int i = 0; i < num; ++i) {
2730  void *sh_data = arr[i].reduce_shar;
2731  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2732  void (*f_comb)(void *, void *) =
2733  (void (*)(void *, void *))(arr[i].reduce_comb);
2734  if (!arr[i].flags.lazy_priv) {
2735  void *pr_data = arr[i].reduce_priv;
2736  size_t size = arr[i].reduce_size;
2737  for (int j = 0; j < nth; ++j) {
2738  void *priv_data = (char *)pr_data + j * size;
2739  f_comb(sh_data, priv_data); // combine results
2740  if (f_fini)
2741  f_fini(priv_data); // finalize if needed
2742  }
2743  } else {
2744  void **pr_data = (void **)(arr[i].reduce_priv);
2745  for (int j = 0; j < nth; ++j) {
2746  if (pr_data[j] != NULL) {
2747  f_comb(sh_data, pr_data[j]); // combine results
2748  if (f_fini)
2749  f_fini(pr_data[j]); // finalize if needed
2750  __kmp_free(pr_data[j]);
2751  }
2752  }
2753  }
2754  __kmp_free(arr[i].reduce_priv);
2755  }
2756  __kmp_thread_free(th, arr);
2757  tg->reduce_data = NULL;
2758  tg->reduce_num_data = 0;
2759 }
2760 
2761 // Cleanup task reduction data for parallel or worksharing,
2762 // do not touch task private data other threads still working with.
2763 // Called from __kmpc_end_taskgroup()
2764 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2765  __kmp_thread_free(th, tg->reduce_data);
2766  tg->reduce_data = NULL;
2767  tg->reduce_num_data = 0;
2768 }
2769 
2770 template <typename T>
2771 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2772  int num, T *data) {
2773  __kmp_assert_valid_gtid(gtid);
2774  kmp_info_t *thr = __kmp_threads[gtid];
2775  kmp_int32 nth = thr->th.th_team_nproc;
2776  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2777  if (nth == 1) {
2778  KA_TRACE(10,
2779  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2780  gtid, thr->th.th_current_task->td_taskgroup));
2781  return (void *)thr->th.th_current_task->td_taskgroup;
2782  }
2783  kmp_team_t *team = thr->th.th_team;
2784  void *reduce_data;
2785  kmp_taskgroup_t *tg;
2786  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2787  if (reduce_data == NULL &&
2788  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2789  (void *)1)) {
2790  // single thread enters this block to initialize common reduction data
2791  KMP_DEBUG_ASSERT(reduce_data == NULL);
2792  // first initialize own data, then make a copy other threads can use
2793  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2794  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2795  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2796  // fini counters should be 0 at this point
2797  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2798  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2799  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2800  } else {
2801  while (
2802  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2803  (void *)1) { // wait for task reduction initialization
2804  KMP_CPU_PAUSE();
2805  }
2806  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2807  tg = thr->th.th_current_task->td_taskgroup;
2808  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2809  }
2810  return tg;
2811 }
2812 
2829 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2830  int num, void *data) {
2831  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2832  (kmp_task_red_input_t *)data);
2833 }
2834 
2849 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2850  void *data) {
2851  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2852  (kmp_taskred_input_t *)data);
2853 }
2854 
2863 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2864  __kmpc_end_taskgroup(loc, gtid);
2865 }
2866 
2867 // __kmpc_taskgroup: Start a new taskgroup
2868 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2869  __kmp_assert_valid_gtid(gtid);
2870  kmp_info_t *thread = __kmp_threads[gtid];
2871  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2872  kmp_taskgroup_t *tg_new =
2873  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2874  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2875  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2876  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2877  tg_new->parent = taskdata->td_taskgroup;
2878  tg_new->reduce_data = NULL;
2879  tg_new->reduce_num_data = 0;
2880  tg_new->gomp_data = NULL;
2881  taskdata->td_taskgroup = tg_new;
2882 
2883 #if OMPT_SUPPORT && OMPT_OPTIONAL
2884  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2885  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2886  if (!codeptr)
2887  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2888  kmp_team_t *team = thread->th.th_team;
2889  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2890  // FIXME: I think this is wrong for lwt!
2891  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2892 
2893  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2894  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2895  &(my_task_data), codeptr);
2896  }
2897 #endif
2898 }
2899 
2900 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2901 // and its descendants are complete
2902 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2903  __kmp_assert_valid_gtid(gtid);
2904  kmp_info_t *thread = __kmp_threads[gtid];
2905  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2906  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2907  int thread_finished = FALSE;
2908 
2909 #if OMPT_SUPPORT && OMPT_OPTIONAL
2910  kmp_team_t *team;
2911  ompt_data_t my_task_data;
2912  ompt_data_t my_parallel_data;
2913  void *codeptr = nullptr;
2914  if (UNLIKELY(ompt_enabled.enabled)) {
2915  team = thread->th.th_team;
2916  my_task_data = taskdata->ompt_task_info.task_data;
2917  // FIXME: I think this is wrong for lwt!
2918  my_parallel_data = team->t.ompt_team_info.parallel_data;
2919  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2920  if (!codeptr)
2921  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2922  }
2923 #endif
2924 
2925  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2926  KMP_DEBUG_ASSERT(taskgroup != NULL);
2927  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2928 
2929  if (__kmp_tasking_mode != tskm_immediate_exec) {
2930  // mark task as waiting not on a barrier
2931  taskdata->td_taskwait_counter += 1;
2932  taskdata->td_taskwait_ident = loc;
2933  taskdata->td_taskwait_thread = gtid + 1;
2934 #if USE_ITT_BUILD
2935  // For ITT the taskgroup wait is similar to taskwait until we need to
2936  // distinguish them
2937  void *itt_sync_obj = NULL;
2938 #if USE_ITT_NOTIFY
2939  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2940 #endif /* USE_ITT_NOTIFY */
2941 #endif /* USE_ITT_BUILD */
2942 
2943 #if OMPT_SUPPORT && OMPT_OPTIONAL
2944  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2945  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2946  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2947  &(my_task_data), codeptr);
2948  }
2949 #endif
2950 
2951  if (!taskdata->td_flags.team_serial ||
2952  (thread->th.th_task_team != NULL &&
2953  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2954  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2955  kmp_flag_32<false, false> flag(
2956  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2957  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2958  flag.execute_tasks(thread, gtid, FALSE,
2959  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2960  __kmp_task_stealing_constraint);
2961  }
2962  }
2963  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2964 
2965 #if OMPT_SUPPORT && OMPT_OPTIONAL
2966  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2967  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2968  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2969  &(my_task_data), codeptr);
2970  }
2971 #endif
2972 
2973 #if USE_ITT_BUILD
2974  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2975  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2976 #endif /* USE_ITT_BUILD */
2977  }
2978  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2979 
2980  if (taskgroup->reduce_data != NULL &&
2981  !taskgroup->gomp_data) { // need to reduce?
2982  int cnt;
2983  void *reduce_data;
2984  kmp_team_t *t = thread->th.th_team;
2985  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2986  // check if <priv> data of the first reduction variable shared for the team
2987  void *priv0 = arr[0].reduce_priv;
2988  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2989  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2990  // finishing task reduction on parallel
2991  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2992  if (cnt == thread->th.th_team_nproc - 1) {
2993  // we are the last thread passing __kmpc_reduction_modifier_fini()
2994  // finalize task reduction:
2995  __kmp_task_reduction_fini(thread, taskgroup);
2996  // cleanup fields in the team structure:
2997  // TODO: is relaxed store enough here (whole barrier should follow)?
2998  __kmp_thread_free(thread, reduce_data);
2999  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
3000  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
3001  } else {
3002  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3003  // so do not finalize reduction, just clean own copy of the data
3004  __kmp_task_reduction_clean(thread, taskgroup);
3005  }
3006  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
3007  NULL &&
3008  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
3009  // finishing task reduction on worksharing
3010  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
3011  if (cnt == thread->th.th_team_nproc - 1) {
3012  // we are the last thread passing __kmpc_reduction_modifier_fini()
3013  __kmp_task_reduction_fini(thread, taskgroup);
3014  // cleanup fields in team structure:
3015  // TODO: is relaxed store enough here (whole barrier should follow)?
3016  __kmp_thread_free(thread, reduce_data);
3017  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
3018  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
3019  } else {
3020  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
3021  // so do not finalize reduction, just clean own copy of the data
3022  __kmp_task_reduction_clean(thread, taskgroup);
3023  }
3024  } else {
3025  // finishing task reduction on taskgroup
3026  __kmp_task_reduction_fini(thread, taskgroup);
3027  }
3028  }
3029  // Restore parent taskgroup for the current task
3030  taskdata->td_taskgroup = taskgroup->parent;
3031  __kmp_thread_free(thread, taskgroup);
3032 
3033  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
3034  gtid, taskdata));
3035 
3036 #if OMPT_SUPPORT && OMPT_OPTIONAL
3037  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
3038  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
3039  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
3040  &(my_task_data), codeptr);
3041  }
3042 #endif
3043 }
3044 
3045 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
3046  kmp_task_team_t *task_team,
3047  kmp_int32 is_constrained) {
3048  kmp_task_t *task = NULL;
3049  kmp_taskdata_t *taskdata;
3050  kmp_taskdata_t *current;
3051  kmp_thread_data_t *thread_data;
3052  int ntasks = task_team->tt.tt_num_task_pri;
3053  if (ntasks == 0) {
3054  KA_TRACE(
3055  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
3056  return NULL;
3057  }
3058  do {
3059  // decrement num_tasks to "reserve" one task to get for execution
3060  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
3061  ntasks - 1))
3062  break;
3063  ntasks = task_team->tt.tt_num_task_pri;
3064  } while (ntasks > 0);
3065  if (ntasks == 0) {
3066  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
3067  __kmp_get_gtid()));
3068  return NULL;
3069  }
3070  // We got a "ticket" to get a "reserved" priority task
3071  int deque_ntasks;
3072  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3073  do {
3074  KMP_ASSERT(list != NULL);
3075  thread_data = &list->td;
3076  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3077  deque_ntasks = thread_data->td.td_deque_ntasks;
3078  if (deque_ntasks == 0) {
3079  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3080  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
3081  __kmp_get_gtid(), thread_data));
3082  list = list->next;
3083  }
3084  } while (deque_ntasks == 0);
3085  KMP_DEBUG_ASSERT(deque_ntasks);
3086  int target = thread_data->td.td_deque_head;
3087  current = __kmp_threads[gtid]->th.th_current_task;
3088  taskdata = thread_data->td.td_deque[target];
3089  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3090  // Bump head pointer and Wrap.
3091  thread_data->td.td_deque_head =
3092  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3093  } else {
3094  if (!task_team->tt.tt_untied_task_encountered) {
3095  // The TSC does not allow to steal victim task
3096  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3097  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
3098  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
3099  gtid, thread_data, task_team, deque_ntasks, target,
3100  thread_data->td.td_deque_tail));
3101  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3102  return NULL;
3103  }
3104  int i;
3105  // walk through the deque trying to steal any task
3106  taskdata = NULL;
3107  for (i = 1; i < deque_ntasks; ++i) {
3108  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3109  taskdata = thread_data->td.td_deque[target];
3110  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3111  break; // found task to execute
3112  } else {
3113  taskdata = NULL;
3114  }
3115  }
3116  if (taskdata == NULL) {
3117  // No appropriate candidate found to execute
3118  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3119  KA_TRACE(
3120  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
3121  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
3122  gtid, thread_data, task_team, deque_ntasks,
3123  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3124  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
3125  return NULL;
3126  }
3127  int prev = target;
3128  for (i = i + 1; i < deque_ntasks; ++i) {
3129  // shift remaining tasks in the deque left by 1
3130  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
3131  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
3132  prev = target;
3133  }
3134  KMP_DEBUG_ASSERT(
3135  thread_data->td.td_deque_tail ==
3136  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
3137  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
3138  }
3139  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
3140  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3141  task = KMP_TASKDATA_TO_TASK(taskdata);
3142  return task;
3143 }
3144 
3145 // __kmp_remove_my_task: remove a task from my own deque
3146 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
3147  kmp_task_team_t *task_team,
3148  kmp_int32 is_constrained) {
3149  kmp_task_t *task;
3150  kmp_taskdata_t *taskdata;
3151  kmp_thread_data_t *thread_data;
3152  kmp_uint32 tail;
3153 
3154  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3155  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
3156  NULL); // Caller should check this condition
3157 
3158  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
3159 
3160  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
3161  gtid, thread_data->td.td_deque_ntasks,
3162  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3163 
3164  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3165  KA_TRACE(10,
3166  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
3167  "ntasks=%d head=%u tail=%u\n",
3168  gtid, thread_data->td.td_deque_ntasks,
3169  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3170  return NULL;
3171  }
3172 
3173  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3174 
3175  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
3176  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3177  KA_TRACE(10,
3178  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
3179  "ntasks=%d head=%u tail=%u\n",
3180  gtid, thread_data->td.td_deque_ntasks,
3181  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3182  return NULL;
3183  }
3184 
3185  tail = (thread_data->td.td_deque_tail - 1) &
3186  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
3187  taskdata = thread_data->td.td_deque[tail];
3188 
3189  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
3190  thread->th.th_current_task)) {
3191  // The TSC does not allow to steal victim task
3192  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3193  KA_TRACE(10,
3194  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3195  "ntasks=%d head=%u tail=%u\n",
3196  gtid, thread_data->td.td_deque_ntasks,
3197  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3198  return NULL;
3199  }
3200 
3201  thread_data->td.td_deque_tail = tail;
3202  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3203 
3204  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3205 
3206  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3207  "ntasks=%d head=%u tail=%u\n",
3208  gtid, taskdata, thread_data->td.td_deque_ntasks,
3209  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3210 
3211  task = KMP_TASKDATA_TO_TASK(taskdata);
3212  return task;
3213 }
3214 
3215 // __kmp_steal_task: remove a task from another thread's deque
3216 // Assume that calling thread has already checked existence of
3217 // task_team thread_data before calling this routine.
3218 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3219  kmp_task_team_t *task_team,
3220  std::atomic<kmp_int32> *unfinished_threads,
3221  int *thread_finished,
3222  kmp_int32 is_constrained) {
3223  kmp_task_t *task;
3224  kmp_taskdata_t *taskdata;
3225  kmp_taskdata_t *current;
3226  kmp_thread_data_t *victim_td, *threads_data;
3227  kmp_int32 target;
3228  kmp_info_t *victim_thr;
3229 
3230  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3231 
3232  threads_data = task_team->tt.tt_threads_data;
3233  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3234  KMP_DEBUG_ASSERT(victim_tid >= 0);
3235  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3236 
3237  victim_td = &threads_data[victim_tid];
3238  victim_thr = victim_td->td.td_thr;
3239  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3240 
3241  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3242  "task_team=%p ntasks=%d head=%u tail=%u\n",
3243  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3244  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3245  victim_td->td.td_deque_tail));
3246 
3247  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3248  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3249  "task_team=%p ntasks=%d head=%u tail=%u\n",
3250  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3251  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3252  victim_td->td.td_deque_tail));
3253  return NULL;
3254  }
3255 
3256  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3257 
3258  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3259  // Check again after we acquire the lock
3260  if (ntasks == 0) {
3261  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3262  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3263  "task_team=%p ntasks=%d head=%u tail=%u\n",
3264  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3265  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3266  return NULL;
3267  }
3268 
3269  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3270  current = __kmp_threads[gtid]->th.th_current_task;
3271  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3272  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3273  // Bump head pointer and Wrap.
3274  victim_td->td.td_deque_head =
3275  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3276  } else {
3277  if (!task_team->tt.tt_untied_task_encountered) {
3278  // The TSC does not allow to steal victim task
3279  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3280  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3281  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3282  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3283  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3284  return NULL;
3285  }
3286  int i;
3287  // walk through victim's deque trying to steal any task
3288  target = victim_td->td.td_deque_head;
3289  taskdata = NULL;
3290  for (i = 1; i < ntasks; ++i) {
3291  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3292  taskdata = victim_td->td.td_deque[target];
3293  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3294  break; // found victim task
3295  } else {
3296  taskdata = NULL;
3297  }
3298  }
3299  if (taskdata == NULL) {
3300  // No appropriate candidate to steal found
3301  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3302  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3303  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3304  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3305  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3306  return NULL;
3307  }
3308  int prev = target;
3309  for (i = i + 1; i < ntasks; ++i) {
3310  // shift remaining tasks in the deque left by 1
3311  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3312  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3313  prev = target;
3314  }
3315  KMP_DEBUG_ASSERT(
3316  victim_td->td.td_deque_tail ==
3317  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3318  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3319  }
3320  if (*thread_finished) {
3321  // We need to un-mark this victim as a finished victim. This must be done
3322  // before releasing the lock, or else other threads (starting with the
3323  // primary thread victim) might be prematurely released from the barrier!!!
3324 #if KMP_DEBUG
3325  kmp_int32 count =
3326 #endif
3327  KMP_ATOMIC_INC(unfinished_threads);
3328  KA_TRACE(
3329  20,
3330  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3331  gtid, count + 1, task_team));
3332  *thread_finished = FALSE;
3333  }
3334  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3335 
3336  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3337 
3338  KMP_COUNT_BLOCK(TASK_stolen);
3339  KA_TRACE(10,
3340  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3341  "task_team=%p ntasks=%d head=%u tail=%u\n",
3342  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3343  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3344 
3345  task = KMP_TASKDATA_TO_TASK(taskdata);
3346  return task;
3347 }
3348 
3349 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3350 // condition is statisfied (return true) or there are none left (return false).
3351 //
3352 // final_spin is TRUE if this is the spin at the release barrier.
3353 // thread_finished indicates whether the thread is finished executing all
3354 // the tasks it has on its deque, and is at the release barrier.
3355 // spinner is the location on which to spin.
3356 // spinner == NULL means only execute a single task and return.
3357 // checker is the value to check to terminate the spin.
3358 template <class C>
3359 static inline int __kmp_execute_tasks_template(
3360  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3361  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3362  kmp_int32 is_constrained) {
3363  kmp_task_team_t *task_team = thread->th.th_task_team;
3364  kmp_thread_data_t *threads_data;
3365  kmp_task_t *task;
3366  kmp_info_t *other_thread;
3367  kmp_taskdata_t *current_task = thread->th.th_current_task;
3368  std::atomic<kmp_int32> *unfinished_threads;
3369  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3370  tid = thread->th.th_info.ds.ds_tid;
3371 
3372  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3373  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3374 
3375  if (task_team == NULL || current_task == NULL)
3376  return FALSE;
3377 
3378  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3379  "*thread_finished=%d\n",
3380  gtid, final_spin, *thread_finished));
3381 
3382  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3383  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3384 
3385  KMP_DEBUG_ASSERT(threads_data != NULL);
3386 
3387  nthreads = task_team->tt.tt_nproc;
3388  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3389  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3390 
3391  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3392  // getting tasks from target constructs
3393  while (1) { // Inner loop to find a task and execute it
3394  task = NULL;
3395  if (task_team->tt.tt_num_task_pri) { // get priority task first
3396  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3397  }
3398  if (task == NULL && use_own_tasks) { // check own queue next
3399  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3400  }
3401  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3402  int asleep = 1;
3403  use_own_tasks = 0;
3404  // Try to steal from the last place I stole from successfully.
3405  if (victim_tid == -2) { // haven't stolen anything yet
3406  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3407  if (victim_tid !=
3408  -1) // if we have a last stolen from victim, get the thread
3409  other_thread = threads_data[victim_tid].td.td_thr;
3410  }
3411  if (victim_tid != -1) { // found last victim
3412  asleep = 0;
3413  } else if (!new_victim) { // no recent steals and we haven't already
3414  // used a new victim; select a random thread
3415  do { // Find a different thread to steal work from.
3416  // Pick a random thread. Initial plan was to cycle through all the
3417  // threads, and only return if we tried to steal from every thread,
3418  // and failed. Arch says that's not such a great idea.
3419  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3420  if (victim_tid >= tid) {
3421  ++victim_tid; // Adjusts random distribution to exclude self
3422  }
3423  // Found a potential victim
3424  other_thread = threads_data[victim_tid].td.td_thr;
3425  // There is a slight chance that __kmp_enable_tasking() did not wake
3426  // up all threads waiting at the barrier. If victim is sleeping,
3427  // then wake it up. Since we were going to pay the cache miss
3428  // penalty for referencing another thread's kmp_info_t struct
3429  // anyway,
3430  // the check shouldn't cost too much performance at this point. In
3431  // extra barrier mode, tasks do not sleep at the separate tasking
3432  // barrier, so this isn't a problem.
3433  asleep = 0;
3434  if ((__kmp_tasking_mode == tskm_task_teams) &&
3435  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3436  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3437  NULL)) {
3438  asleep = 1;
3439  __kmp_null_resume_wrapper(other_thread);
3440  // A sleeping thread should not have any tasks on it's queue.
3441  // There is a slight possibility that it resumes, steals a task
3442  // from another thread, which spawns more tasks, all in the time
3443  // that it takes this thread to check => don't write an assertion
3444  // that the victim's queue is empty. Try stealing from a
3445  // different thread.
3446  }
3447  } while (asleep);
3448  }
3449 
3450  if (!asleep) {
3451  // We have a victim to try to steal from
3452  task =
3453  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3454  thread_finished, is_constrained);
3455  }
3456  if (task != NULL) { // set last stolen to victim
3457  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3458  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3459  // The pre-refactored code did not try more than 1 successful new
3460  // vicitm, unless the last one generated more local tasks;
3461  // new_victim keeps track of this
3462  new_victim = 1;
3463  }
3464  } else { // No tasks found; unset last_stolen
3465  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3466  victim_tid = -2; // no successful victim found
3467  }
3468  }
3469 
3470  if (task == NULL)
3471  break; // break out of tasking loop
3472 
3473 // Found a task; execute it
3474 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3475  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3476  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3477  // get the object reliably
3478  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3479  }
3480  __kmp_itt_task_starting(itt_sync_obj);
3481  }
3482 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3483  __kmp_invoke_task(gtid, task, current_task);
3484 #if USE_ITT_BUILD
3485  if (itt_sync_obj != NULL)
3486  __kmp_itt_task_finished(itt_sync_obj);
3487 #endif /* USE_ITT_BUILD */
3488  // If this thread is only partway through the barrier and the condition is
3489  // met, then return now, so that the barrier gather/release pattern can
3490  // proceed. If this thread is in the last spin loop in the barrier,
3491  // waiting to be released, we know that the termination condition will not
3492  // be satisfied, so don't waste any cycles checking it.
3493  if (flag == NULL || (!final_spin && flag->done_check())) {
3494  KA_TRACE(
3495  15,
3496  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3497  gtid));
3498  return TRUE;
3499  }
3500  if (thread->th.th_task_team == NULL) {
3501  break;
3502  }
3503  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3504  // If execution of a stolen task results in more tasks being placed on our
3505  // run queue, reset use_own_tasks
3506  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3507  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3508  "other tasks, restart\n",
3509  gtid));
3510  use_own_tasks = 1;
3511  new_victim = 0;
3512  }
3513  }
3514 
3515  // The task source has been exhausted. If in final spin loop of barrier,
3516  // check if termination condition is satisfied. The work queue may be empty
3517  // but there might be proxy tasks still executing.
3518  if (final_spin &&
3519  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3520  // First, decrement the #unfinished threads, if that has not already been
3521  // done. This decrement might be to the spin location, and result in the
3522  // termination condition being satisfied.
3523  if (!*thread_finished) {
3524 #if KMP_DEBUG
3525  kmp_int32 count = -1 +
3526 #endif
3527  KMP_ATOMIC_DEC(unfinished_threads);
3528  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3529  "unfinished_threads to %d task_team=%p\n",
3530  gtid, count, task_team));
3531  *thread_finished = TRUE;
3532  }
3533 
3534  // It is now unsafe to reference thread->th.th_team !!!
3535  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3536  // thread to pass through the barrier, where it might reset each thread's
3537  // th.th_team field for the next parallel region. If we can steal more
3538  // work, we know that this has not happened yet.
3539  if (flag != NULL && flag->done_check()) {
3540  KA_TRACE(
3541  15,
3542  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3543  gtid));
3544  return TRUE;
3545  }
3546  }
3547 
3548  // If this thread's task team is NULL, primary thread has recognized that
3549  // there are no more tasks; bail out
3550  if (thread->th.th_task_team == NULL) {
3551  KA_TRACE(15,
3552  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3553  return FALSE;
3554  }
3555 
3556  // Check the flag again to see if it has already done in case to be trapped
3557  // into infinite loop when a if0 task depends on a hidden helper task
3558  // outside any parallel region. Detached tasks are not impacted in this case
3559  // because the only thread executing this function has to execute the proxy
3560  // task so it is in another code path that has the same check.
3561  if (flag == NULL || (!final_spin && flag->done_check())) {
3562  KA_TRACE(15,
3563  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3564  gtid));
3565  return TRUE;
3566  }
3567 
3568  // We could be getting tasks from target constructs; if this is the only
3569  // thread, keep trying to execute tasks from own queue
3570  if (nthreads == 1 &&
3571  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3572  use_own_tasks = 1;
3573  else {
3574  KA_TRACE(15,
3575  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3576  return FALSE;
3577  }
3578  }
3579 }
3580 
3581 template <bool C, bool S>
3582 int __kmp_execute_tasks_32(
3583  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3584  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3585  kmp_int32 is_constrained) {
3586  return __kmp_execute_tasks_template(
3587  thread, gtid, flag, final_spin,
3588  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3589 }
3590 
3591 template <bool C, bool S>
3592 int __kmp_execute_tasks_64(
3593  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3594  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3595  kmp_int32 is_constrained) {
3596  return __kmp_execute_tasks_template(
3597  thread, gtid, flag, final_spin,
3598  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3599 }
3600 
3601 template <bool C, bool S>
3602 int __kmp_atomic_execute_tasks_64(
3603  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3604  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3605  kmp_int32 is_constrained) {
3606  return __kmp_execute_tasks_template(
3607  thread, gtid, flag, final_spin,
3608  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3609 }
3610 
3611 int __kmp_execute_tasks_oncore(
3612  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3613  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3614  kmp_int32 is_constrained) {
3615  return __kmp_execute_tasks_template(
3616  thread, gtid, flag, final_spin,
3617  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3618 }
3619 
3620 template int
3621 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3622  kmp_flag_32<false, false> *, int,
3623  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3624 
3625 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3626  kmp_flag_64<false, true> *,
3627  int,
3628  int *USE_ITT_BUILD_ARG(void *),
3629  kmp_int32);
3630 
3631 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3632  kmp_flag_64<true, false> *,
3633  int,
3634  int *USE_ITT_BUILD_ARG(void *),
3635  kmp_int32);
3636 
3637 template int __kmp_atomic_execute_tasks_64<false, true>(
3638  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3639  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3640 
3641 template int __kmp_atomic_execute_tasks_64<true, false>(
3642  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3643  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3644 
3645 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3646 // next barrier so they can assist in executing enqueued tasks.
3647 // First thread in allocates the task team atomically.
3648 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3649  kmp_info_t *this_thr) {
3650  kmp_thread_data_t *threads_data;
3651  int nthreads, i, is_init_thread;
3652 
3653  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3654  __kmp_gtid_from_thread(this_thr)));
3655 
3656  KMP_DEBUG_ASSERT(task_team != NULL);
3657  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3658 
3659  nthreads = task_team->tt.tt_nproc;
3660  KMP_DEBUG_ASSERT(nthreads > 0);
3661  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3662 
3663  // Allocate or increase the size of threads_data if necessary
3664  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3665 
3666  if (!is_init_thread) {
3667  // Some other thread already set up the array.
3668  KA_TRACE(
3669  20,
3670  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3671  __kmp_gtid_from_thread(this_thr)));
3672  return;
3673  }
3674  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3675  KMP_DEBUG_ASSERT(threads_data != NULL);
3676 
3677  if (__kmp_tasking_mode == tskm_task_teams &&
3678  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3679  // Release any threads sleeping at the barrier, so that they can steal
3680  // tasks and execute them. In extra barrier mode, tasks do not sleep
3681  // at the separate tasking barrier, so this isn't a problem.
3682  for (i = 0; i < nthreads; i++) {
3683  void *sleep_loc;
3684  kmp_info_t *thread = threads_data[i].td.td_thr;
3685 
3686  if (i == this_thr->th.th_info.ds.ds_tid) {
3687  continue;
3688  }
3689  // Since we haven't locked the thread's suspend mutex lock at this
3690  // point, there is a small window where a thread might be putting
3691  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3692  // To work around this, __kmp_execute_tasks_template() periodically checks
3693  // see if other threads are sleeping (using the same random mechanism that
3694  // is used for task stealing) and awakens them if they are.
3695  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3696  NULL) {
3697  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3698  __kmp_gtid_from_thread(this_thr),
3699  __kmp_gtid_from_thread(thread)));
3700  __kmp_null_resume_wrapper(thread);
3701  } else {
3702  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3703  __kmp_gtid_from_thread(this_thr),
3704  __kmp_gtid_from_thread(thread)));
3705  }
3706  }
3707  }
3708 
3709  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3710  __kmp_gtid_from_thread(this_thr)));
3711 }
3712 
3713 /* // TODO: Check the comment consistency
3714  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3715  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3716  * After a child * thread checks into a barrier and calls __kmp_release() from
3717  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3718  * longer assume that the kmp_team_t structure is intact (at any moment, the
3719  * primary thread may exit the barrier code and free the team data structure,
3720  * and return the threads to the thread pool).
3721  *
3722  * This does not work with the tasking code, as the thread is still
3723  * expected to participate in the execution of any tasks that may have been
3724  * spawned my a member of the team, and the thread still needs access to all
3725  * to each thread in the team, so that it can steal work from it.
3726  *
3727  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3728  * counting mechanism, and is allocated by the primary thread before calling
3729  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3730  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3731  * of the kmp_task_team_t structs for consecutive barriers can overlap
3732  * (and will, unless the primary thread is the last thread to exit the barrier
3733  * release phase, which is not typical). The existence of such a struct is
3734  * useful outside the context of tasking.
3735  *
3736  * We currently use the existence of the threads array as an indicator that
3737  * tasks were spawned since the last barrier. If the structure is to be
3738  * useful outside the context of tasking, then this will have to change, but
3739  * not setting the field minimizes the performance impact of tasking on
3740  * barriers, when no explicit tasks were spawned (pushed, actually).
3741  */
3742 
3743 static kmp_task_team_t *__kmp_free_task_teams =
3744  NULL; // Free list for task_team data structures
3745 // Lock for task team data structures
3746 kmp_bootstrap_lock_t __kmp_task_team_lock =
3747  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3748 
3749 // __kmp_alloc_task_deque:
3750 // Allocates a task deque for a particular thread, and initialize the necessary
3751 // data structures relating to the deque. This only happens once per thread
3752 // per task team since task teams are recycled. No lock is needed during
3753 // allocation since each thread allocates its own deque.
3754 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3755  kmp_thread_data_t *thread_data) {
3756  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3757  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3758 
3759  // Initialize last stolen task field to "none"
3760  thread_data->td.td_deque_last_stolen = -1;
3761 
3762  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3763  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3764  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3765 
3766  KE_TRACE(
3767  10,
3768  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3769  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3770  // Allocate space for task deque, and zero the deque
3771  // Cannot use __kmp_thread_calloc() because threads not around for
3772  // kmp_reap_task_team( ).
3773  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3774  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3775  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3776 }
3777 
3778 // __kmp_free_task_deque:
3779 // Deallocates a task deque for a particular thread. Happens at library
3780 // deallocation so don't need to reset all thread data fields.
3781 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3782  if (thread_data->td.td_deque != NULL) {
3783  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3784  TCW_4(thread_data->td.td_deque_ntasks, 0);
3785  __kmp_free(thread_data->td.td_deque);
3786  thread_data->td.td_deque = NULL;
3787  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3788  }
3789 
3790 #ifdef BUILD_TIED_TASK_STACK
3791  // GEH: Figure out what to do here for td_susp_tied_tasks
3792  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3793  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3794  }
3795 #endif // BUILD_TIED_TASK_STACK
3796 }
3797 
3798 // __kmp_realloc_task_threads_data:
3799 // Allocates a threads_data array for a task team, either by allocating an
3800 // initial array or enlarging an existing array. Only the first thread to get
3801 // the lock allocs or enlarges the array and re-initializes the array elements.
3802 // That thread returns "TRUE", the rest return "FALSE".
3803 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3804 // The current size is given by task_team -> tt.tt_max_threads.
3805 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3806  kmp_task_team_t *task_team) {
3807  kmp_thread_data_t **threads_data_p;
3808  kmp_int32 nthreads, maxthreads;
3809  int is_init_thread = FALSE;
3810 
3811  if (TCR_4(task_team->tt.tt_found_tasks)) {
3812  // Already reallocated and initialized.
3813  return FALSE;
3814  }
3815 
3816  threads_data_p = &task_team->tt.tt_threads_data;
3817  nthreads = task_team->tt.tt_nproc;
3818  maxthreads = task_team->tt.tt_max_threads;
3819 
3820  // All threads must lock when they encounter the first task of the implicit
3821  // task region to make sure threads_data fields are (re)initialized before
3822  // used.
3823  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3824 
3825  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3826  // first thread to enable tasking
3827  kmp_team_t *team = thread->th.th_team;
3828  int i;
3829 
3830  is_init_thread = TRUE;
3831  if (maxthreads < nthreads) {
3832 
3833  if (*threads_data_p != NULL) {
3834  kmp_thread_data_t *old_data = *threads_data_p;
3835  kmp_thread_data_t *new_data = NULL;
3836 
3837  KE_TRACE(
3838  10,
3839  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3840  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3841  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3842  // Reallocate threads_data to have more elements than current array
3843  // Cannot use __kmp_thread_realloc() because threads not around for
3844  // kmp_reap_task_team( ). Note all new array entries are initialized
3845  // to zero by __kmp_allocate().
3846  new_data = (kmp_thread_data_t *)__kmp_allocate(
3847  nthreads * sizeof(kmp_thread_data_t));
3848  // copy old data to new data
3849  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3850  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3851 
3852 #ifdef BUILD_TIED_TASK_STACK
3853  // GEH: Figure out if this is the right thing to do
3854  for (i = maxthreads; i < nthreads; i++) {
3855  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3856  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3857  }
3858 #endif // BUILD_TIED_TASK_STACK
3859  // Install the new data and free the old data
3860  (*threads_data_p) = new_data;
3861  __kmp_free(old_data);
3862  } else {
3863  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3864  "threads data for task_team %p, size = %d\n",
3865  __kmp_gtid_from_thread(thread), task_team, nthreads));
3866  // Make the initial allocate for threads_data array, and zero entries
3867  // Cannot use __kmp_thread_calloc() because threads not around for
3868  // kmp_reap_task_team( ).
3869  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3870  nthreads * sizeof(kmp_thread_data_t));
3871 #ifdef BUILD_TIED_TASK_STACK
3872  // GEH: Figure out if this is the right thing to do
3873  for (i = 0; i < nthreads; i++) {
3874  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3875  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3876  }
3877 #endif // BUILD_TIED_TASK_STACK
3878  }
3879  task_team->tt.tt_max_threads = nthreads;
3880  } else {
3881  // If array has (more than) enough elements, go ahead and use it
3882  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3883  }
3884 
3885  // initialize threads_data pointers back to thread_info structures
3886  for (i = 0; i < nthreads; i++) {
3887  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3888  thread_data->td.td_thr = team->t.t_threads[i];
3889 
3890  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3891  // The last stolen field survives across teams / barrier, and the number
3892  // of threads may have changed. It's possible (likely?) that a new
3893  // parallel region will exhibit the same behavior as previous region.
3894  thread_data->td.td_deque_last_stolen = -1;
3895  }
3896  }
3897 
3898  KMP_MB();
3899  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3900  }
3901 
3902  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3903  return is_init_thread;
3904 }
3905 
3906 // __kmp_free_task_threads_data:
3907 // Deallocates a threads_data array for a task team, including any attached
3908 // tasking deques. Only occurs at library shutdown.
3909 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3910  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3911  if (task_team->tt.tt_threads_data != NULL) {
3912  int i;
3913  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3914  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3915  }
3916  __kmp_free(task_team->tt.tt_threads_data);
3917  task_team->tt.tt_threads_data = NULL;
3918  }
3919  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3920 }
3921 
3922 // __kmp_free_task_pri_list:
3923 // Deallocates tasking deques used for priority tasks.
3924 // Only occurs at library shutdown.
3925 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3926  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3927  if (task_team->tt.tt_task_pri_list != NULL) {
3928  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3929  while (list != NULL) {
3930  kmp_task_pri_t *next = list->next;
3931  __kmp_free_task_deque(&list->td);
3932  __kmp_free(list);
3933  list = next;
3934  }
3935  task_team->tt.tt_task_pri_list = NULL;
3936  }
3937  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3938 }
3939 
3940 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3941  kmp_team_t *team) {
3942  int team_nth = team->t.t_nproc;
3943  // Only need to init if task team is isn't active or team size changed
3944  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3945  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3946  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3947  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3948  TCW_4(task_team->tt.tt_nproc, team_nth);
3949  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3950  TCW_4(task_team->tt.tt_active, TRUE);
3951  }
3952 }
3953 
3954 // __kmp_allocate_task_team:
3955 // Allocates a task team associated with a specific team, taking it from
3956 // the global task team free list if possible. Also initializes data
3957 // structures.
3958 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3959  kmp_team_t *team) {
3960  kmp_task_team_t *task_team = NULL;
3961 
3962  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3963  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3964 
3965  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3966  // Take a task team from the task team pool
3967  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3968  if (__kmp_free_task_teams != NULL) {
3969  task_team = __kmp_free_task_teams;
3970  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3971  task_team->tt.tt_next = NULL;
3972  }
3973  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3974  }
3975 
3976  if (task_team == NULL) {
3977  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3978  "task team for team %p\n",
3979  __kmp_gtid_from_thread(thread), team));
3980  // Allocate a new task team if one is not available. Cannot use
3981  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3982  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3983  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3984  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3985 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3986  // suppress race conditions detection on synchronization flags in debug mode
3987  // this helps to analyze library internals eliminating false positives
3988  __itt_suppress_mark_range(
3989  __itt_suppress_range, __itt_suppress_threading_errors,
3990  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3991  __itt_suppress_mark_range(__itt_suppress_range,
3992  __itt_suppress_threading_errors,
3993  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3994  sizeof(task_team->tt.tt_active));
3995 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3996  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3997  // task_team->tt.tt_threads_data = NULL;
3998  // task_team->tt.tt_max_threads = 0;
3999  // task_team->tt.tt_next = NULL;
4000  }
4001 
4002  __kmp_task_team_init(task_team, team);
4003 
4004  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
4005  "unfinished_threads init'd to %d\n",
4006  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
4007  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
4008  return task_team;
4009 }
4010 
4011 // __kmp_free_task_team:
4012 // Frees the task team associated with a specific thread, and adds it
4013 // to the global task team free list.
4014 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
4015  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
4016  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
4017 
4018  // Put task team back on free list
4019  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4020 
4021  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
4022  task_team->tt.tt_next = __kmp_free_task_teams;
4023  TCW_PTR(__kmp_free_task_teams, task_team);
4024 
4025  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4026 }
4027 
4028 // __kmp_reap_task_teams:
4029 // Free all the task teams on the task team free list.
4030 // Should only be done during library shutdown.
4031 // Cannot do anything that needs a thread structure or gtid since they are
4032 // already gone.
4033 void __kmp_reap_task_teams(void) {
4034  kmp_task_team_t *task_team;
4035 
4036  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
4037  // Free all task_teams on the free list
4038  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
4039  while ((task_team = __kmp_free_task_teams) != NULL) {
4040  __kmp_free_task_teams = task_team->tt.tt_next;
4041  task_team->tt.tt_next = NULL;
4042 
4043  // Free threads_data if necessary
4044  if (task_team->tt.tt_threads_data != NULL) {
4045  __kmp_free_task_threads_data(task_team);
4046  }
4047  if (task_team->tt.tt_task_pri_list != NULL) {
4048  __kmp_free_task_pri_list(task_team);
4049  }
4050  __kmp_free(task_team);
4051  }
4052  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
4053  }
4054 }
4055 
4056 // View the array of two task team pointers as a pair of pointers:
4057 // 1) a single task_team pointer
4058 // 2) next pointer for stack
4059 // Serial teams can create a stack of task teams for nested serial teams.
4060 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4061  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4062  kmp_task_team_list_t *current =
4063  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4064  kmp_task_team_list_t *node =
4065  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
4066  node->task_team = current->task_team;
4067  node->next = current->next;
4068  thread->th.th_task_team = current->task_team = NULL;
4069  current->next = node;
4070 }
4071 
4072 // Serial team pops a task team off the stack
4073 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
4074  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4075  kmp_task_team_list_t *current =
4076  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
4077  if (current->task_team) {
4078  __kmp_free_task_team(thread, current->task_team);
4079  }
4080  kmp_task_team_list_t *next = current->next;
4081  if (next) {
4082  current->task_team = next->task_team;
4083  current->next = next->next;
4084  KMP_DEBUG_ASSERT(next != current);
4085  __kmp_free(next);
4086  thread->th.th_task_team = current->task_team;
4087  }
4088 }
4089 
4090 // __kmp_wait_to_unref_task_teams:
4091 // Some threads could still be in the fork barrier release code, possibly
4092 // trying to steal tasks. Wait for each thread to unreference its task team.
4093 void __kmp_wait_to_unref_task_teams(void) {
4094  kmp_info_t *thread;
4095  kmp_uint32 spins;
4096  kmp_uint64 time;
4097  int done;
4098 
4099  KMP_INIT_YIELD(spins);
4100  KMP_INIT_BACKOFF(time);
4101 
4102  for (;;) {
4103  done = TRUE;
4104 
4105  // TODO: GEH - this may be is wrong because some sync would be necessary
4106  // in case threads are added to the pool during the traversal. Need to
4107  // verify that lock for thread pool is held when calling this routine.
4108  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
4109  thread = thread->th.th_next_pool) {
4110 #if KMP_OS_WINDOWS
4111  DWORD exit_val;
4112 #endif
4113  if (TCR_PTR(thread->th.th_task_team) == NULL) {
4114  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
4115  __kmp_gtid_from_thread(thread)));
4116  continue;
4117  }
4118 #if KMP_OS_WINDOWS
4119  // TODO: GEH - add this check for Linux* OS / OS X* as well?
4120  if (!__kmp_is_thread_alive(thread, &exit_val)) {
4121  thread->th.th_task_team = NULL;
4122  continue;
4123  }
4124 #endif
4125 
4126  done = FALSE; // Because th_task_team pointer is not NULL for this thread
4127 
4128  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
4129  "unreference task_team\n",
4130  __kmp_gtid_from_thread(thread)));
4131 
4132  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
4133  void *sleep_loc;
4134  // If the thread is sleeping, awaken it.
4135  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
4136  NULL) {
4137  KA_TRACE(
4138  10,
4139  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
4140  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
4141  __kmp_null_resume_wrapper(thread);
4142  }
4143  }
4144  }
4145  if (done) {
4146  break;
4147  }
4148 
4149  // If oversubscribed or have waited a bit, yield.
4150  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
4151  }
4152 }
4153 
4154 // __kmp_task_team_setup: Create a task_team for the current team, but use
4155 // an already created, unused one if it already exists.
4156 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
4157  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4158 
4159  // For the serial and root teams, setup the first task team pointer to point
4160  // to task team. The other pointer is a stack of task teams from previous
4161  // serial levels.
4162  if (team == this_thr->th.th_serial_team ||
4163  team == this_thr->th.th_root->r.r_root_team) {
4164  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4165  if (team->t.t_task_team[0] == NULL) {
4166  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
4167  KA_TRACE(
4168  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4169  " for serial/root team %p\n",
4170  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
4171 
4172  } else
4173  __kmp_task_team_init(team->t.t_task_team[0], team);
4174  return;
4175  }
4176 
4177  // If this task_team hasn't been created yet, allocate it. It will be used in
4178  // the region after the next.
4179  // If it exists, it is the current task team and shouldn't be touched yet as
4180  // it may still be in use.
4181  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
4182  team->t.t_task_team[this_thr->th.th_task_state] =
4183  __kmp_allocate_task_team(this_thr, team);
4184  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
4185  " for team %d at parity=%d\n",
4186  __kmp_gtid_from_thread(this_thr),
4187  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
4188  this_thr->th.th_task_state));
4189  }
4190 
4191  // After threads exit the release, they will call sync, and then point to this
4192  // other task_team; make sure it is allocated and properly initialized. As
4193  // threads spin in the barrier release phase, they will continue to use the
4194  // previous task_team struct(above), until they receive the signal to stop
4195  // checking for tasks (they can't safely reference the kmp_team_t struct,
4196  // which could be reallocated by the primary thread).
4197  int other_team = 1 - this_thr->th.th_task_state;
4198  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
4199  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
4200  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
4201  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
4202  "task_team %p for team %d at parity=%d\n",
4203  __kmp_gtid_from_thread(this_thr),
4204  team->t.t_task_team[other_team], team->t.t_id, other_team));
4205  } else { // Leave the old task team struct in place for the upcoming region;
4206  // adjust as needed
4207  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
4208  __kmp_task_team_init(task_team, team);
4209  // if team size has changed, the first thread to enable tasking will
4210  // realloc threads_data if necessary
4211  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4212  "%p for team %d at parity=%d\n",
4213  __kmp_gtid_from_thread(this_thr),
4214  team->t.t_task_team[other_team], team->t.t_id, other_team));
4215  }
4216 
4217  // For regular thread, task enabling should be called when the task is going
4218  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4219  // it ahead of time so that some operations can be performed without race
4220  // condition.
4221  if (this_thr == __kmp_hidden_helper_main_thread) {
4222  for (int i = 0; i < 2; ++i) {
4223  kmp_task_team_t *task_team = team->t.t_task_team[i];
4224  if (KMP_TASKING_ENABLED(task_team)) {
4225  continue;
4226  }
4227  __kmp_enable_tasking(task_team, this_thr);
4228  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4229  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4230  if (thread_data->td.td_deque == NULL) {
4231  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4232  }
4233  }
4234  }
4235  }
4236 }
4237 
4238 // __kmp_task_team_sync: Propagation of task team data from team to threads
4239 // which happens just after the release phase of a team barrier. This may be
4240 // called by any thread. This is not called for serial or root teams.
4241 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4242  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4243  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4244  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4245 
4246  // Toggle the th_task_state field, to switch which task_team this thread
4247  // refers to
4248  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4249 
4250  // It is now safe to propagate the task team pointer from the team struct to
4251  // the current thread.
4252  TCW_PTR(this_thr->th.th_task_team,
4253  team->t.t_task_team[this_thr->th.th_task_state]);
4254  KA_TRACE(20,
4255  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4256  "%p from Team #%d (parity=%d)\n",
4257  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4258  team->t.t_id, this_thr->th.th_task_state));
4259 }
4260 
4261 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4262 // barrier gather phase. Only called by the primary thread.
4263 //
4264 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4265 // by passing in 0 optionally as the last argument. When wait is zero, primary
4266 // thread does not wait for unfinished_threads to reach 0.
4267 void __kmp_task_team_wait(
4268  kmp_info_t *this_thr,
4269  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4270  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4271 
4272  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4273  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4274 
4275  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4276  if (wait) {
4277  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4278  "(for unfinished_threads to reach 0) on task_team = %p\n",
4279  __kmp_gtid_from_thread(this_thr), task_team));
4280  // Worker threads may have dropped through to release phase, but could
4281  // still be executing tasks. Wait here for tasks to complete. To avoid
4282  // memory contention, only primary thread checks termination condition.
4283  kmp_flag_32<false, false> flag(
4284  RCAST(std::atomic<kmp_uint32> *,
4285  &task_team->tt.tt_unfinished_threads),
4286  0U);
4287  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4288  }
4289  // Deactivate the old task team, so that the worker threads will stop
4290  // referencing it while spinning.
4291  KA_TRACE(
4292  20,
4293  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4294  "setting active to false, setting local and team's pointer to NULL\n",
4295  __kmp_gtid_from_thread(this_thr), task_team));
4296  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4297  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4298  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4299  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4300  KMP_MB();
4301 
4302  TCW_PTR(this_thr->th.th_task_team, NULL);
4303  }
4304 }
4305 
4306 // __kmp_tasking_barrier:
4307 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4308 // Internal function to execute all tasks prior to a regular barrier or a join
4309 // barrier. It is a full barrier itself, which unfortunately turns regular
4310 // barriers into double barriers and join barriers into 1 1/2 barriers.
4311 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4312  std::atomic<kmp_uint32> *spin = RCAST(
4313  std::atomic<kmp_uint32> *,
4314  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4315  int flag = FALSE;
4316  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4317 
4318 #if USE_ITT_BUILD
4319  KMP_FSYNC_SPIN_INIT(spin, NULL);
4320 #endif /* USE_ITT_BUILD */
4321  kmp_flag_32<false, false> spin_flag(spin, 0U);
4322  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4323  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4324 #if USE_ITT_BUILD
4325  // TODO: What about itt_sync_obj??
4326  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4327 #endif /* USE_ITT_BUILD */
4328 
4329  if (TCR_4(__kmp_global.g.g_done)) {
4330  if (__kmp_global.g.g_abort)
4331  __kmp_abort_thread();
4332  break;
4333  }
4334  KMP_YIELD(TRUE);
4335  }
4336 #if USE_ITT_BUILD
4337  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4338 #endif /* USE_ITT_BUILD */
4339 }
4340 
4341 // __kmp_give_task puts a task into a given thread queue if:
4342 // - the queue for that thread was created
4343 // - there's space in that queue
4344 // Because of this, __kmp_push_task needs to check if there's space after
4345 // getting the lock
4346 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4347  kmp_int32 pass) {
4348  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4349  kmp_task_team_t *task_team = taskdata->td_task_team;
4350 
4351  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4352  taskdata, tid));
4353 
4354  // If task_team is NULL something went really bad...
4355  KMP_DEBUG_ASSERT(task_team != NULL);
4356 
4357  bool result = false;
4358  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4359 
4360  if (thread_data->td.td_deque == NULL) {
4361  // There's no queue in this thread, go find another one
4362  // We're guaranteed that at least one thread has a queue
4363  KA_TRACE(30,
4364  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4365  tid, taskdata));
4366  return result;
4367  }
4368 
4369  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4370  TASK_DEQUE_SIZE(thread_data->td)) {
4371  KA_TRACE(
4372  30,
4373  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4374  taskdata, tid));
4375 
4376  // if this deque is bigger than the pass ratio give a chance to another
4377  // thread
4378  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4379  return result;
4380 
4381  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4382  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4383  TASK_DEQUE_SIZE(thread_data->td)) {
4384  // expand deque to push the task which is not allowed to execute
4385  __kmp_realloc_task_deque(thread, thread_data);
4386  }
4387 
4388  } else {
4389 
4390  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4391 
4392  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4393  TASK_DEQUE_SIZE(thread_data->td)) {
4394  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4395  "thread %d.\n",
4396  taskdata, tid));
4397 
4398  // if this deque is bigger than the pass ratio give a chance to another
4399  // thread
4400  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4401  goto release_and_exit;
4402 
4403  __kmp_realloc_task_deque(thread, thread_data);
4404  }
4405  }
4406 
4407  // lock is held here, and there is space in the deque
4408 
4409  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4410  // Wrap index.
4411  thread_data->td.td_deque_tail =
4412  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4413  TCW_4(thread_data->td.td_deque_ntasks,
4414  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4415 
4416  result = true;
4417  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4418  taskdata, tid));
4419 
4420 release_and_exit:
4421  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4422 
4423  return result;
4424 }
4425 
4426 #define PROXY_TASK_FLAG 0x40000000
4427 /* The finish of the proxy tasks is divided in two pieces:
4428  - the top half is the one that can be done from a thread outside the team
4429  - the bottom half must be run from a thread within the team
4430 
4431  In order to run the bottom half the task gets queued back into one of the
4432  threads of the team. Once the td_incomplete_child_task counter of the parent
4433  is decremented the threads can leave the barriers. So, the bottom half needs
4434  to be queued before the counter is decremented. The top half is therefore
4435  divided in two parts:
4436  - things that can be run before queuing the bottom half
4437  - things that must be run after queuing the bottom half
4438 
4439  This creates a second race as the bottom half can free the task before the
4440  second top half is executed. To avoid this we use the
4441  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4442  half. */
4443 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4444  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4445  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4446  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4447  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4448 
4449  taskdata->td_flags.complete = 1; // mark the task as completed
4450 #if OMPX_TASKGRAPH
4451  taskdata->td_flags.onced = 1;
4452 #endif
4453 
4454  if (taskdata->td_taskgroup)
4455  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4456 
4457  // Create an imaginary children for this task so the bottom half cannot
4458  // release the task before we have completed the second top half
4459  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4460 }
4461 
4462 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4463 #if KMP_DEBUG
4464  kmp_int32 children = 0;
4465  // Predecrement simulated by "- 1" calculation
4466  children = -1 +
4467 #endif
4468  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4469  KMP_DEBUG_ASSERT(children >= 0);
4470 
4471  // Remove the imaginary children
4472  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4473 }
4474 
4475 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4476  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4477  kmp_info_t *thread = __kmp_threads[gtid];
4478 
4479  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4480  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4481  1); // top half must run before bottom half
4482 
4483  // We need to wait to make sure the top half is finished
4484  // Spinning here should be ok as this should happen quickly
4485  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4486  PROXY_TASK_FLAG) > 0)
4487  ;
4488 
4489  __kmp_release_deps(gtid, taskdata);
4490  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4491 }
4492 
4501 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4502  KMP_DEBUG_ASSERT(ptask != NULL);
4503  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4504  KA_TRACE(
4505  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4506  gtid, taskdata));
4507  __kmp_assert_valid_gtid(gtid);
4508  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4509 
4510  __kmp_first_top_half_finish_proxy(taskdata);
4511  __kmp_second_top_half_finish_proxy(taskdata);
4512  __kmp_bottom_half_finish_proxy(gtid, ptask);
4513 
4514  KA_TRACE(10,
4515  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4516  gtid, taskdata));
4517 }
4518 
4519 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4520  KMP_DEBUG_ASSERT(ptask != NULL);
4521  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4522 
4523  // Enqueue task to complete bottom half completion from a thread within the
4524  // corresponding team
4525  kmp_team_t *team = taskdata->td_team;
4526  kmp_int32 nthreads = team->t.t_nproc;
4527  kmp_info_t *thread;
4528 
4529  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4530  // but we cannot use __kmp_get_random here
4531  kmp_int32 start_k = start % nthreads;
4532  kmp_int32 pass = 1;
4533  kmp_int32 k = start_k;
4534 
4535  do {
4536  // For now we're just linearly trying to find a thread
4537  thread = team->t.t_threads[k];
4538  k = (k + 1) % nthreads;
4539 
4540  // we did a full pass through all the threads
4541  if (k == start_k)
4542  pass = pass << 1;
4543 
4544  } while (!__kmp_give_task(thread, k, ptask, pass));
4545 
4546  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4547  // awake at least one thread to execute given task
4548  for (int i = 0; i < nthreads; ++i) {
4549  thread = team->t.t_threads[i];
4550  if (thread->th.th_sleep_loc != NULL) {
4551  __kmp_null_resume_wrapper(thread);
4552  break;
4553  }
4554  }
4555  }
4556 }
4557 
4565 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4566  KMP_DEBUG_ASSERT(ptask != NULL);
4567  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4568 
4569  KA_TRACE(
4570  10,
4571  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4572  taskdata));
4573 
4574  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4575 
4576  __kmp_first_top_half_finish_proxy(taskdata);
4577 
4578  __kmpc_give_task(ptask);
4579 
4580  __kmp_second_top_half_finish_proxy(taskdata);
4581 
4582  KA_TRACE(
4583  10,
4584  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4585  taskdata));
4586 }
4587 
4588 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4589  kmp_task_t *task) {
4590  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4591  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4592  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4593  td->td_allow_completion_event.ed.task = task;
4594  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4595  }
4596  return &td->td_allow_completion_event;
4597 }
4598 
4599 void __kmp_fulfill_event(kmp_event_t *event) {
4600  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4601  kmp_task_t *ptask = event->ed.task;
4602  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4603  bool detached = false;
4604  int gtid = __kmp_get_gtid();
4605 
4606  // The associated task might have completed or could be completing at this
4607  // point.
4608  // We need to take the lock to avoid races
4609  __kmp_acquire_tas_lock(&event->lock, gtid);
4610  if (taskdata->td_flags.proxy == TASK_PROXY) {
4611  detached = true;
4612  } else {
4613 #if OMPT_SUPPORT
4614  // The OMPT event must occur under mutual exclusion,
4615  // otherwise the tool might access ptask after free
4616  if (UNLIKELY(ompt_enabled.enabled))
4617  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4618 #endif
4619  }
4620  event->type = KMP_EVENT_UNINITIALIZED;
4621  __kmp_release_tas_lock(&event->lock, gtid);
4622 
4623  if (detached) {
4624 #if OMPT_SUPPORT
4625  // We free ptask afterwards and know the task is finished,
4626  // so locking is not necessary
4627  if (UNLIKELY(ompt_enabled.enabled))
4628  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4629 #endif
4630  // If the task detached complete the proxy task
4631  if (gtid >= 0) {
4632  kmp_team_t *team = taskdata->td_team;
4633  kmp_info_t *thread = __kmp_get_thread();
4634  if (thread->th.th_team == team) {
4635  __kmpc_proxy_task_completed(gtid, ptask);
4636  return;
4637  }
4638  }
4639 
4640  // fallback
4642  }
4643  }
4644 }
4645 
4646 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4647 // for taskloop
4648 //
4649 // thread: allocating thread
4650 // task_src: pointer to source task to be duplicated
4651 // taskloop_recur: used only when dealing with taskgraph,
4652 // indicating whether we need to update task->td_task_id
4653 // returns: a pointer to the allocated kmp_task_t structure (task).
4654 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4655 #if OMPX_TASKGRAPH
4656  , int taskloop_recur
4657 #endif
4658 ) {
4659  kmp_task_t *task;
4660  kmp_taskdata_t *taskdata;
4661  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4662  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4663  size_t shareds_offset;
4664  size_t task_size;
4665 
4666  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4667  task_src));
4668  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4669  TASK_FULL); // it should not be proxy task
4670  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4671  task_size = taskdata_src->td_size_alloc;
4672 
4673  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4674  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4675  task_size));
4676 #if USE_FAST_MEMORY
4677  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4678 #else
4679  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4680 #endif /* USE_FAST_MEMORY */
4681  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4682 
4683  task = KMP_TASKDATA_TO_TASK(taskdata);
4684 
4685  // Initialize new task (only specific fields not affected by memcpy)
4686 #if OMPX_TASKGRAPH
4687  if (taskdata->is_taskgraph && !taskloop_recur &&
4688  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4689  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4690 #endif
4691  taskdata->td_task_id = KMP_GEN_TASK_ID();
4692  if (task->shareds != NULL) { // need setup shareds pointer
4693  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4694  task->shareds = &((char *)taskdata)[shareds_offset];
4695  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4696  0);
4697  }
4698  taskdata->td_alloc_thread = thread;
4699  taskdata->td_parent = parent_task;
4700  // task inherits the taskgroup from the parent task
4701  taskdata->td_taskgroup = parent_task->td_taskgroup;
4702  // tied task needs to initialize the td_last_tied at creation,
4703  // untied one does this when it is scheduled for execution
4704  if (taskdata->td_flags.tiedness == TASK_TIED)
4705  taskdata->td_last_tied = taskdata;
4706 
4707  // Only need to keep track of child task counts if team parallel and tasking
4708  // not serialized
4709  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4710  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4711  if (parent_task->td_taskgroup)
4712  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4713  // Only need to keep track of allocated child tasks for explicit tasks since
4714  // implicit not deallocated
4715  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4716  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4717  }
4718 
4719  KA_TRACE(20,
4720  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4721  thread, taskdata, taskdata->td_parent));
4722 #if OMPT_SUPPORT
4723  if (UNLIKELY(ompt_enabled.enabled))
4724  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4725 #endif
4726  return task;
4727 }
4728 
4729 // Routine optionally generated by the compiler for setting the lastprivate flag
4730 // and calling needed constructors for private/firstprivate objects
4731 // (used to form taskloop tasks from pattern task)
4732 // Parameters: dest task, src task, lastprivate flag.
4733 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4734 
4735 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4736 
4737 // class to encapsulate manipulating loop bounds in a taskloop task.
4738 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4739 // the loop bound variables.
4740 class kmp_taskloop_bounds_t {
4741  kmp_task_t *task;
4742  const kmp_taskdata_t *taskdata;
4743  size_t lower_offset;
4744  size_t upper_offset;
4745 
4746 public:
4747  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4748  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4749  lower_offset((char *)lb - (char *)task),
4750  upper_offset((char *)ub - (char *)task) {
4751  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4752  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4753  }
4754  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4755  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4756  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4757  size_t get_lower_offset() const { return lower_offset; }
4758  size_t get_upper_offset() const { return upper_offset; }
4759  kmp_uint64 get_lb() const {
4760  kmp_int64 retval;
4761 #if defined(KMP_GOMP_COMPAT)
4762  // Intel task just returns the lower bound normally
4763  if (!taskdata->td_flags.native) {
4764  retval = *(kmp_int64 *)((char *)task + lower_offset);
4765  } else {
4766  // GOMP task has to take into account the sizeof(long)
4767  if (taskdata->td_size_loop_bounds == 4) {
4768  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4769  retval = (kmp_int64)*lb;
4770  } else {
4771  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4772  retval = (kmp_int64)*lb;
4773  }
4774  }
4775 #else
4776  (void)taskdata;
4777  retval = *(kmp_int64 *)((char *)task + lower_offset);
4778 #endif // defined(KMP_GOMP_COMPAT)
4779  return retval;
4780  }
4781  kmp_uint64 get_ub() const {
4782  kmp_int64 retval;
4783 #if defined(KMP_GOMP_COMPAT)
4784  // Intel task just returns the upper bound normally
4785  if (!taskdata->td_flags.native) {
4786  retval = *(kmp_int64 *)((char *)task + upper_offset);
4787  } else {
4788  // GOMP task has to take into account the sizeof(long)
4789  if (taskdata->td_size_loop_bounds == 4) {
4790  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4791  retval = (kmp_int64)*ub;
4792  } else {
4793  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4794  retval = (kmp_int64)*ub;
4795  }
4796  }
4797 #else
4798  retval = *(kmp_int64 *)((char *)task + upper_offset);
4799 #endif // defined(KMP_GOMP_COMPAT)
4800  return retval;
4801  }
4802  void set_lb(kmp_uint64 lb) {
4803 #if defined(KMP_GOMP_COMPAT)
4804  // Intel task just sets the lower bound normally
4805  if (!taskdata->td_flags.native) {
4806  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4807  } else {
4808  // GOMP task has to take into account the sizeof(long)
4809  if (taskdata->td_size_loop_bounds == 4) {
4810  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4811  *lower = (kmp_uint32)lb;
4812  } else {
4813  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4814  *lower = (kmp_uint64)lb;
4815  }
4816  }
4817 #else
4818  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4819 #endif // defined(KMP_GOMP_COMPAT)
4820  }
4821  void set_ub(kmp_uint64 ub) {
4822 #if defined(KMP_GOMP_COMPAT)
4823  // Intel task just sets the upper bound normally
4824  if (!taskdata->td_flags.native) {
4825  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4826  } else {
4827  // GOMP task has to take into account the sizeof(long)
4828  if (taskdata->td_size_loop_bounds == 4) {
4829  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4830  *upper = (kmp_uint32)ub;
4831  } else {
4832  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4833  *upper = (kmp_uint64)ub;
4834  }
4835  }
4836 #else
4837  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4838 #endif // defined(KMP_GOMP_COMPAT)
4839  }
4840 };
4841 
4842 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4843 //
4844 // loc Source location information
4845 // gtid Global thread ID
4846 // task Pattern task, exposes the loop iteration range
4847 // lb Pointer to loop lower bound in task structure
4848 // ub Pointer to loop upper bound in task structure
4849 // st Loop stride
4850 // ub_glob Global upper bound (used for lastprivate check)
4851 // num_tasks Number of tasks to execute
4852 // grainsize Number of loop iterations per task
4853 // extras Number of chunks with grainsize+1 iterations
4854 // last_chunk Reduction of grainsize for last task
4855 // tc Iterations count
4856 // task_dup Tasks duplication routine
4857 // codeptr_ra Return address for OMPT events
4858 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4859  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4860  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4861  kmp_uint64 grainsize, kmp_uint64 extras,
4862  kmp_int64 last_chunk, kmp_uint64 tc,
4863 #if OMPT_SUPPORT
4864  void *codeptr_ra,
4865 #endif
4866  void *task_dup) {
4867  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4868  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4869  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4870  // compiler provides global bounds here
4871  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4872  kmp_uint64 lower = task_bounds.get_lb();
4873  kmp_uint64 upper = task_bounds.get_ub();
4874  kmp_uint64 i;
4875  kmp_info_t *thread = __kmp_threads[gtid];
4876  kmp_taskdata_t *current_task = thread->th.th_current_task;
4877  kmp_task_t *next_task;
4878  kmp_int32 lastpriv = 0;
4879 
4880  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4881  (last_chunk < 0 ? last_chunk : extras));
4882  KMP_DEBUG_ASSERT(num_tasks > extras);
4883  KMP_DEBUG_ASSERT(num_tasks > 0);
4884  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4885  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4886  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4887  ub_glob, st, task_dup));
4888 
4889  // Launch num_tasks tasks, assign grainsize iterations each task
4890  for (i = 0; i < num_tasks; ++i) {
4891  kmp_uint64 chunk_minus_1;
4892  if (extras == 0) {
4893  chunk_minus_1 = grainsize - 1;
4894  } else {
4895  chunk_minus_1 = grainsize;
4896  --extras; // first extras iterations get bigger chunk (grainsize+1)
4897  }
4898  upper = lower + st * chunk_minus_1;
4899  if (upper > *ub) {
4900  upper = *ub;
4901  }
4902  if (i == num_tasks - 1) {
4903  // schedule the last task, set lastprivate flag if needed
4904  if (st == 1) { // most common case
4905  KMP_DEBUG_ASSERT(upper == *ub);
4906  if (upper == ub_glob)
4907  lastpriv = 1;
4908  } else if (st > 0) { // positive loop stride
4909  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4910  if ((kmp_uint64)st > ub_glob - upper)
4911  lastpriv = 1;
4912  } else { // negative loop stride
4913  KMP_DEBUG_ASSERT(upper + st < *ub);
4914  if (upper - ub_glob < (kmp_uint64)(-st))
4915  lastpriv = 1;
4916  }
4917  }
4918 
4919 #if OMPX_TASKGRAPH
4920  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4921 #else
4922  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4923 #endif
4924 
4925  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4926  kmp_taskloop_bounds_t next_task_bounds =
4927  kmp_taskloop_bounds_t(next_task, task_bounds);
4928 
4929  // adjust task-specific bounds
4930  next_task_bounds.set_lb(lower);
4931  if (next_taskdata->td_flags.native) {
4932  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4933  } else {
4934  next_task_bounds.set_ub(upper);
4935  }
4936  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4937  // etc.
4938  ptask_dup(next_task, task, lastpriv);
4939  KA_TRACE(40,
4940  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4941  "upper %lld stride %lld, (offsets %p %p)\n",
4942  gtid, i, next_task, lower, upper, st,
4943  next_task_bounds.get_lower_offset(),
4944  next_task_bounds.get_upper_offset()));
4945 #if OMPT_SUPPORT
4946  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4947  codeptr_ra); // schedule new task
4948 #if OMPT_OPTIONAL
4949  if (ompt_enabled.ompt_callback_dispatch) {
4950  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4951  lower, upper, st);
4952  }
4953 #endif // OMPT_OPTIONAL
4954 #else
4955  __kmp_omp_task(gtid, next_task, true); // schedule new task
4956 #endif
4957  lower = upper + st; // adjust lower bound for the next iteration
4958  }
4959  // free the pattern task and exit
4960  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4961  // do not execute the pattern task, just do internal bookkeeping
4962  __kmp_task_finish<false>(gtid, task, current_task);
4963 }
4964 
4965 // Structure to keep taskloop parameters for auxiliary task
4966 // kept in the shareds of the task structure.
4967 typedef struct __taskloop_params {
4968  kmp_task_t *task;
4969  kmp_uint64 *lb;
4970  kmp_uint64 *ub;
4971  void *task_dup;
4972  kmp_int64 st;
4973  kmp_uint64 ub_glob;
4974  kmp_uint64 num_tasks;
4975  kmp_uint64 grainsize;
4976  kmp_uint64 extras;
4977  kmp_int64 last_chunk;
4978  kmp_uint64 tc;
4979  kmp_uint64 num_t_min;
4980 #if OMPT_SUPPORT
4981  void *codeptr_ra;
4982 #endif
4983 } __taskloop_params_t;
4984 
4985 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4986  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4987  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4988  kmp_uint64,
4989 #if OMPT_SUPPORT
4990  void *,
4991 #endif
4992  void *);
4993 
4994 // Execute part of the taskloop submitted as a task.
4995 int __kmp_taskloop_task(int gtid, void *ptask) {
4996  __taskloop_params_t *p =
4997  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4998  kmp_task_t *task = p->task;
4999  kmp_uint64 *lb = p->lb;
5000  kmp_uint64 *ub = p->ub;
5001  void *task_dup = p->task_dup;
5002  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5003  kmp_int64 st = p->st;
5004  kmp_uint64 ub_glob = p->ub_glob;
5005  kmp_uint64 num_tasks = p->num_tasks;
5006  kmp_uint64 grainsize = p->grainsize;
5007  kmp_uint64 extras = p->extras;
5008  kmp_int64 last_chunk = p->last_chunk;
5009  kmp_uint64 tc = p->tc;
5010  kmp_uint64 num_t_min = p->num_t_min;
5011 #if OMPT_SUPPORT
5012  void *codeptr_ra = p->codeptr_ra;
5013 #endif
5014 #if KMP_DEBUG
5015  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5016  KMP_DEBUG_ASSERT(task != NULL);
5017  KA_TRACE(20,
5018  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
5019  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5020  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5021  st, task_dup));
5022 #endif
5023  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
5024  if (num_tasks > num_t_min)
5025  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5026  grainsize, extras, last_chunk, tc, num_t_min,
5027 #if OMPT_SUPPORT
5028  codeptr_ra,
5029 #endif
5030  task_dup);
5031  else
5032  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
5033  grainsize, extras, last_chunk, tc,
5034 #if OMPT_SUPPORT
5035  codeptr_ra,
5036 #endif
5037  task_dup);
5038 
5039  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
5040  return 0;
5041 }
5042 
5043 // Schedule part of the taskloop as a task,
5044 // execute the rest of the taskloop.
5045 //
5046 // loc Source location information
5047 // gtid Global thread ID
5048 // task Pattern task, exposes the loop iteration range
5049 // lb Pointer to loop lower bound in task structure
5050 // ub Pointer to loop upper bound in task structure
5051 // st Loop stride
5052 // ub_glob Global upper bound (used for lastprivate check)
5053 // num_tasks Number of tasks to execute
5054 // grainsize Number of loop iterations per task
5055 // extras Number of chunks with grainsize+1 iterations
5056 // last_chunk Reduction of grainsize for last task
5057 // tc Iterations count
5058 // num_t_min Threshold to launch tasks recursively
5059 // task_dup Tasks duplication routine
5060 // codeptr_ra Return address for OMPT events
5061 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
5062  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5063  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
5064  kmp_uint64 grainsize, kmp_uint64 extras,
5065  kmp_int64 last_chunk, kmp_uint64 tc,
5066  kmp_uint64 num_t_min,
5067 #if OMPT_SUPPORT
5068  void *codeptr_ra,
5069 #endif
5070  void *task_dup) {
5071  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5072  KMP_DEBUG_ASSERT(task != NULL);
5073  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
5074  KA_TRACE(20,
5075  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
5076  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
5077  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
5078  st, task_dup));
5079  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
5080  kmp_uint64 lower = *lb;
5081  kmp_info_t *thread = __kmp_threads[gtid];
5082  // kmp_taskdata_t *current_task = thread->th.th_current_task;
5083  kmp_task_t *next_task;
5084  size_t lower_offset =
5085  (char *)lb - (char *)task; // remember offset of lb in the task structure
5086  size_t upper_offset =
5087  (char *)ub - (char *)task; // remember offset of ub in the task structure
5088 
5089  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5090  (last_chunk < 0 ? last_chunk : extras));
5091  KMP_DEBUG_ASSERT(num_tasks > extras);
5092  KMP_DEBUG_ASSERT(num_tasks > 0);
5093 
5094  // split the loop in two halves
5095  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
5096  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
5097  kmp_uint64 gr_size0 = grainsize;
5098  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
5099  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
5100  if (last_chunk < 0) {
5101  ext0 = ext1 = 0;
5102  last_chunk1 = last_chunk;
5103  tc0 = grainsize * n_tsk0;
5104  tc1 = tc - tc0;
5105  } else if (n_tsk0 <= extras) {
5106  gr_size0++; // integrate extras into grainsize
5107  ext0 = 0; // no extra iters in 1st half
5108  ext1 = extras - n_tsk0; // remaining extras
5109  tc0 = gr_size0 * n_tsk0;
5110  tc1 = tc - tc0;
5111  } else { // n_tsk0 > extras
5112  ext1 = 0; // no extra iters in 2nd half
5113  ext0 = extras;
5114  tc1 = grainsize * n_tsk1;
5115  tc0 = tc - tc1;
5116  }
5117  ub0 = lower + st * (tc0 - 1);
5118  lb1 = ub0 + st;
5119 
5120  // create pattern task for 2nd half of the loop
5121 #if OMPX_TASKGRAPH
5122  next_task = __kmp_task_dup_alloc(thread, task,
5123  /* taskloop_recur */ 1);
5124 #else
5125  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
5126 #endif
5127  // adjust lower bound (upper bound is not changed) for the 2nd half
5128  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
5129  if (ptask_dup != NULL) // construct firstprivates, etc.
5130  ptask_dup(next_task, task, 0);
5131  *ub = ub0; // adjust upper bound for the 1st half
5132 
5133  // create auxiliary task for 2nd half of the loop
5134  // make sure new task has same parent task as the pattern task
5135  kmp_taskdata_t *current_task = thread->th.th_current_task;
5136  thread->th.th_current_task = taskdata->td_parent;
5137  kmp_task_t *new_task =
5138  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
5139  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
5140  // restore current task
5141  thread->th.th_current_task = current_task;
5142  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
5143  p->task = next_task;
5144  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
5145  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
5146  p->task_dup = task_dup;
5147  p->st = st;
5148  p->ub_glob = ub_glob;
5149  p->num_tasks = n_tsk1;
5150  p->grainsize = grainsize;
5151  p->extras = ext1;
5152  p->last_chunk = last_chunk1;
5153  p->tc = tc1;
5154  p->num_t_min = num_t_min;
5155 #if OMPT_SUPPORT
5156  p->codeptr_ra = codeptr_ra;
5157 #endif
5158 
5159 #if OMPX_TASKGRAPH
5160  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
5161  new_task_data->tdg = taskdata->tdg;
5162  new_task_data->is_taskgraph = 0;
5163 #endif
5164 
5165 #if OMPT_SUPPORT
5166  // schedule new task with correct return address for OMPT events
5167  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
5168 #else
5169  __kmp_omp_task(gtid, new_task, true); // schedule new task
5170 #endif
5171 
5172  // execute the 1st half of current subrange
5173  if (n_tsk0 > num_t_min)
5174  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
5175  ext0, last_chunk0, tc0, num_t_min,
5176 #if OMPT_SUPPORT
5177  codeptr_ra,
5178 #endif
5179  task_dup);
5180  else
5181  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
5182  gr_size0, ext0, last_chunk0, tc0,
5183 #if OMPT_SUPPORT
5184  codeptr_ra,
5185 #endif
5186  task_dup);
5187 
5188  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
5189 }
5190 
5191 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5192  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5193  int nogroup, int sched, kmp_uint64 grainsize,
5194  int modifier, void *task_dup) {
5195  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
5196  KMP_DEBUG_ASSERT(task != NULL);
5197  if (nogroup == 0) {
5198 #if OMPT_SUPPORT && OMPT_OPTIONAL
5199  OMPT_STORE_RETURN_ADDRESS(gtid);
5200 #endif
5201  __kmpc_taskgroup(loc, gtid);
5202  }
5203 
5204 #if OMPX_TASKGRAPH
5205  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
5206 #endif
5207  // =========================================================================
5208  // calculate loop parameters
5209  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5210  kmp_uint64 tc;
5211  // compiler provides global bounds here
5212  kmp_uint64 lower = task_bounds.get_lb();
5213  kmp_uint64 upper = task_bounds.get_ub();
5214  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5215  kmp_uint64 num_tasks = 0, extras = 0;
5216  kmp_int64 last_chunk =
5217  0; // reduce grainsize of last task by last_chunk in strict mode
5218  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5219  kmp_info_t *thread = __kmp_threads[gtid];
5220  kmp_taskdata_t *current_task = thread->th.th_current_task;
5221 
5222  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5223  "grain %llu(%d, %d), dup %p\n",
5224  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5225  task_dup));
5226 
5227  // compute trip count
5228  if (st == 1) { // most common case
5229  tc = upper - lower + 1;
5230  } else if (st < 0) {
5231  tc = (lower - upper) / (-st) + 1;
5232  } else { // st > 0
5233  tc = (upper - lower) / st + 1;
5234  }
5235  if (tc == 0) {
5236  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5237  // free the pattern task and exit
5238  __kmp_task_start(gtid, task, current_task);
5239  // do not execute anything for zero-trip loop
5240  __kmp_task_finish<false>(gtid, task, current_task);
5241  return;
5242  }
5243 
5244 #if OMPT_SUPPORT && OMPT_OPTIONAL
5245  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5246  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5247  if (ompt_enabled.ompt_callback_work) {
5248  ompt_callbacks.ompt_callback(ompt_callback_work)(
5249  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5250  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5251  }
5252 #endif
5253 
5254  if (num_tasks_min == 0)
5255  // TODO: can we choose better default heuristic?
5256  num_tasks_min =
5257  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5258 
5259  // compute num_tasks/grainsize based on the input provided
5260  switch (sched) {
5261  case 0: // no schedule clause specified, we can choose the default
5262  // let's try to schedule (team_size*10) tasks
5263  grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5264  KMP_FALLTHROUGH();
5265  case 2: // num_tasks provided
5266  if (grainsize > tc) {
5267  num_tasks = tc; // too big num_tasks requested, adjust values
5268  grainsize = 1;
5269  extras = 0;
5270  } else {
5271  num_tasks = grainsize;
5272  grainsize = tc / num_tasks;
5273  extras = tc % num_tasks;
5274  }
5275  break;
5276  case 1: // grainsize provided
5277  if (grainsize > tc) {
5278  num_tasks = 1;
5279  grainsize = tc; // too big grainsize requested, adjust values
5280  extras = 0;
5281  } else {
5282  if (modifier) {
5283  num_tasks = (tc + grainsize - 1) / grainsize;
5284  last_chunk = tc - (num_tasks * grainsize);
5285  extras = 0;
5286  } else {
5287  num_tasks = tc / grainsize;
5288  // adjust grainsize for balanced distribution of iterations
5289  grainsize = tc / num_tasks;
5290  extras = tc % num_tasks;
5291  }
5292  }
5293  break;
5294  default:
5295  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5296  }
5297 
5298  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5299  (last_chunk < 0 ? last_chunk : extras));
5300  KMP_DEBUG_ASSERT(num_tasks > extras);
5301  KMP_DEBUG_ASSERT(num_tasks > 0);
5302  // =========================================================================
5303 
5304  // check if clause value first
5305  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5306  if (if_val == 0) { // if(0) specified, mark task as serial
5307  taskdata->td_flags.task_serial = 1;
5308  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5309  // always start serial tasks linearly
5310  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5311  grainsize, extras, last_chunk, tc,
5312 #if OMPT_SUPPORT
5313  OMPT_GET_RETURN_ADDRESS(0),
5314 #endif
5315  task_dup);
5316  // !taskdata->td_flags.native => currently force linear spawning of tasks
5317  // for GOMP_taskloop
5318  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5319  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5320  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5321  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5322  last_chunk));
5323  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5324  grainsize, extras, last_chunk, tc, num_tasks_min,
5325 #if OMPT_SUPPORT
5326  OMPT_GET_RETURN_ADDRESS(0),
5327 #endif
5328  task_dup);
5329  } else {
5330  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5331  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5332  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5333  last_chunk));
5334  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5335  grainsize, extras, last_chunk, tc,
5336 #if OMPT_SUPPORT
5337  OMPT_GET_RETURN_ADDRESS(0),
5338 #endif
5339  task_dup);
5340  }
5341 
5342 #if OMPT_SUPPORT && OMPT_OPTIONAL
5343  if (ompt_enabled.ompt_callback_work) {
5344  ompt_callbacks.ompt_callback(ompt_callback_work)(
5345  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5346  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5347  }
5348 #endif
5349 
5350  if (nogroup == 0) {
5351 #if OMPT_SUPPORT && OMPT_OPTIONAL
5352  OMPT_STORE_RETURN_ADDRESS(gtid);
5353 #endif
5354  __kmpc_end_taskgroup(loc, gtid);
5355  }
5356  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5357 }
5358 
5375 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5376  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5377  int sched, kmp_uint64 grainsize, void *task_dup) {
5378  __kmp_assert_valid_gtid(gtid);
5379  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5380  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5381  0, task_dup);
5382  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5383 }
5384 
5402 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5403  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5404  int nogroup, int sched, kmp_uint64 grainsize,
5405  int modifier, void *task_dup) {
5406  __kmp_assert_valid_gtid(gtid);
5407  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5408  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5409  modifier, task_dup);
5410  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5411 }
5412 
5422  if (gtid == KMP_GTID_DNE)
5423  return NULL;
5424 
5425  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5426  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5427 
5428  if (!taskdata)
5429  return NULL;
5430 
5431  return &taskdata->td_target_data.async_handle;
5432 }
5433 
5442 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5443  if (gtid == KMP_GTID_DNE)
5444  return FALSE;
5445 
5446  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5447  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5448 
5449  if (!taskdata)
5450  return FALSE;
5451 
5452  return taskdata->td_task_team != NULL;
5453 }
5454 
5455 #if OMPX_TASKGRAPH
5456 // __kmp_find_tdg: identify a TDG through its ID
5457 // tdg_id: ID of the TDG
5458 // returns: If a TDG corresponding to this ID is found and not
5459 // its initial state, return the pointer to it, otherwise nullptr
5460 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5461  kmp_tdg_info_t *res = nullptr;
5462  if (__kmp_max_tdgs == 0)
5463  return res;
5464 
5465  if (__kmp_global_tdgs == NULL)
5466  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5467  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5468 
5469  if ((__kmp_global_tdgs[tdg_id]) &&
5470  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5471  res = __kmp_global_tdgs[tdg_id];
5472  return res;
5473 }
5474 
5475 // __kmp_print_tdg_dot: prints the TDG to a dot file
5476 // tdg: ID of the TDG
5477 // gtid: Global Thread ID
5478 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5479  kmp_int32 tdg_id = tdg->tdg_id;
5480  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5481 
5482  char file_name[20];
5483  sprintf(file_name, "tdg_%d.dot", tdg_id);
5484  kmp_safe_raii_file_t tdg_file(file_name, "w");
5485 
5486  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5487  fprintf(tdg_file,
5488  "digraph TDG {\n"
5489  " compound=true\n"
5490  " subgraph cluster {\n"
5491  " label=TDG_%d\n",
5492  tdg_id);
5493  for (kmp_int32 i = 0; i < num_tasks; i++) {
5494  fprintf(tdg_file, " %d[style=bold]\n", i);
5495  }
5496  fprintf(tdg_file, " }\n");
5497  for (kmp_int32 i = 0; i < num_tasks; i++) {
5498  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5499  kmp_int32 *successors = tdg->record_map[i].successors;
5500  if (nsuccessors > 0) {
5501  for (kmp_int32 j = 0; j < nsuccessors; j++)
5502  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5503  }
5504  }
5505  fprintf(tdg_file, "}");
5506  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5507 }
5508 
5509 // __kmp_exec_tdg: launch the execution of a previous
5510 // recorded TDG
5511 // gtid: Global Thread ID
5512 // tdg: ID of the TDG
5513 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5514  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5515  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5516  tdg->tdg_id, tdg->num_roots));
5517  kmp_node_info_t *this_record_map = tdg->record_map;
5518  kmp_int32 *this_root_tasks = tdg->root_tasks;
5519  kmp_int32 this_num_roots = tdg->num_roots;
5520  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5521 
5522  kmp_info_t *thread = __kmp_threads[gtid];
5523  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5524 
5525  if (tdg->rec_taskred_data) {
5526  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5527  }
5528 
5529  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5530  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5531 
5532  td->td_parent = parent_task;
5533  this_record_map[j].parent_task = parent_task;
5534 
5535  kmp_taskgroup_t *parent_taskgroup =
5536  this_record_map[j].parent_task->td_taskgroup;
5537 
5538  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5539  this_record_map[j].npredecessors);
5540  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5541 
5542  if (parent_taskgroup) {
5543  KMP_ATOMIC_INC(&parent_taskgroup->count);
5544  // The taskgroup is different so we must update it
5545  td->td_taskgroup = parent_taskgroup;
5546  } else if (td->td_taskgroup != nullptr) {
5547  // If the parent doesnt have a taskgroup, remove it from the task
5548  td->td_taskgroup = nullptr;
5549  }
5550  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5551  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5552  }
5553 
5554  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5555  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5556  }
5557  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5558  tdg->tdg_id, tdg->num_roots));
5559 }
5560 
5561 // __kmp_start_record: set up a TDG structure and turn the
5562 // recording flag to true
5563 // gtid: Global Thread ID of the encountering thread
5564 // input_flags: Flags associated with the TDG
5565 // tdg_id: ID of the TDG to record
5566 static inline void __kmp_start_record(kmp_int32 gtid,
5567  kmp_taskgraph_flags_t *flags,
5568  kmp_int32 tdg_id) {
5569  kmp_tdg_info_t *tdg =
5570  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5571  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5572  // Initializing the TDG structure
5573  tdg->tdg_id = tdg_id;
5574  tdg->map_size = INIT_MAPSIZE;
5575  tdg->num_roots = -1;
5576  tdg->root_tasks = nullptr;
5577  tdg->tdg_status = KMP_TDG_RECORDING;
5578  tdg->rec_num_taskred = 0;
5579  tdg->rec_taskred_data = nullptr;
5580  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5581 
5582  // Initializing the list of nodes in this TDG
5583  kmp_node_info_t *this_record_map =
5584  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5585  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5586  kmp_int32 *successorsList =
5587  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5588  this_record_map[i].task = nullptr;
5589  this_record_map[i].successors = successorsList;
5590  this_record_map[i].nsuccessors = 0;
5591  this_record_map[i].npredecessors = 0;
5592  this_record_map[i].successors_size = __kmp_successors_size;
5593  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5594  }
5595 
5596  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5597 }
5598 
5599 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5600 // the beginning of the record process of a task region
5601 // loc_ref: Location of TDG, not used yet
5602 // gtid: Global Thread ID of the encountering thread
5603 // input_flags: Flags associated with the TDG
5604 // tdg_id: ID of the TDG to record, for now, incremental integer
5605 // returns: 1 if we record, otherwise, 0
5606 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5607  kmp_int32 input_flags, kmp_int32 tdg_id) {
5608 
5609  kmp_int32 res;
5610  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5611  KA_TRACE(10,
5612  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5613  gtid, loc_ref, input_flags, tdg_id));
5614 
5615  if (__kmp_max_tdgs == 0) {
5616  KA_TRACE(
5617  10,
5618  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5619  "__kmp_max_tdgs = 0\n",
5620  gtid, loc_ref, input_flags, tdg_id));
5621  return 1;
5622  }
5623 
5624  __kmpc_taskgroup(loc_ref, gtid);
5625  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5626  // TODO: use re_record flag
5627  __kmp_exec_tdg(gtid, tdg);
5628  res = 0;
5629  } else {
5630  __kmp_curr_tdg_idx = tdg_id;
5631  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5632  __kmp_start_record(gtid, flags, tdg_id);
5633  __kmp_num_tdg++;
5634  res = 1;
5635  }
5636  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5637  gtid, tdg_id, res ? "record" : "execute"));
5638  return res;
5639 }
5640 
5641 // __kmp_end_record: set up a TDG after recording it
5642 // gtid: Global thread ID
5643 // tdg: Pointer to the TDG
5644 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5645  // Store roots
5646  kmp_node_info_t *this_record_map = tdg->record_map;
5647  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5648  kmp_int32 *this_root_tasks =
5649  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5650  kmp_int32 this_map_size = tdg->map_size;
5651  kmp_int32 this_num_roots = 0;
5652  kmp_info_t *thread = __kmp_threads[gtid];
5653 
5654  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5655  if (this_record_map[i].npredecessors == 0) {
5656  this_root_tasks[this_num_roots++] = i;
5657  }
5658  }
5659 
5660  // Update with roots info and mapsize
5661  tdg->map_size = this_map_size;
5662  tdg->num_roots = this_num_roots;
5663  tdg->root_tasks = this_root_tasks;
5664  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5665  tdg->tdg_status = KMP_TDG_READY;
5666 
5667  if (thread->th.th_current_task->td_dephash) {
5668  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5669  thread->th.th_current_task->td_dephash = NULL;
5670  }
5671 
5672  // Reset predecessor counter
5673  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5674  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5675  this_record_map[i].npredecessors);
5676  }
5677  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5678 
5679  if (__kmp_tdg_dot)
5680  __kmp_print_tdg_dot(tdg, gtid);
5681 }
5682 
5683 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5684 // the end of recording phase
5685 //
5686 // loc_ref: Source location information
5687 // gtid: Global thread ID
5688 // input_flags: Flags attached to the graph
5689 // tdg_id: ID of the TDG just finished recording
5690 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5691  kmp_int32 input_flags, kmp_int32 tdg_id) {
5692  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5693 
5694  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5695  " tdg=%d with flags=%d\n",
5696  gtid, loc_ref, tdg_id, input_flags));
5697  if (__kmp_max_tdgs) {
5698  // TODO: use input_flags->nowait
5699  __kmpc_end_taskgroup(loc_ref, gtid);
5700  if (__kmp_tdg_is_recording(tdg->tdg_status))
5701  __kmp_end_record(gtid, tdg);
5702  }
5703  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5704  " tdg=%d, its status is now READY\n",
5705  gtid, loc_ref, tdg_id));
5706 }
5707 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:247
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags