LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #include "tsan_annotations.h"
25 
26 /* forward declaration */
27 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
28  kmp_info_t *this_thr);
29 static void __kmp_alloc_task_deque(kmp_info_t *thread,
30  kmp_thread_data_t *thread_data);
31 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
32  kmp_task_team_t *task_team);
33 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
34 
35 #ifdef BUILD_TIED_TASK_STACK
36 
37 // __kmp_trace_task_stack: print the tied tasks from the task stack in order
38 // from top do bottom
39 //
40 // gtid: global thread identifier for thread containing stack
41 // thread_data: thread data for task team thread containing stack
42 // threshold: value above which the trace statement triggers
43 // location: string identifying call site of this function (for trace)
44 static void __kmp_trace_task_stack(kmp_int32 gtid,
45  kmp_thread_data_t *thread_data,
46  int threshold, char *location) {
47  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
48  kmp_taskdata_t **stack_top = task_stack->ts_top;
49  kmp_int32 entries = task_stack->ts_entries;
50  kmp_taskdata_t *tied_task;
51 
52  KA_TRACE(
53  threshold,
54  ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, "
55  "first_block = %p, stack_top = %p \n",
56  location, gtid, entries, task_stack->ts_first_block, stack_top));
57 
58  KMP_DEBUG_ASSERT(stack_top != NULL);
59  KMP_DEBUG_ASSERT(entries > 0);
60 
61  while (entries != 0) {
62  KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]);
63  // fix up ts_top if we need to pop from previous block
64  if (entries & TASK_STACK_INDEX_MASK == 0) {
65  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top);
66 
67  stack_block = stack_block->sb_prev;
68  stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
69  }
70 
71  // finish bookkeeping
72  stack_top--;
73  entries--;
74 
75  tied_task = *stack_top;
76 
77  KMP_DEBUG_ASSERT(tied_task != NULL);
78  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
79 
80  KA_TRACE(threshold,
81  ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, "
82  "stack_top=%p, tied_task=%p\n",
83  location, gtid, entries, stack_top, tied_task));
84  }
85  KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]);
86 
87  KA_TRACE(threshold,
88  ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n",
89  location, gtid));
90 }
91 
92 // __kmp_init_task_stack: initialize the task stack for the first time
93 // after a thread_data structure is created.
94 // It should not be necessary to do this again (assuming the stack works).
95 //
96 // gtid: global thread identifier of calling thread
97 // thread_data: thread data for task team thread containing stack
98 static void __kmp_init_task_stack(kmp_int32 gtid,
99  kmp_thread_data_t *thread_data) {
100  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
101  kmp_stack_block_t *first_block;
102 
103  // set up the first block of the stack
104  first_block = &task_stack->ts_first_block;
105  task_stack->ts_top = (kmp_taskdata_t **)first_block;
106  memset((void *)first_block, '\0',
107  TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *));
108 
109  // initialize the stack to be empty
110  task_stack->ts_entries = TASK_STACK_EMPTY;
111  first_block->sb_next = NULL;
112  first_block->sb_prev = NULL;
113 }
114 
115 // __kmp_free_task_stack: free the task stack when thread_data is destroyed.
116 //
117 // gtid: global thread identifier for calling thread
118 // thread_data: thread info for thread containing stack
119 static void __kmp_free_task_stack(kmp_int32 gtid,
120  kmp_thread_data_t *thread_data) {
121  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
122  kmp_stack_block_t *stack_block = &task_stack->ts_first_block;
123 
124  KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY);
125  // free from the second block of the stack
126  while (stack_block != NULL) {
127  kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL;
128 
129  stack_block->sb_next = NULL;
130  stack_block->sb_prev = NULL;
131  if (stack_block != &task_stack->ts_first_block) {
132  __kmp_thread_free(thread,
133  stack_block); // free the block, if not the first
134  }
135  stack_block = next_block;
136  }
137  // initialize the stack to be empty
138  task_stack->ts_entries = 0;
139  task_stack->ts_top = NULL;
140 }
141 
142 // __kmp_push_task_stack: Push the tied task onto the task stack.
143 // Grow the stack if necessary by allocating another block.
144 //
145 // gtid: global thread identifier for calling thread
146 // thread: thread info for thread containing stack
147 // tied_task: the task to push on the stack
148 static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread,
149  kmp_taskdata_t *tied_task) {
150  // GEH - need to consider what to do if tt_threads_data not allocated yet
151  kmp_thread_data_t *thread_data =
152  &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
153  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
154 
155  if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) {
156  return; // Don't push anything on stack if team or team tasks are serialized
157  }
158 
159  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
160  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
161 
162  KA_TRACE(20,
163  ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n",
164  gtid, thread, tied_task));
165  // Store entry
166  *(task_stack->ts_top) = tied_task;
167 
168  // Do bookkeeping for next push
169  task_stack->ts_top++;
170  task_stack->ts_entries++;
171 
172  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
173  // Find beginning of this task block
174  kmp_stack_block_t *stack_block =
175  (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE);
176 
177  // Check if we already have a block
178  if (stack_block->sb_next !=
179  NULL) { // reset ts_top to beginning of next block
180  task_stack->ts_top = &stack_block->sb_next->sb_block[0];
181  } else { // Alloc new block and link it up
182  kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc(
183  thread, sizeof(kmp_stack_block_t));
184 
185  task_stack->ts_top = &new_block->sb_block[0];
186  stack_block->sb_next = new_block;
187  new_block->sb_prev = stack_block;
188  new_block->sb_next = NULL;
189 
190  KA_TRACE(
191  30,
192  ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n",
193  gtid, tied_task, new_block));
194  }
195  }
196  KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
197  tied_task));
198 }
199 
200 // __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return
201 // the task, just check to make sure it matches the ending task passed in.
202 //
203 // gtid: global thread identifier for the calling thread
204 // thread: thread info structure containing stack
205 // tied_task: the task popped off the stack
206 // ending_task: the task that is ending (should match popped task)
207 static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread,
208  kmp_taskdata_t *ending_task) {
209  // GEH - need to consider what to do if tt_threads_data not allocated yet
210  kmp_thread_data_t *thread_data =
211  &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)];
212  kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks;
213  kmp_taskdata_t *tied_task;
214 
215  if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) {
216  // Don't pop anything from stack if team or team tasks are serialized
217  return;
218  }
219 
220  KMP_DEBUG_ASSERT(task_stack->ts_top != NULL);
221  KMP_DEBUG_ASSERT(task_stack->ts_entries > 0);
222 
223  KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid,
224  thread));
225 
226  // fix up ts_top if we need to pop from previous block
227  if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) {
228  kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top);
229 
230  stack_block = stack_block->sb_prev;
231  task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE];
232  }
233 
234  // finish bookkeeping
235  task_stack->ts_top--;
236  task_stack->ts_entries--;
237 
238  tied_task = *(task_stack->ts_top);
239 
240  KMP_DEBUG_ASSERT(tied_task != NULL);
241  KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED);
242  KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly
243 
244  KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid,
245  tied_task));
246  return;
247 }
248 #endif /* BUILD_TIED_TASK_STACK */
249 
250 // returns 1 if new task is allowed to execute, 0 otherwise
251 // checks Task Scheduling constraint (if requested) and
252 // mutexinoutset dependencies if any
253 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
254  const kmp_taskdata_t *tasknew,
255  const kmp_taskdata_t *taskcurr) {
256  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
257  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
258  // only descendant of all deferred tied tasks can be scheduled, checking
259  // the last one is enough, as it in turn is the descendant of all others
260  kmp_taskdata_t *current = taskcurr->td_last_tied;
261  KMP_DEBUG_ASSERT(current != NULL);
262  // check if the task is not suspended on barrier
263  if (current->td_flags.tasktype == TASK_EXPLICIT ||
264  current->td_taskwait_thread > 0) { // <= 0 on barrier
265  kmp_int32 level = current->td_level;
266  kmp_taskdata_t *parent = tasknew->td_parent;
267  while (parent != current && parent->td_level > level) {
268  // check generation up to the level of the current task
269  parent = parent->td_parent;
270  KMP_DEBUG_ASSERT(parent != NULL);
271  }
272  if (parent != current)
273  return false;
274  }
275  }
276  // Check mutexinoutset dependencies, acquire locks
277  kmp_depnode_t *node = tasknew->td_depnode;
278  if (node && (node->dn.mtx_num_locks > 0)) {
279  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
280  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
281  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
282  continue;
283  // could not get the lock, release previous locks
284  for (int j = i - 1; j >= 0; --j)
285  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
286  return false;
287  }
288  // negative num_locks means all locks acquired successfully
289  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
290  }
291  return true;
292 }
293 
294 // __kmp_realloc_task_deque:
295 // Re-allocates a task deque for a particular thread, copies the content from
296 // the old deque and adjusts the necessary data structures relating to the
297 // deque. This operation must be done with the deque_lock being held
298 static void __kmp_realloc_task_deque(kmp_info_t *thread,
299  kmp_thread_data_t *thread_data) {
300  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
301  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
302  kmp_int32 new_size = 2 * size;
303 
304  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
305  "%d] for thread_data %p\n",
306  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
307 
308  kmp_taskdata_t **new_deque =
309  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
310 
311  int i, j;
312  for (i = thread_data->td.td_deque_head, j = 0; j < size;
313  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
314  new_deque[j] = thread_data->td.td_deque[i];
315 
316  __kmp_free(thread_data->td.td_deque);
317 
318  thread_data->td.td_deque_head = 0;
319  thread_data->td.td_deque_tail = size;
320  thread_data->td.td_deque = new_deque;
321  thread_data->td.td_deque_size = new_size;
322 }
323 
324 // __kmp_push_task: Add a task to the thread's deque
325 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
326  kmp_info_t *thread = __kmp_threads[gtid];
327  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
328  kmp_task_team_t *task_team = thread->th.th_task_team;
329  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
330  kmp_thread_data_t *thread_data;
331 
332  KA_TRACE(20,
333  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
334 
335  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
336  // untied task needs to increment counter so that the task structure is not
337  // freed prematurely
338  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
339  KMP_DEBUG_USE_VAR(counter);
340  KA_TRACE(
341  20,
342  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
343  gtid, counter, taskdata));
344  }
345 
346  // The first check avoids building task_team thread data if serialized
347  if (taskdata->td_flags.task_serial) {
348  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
349  "TASK_NOT_PUSHED for task %p\n",
350  gtid, taskdata));
351  return TASK_NOT_PUSHED;
352  }
353 
354  // Now that serialized tasks have returned, we can assume that we are not in
355  // immediate exec mode
356  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
357  if (!KMP_TASKING_ENABLED(task_team)) {
358  __kmp_enable_tasking(task_team, thread);
359  }
360  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
361  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
362 
363  // Find tasking deque specific to encountering thread
364  thread_data = &task_team->tt.tt_threads_data[tid];
365 
366  // No lock needed since only owner can allocate
367  if (thread_data->td.td_deque == NULL) {
368  __kmp_alloc_task_deque(thread, thread_data);
369  }
370 
371  int locked = 0;
372  // Check if deque is full
373  if (TCR_4(thread_data->td.td_deque_ntasks) >=
374  TASK_DEQUE_SIZE(thread_data->td)) {
375  if (__kmp_enable_task_throttling &&
376  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
377  thread->th.th_current_task)) {
378  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
379  "TASK_NOT_PUSHED for task %p\n",
380  gtid, taskdata));
381  return TASK_NOT_PUSHED;
382  } else {
383  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
384  locked = 1;
385  if (TCR_4(thread_data->td.td_deque_ntasks) >=
386  TASK_DEQUE_SIZE(thread_data->td)) {
387  // expand deque to push the task which is not allowed to execute
388  __kmp_realloc_task_deque(thread, thread_data);
389  }
390  }
391  }
392  // Lock the deque for the task push operation
393  if (!locked) {
394  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
395  // Need to recheck as we can get a proxy task from thread outside of OpenMP
396  if (TCR_4(thread_data->td.td_deque_ntasks) >=
397  TASK_DEQUE_SIZE(thread_data->td)) {
398  if (__kmp_enable_task_throttling &&
399  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
400  thread->th.th_current_task)) {
401  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
402  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
403  "returning TASK_NOT_PUSHED for task %p\n",
404  gtid, taskdata));
405  return TASK_NOT_PUSHED;
406  } else {
407  // expand deque to push the task which is not allowed to execute
408  __kmp_realloc_task_deque(thread, thread_data);
409  }
410  }
411  }
412  // Must have room since no thread can add tasks but calling thread
413  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
414  TASK_DEQUE_SIZE(thread_data->td));
415 
416  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
417  taskdata; // Push taskdata
418  // Wrap index.
419  thread_data->td.td_deque_tail =
420  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
421  TCW_4(thread_data->td.td_deque_ntasks,
422  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
423  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
424  KMP_FSYNC_RELEASING(taskdata); // releasing child
425  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
426  "task=%p ntasks=%d head=%u tail=%u\n",
427  gtid, taskdata, thread_data->td.td_deque_ntasks,
428  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
429 
430  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
431 
432  return TASK_SUCCESSFULLY_PUSHED;
433 }
434 
435 // __kmp_pop_current_task_from_thread: set up current task from called thread
436 // when team ends
437 //
438 // this_thr: thread structure to set current_task in.
439 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
440  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
441  "this_thread=%p, curtask=%p, "
442  "curtask_parent=%p\n",
443  0, this_thr, this_thr->th.th_current_task,
444  this_thr->th.th_current_task->td_parent));
445 
446  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
447 
448  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
449  "this_thread=%p, curtask=%p, "
450  "curtask_parent=%p\n",
451  0, this_thr, this_thr->th.th_current_task,
452  this_thr->th.th_current_task->td_parent));
453 }
454 
455 // __kmp_push_current_task_to_thread: set up current task in called thread for a
456 // new team
457 //
458 // this_thr: thread structure to set up
459 // team: team for implicit task data
460 // tid: thread within team to set up
461 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
462  int tid) {
463  // current task of the thread is a parent of the new just created implicit
464  // tasks of new team
465  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
466  "curtask=%p "
467  "parent_task=%p\n",
468  tid, this_thr, this_thr->th.th_current_task,
469  team->t.t_implicit_task_taskdata[tid].td_parent));
470 
471  KMP_DEBUG_ASSERT(this_thr != NULL);
472 
473  if (tid == 0) {
474  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
475  team->t.t_implicit_task_taskdata[0].td_parent =
476  this_thr->th.th_current_task;
477  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
478  }
479  } else {
480  team->t.t_implicit_task_taskdata[tid].td_parent =
481  team->t.t_implicit_task_taskdata[0].td_parent;
482  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
483  }
484 
485  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
486  "curtask=%p "
487  "parent_task=%p\n",
488  tid, this_thr, this_thr->th.th_current_task,
489  team->t.t_implicit_task_taskdata[tid].td_parent));
490 }
491 
492 // __kmp_task_start: bookkeeping for a task starting execution
493 //
494 // GTID: global thread id of calling thread
495 // task: task starting execution
496 // current_task: task suspending
497 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
498  kmp_taskdata_t *current_task) {
499  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
500  kmp_info_t *thread = __kmp_threads[gtid];
501 
502  KA_TRACE(10,
503  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
504  gtid, taskdata, current_task));
505 
506  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
507 
508  // mark currently executing task as suspended
509  // TODO: GEH - make sure root team implicit task is initialized properly.
510  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
511  current_task->td_flags.executing = 0;
512 
513 // Add task to stack if tied
514 #ifdef BUILD_TIED_TASK_STACK
515  if (taskdata->td_flags.tiedness == TASK_TIED) {
516  __kmp_push_task_stack(gtid, thread, taskdata);
517  }
518 #endif /* BUILD_TIED_TASK_STACK */
519 
520  // mark starting task as executing and as current task
521  thread->th.th_current_task = taskdata;
522 
523  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
524  taskdata->td_flags.tiedness == TASK_UNTIED);
525  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
526  taskdata->td_flags.tiedness == TASK_UNTIED);
527  taskdata->td_flags.started = 1;
528  taskdata->td_flags.executing = 1;
529  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
530  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
531 
532  // GEH TODO: shouldn't we pass some sort of location identifier here?
533  // APT: yes, we will pass location here.
534  // need to store current thread state (in a thread or taskdata structure)
535  // before setting work_state, otherwise wrong state is set after end of task
536 
537  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
538 
539  return;
540 }
541 
542 #if OMPT_SUPPORT
543 //------------------------------------------------------------------------------
544 // __ompt_task_init:
545 // Initialize OMPT fields maintained by a task. This will only be called after
546 // ompt_start_tool, so we already know whether ompt is enabled or not.
547 
548 static inline void __ompt_task_init(kmp_taskdata_t *task, int tid) {
549  // The calls to __ompt_task_init already have the ompt_enabled condition.
550  task->ompt_task_info.task_data.value = 0;
551  task->ompt_task_info.frame.exit_frame = ompt_data_none;
552  task->ompt_task_info.frame.enter_frame = ompt_data_none;
553  task->ompt_task_info.frame.exit_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
554  task->ompt_task_info.frame.enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
555 }
556 
557 // __ompt_task_start:
558 // Build and trigger task-begin event
559 static inline void __ompt_task_start(kmp_task_t *task,
560  kmp_taskdata_t *current_task,
561  kmp_int32 gtid) {
562  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
563  ompt_task_status_t status = ompt_task_switch;
564  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
565  status = ompt_task_yield;
566  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
567  }
568  /* let OMPT know that we're about to run this task */
569  if (ompt_enabled.ompt_callback_task_schedule) {
570  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
571  &(current_task->ompt_task_info.task_data), status,
572  &(taskdata->ompt_task_info.task_data));
573  }
574  taskdata->ompt_task_info.scheduling_parent = current_task;
575 }
576 
577 // __ompt_task_finish:
578 // Build and trigger final task-schedule event
579 static inline void __ompt_task_finish(kmp_task_t *task,
580  kmp_taskdata_t *resumed_task,
581  ompt_task_status_t status) {
582  if (ompt_enabled.ompt_callback_task_schedule) {
583  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
584  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
585  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
586  status = ompt_task_cancel;
587  }
588 
589  /* let OMPT know that we're returning to the callee task */
590  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
591  &(taskdata->ompt_task_info.task_data), status,
592  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
593  }
594 }
595 #endif
596 
597 template <bool ompt>
598 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
599  kmp_task_t *task,
600  void *frame_address,
601  void *return_address) {
602  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
603  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
604 
605  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
606  "current_task=%p\n",
607  gtid, loc_ref, taskdata, current_task));
608 
609  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
610  // untied task needs to increment counter so that the task structure is not
611  // freed prematurely
612  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
613  KMP_DEBUG_USE_VAR(counter);
614  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
615  "incremented for task %p\n",
616  gtid, counter, taskdata));
617  }
618 
619  taskdata->td_flags.task_serial =
620  1; // Execute this task immediately, not deferred.
621  __kmp_task_start(gtid, task, current_task);
622 
623 #if OMPT_SUPPORT
624  if (ompt) {
625  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
626  current_task->ompt_task_info.frame.enter_frame.ptr =
627  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
628  current_task->ompt_task_info.frame.enter_frame_flags =
629  taskdata->ompt_task_info.frame.exit_frame_flags = ompt_frame_application | ompt_frame_framepointer;
630  }
631  if (ompt_enabled.ompt_callback_task_create) {
632  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
633  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
634  &(parent_info->task_data), &(parent_info->frame),
635  &(taskdata->ompt_task_info.task_data),
636  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(taskdata), 0,
637  return_address);
638  }
639  __ompt_task_start(task, current_task, gtid);
640  }
641 #endif // OMPT_SUPPORT
642 
643  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
644  loc_ref, taskdata));
645 }
646 
647 #if OMPT_SUPPORT
648 OMPT_NOINLINE
649 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
650  kmp_task_t *task,
651  void *frame_address,
652  void *return_address) {
653  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
654  return_address);
655 }
656 #endif // OMPT_SUPPORT
657 
658 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
659 // execution
660 //
661 // loc_ref: source location information; points to beginning of task block.
662 // gtid: global thread number.
663 // task: task thunk for the started task.
664 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
665  kmp_task_t *task) {
666 #if OMPT_SUPPORT
667  if (UNLIKELY(ompt_enabled.enabled)) {
668  OMPT_STORE_RETURN_ADDRESS(gtid);
669  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
670  OMPT_GET_FRAME_ADDRESS(1),
671  OMPT_LOAD_RETURN_ADDRESS(gtid));
672  return;
673  }
674 #endif
675  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
676 }
677 
678 #ifdef TASK_UNUSED
679 // __kmpc_omp_task_begin: report that a given task has started execution
680 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
681 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
682  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
683 
684  KA_TRACE(
685  10,
686  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
687  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
688 
689  __kmp_task_start(gtid, task, current_task);
690 
691  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
692  loc_ref, KMP_TASK_TO_TASKDATA(task)));
693  return;
694 }
695 #endif // TASK_UNUSED
696 
697 // __kmp_free_task: free the current task space and the space for shareds
698 //
699 // gtid: Global thread ID of calling thread
700 // taskdata: task to free
701 // thread: thread data structure of caller
702 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
703  kmp_info_t *thread) {
704  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
705  taskdata));
706 
707  // Check to make sure all flags and counters have the correct values
708  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
709  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
710  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
711  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
712  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
713  taskdata->td_flags.task_serial == 1);
714  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
715 
716  taskdata->td_flags.freed = 1;
717  ANNOTATE_HAPPENS_BEFORE(taskdata);
718 // deallocate the taskdata and shared variable blocks associated with this task
719 #if USE_FAST_MEMORY
720  __kmp_fast_free(thread, taskdata);
721 #else /* ! USE_FAST_MEMORY */
722  __kmp_thread_free(thread, taskdata);
723 #endif
724 
725  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
726 }
727 
728 // __kmp_free_task_and_ancestors: free the current task and ancestors without
729 // children
730 //
731 // gtid: Global thread ID of calling thread
732 // taskdata: task to free
733 // thread: thread data structure of caller
734 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
735  kmp_taskdata_t *taskdata,
736  kmp_info_t *thread) {
737  // Proxy tasks must always be allowed to free their parents
738  // because they can be run in background even in serial mode.
739  kmp_int32 team_serial =
740  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
741  !taskdata->td_flags.proxy;
742  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
743 
744  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
745  KMP_DEBUG_ASSERT(children >= 0);
746 
747  // Now, go up the ancestor tree to see if any ancestors can now be freed.
748  while (children == 0) {
749  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
750 
751  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
752  "and freeing itself\n",
753  gtid, taskdata));
754 
755  // --- Deallocate my ancestor task ---
756  __kmp_free_task(gtid, taskdata, thread);
757 
758  taskdata = parent_taskdata;
759 
760  if (team_serial)
761  return;
762  // Stop checking ancestors at implicit task instead of walking up ancestor
763  // tree to avoid premature deallocation of ancestors.
764  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
765  if (taskdata->td_dephash) { // do we need to cleanup dephash?
766  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
767  kmp_tasking_flags_t flags_old = taskdata->td_flags;
768  if (children == 0 && flags_old.complete == 1) {
769  kmp_tasking_flags_t flags_new = flags_old;
770  flags_new.complete = 0;
771  if (KMP_COMPARE_AND_STORE_ACQ32(
772  RCAST(kmp_int32 *, &taskdata->td_flags),
773  *RCAST(kmp_int32 *, &flags_old),
774  *RCAST(kmp_int32 *, &flags_new))) {
775  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
776  "dephash of implicit task %p\n",
777  gtid, taskdata));
778  // cleanup dephash of finished implicit task
779  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
780  }
781  }
782  }
783  return;
784  }
785  // Predecrement simulated by "- 1" calculation
786  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
787  KMP_DEBUG_ASSERT(children >= 0);
788  }
789 
790  KA_TRACE(
791  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
792  "not freeing it yet\n",
793  gtid, taskdata, children));
794 }
795 
796 // __kmp_task_finish: bookkeeping to do when a task finishes execution
797 //
798 // gtid: global thread ID for calling thread
799 // task: task to be finished
800 // resumed_task: task to be resumed. (may be NULL if task is serialized)
801 //
802 // template<ompt>: effectively ompt_enabled.enabled!=0
803 // the version with ompt=false is inlined, allowing to optimize away all ompt
804 // code in this case
805 template <bool ompt>
806 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
807  kmp_taskdata_t *resumed_task) {
808  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
809  kmp_info_t *thread = __kmp_threads[gtid];
810  kmp_task_team_t *task_team =
811  thread->th.th_task_team; // might be NULL for serial teams...
812  kmp_int32 children = 0;
813 
814  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
815  "task %p\n",
816  gtid, taskdata, resumed_task));
817 
818  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
819 
820 // Pop task from stack if tied
821 #ifdef BUILD_TIED_TASK_STACK
822  if (taskdata->td_flags.tiedness == TASK_TIED) {
823  __kmp_pop_task_stack(gtid, thread, taskdata);
824  }
825 #endif /* BUILD_TIED_TASK_STACK */
826 
827  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
828  // untied task needs to check the counter so that the task structure is not
829  // freed prematurely
830  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
831  KA_TRACE(
832  20,
833  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
834  gtid, counter, taskdata));
835  if (counter > 0) {
836  // untied task is not done, to be continued possibly by other thread, do
837  // not free it now
838  if (resumed_task == NULL) {
839  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
840  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
841  // task is the parent
842  }
843  thread->th.th_current_task = resumed_task; // restore current_task
844  resumed_task->td_flags.executing = 1; // resume previous task
845  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
846  "resuming task %p\n",
847  gtid, taskdata, resumed_task));
848  return;
849  }
850  }
851 
852  // Check mutexinoutset dependencies, release locks
853  kmp_depnode_t *node = taskdata->td_depnode;
854  if (node && (node->dn.mtx_num_locks < 0)) {
855  // negative num_locks means all locks were acquired
856  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
857  for (int i = node->dn.mtx_num_locks - 1; i >= 0; --i) {
858  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
859  __kmp_release_lock(node->dn.mtx_locks[i], gtid);
860  }
861  }
862 
863  // bookkeeping for resuming task:
864  // GEH - note tasking_ser => task_serial
865  KMP_DEBUG_ASSERT(
866  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
867  taskdata->td_flags.task_serial);
868  if (taskdata->td_flags.task_serial) {
869  if (resumed_task == NULL) {
870  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
871  // task is the parent
872  }
873  } else {
874  KMP_DEBUG_ASSERT(resumed_task !=
875  NULL); // verify that resumed task is passed as argument
876  }
877 
878  /* If the tasks' destructor thunk flag has been set, we need to invoke the
879  destructor thunk that has been generated by the compiler. The code is
880  placed here, since at this point other tasks might have been released
881  hence overlapping the destructor invocations with some other work in the
882  released tasks. The OpenMP spec is not specific on when the destructors
883  are invoked, so we should be free to choose. */
884  if (taskdata->td_flags.destructors_thunk) {
885  kmp_routine_entry_t destr_thunk = task->data1.destructors;
886  KMP_ASSERT(destr_thunk);
887  destr_thunk(gtid, task);
888  }
889 
890  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
891  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
892  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
893 
894  bool detach = false;
895  if (taskdata->td_flags.detachable == TASK_DETACHABLE) {
896  if (taskdata->td_allow_completion_event.type ==
897  KMP_EVENT_ALLOW_COMPLETION) {
898  // event hasn't been fulfilled yet. Try to detach task.
899  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
900  if (taskdata->td_allow_completion_event.type ==
901  KMP_EVENT_ALLOW_COMPLETION) {
902  // task finished execution
903  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
904  taskdata->td_flags.executing = 0; // suspend the finishing task
905 
906 #if OMPT_SUPPORT
907  // For a detached task, which is not completed, we switch back
908  // the omp_fulfill_event signals completion
909  // locking is necessary to avoid a race with ompt_task_late_fulfill
910  if (ompt)
911  __ompt_task_finish(task, resumed_task, ompt_task_detach);
912 #endif
913 
914  // no access to taskdata after this point!
915  // __kmp_fulfill_event might free taskdata at any time from now
916 
917  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
918  detach = true;
919  }
920  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
921  }
922  }
923 
924  if (!detach) {
925  taskdata->td_flags.complete = 1; // mark the task as completed
926 
927 #if OMPT_SUPPORT
928  // This is not a detached task, we are done here
929  if (ompt)
930  __ompt_task_finish(task, resumed_task, ompt_task_complete);
931 #endif
932 
933  // Only need to keep track of count if team parallel and tasking not
934  // serialized, or task is detachable and event has already been fulfilled
935  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
936  taskdata->td_flags.detachable == TASK_DETACHABLE) {
937  // Predecrement simulated by "- 1" calculation
938  children =
939  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
940  KMP_DEBUG_ASSERT(children >= 0);
941  if (taskdata->td_taskgroup)
942  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
943  __kmp_release_deps(gtid, taskdata);
944  } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
945  // if we found proxy tasks there could exist a dependency chain
946  // with the proxy task as origin
947  __kmp_release_deps(gtid, taskdata);
948  }
949  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
950  // called. Othertwise, if a task is executed immediately from the
951  // release_deps code, the flag will be reset to 1 again by this same
952  // function
953  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
954  taskdata->td_flags.executing = 0; // suspend the finishing task
955  }
956 
957 
958  KA_TRACE(
959  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
960  gtid, taskdata, children));
961 
962  // Free this task and then ancestor tasks if they have no children.
963  // Restore th_current_task first as suggested by John:
964  // johnmc: if an asynchronous inquiry peers into the runtime system
965  // it doesn't see the freed task as the current task.
966  thread->th.th_current_task = resumed_task;
967  if (!detach)
968  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
969 
970  // TODO: GEH - make sure root team implicit task is initialized properly.
971  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
972  resumed_task->td_flags.executing = 1; // resume previous task
973 
974  KA_TRACE(
975  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
976  gtid, taskdata, resumed_task));
977 
978  return;
979 }
980 
981 template <bool ompt>
982 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
983  kmp_int32 gtid,
984  kmp_task_t *task) {
985  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
986  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
987  __kmp_assert_valid_gtid(gtid);
988  // this routine will provide task to resume
989  __kmp_task_finish<ompt>(gtid, task, NULL);
990 
991  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
992  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
993 
994 #if OMPT_SUPPORT
995  if (ompt) {
996  ompt_frame_t *ompt_frame;
997  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
998  ompt_frame->enter_frame = ompt_data_none;
999  ompt_frame->enter_frame_flags = ompt_frame_runtime | ompt_frame_framepointer;
1000  }
1001 #endif
1002 
1003  return;
1004 }
1005 
1006 #if OMPT_SUPPORT
1007 OMPT_NOINLINE
1008 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1009  kmp_task_t *task) {
1010  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1011 }
1012 #endif // OMPT_SUPPORT
1013 
1014 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1015 //
1016 // loc_ref: source location information; points to end of task block.
1017 // gtid: global thread number.
1018 // task: task thunk for the completed task.
1019 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1020  kmp_task_t *task) {
1021 #if OMPT_SUPPORT
1022  if (UNLIKELY(ompt_enabled.enabled)) {
1023  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1024  return;
1025  }
1026 #endif
1027  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1028 }
1029 
1030 #ifdef TASK_UNUSED
1031 // __kmpc_omp_task_complete: report that a task has completed execution
1032 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1033 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1034  kmp_task_t *task) {
1035  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1036  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1037 
1038  __kmp_task_finish<false>(gtid, task,
1039  NULL); // Not sure how to find task to resume
1040 
1041  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1042  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1043  return;
1044 }
1045 #endif // TASK_UNUSED
1046 
1047 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1048 // task for a given thread
1049 //
1050 // loc_ref: reference to source location of parallel region
1051 // this_thr: thread data structure corresponding to implicit task
1052 // team: team for this_thr
1053 // tid: thread id of given thread within team
1054 // set_curr_task: TRUE if need to push current task to thread
1055 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1056 // have already been done elsewhere.
1057 // TODO: Get better loc_ref. Value passed in may be NULL
1058 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1059  kmp_team_t *team, int tid, int set_curr_task) {
1060  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1061 
1062  KF_TRACE(
1063  10,
1064  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1065  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1066 
1067  task->td_task_id = KMP_GEN_TASK_ID();
1068  task->td_team = team;
1069  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1070  // in debugger)
1071  task->td_ident = loc_ref;
1072  task->td_taskwait_ident = NULL;
1073  task->td_taskwait_counter = 0;
1074  task->td_taskwait_thread = 0;
1075 
1076  task->td_flags.tiedness = TASK_TIED;
1077  task->td_flags.tasktype = TASK_IMPLICIT;
1078  task->td_flags.proxy = TASK_FULL;
1079 
1080  // All implicit tasks are executed immediately, not deferred
1081  task->td_flags.task_serial = 1;
1082  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1083  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1084 
1085  task->td_flags.started = 1;
1086  task->td_flags.executing = 1;
1087  task->td_flags.complete = 0;
1088  task->td_flags.freed = 0;
1089 
1090  task->td_depnode = NULL;
1091  task->td_last_tied = task;
1092  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1093 
1094  if (set_curr_task) { // only do this init first time thread is created
1095  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1096  // Not used: don't need to deallocate implicit task
1097  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1098  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1099  task->td_dephash = NULL;
1100  __kmp_push_current_task_to_thread(this_thr, team, tid);
1101  } else {
1102  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1103  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1104  }
1105 
1106 #if OMPT_SUPPORT
1107  if (UNLIKELY(ompt_enabled.enabled))
1108  __ompt_task_init(task, tid);
1109 #endif
1110 
1111  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1112  team, task));
1113 }
1114 
1115 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1116 // at the end of parallel regions. Some resources are kept for reuse in the next
1117 // parallel region.
1118 //
1119 // thread: thread data structure corresponding to implicit task
1120 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1121  kmp_taskdata_t *task = thread->th.th_current_task;
1122  if (task->td_dephash) {
1123  int children;
1124  task->td_flags.complete = 1;
1125  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1126  kmp_tasking_flags_t flags_old = task->td_flags;
1127  if (children == 0 && flags_old.complete == 1) {
1128  kmp_tasking_flags_t flags_new = flags_old;
1129  flags_new.complete = 0;
1130  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1131  *RCAST(kmp_int32 *, &flags_old),
1132  *RCAST(kmp_int32 *, &flags_new))) {
1133  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1134  "dephash of implicit task %p\n",
1135  thread->th.th_info.ds.ds_gtid, task));
1136  __kmp_dephash_free_entries(thread, task->td_dephash);
1137  }
1138  }
1139  }
1140 }
1141 
1142 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1143 // when these are destroyed regions
1144 //
1145 // thread: thread data structure corresponding to implicit task
1146 void __kmp_free_implicit_task(kmp_info_t *thread) {
1147  kmp_taskdata_t *task = thread->th.th_current_task;
1148  if (task && task->td_dephash) {
1149  __kmp_dephash_free(thread, task->td_dephash);
1150  task->td_dephash = NULL;
1151  }
1152 }
1153 
1154 // Round up a size to a power of two specified by val: Used to insert padding
1155 // between structures co-allocated using a single malloc() call
1156 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1157  if (size & (val - 1)) {
1158  size &= ~(val - 1);
1159  if (size <= KMP_SIZE_T_MAX - val) {
1160  size += val; // Round up if there is no overflow.
1161  }
1162  }
1163  return size;
1164 } // __kmp_round_up_to_va
1165 
1166 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1167 //
1168 // loc_ref: source location information
1169 // gtid: global thread number.
1170 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1171 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1172 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1173 // private vars accessed in task.
1174 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1175 // in task.
1176 // task_entry: Pointer to task code entry point generated by compiler.
1177 // returns: a pointer to the allocated kmp_task_t structure (task).
1178 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1179  kmp_tasking_flags_t *flags,
1180  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1181  kmp_routine_entry_t task_entry) {
1182  kmp_task_t *task;
1183  kmp_taskdata_t *taskdata;
1184  kmp_info_t *thread = __kmp_threads[gtid];
1185  kmp_team_t *team = thread->th.th_team;
1186  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1187  size_t shareds_offset;
1188 
1189  if (!TCR_4(__kmp_init_middle))
1190  __kmp_middle_initialize();
1191 
1192  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1193  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1194  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1195  sizeof_shareds, task_entry));
1196 
1197  if (parent_task->td_flags.final) {
1198  if (flags->merged_if0) {
1199  }
1200  flags->final = 1;
1201  }
1202  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1203  // Untied task encountered causes the TSC algorithm to check entire deque of
1204  // the victim thread. If no untied task encountered, then checking the head
1205  // of the deque should be enough.
1206  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1207  }
1208 
1209  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1210  // the tasking setup
1211  // when that happens is too late.
1212  if (flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) {
1213  if (flags->proxy == TASK_PROXY) {
1214  flags->tiedness = TASK_UNTIED;
1215  flags->merged_if0 = 1;
1216  }
1217  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1218  tasking support enabled */
1219  if ((thread->th.th_task_team) == NULL) {
1220  /* This should only happen if the team is serialized
1221  setup a task team and propagate it to the thread */
1222  KMP_DEBUG_ASSERT(team->t.t_serialized);
1223  KA_TRACE(30,
1224  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1225  gtid));
1226  __kmp_task_team_setup(
1227  thread, team,
1228  1); // 1 indicates setup the current team regardless of nthreads
1229  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1230  }
1231  kmp_task_team_t *task_team = thread->th.th_task_team;
1232 
1233  /* tasking must be enabled now as the task might not be pushed */
1234  if (!KMP_TASKING_ENABLED(task_team)) {
1235  KA_TRACE(
1236  30,
1237  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1238  __kmp_enable_tasking(task_team, thread);
1239  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1240  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1241  // No lock needed since only owner can allocate
1242  if (thread_data->td.td_deque == NULL) {
1243  __kmp_alloc_task_deque(thread, thread_data);
1244  }
1245  }
1246 
1247  if (task_team->tt.tt_found_proxy_tasks == FALSE)
1248  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1249  }
1250 
1251  // Calculate shared structure offset including padding after kmp_task_t struct
1252  // to align pointers in shared struct
1253  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1254  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(void *));
1255 
1256  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1257  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1258  shareds_offset));
1259  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1260  sizeof_shareds));
1261 
1262 // Avoid double allocation here by combining shareds with taskdata
1263 #if USE_FAST_MEMORY
1264  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1265  sizeof_shareds);
1266 #else /* ! USE_FAST_MEMORY */
1267  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1268  sizeof_shareds);
1269 #endif /* USE_FAST_MEMORY */
1270  ANNOTATE_HAPPENS_AFTER(taskdata);
1271 
1272  task = KMP_TASKDATA_TO_TASK(taskdata);
1273 
1274 // Make sure task & taskdata are aligned appropriately
1275 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || !KMP_HAVE_QUAD
1276  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1277  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1278 #else
1279  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1280  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1281 #endif
1282  if (sizeof_shareds > 0) {
1283  // Avoid double allocation here by combining shareds with taskdata
1284  task->shareds = &((char *)taskdata)[shareds_offset];
1285  // Make sure shareds struct is aligned to pointer size
1286  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1287  0);
1288  } else {
1289  task->shareds = NULL;
1290  }
1291  task->routine = task_entry;
1292  task->part_id = 0; // AC: Always start with 0 part id
1293 
1294  taskdata->td_task_id = KMP_GEN_TASK_ID();
1295  taskdata->td_team = team;
1296  taskdata->td_alloc_thread = thread;
1297  taskdata->td_parent = parent_task;
1298  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1299  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1300  taskdata->td_ident = loc_ref;
1301  taskdata->td_taskwait_ident = NULL;
1302  taskdata->td_taskwait_counter = 0;
1303  taskdata->td_taskwait_thread = 0;
1304  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1305  // avoid copying icvs for proxy tasks
1306  if (flags->proxy == TASK_FULL)
1307  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1308 
1309  taskdata->td_flags.tiedness = flags->tiedness;
1310  taskdata->td_flags.final = flags->final;
1311  taskdata->td_flags.merged_if0 = flags->merged_if0;
1312  taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
1313  taskdata->td_flags.proxy = flags->proxy;
1314  taskdata->td_flags.detachable = flags->detachable;
1315  taskdata->td_task_team = thread->th.th_task_team;
1316  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1317  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1318 
1319  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1320  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1321 
1322  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1323  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1324 
1325  // GEH - Note we serialize the task if the team is serialized to make sure
1326  // implicit parallel region tasks are not left until program termination to
1327  // execute. Also, it helps locality to execute immediately.
1328 
1329  taskdata->td_flags.task_serial =
1330  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1331  taskdata->td_flags.tasking_ser || flags->merged_if0);
1332 
1333  taskdata->td_flags.started = 0;
1334  taskdata->td_flags.executing = 0;
1335  taskdata->td_flags.complete = 0;
1336  taskdata->td_flags.freed = 0;
1337 
1338  taskdata->td_flags.native = flags->native;
1339 
1340  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1341  // start at one because counts current task and children
1342  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1343  taskdata->td_taskgroup =
1344  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1345  taskdata->td_dephash = NULL;
1346  taskdata->td_depnode = NULL;
1347  if (flags->tiedness == TASK_UNTIED)
1348  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1349  else
1350  taskdata->td_last_tied = taskdata;
1351  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1352 #if OMPT_SUPPORT
1353  if (UNLIKELY(ompt_enabled.enabled))
1354  __ompt_task_init(taskdata, gtid);
1355 #endif
1356 // Only need to keep track of child task counts if team parallel and tasking not
1357 // serialized or if it is a proxy or detachable task
1358  if (flags->proxy == TASK_PROXY ||
1359  flags->detachable == TASK_DETACHABLE ||
1360  !(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
1361  {
1362  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1363  if (parent_task->td_taskgroup)
1364  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1365  // Only need to keep track of allocated child tasks for explicit tasks since
1366  // implicit not deallocated
1367  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1368  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1369  }
1370  }
1371 
1372  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1373  gtid, taskdata, taskdata->td_parent));
1374  ANNOTATE_HAPPENS_BEFORE(task);
1375 
1376  return task;
1377 }
1378 
1379 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1380  kmp_int32 flags, size_t sizeof_kmp_task_t,
1381  size_t sizeof_shareds,
1382  kmp_routine_entry_t task_entry) {
1383  kmp_task_t *retval;
1384  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1385  __kmp_assert_valid_gtid(gtid);
1386  input_flags->native = FALSE;
1387 // __kmp_task_alloc() sets up all other runtime flags
1388  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1389  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1390  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1391  input_flags->proxy ? "proxy" : "",
1392  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1393  sizeof_shareds, task_entry));
1394 
1395  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1396  sizeof_shareds, task_entry);
1397 
1398  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1399 
1400  return retval;
1401 }
1402 
1403 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1404  kmp_int32 flags,
1405  size_t sizeof_kmp_task_t,
1406  size_t sizeof_shareds,
1407  kmp_routine_entry_t task_entry,
1408  kmp_int64 device_id) {
1409  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1410  sizeof_shareds, task_entry);
1411 }
1412 
1426 kmp_int32
1428  kmp_task_t *new_task, kmp_int32 naffins,
1429  kmp_task_affinity_info_t *affin_list) {
1430  return 0;
1431 }
1432 
1433 // __kmp_invoke_task: invoke the specified task
1434 //
1435 // gtid: global thread ID of caller
1436 // task: the task to invoke
1437 // current_task: the task to resume after task invocation
1438 static void __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1439  kmp_taskdata_t *current_task) {
1440  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1441  kmp_info_t *thread;
1442  int discard = 0 /* false */;
1443  KA_TRACE(
1444  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1445  gtid, taskdata, current_task));
1446  KMP_DEBUG_ASSERT(task);
1447  if (taskdata->td_flags.proxy == TASK_PROXY &&
1448  taskdata->td_flags.complete == 1) {
1449  // This is a proxy task that was already completed but it needs to run
1450  // its bottom-half finish
1451  KA_TRACE(
1452  30,
1453  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1454  gtid, taskdata));
1455 
1456  __kmp_bottom_half_finish_proxy(gtid, task);
1457 
1458  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1459  "proxy task %p, resuming task %p\n",
1460  gtid, taskdata, current_task));
1461 
1462  return;
1463  }
1464 
1465 #if OMPT_SUPPORT
1466  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1467  // does not execute code.
1468  ompt_thread_info_t oldInfo;
1469  if (UNLIKELY(ompt_enabled.enabled)) {
1470  // Store the threads states and restore them after the task
1471  thread = __kmp_threads[gtid];
1472  oldInfo = thread->th.ompt_thread_info;
1473  thread->th.ompt_thread_info.wait_id = 0;
1474  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1475  ? ompt_state_work_serial
1476  : ompt_state_work_parallel;
1477  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1478  }
1479 #endif
1480 
1481  // Proxy tasks are not handled by the runtime
1482  if (taskdata->td_flags.proxy != TASK_PROXY) {
1483  ANNOTATE_HAPPENS_AFTER(task);
1484  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1485  }
1486 
1487  // TODO: cancel tasks if the parallel region has also been cancelled
1488  // TODO: check if this sequence can be hoisted above __kmp_task_start
1489  // if cancellation has been enabled for this run ...
1490  if (__kmp_omp_cancellation) {
1491  thread = __kmp_threads[gtid];
1492  kmp_team_t *this_team = thread->th.th_team;
1493  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1494  if ((taskgroup && taskgroup->cancel_request) ||
1495  (this_team->t.t_cancel_request == cancel_parallel)) {
1496 #if OMPT_SUPPORT && OMPT_OPTIONAL
1497  ompt_data_t *task_data;
1498  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1499  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1500  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1501  task_data,
1502  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1503  : ompt_cancel_parallel) |
1504  ompt_cancel_discarded_task,
1505  NULL);
1506  }
1507 #endif
1508  KMP_COUNT_BLOCK(TASK_cancelled);
1509  // this task belongs to a task group and we need to cancel it
1510  discard = 1 /* true */;
1511  }
1512  }
1513 
1514  // Invoke the task routine and pass in relevant data.
1515  // Thunks generated by gcc take a different argument list.
1516  if (!discard) {
1517  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1518  taskdata->td_last_tied = current_task->td_last_tied;
1519  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1520  }
1521 #if KMP_STATS_ENABLED
1522  KMP_COUNT_BLOCK(TASK_executed);
1523  switch (KMP_GET_THREAD_STATE()) {
1524  case FORK_JOIN_BARRIER:
1525  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1526  break;
1527  case PLAIN_BARRIER:
1528  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1529  break;
1530  case TASKYIELD:
1531  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1532  break;
1533  case TASKWAIT:
1534  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1535  break;
1536  case TASKGROUP:
1537  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1538  break;
1539  default:
1540  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1541  break;
1542  }
1543 #endif // KMP_STATS_ENABLED
1544 
1545 // OMPT task begin
1546 #if OMPT_SUPPORT
1547  if (UNLIKELY(ompt_enabled.enabled))
1548  __ompt_task_start(task, current_task, gtid);
1549 #endif
1550 
1551 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1552  kmp_uint64 cur_time;
1553  kmp_int32 kmp_itt_count_task =
1554  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1555  current_task->td_flags.tasktype == TASK_IMPLICIT;
1556  if (kmp_itt_count_task) {
1557  thread = __kmp_threads[gtid];
1558  // Time outer level explicit task on barrier for adjusting imbalance time
1559  if (thread->th.th_bar_arrive_time)
1560  cur_time = __itt_get_timestamp();
1561  else
1562  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1563  }
1564  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1565 #endif
1566 
1567 #ifdef KMP_GOMP_COMPAT
1568  if (taskdata->td_flags.native) {
1569  ((void (*)(void *))(*(task->routine)))(task->shareds);
1570  } else
1571 #endif /* KMP_GOMP_COMPAT */
1572  {
1573  (*(task->routine))(gtid, task);
1574  }
1575  KMP_POP_PARTITIONED_TIMER();
1576 
1577 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1578  if (kmp_itt_count_task) {
1579  // Barrier imbalance - adjust arrive time with the task duration
1580  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1581  }
1582  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1583  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1584 #endif
1585 
1586  }
1587 
1588  // Proxy tasks are not handled by the runtime
1589  if (taskdata->td_flags.proxy != TASK_PROXY) {
1590  ANNOTATE_HAPPENS_BEFORE(taskdata->td_parent);
1591 #if OMPT_SUPPORT
1592  if (UNLIKELY(ompt_enabled.enabled)) {
1593  thread->th.ompt_thread_info = oldInfo;
1594  if (taskdata->td_flags.tiedness == TASK_TIED) {
1595  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1596  }
1597  __kmp_task_finish<true>(gtid, task, current_task);
1598  } else
1599 #endif
1600  __kmp_task_finish<false>(gtid, task, current_task);
1601  }
1602 
1603  KA_TRACE(
1604  30,
1605  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1606  gtid, taskdata, current_task));
1607  return;
1608 }
1609 
1610 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1611 //
1612 // loc_ref: location of original task pragma (ignored)
1613 // gtid: Global Thread ID of encountering thread
1614 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1615 // Returns:
1616 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1617 // be resumed later.
1618 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1619 // resumed later.
1620 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1621  kmp_task_t *new_task) {
1622  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1623 
1624  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1625  loc_ref, new_taskdata));
1626 
1627 #if OMPT_SUPPORT
1628  kmp_taskdata_t *parent;
1629  if (UNLIKELY(ompt_enabled.enabled)) {
1630  parent = new_taskdata->td_parent;
1631  if (ompt_enabled.ompt_callback_task_create) {
1632  ompt_data_t task_data = ompt_data_none;
1633  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1634  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1635  parent ? &(parent->ompt_task_info.frame) : NULL,
1636  &(new_taskdata->ompt_task_info.task_data), ompt_task_explicit, 0,
1637  OMPT_GET_RETURN_ADDRESS(0));
1638  }
1639  }
1640 #endif
1641 
1642  /* Should we execute the new task or queue it? For now, let's just always try
1643  to queue it. If the queue fills up, then we'll execute it. */
1644 
1645  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1646  { // Execute this task immediately
1647  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1648  new_taskdata->td_flags.task_serial = 1;
1649  __kmp_invoke_task(gtid, new_task, current_task);
1650  }
1651 
1652  KA_TRACE(
1653  10,
1654  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1655  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1656  gtid, loc_ref, new_taskdata));
1657 
1658  ANNOTATE_HAPPENS_BEFORE(new_task);
1659 #if OMPT_SUPPORT
1660  if (UNLIKELY(ompt_enabled.enabled)) {
1661  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1662  }
1663 #endif
1664  return TASK_CURRENT_NOT_QUEUED;
1665 }
1666 
1667 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1668 //
1669 // gtid: Global Thread ID of encountering thread
1670 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1671 // serialize_immediate: if TRUE then if the task is executed immediately its
1672 // execution will be serialized
1673 // Returns:
1674 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1675 // be resumed later.
1676 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1677 // resumed later.
1678 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1679  bool serialize_immediate) {
1680  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1681 
1682  /* Should we execute the new task or queue it? For now, let's just always try
1683  to queue it. If the queue fills up, then we'll execute it. */
1684  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1685  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1686  { // Execute this task immediately
1687  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1688  if (serialize_immediate)
1689  new_taskdata->td_flags.task_serial = 1;
1690  __kmp_invoke_task(gtid, new_task, current_task);
1691  }
1692 
1693  ANNOTATE_HAPPENS_BEFORE(new_task);
1694  return TASK_CURRENT_NOT_QUEUED;
1695 }
1696 
1697 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1698 // non-thread-switchable task from the parent thread only!
1699 //
1700 // loc_ref: location of original task pragma (ignored)
1701 // gtid: Global Thread ID of encountering thread
1702 // new_task: non-thread-switchable task thunk allocated by
1703 // __kmp_omp_task_alloc()
1704 // Returns:
1705 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1706 // be resumed later.
1707 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1708 // resumed later.
1709 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1710  kmp_task_t *new_task) {
1711  kmp_int32 res;
1712  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1713 
1714 #if KMP_DEBUG || OMPT_SUPPORT
1715  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1716 #endif
1717  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1718  new_taskdata));
1719  __kmp_assert_valid_gtid(gtid);
1720 
1721 #if OMPT_SUPPORT
1722  kmp_taskdata_t *parent = NULL;
1723  if (UNLIKELY(ompt_enabled.enabled)) {
1724  if (!new_taskdata->td_flags.started) {
1725  OMPT_STORE_RETURN_ADDRESS(gtid);
1726  parent = new_taskdata->td_parent;
1727  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1728  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1729  }
1730  if (ompt_enabled.ompt_callback_task_create) {
1731  ompt_data_t task_data = ompt_data_none;
1732  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1733  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1734  parent ? &(parent->ompt_task_info.frame) : NULL,
1735  &(new_taskdata->ompt_task_info.task_data),
1736  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1737  OMPT_LOAD_RETURN_ADDRESS(gtid));
1738  }
1739  } else {
1740  // We are scheduling the continuation of an UNTIED task.
1741  // Scheduling back to the parent task.
1742  __ompt_task_finish(new_task,
1743  new_taskdata->ompt_task_info.scheduling_parent,
1744  ompt_task_switch);
1745  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1746  }
1747  }
1748 #endif
1749 
1750  res = __kmp_omp_task(gtid, new_task, true);
1751 
1752  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1753  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1754  gtid, loc_ref, new_taskdata));
1755 #if OMPT_SUPPORT
1756  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1757  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1758  }
1759 #endif
1760  return res;
1761 }
1762 
1763 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1764 // a taskloop task with the correct OMPT return address
1765 //
1766 // loc_ref: location of original task pragma (ignored)
1767 // gtid: Global Thread ID of encountering thread
1768 // new_task: non-thread-switchable task thunk allocated by
1769 // __kmp_omp_task_alloc()
1770 // codeptr_ra: return address for OMPT callback
1771 // Returns:
1772 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1773 // be resumed later.
1774 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1775 // resumed later.
1776 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1777  kmp_task_t *new_task, void *codeptr_ra) {
1778  kmp_int32 res;
1779  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1780 
1781 #if KMP_DEBUG || OMPT_SUPPORT
1782  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1783 #endif
1784  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1785  new_taskdata));
1786 
1787 #if OMPT_SUPPORT
1788  kmp_taskdata_t *parent = NULL;
1789  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1790  parent = new_taskdata->td_parent;
1791  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1792  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1793  if (ompt_enabled.ompt_callback_task_create) {
1794  ompt_data_t task_data = ompt_data_none;
1795  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1796  parent ? &(parent->ompt_task_info.task_data) : &task_data,
1797  parent ? &(parent->ompt_task_info.frame) : NULL,
1798  &(new_taskdata->ompt_task_info.task_data),
1799  ompt_task_explicit | TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1800  codeptr_ra);
1801  }
1802  }
1803 #endif
1804 
1805  res = __kmp_omp_task(gtid, new_task, true);
1806 
1807  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1808  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1809  gtid, loc_ref, new_taskdata));
1810 #if OMPT_SUPPORT
1811  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1812  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1813  }
1814 #endif
1815  return res;
1816 }
1817 
1818 template <bool ompt>
1819 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1820  void *frame_address,
1821  void *return_address) {
1822  kmp_taskdata_t *taskdata;
1823  kmp_info_t *thread;
1824  int thread_finished = FALSE;
1825  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1826 
1827  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1828  __kmp_assert_valid_gtid(gtid);
1829 
1830  if (__kmp_tasking_mode != tskm_immediate_exec) {
1831  thread = __kmp_threads[gtid];
1832  taskdata = thread->th.th_current_task;
1833 
1834 #if OMPT_SUPPORT && OMPT_OPTIONAL
1835  ompt_data_t *my_task_data;
1836  ompt_data_t *my_parallel_data;
1837 
1838  if (ompt) {
1839  my_task_data = &(taskdata->ompt_task_info.task_data);
1840  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
1841 
1842  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
1843 
1844  if (ompt_enabled.ompt_callback_sync_region) {
1845  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1846  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1847  my_task_data, return_address);
1848  }
1849 
1850  if (ompt_enabled.ompt_callback_sync_region_wait) {
1851  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1852  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
1853  my_task_data, return_address);
1854  }
1855  }
1856 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1857 
1858 // Debugger: The taskwait is active. Store location and thread encountered the
1859 // taskwait.
1860 #if USE_ITT_BUILD
1861 // Note: These values are used by ITT events as well.
1862 #endif /* USE_ITT_BUILD */
1863  taskdata->td_taskwait_counter += 1;
1864  taskdata->td_taskwait_ident = loc_ref;
1865  taskdata->td_taskwait_thread = gtid + 1;
1866 
1867 #if USE_ITT_BUILD
1868  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1869  if (itt_sync_obj != NULL)
1870  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1871 #endif /* USE_ITT_BUILD */
1872 
1873  bool must_wait =
1874  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
1875 
1876  must_wait = must_wait || (thread->th.th_task_team != NULL &&
1877  thread->th.th_task_team->tt.tt_found_proxy_tasks);
1878  if (must_wait) {
1879  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
1880  &(taskdata->td_incomplete_child_tasks)),
1881  0U);
1882  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
1883  flag.execute_tasks(thread, gtid, FALSE,
1884  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1885  __kmp_task_stealing_constraint);
1886  }
1887  }
1888 #if USE_ITT_BUILD
1889  if (itt_sync_obj != NULL)
1890  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
1891  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
1892 #endif /* USE_ITT_BUILD */
1893 
1894  // Debugger: The taskwait is completed. Location remains, but thread is
1895  // negated.
1896  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
1897 
1898 #if OMPT_SUPPORT && OMPT_OPTIONAL
1899  if (ompt) {
1900  if (ompt_enabled.ompt_callback_sync_region_wait) {
1901  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1902  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1903  my_task_data, return_address);
1904  }
1905  if (ompt_enabled.ompt_callback_sync_region) {
1906  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1907  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
1908  my_task_data, return_address);
1909  }
1910  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
1911  }
1912 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1913 
1914  ANNOTATE_HAPPENS_AFTER(taskdata);
1915  }
1916 
1917  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
1918  "returning TASK_CURRENT_NOT_QUEUED\n",
1919  gtid, taskdata));
1920 
1921  return TASK_CURRENT_NOT_QUEUED;
1922 }
1923 
1924 #if OMPT_SUPPORT && OMPT_OPTIONAL
1925 OMPT_NOINLINE
1926 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
1927  void *frame_address,
1928  void *return_address) {
1929  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
1930  return_address);
1931 }
1932 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1933 
1934 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
1935 // complete
1936 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
1937 #if OMPT_SUPPORT && OMPT_OPTIONAL
1938  if (UNLIKELY(ompt_enabled.enabled)) {
1939  OMPT_STORE_RETURN_ADDRESS(gtid);
1940  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
1941  OMPT_LOAD_RETURN_ADDRESS(gtid));
1942  }
1943 #endif
1944  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
1945 }
1946 
1947 // __kmpc_omp_taskyield: switch to a different task
1948 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
1949  kmp_taskdata_t *taskdata;
1950  kmp_info_t *thread;
1951  int thread_finished = FALSE;
1952 
1953  KMP_COUNT_BLOCK(OMP_TASKYIELD);
1954  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
1955 
1956  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
1957  gtid, loc_ref, end_part));
1958  __kmp_assert_valid_gtid(gtid);
1959 
1960  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
1961  thread = __kmp_threads[gtid];
1962  taskdata = thread->th.th_current_task;
1963 // Should we model this as a task wait or not?
1964 // Debugger: The taskwait is active. Store location and thread encountered the
1965 // taskwait.
1966 #if USE_ITT_BUILD
1967 // Note: These values are used by ITT events as well.
1968 #endif /* USE_ITT_BUILD */
1969  taskdata->td_taskwait_counter += 1;
1970  taskdata->td_taskwait_ident = loc_ref;
1971  taskdata->td_taskwait_thread = gtid + 1;
1972 
1973 #if USE_ITT_BUILD
1974  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
1975  if (itt_sync_obj != NULL)
1976  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
1977 #endif /* USE_ITT_BUILD */
1978  if (!taskdata->td_flags.team_serial) {
1979  kmp_task_team_t *task_team = thread->th.th_task_team;
1980  if (task_team != NULL) {
1981  if (KMP_TASKING_ENABLED(task_team)) {
1982 #if OMPT_SUPPORT
1983  if (UNLIKELY(ompt_enabled.enabled))
1984  thread->th.ompt_thread_info.ompt_task_yielded = 1;
1985 #endif
1986  __kmp_execute_tasks_32(
1987  thread, gtid, NULL, FALSE,
1988  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
1989  __kmp_task_stealing_constraint);
1990 #if OMPT_SUPPORT
1991  if (UNLIKELY(ompt_enabled.enabled))
1992  thread->th.ompt_thread_info.ompt_task_yielded = 0;
1993 #endif
1994  }
1995  }
1996  }
1997 #if USE_ITT_BUILD
1998  if (itt_sync_obj != NULL)
1999  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2000 #endif /* USE_ITT_BUILD */
2001 
2002  // Debugger: The taskwait is completed. Location remains, but thread is
2003  // negated.
2004  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2005  }
2006 
2007  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2008  "returning TASK_CURRENT_NOT_QUEUED\n",
2009  gtid, taskdata));
2010 
2011  return TASK_CURRENT_NOT_QUEUED;
2012 }
2013 
2014 // Task Reduction implementation
2015 //
2016 // Note: initial implementation didn't take into account the possibility
2017 // to specify omp_orig for initializer of the UDR (user defined reduction).
2018 // Corrected implementation takes into account the omp_orig object.
2019 // Compiler is free to use old implementation if omp_orig is not specified.
2020 
2029 typedef struct kmp_taskred_flags {
2031  unsigned lazy_priv : 1;
2032  unsigned reserved31 : 31;
2034 
2038 typedef struct kmp_task_red_input {
2039  void *reduce_shar;
2040  size_t reduce_size;
2041  // three compiler-generated routines (init, fini are optional):
2042  void *reduce_init;
2043  void *reduce_fini;
2044  void *reduce_comb;
2047 
2051 typedef struct kmp_taskred_data {
2052  void *reduce_shar;
2053  size_t reduce_size;
2055  void *reduce_priv;
2056  void *reduce_pend;
2057  // three compiler-generated routines (init, fini are optional):
2058  void *reduce_comb;
2059  void *reduce_init;
2060  void *reduce_fini;
2061  void *reduce_orig;
2063 
2069 typedef struct kmp_taskred_input {
2070  void *reduce_shar;
2071  void *reduce_orig;
2072  size_t reduce_size;
2073  // three compiler-generated routines (init, fini are optional):
2074  void *reduce_init;
2075  void *reduce_fini;
2076  void *reduce_comb;
2083 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2084 template <>
2085 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2086  kmp_task_red_input_t &src) {
2087  item.reduce_orig = NULL;
2088 }
2089 template <>
2090 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2091  kmp_taskred_input_t &src) {
2092  if (src.reduce_orig != NULL) {
2093  item.reduce_orig = src.reduce_orig;
2094  } else {
2095  item.reduce_orig = src.reduce_shar;
2096  } // non-NULL reduce_orig means new interface used
2097 }
2098 
2099 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, int j);
2100 template <>
2101 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2102  int offset) {
2103  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2104 }
2105 template <>
2106 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2107  int offset) {
2108  ((void (*)(void *, void *))item.reduce_init)(
2109  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2110 }
2111 
2112 template <typename T>
2113 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2114  __kmp_assert_valid_gtid(gtid);
2115  kmp_info_t *thread = __kmp_threads[gtid];
2116  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2117  kmp_int32 nth = thread->th.th_team_nproc;
2118  kmp_taskred_data_t *arr;
2119 
2120  // check input data just in case
2121  KMP_ASSERT(tg != NULL);
2122  KMP_ASSERT(data != NULL);
2123  KMP_ASSERT(num > 0);
2124  if (nth == 1) {
2125  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2126  gtid, tg));
2127  return (void *)tg;
2128  }
2129  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2130  gtid, tg, num));
2131  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2132  thread, num * sizeof(kmp_taskred_data_t));
2133  for (int i = 0; i < num; ++i) {
2134  size_t size = data[i].reduce_size - 1;
2135  // round the size up to cache line per thread-specific item
2136  size += CACHE_LINE - size % CACHE_LINE;
2137  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2138  arr[i].reduce_shar = data[i].reduce_shar;
2139  arr[i].reduce_size = size;
2140  arr[i].flags = data[i].flags;
2141  arr[i].reduce_comb = data[i].reduce_comb;
2142  arr[i].reduce_init = data[i].reduce_init;
2143  arr[i].reduce_fini = data[i].reduce_fini;
2144  __kmp_assign_orig<T>(arr[i], data[i]);
2145  if (!arr[i].flags.lazy_priv) {
2146  // allocate cache-line aligned block and fill it with zeros
2147  arr[i].reduce_priv = __kmp_allocate(nth * size);
2148  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2149  if (arr[i].reduce_init != NULL) {
2150  // initialize all thread-specific items
2151  for (int j = 0; j < nth; ++j) {
2152  __kmp_call_init<T>(arr[i], j * size);
2153  }
2154  }
2155  } else {
2156  // only allocate space for pointers now,
2157  // objects will be lazily allocated/initialized if/when requested
2158  // note that __kmp_allocate zeroes the allocated memory
2159  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2160  }
2161  }
2162  tg->reduce_data = (void *)arr;
2163  tg->reduce_num_data = num;
2164  return (void *)tg;
2165 }
2166 
2181 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2182  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2183 }
2184 
2197 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2198  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2199 }
2200 
2201 // Copy task reduction data (except for shared pointers).
2202 template <typename T>
2203 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2204  kmp_taskgroup_t *tg, void *reduce_data) {
2205  kmp_taskred_data_t *arr;
2206  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2207  " from data %p\n",
2208  thr, tg, reduce_data));
2209  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2210  thr, num * sizeof(kmp_taskred_data_t));
2211  // threads will share private copies, thunk routines, sizes, flags, etc.:
2212  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2213  for (int i = 0; i < num; ++i) {
2214  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2215  }
2216  tg->reduce_data = (void *)arr;
2217  tg->reduce_num_data = num;
2218 }
2219 
2229 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2230  __kmp_assert_valid_gtid(gtid);
2231  kmp_info_t *thread = __kmp_threads[gtid];
2232  kmp_int32 nth = thread->th.th_team_nproc;
2233  if (nth == 1)
2234  return data; // nothing to do
2235 
2236  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2237  if (tg == NULL)
2238  tg = thread->th.th_current_task->td_taskgroup;
2239  KMP_ASSERT(tg != NULL);
2240  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)(tg->reduce_data);
2241  kmp_int32 num = tg->reduce_num_data;
2242  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2243 
2244  KMP_ASSERT(data != NULL);
2245  while (tg != NULL) {
2246  for (int i = 0; i < num; ++i) {
2247  if (!arr[i].flags.lazy_priv) {
2248  if (data == arr[i].reduce_shar ||
2249  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2250  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2251  } else {
2252  // check shared location first
2253  void **p_priv = (void **)(arr[i].reduce_priv);
2254  if (data == arr[i].reduce_shar)
2255  goto found;
2256  // check if we get some thread specific location as parameter
2257  for (int j = 0; j < nth; ++j)
2258  if (data == p_priv[j])
2259  goto found;
2260  continue; // not found, continue search
2261  found:
2262  if (p_priv[tid] == NULL) {
2263  // allocate thread specific object lazily
2264  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2265  if (arr[i].reduce_init != NULL) {
2266  if (arr[i].reduce_orig != NULL) { // new interface
2267  ((void (*)(void *, void *))arr[i].reduce_init)(
2268  p_priv[tid], arr[i].reduce_orig);
2269  } else { // old interface (single parameter)
2270  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2271  }
2272  }
2273  }
2274  return p_priv[tid];
2275  }
2276  }
2277  tg = tg->parent;
2278  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2279  num = tg->reduce_num_data;
2280  }
2281  KMP_ASSERT2(0, "Unknown task reduction item");
2282  return NULL; // ERROR, this line never executed
2283 }
2284 
2285 // Finalize task reduction.
2286 // Called from __kmpc_end_taskgroup()
2287 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2288  kmp_int32 nth = th->th.th_team_nproc;
2289  KMP_DEBUG_ASSERT(nth > 1); // should not be called if nth == 1
2290  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2291  kmp_int32 num = tg->reduce_num_data;
2292  for (int i = 0; i < num; ++i) {
2293  void *sh_data = arr[i].reduce_shar;
2294  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2295  void (*f_comb)(void *, void *) =
2296  (void (*)(void *, void *))(arr[i].reduce_comb);
2297  if (!arr[i].flags.lazy_priv) {
2298  void *pr_data = arr[i].reduce_priv;
2299  size_t size = arr[i].reduce_size;
2300  for (int j = 0; j < nth; ++j) {
2301  void *priv_data = (char *)pr_data + j * size;
2302  f_comb(sh_data, priv_data); // combine results
2303  if (f_fini)
2304  f_fini(priv_data); // finalize if needed
2305  }
2306  } else {
2307  void **pr_data = (void **)(arr[i].reduce_priv);
2308  for (int j = 0; j < nth; ++j) {
2309  if (pr_data[j] != NULL) {
2310  f_comb(sh_data, pr_data[j]); // combine results
2311  if (f_fini)
2312  f_fini(pr_data[j]); // finalize if needed
2313  __kmp_free(pr_data[j]);
2314  }
2315  }
2316  }
2317  __kmp_free(arr[i].reduce_priv);
2318  }
2319  __kmp_thread_free(th, arr);
2320  tg->reduce_data = NULL;
2321  tg->reduce_num_data = 0;
2322 }
2323 
2324 // Cleanup task reduction data for parallel or worksharing,
2325 // do not touch task private data other threads still working with.
2326 // Called from __kmpc_end_taskgroup()
2327 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2328  __kmp_thread_free(th, tg->reduce_data);
2329  tg->reduce_data = NULL;
2330  tg->reduce_num_data = 0;
2331 }
2332 
2333 template <typename T>
2334 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2335  int num, T *data) {
2336  __kmp_assert_valid_gtid(gtid);
2337  kmp_info_t *thr = __kmp_threads[gtid];
2338  kmp_int32 nth = thr->th.th_team_nproc;
2339  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2340  if (nth == 1) {
2341  KA_TRACE(10,
2342  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2343  gtid, thr->th.th_current_task->td_taskgroup));
2344  return (void *)thr->th.th_current_task->td_taskgroup;
2345  }
2346  kmp_team_t *team = thr->th.th_team;
2347  void *reduce_data;
2348  kmp_taskgroup_t *tg;
2349  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2350  if (reduce_data == NULL &&
2351  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2352  (void *)1)) {
2353  // single thread enters this block to initialize common reduction data
2354  KMP_DEBUG_ASSERT(reduce_data == NULL);
2355  // first initialize own data, then make a copy other threads can use
2356  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2357  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2358  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2359  // fini counters should be 0 at this point
2360  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2361  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2362  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2363  } else {
2364  while (
2365  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2366  (void *)1) { // wait for task reduction initialization
2367  KMP_CPU_PAUSE();
2368  }
2369  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2370  tg = thr->th.th_current_task->td_taskgroup;
2371  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2372  }
2373  return tg;
2374 }
2375 
2392 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2393  int num, void *data) {
2394  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2395  (kmp_task_red_input_t *)data);
2396 }
2397 
2412 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2413  void *data) {
2414  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2415  (kmp_taskred_input_t *)data);
2416 }
2417 
2426 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2427  __kmpc_end_taskgroup(loc, gtid);
2428 }
2429 
2430 // __kmpc_taskgroup: Start a new taskgroup
2431 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2432  __kmp_assert_valid_gtid(gtid);
2433  kmp_info_t *thread = __kmp_threads[gtid];
2434  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2435  kmp_taskgroup_t *tg_new =
2436  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2437  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2438  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2439  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2440  tg_new->parent = taskdata->td_taskgroup;
2441  tg_new->reduce_data = NULL;
2442  tg_new->reduce_num_data = 0;
2443  taskdata->td_taskgroup = tg_new;
2444 
2445 #if OMPT_SUPPORT && OMPT_OPTIONAL
2446  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2447  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2448  if (!codeptr)
2449  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2450  kmp_team_t *team = thread->th.th_team;
2451  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2452  // FIXME: I think this is wrong for lwt!
2453  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2454 
2455  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2456  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2457  &(my_task_data), codeptr);
2458  }
2459 #endif
2460 }
2461 
2462 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2463 // and its descendants are complete
2464 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2465  __kmp_assert_valid_gtid(gtid);
2466  kmp_info_t *thread = __kmp_threads[gtid];
2467  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2468  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2469  int thread_finished = FALSE;
2470 
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2472  kmp_team_t *team;
2473  ompt_data_t my_task_data;
2474  ompt_data_t my_parallel_data;
2475  void *codeptr;
2476  if (UNLIKELY(ompt_enabled.enabled)) {
2477  team = thread->th.th_team;
2478  my_task_data = taskdata->ompt_task_info.task_data;
2479  // FIXME: I think this is wrong for lwt!
2480  my_parallel_data = team->t.ompt_team_info.parallel_data;
2481  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2482  if (!codeptr)
2483  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2484  }
2485 #endif
2486 
2487  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2488  KMP_DEBUG_ASSERT(taskgroup != NULL);
2489  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2490 
2491  if (__kmp_tasking_mode != tskm_immediate_exec) {
2492  // mark task as waiting not on a barrier
2493  taskdata->td_taskwait_counter += 1;
2494  taskdata->td_taskwait_ident = loc;
2495  taskdata->td_taskwait_thread = gtid + 1;
2496 #if USE_ITT_BUILD
2497  // For ITT the taskgroup wait is similar to taskwait until we need to
2498  // distinguish them
2499  void *itt_sync_obj = __kmp_itt_taskwait_object(gtid);
2500  if (itt_sync_obj != NULL)
2501  __kmp_itt_taskwait_starting(gtid, itt_sync_obj);
2502 #endif /* USE_ITT_BUILD */
2503 
2504 #if OMPT_SUPPORT && OMPT_OPTIONAL
2505  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2506  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2507  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2508  &(my_task_data), codeptr);
2509  }
2510 #endif
2511 
2512  if (!taskdata->td_flags.team_serial ||
2513  (thread->th.th_task_team != NULL &&
2514  thread->th.th_task_team->tt.tt_found_proxy_tasks)) {
2515  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
2516  0U);
2517  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2518  flag.execute_tasks(thread, gtid, FALSE,
2519  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2520  __kmp_task_stealing_constraint);
2521  }
2522  }
2523  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2524 
2525 #if OMPT_SUPPORT && OMPT_OPTIONAL
2526  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2527  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2528  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2529  &(my_task_data), codeptr);
2530  }
2531 #endif
2532 
2533 #if USE_ITT_BUILD
2534  if (itt_sync_obj != NULL)
2535  __kmp_itt_taskwait_finished(gtid, itt_sync_obj);
2536  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2537 #endif /* USE_ITT_BUILD */
2538  }
2539  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2540 
2541  if (taskgroup->reduce_data != NULL) { // need to reduce?
2542  int cnt;
2543  void *reduce_data;
2544  kmp_team_t *t = thread->th.th_team;
2545  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2546  // check if <priv> data of the first reduction variable shared for the team
2547  void *priv0 = arr[0].reduce_priv;
2548  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2549  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2550  // finishing task reduction on parallel
2551  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2552  if (cnt == thread->th.th_team_nproc - 1) {
2553  // we are the last thread passing __kmpc_reduction_modifier_fini()
2554  // finalize task reduction:
2555  __kmp_task_reduction_fini(thread, taskgroup);
2556  // cleanup fields in the team structure:
2557  // TODO: is relaxed store enough here (whole barrier should follow)?
2558  __kmp_thread_free(thread, reduce_data);
2559  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2560  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2561  } else {
2562  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2563  // so do not finalize reduction, just clean own copy of the data
2564  __kmp_task_reduction_clean(thread, taskgroup);
2565  }
2566  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2567  NULL &&
2568  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2569  // finishing task reduction on worksharing
2570  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2571  if (cnt == thread->th.th_team_nproc - 1) {
2572  // we are the last thread passing __kmpc_reduction_modifier_fini()
2573  __kmp_task_reduction_fini(thread, taskgroup);
2574  // cleanup fields in team structure:
2575  // TODO: is relaxed store enough here (whole barrier should follow)?
2576  __kmp_thread_free(thread, reduce_data);
2577  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2578  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2579  } else {
2580  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2581  // so do not finalize reduction, just clean own copy of the data
2582  __kmp_task_reduction_clean(thread, taskgroup);
2583  }
2584  } else {
2585  // finishing task reduction on taskgroup
2586  __kmp_task_reduction_fini(thread, taskgroup);
2587  }
2588  }
2589  // Restore parent taskgroup for the current task
2590  taskdata->td_taskgroup = taskgroup->parent;
2591  __kmp_thread_free(thread, taskgroup);
2592 
2593  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2594  gtid, taskdata));
2595  ANNOTATE_HAPPENS_AFTER(taskdata);
2596 
2597 #if OMPT_SUPPORT && OMPT_OPTIONAL
2598  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2599  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2600  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2601  &(my_task_data), codeptr);
2602  }
2603 #endif
2604 }
2605 
2606 // __kmp_remove_my_task: remove a task from my own deque
2607 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2608  kmp_task_team_t *task_team,
2609  kmp_int32 is_constrained) {
2610  kmp_task_t *task;
2611  kmp_taskdata_t *taskdata;
2612  kmp_thread_data_t *thread_data;
2613  kmp_uint32 tail;
2614 
2615  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2616  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2617  NULL); // Caller should check this condition
2618 
2619  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2620 
2621  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2622  gtid, thread_data->td.td_deque_ntasks,
2623  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2624 
2625  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2626  KA_TRACE(10,
2627  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2628  "ntasks=%d head=%u tail=%u\n",
2629  gtid, thread_data->td.td_deque_ntasks,
2630  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2631  return NULL;
2632  }
2633 
2634  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2635 
2636  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2637  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2638  KA_TRACE(10,
2639  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2640  "ntasks=%d head=%u tail=%u\n",
2641  gtid, thread_data->td.td_deque_ntasks,
2642  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2643  return NULL;
2644  }
2645 
2646  tail = (thread_data->td.td_deque_tail - 1) &
2647  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2648  taskdata = thread_data->td.td_deque[tail];
2649 
2650  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2651  thread->th.th_current_task)) {
2652  // The TSC does not allow to steal victim task
2653  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2654  KA_TRACE(10,
2655  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2656  "ntasks=%d head=%u tail=%u\n",
2657  gtid, thread_data->td.td_deque_ntasks,
2658  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2659  return NULL;
2660  }
2661 
2662  thread_data->td.td_deque_tail = tail;
2663  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2664 
2665  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2666 
2667  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2668  "ntasks=%d head=%u tail=%u\n",
2669  gtid, taskdata, thread_data->td.td_deque_ntasks,
2670  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2671 
2672  task = KMP_TASKDATA_TO_TASK(taskdata);
2673  return task;
2674 }
2675 
2676 // __kmp_steal_task: remove a task from another thread's deque
2677 // Assume that calling thread has already checked existence of
2678 // task_team thread_data before calling this routine.
2679 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
2680  kmp_task_team_t *task_team,
2681  std::atomic<kmp_int32> *unfinished_threads,
2682  int *thread_finished,
2683  kmp_int32 is_constrained) {
2684  kmp_task_t *task;
2685  kmp_taskdata_t *taskdata;
2686  kmp_taskdata_t *current;
2687  kmp_thread_data_t *victim_td, *threads_data;
2688  kmp_int32 target;
2689  kmp_int32 victim_tid;
2690 
2691  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2692 
2693  threads_data = task_team->tt.tt_threads_data;
2694  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
2695 
2696  victim_tid = victim_thr->th.th_info.ds.ds_tid;
2697  victim_td = &threads_data[victim_tid];
2698 
2699  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
2700  "task_team=%p ntasks=%d head=%u tail=%u\n",
2701  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2702  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2703  victim_td->td.td_deque_tail));
2704 
2705  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
2706  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
2707  "task_team=%p ntasks=%d head=%u tail=%u\n",
2708  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
2709  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
2710  victim_td->td.td_deque_tail));
2711  return NULL;
2712  }
2713 
2714  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
2715 
2716  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
2717  // Check again after we acquire the lock
2718  if (ntasks == 0) {
2719  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2720  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
2721  "task_team=%p ntasks=%d head=%u tail=%u\n",
2722  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2723  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2724  return NULL;
2725  }
2726 
2727  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
2728  current = __kmp_threads[gtid]->th.th_current_task;
2729  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
2730  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2731  // Bump head pointer and Wrap.
2732  victim_td->td.td_deque_head =
2733  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
2734  } else {
2735  if (!task_team->tt.tt_untied_task_encountered) {
2736  // The TSC does not allow to steal victim task
2737  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2738  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
2739  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2740  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2741  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2742  return NULL;
2743  }
2744  int i;
2745  // walk through victim's deque trying to steal any task
2746  target = victim_td->td.td_deque_head;
2747  taskdata = NULL;
2748  for (i = 1; i < ntasks; ++i) {
2749  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2750  taskdata = victim_td->td.td_deque[target];
2751  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2752  break; // found victim task
2753  } else {
2754  taskdata = NULL;
2755  }
2756  }
2757  if (taskdata == NULL) {
2758  // No appropriate candidate to steal found
2759  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2760  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
2761  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
2762  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
2763  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2764  return NULL;
2765  }
2766  int prev = target;
2767  for (i = i + 1; i < ntasks; ++i) {
2768  // shift remaining tasks in the deque left by 1
2769  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
2770  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
2771  prev = target;
2772  }
2773  KMP_DEBUG_ASSERT(
2774  victim_td->td.td_deque_tail ==
2775  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
2776  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
2777  }
2778  if (*thread_finished) {
2779  // We need to un-mark this victim as a finished victim. This must be done
2780  // before releasing the lock, or else other threads (starting with the
2781  // master victim) might be prematurely released from the barrier!!!
2782  kmp_int32 count;
2783 
2784  count = KMP_ATOMIC_INC(unfinished_threads);
2785 
2786  KA_TRACE(
2787  20,
2788  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
2789  gtid, count + 1, task_team));
2790 
2791  *thread_finished = FALSE;
2792  }
2793  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
2794 
2795  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
2796 
2797  KMP_COUNT_BLOCK(TASK_stolen);
2798  KA_TRACE(10,
2799  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
2800  "task_team=%p ntasks=%d head=%u tail=%u\n",
2801  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
2802  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
2803 
2804  task = KMP_TASKDATA_TO_TASK(taskdata);
2805  return task;
2806 }
2807 
2808 // __kmp_execute_tasks_template: Choose and execute tasks until either the
2809 // condition is statisfied (return true) or there are none left (return false).
2810 //
2811 // final_spin is TRUE if this is the spin at the release barrier.
2812 // thread_finished indicates whether the thread is finished executing all
2813 // the tasks it has on its deque, and is at the release barrier.
2814 // spinner is the location on which to spin.
2815 // spinner == NULL means only execute a single task and return.
2816 // checker is the value to check to terminate the spin.
2817 template <class C>
2818 static inline int __kmp_execute_tasks_template(
2819  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
2820  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
2821  kmp_int32 is_constrained) {
2822  kmp_task_team_t *task_team = thread->th.th_task_team;
2823  kmp_thread_data_t *threads_data;
2824  kmp_task_t *task;
2825  kmp_info_t *other_thread;
2826  kmp_taskdata_t *current_task = thread->th.th_current_task;
2827  std::atomic<kmp_int32> *unfinished_threads;
2828  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
2829  tid = thread->th.th_info.ds.ds_tid;
2830 
2831  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2832  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
2833 
2834  if (task_team == NULL || current_task == NULL)
2835  return FALSE;
2836 
2837  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
2838  "*thread_finished=%d\n",
2839  gtid, final_spin, *thread_finished));
2840 
2841  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
2842  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
2843  KMP_DEBUG_ASSERT(threads_data != NULL);
2844 
2845  nthreads = task_team->tt.tt_nproc;
2846  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
2847  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks);
2848  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
2849 
2850  while (1) { // Outer loop keeps trying to find tasks in case of single thread
2851  // getting tasks from target constructs
2852  while (1) { // Inner loop to find a task and execute it
2853  task = NULL;
2854  if (use_own_tasks) { // check on own queue first
2855  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
2856  }
2857  if ((task == NULL) && (nthreads > 1)) { // Steal a task
2858  int asleep = 1;
2859  use_own_tasks = 0;
2860  // Try to steal from the last place I stole from successfully.
2861  if (victim_tid == -2) { // haven't stolen anything yet
2862  victim_tid = threads_data[tid].td.td_deque_last_stolen;
2863  if (victim_tid !=
2864  -1) // if we have a last stolen from victim, get the thread
2865  other_thread = threads_data[victim_tid].td.td_thr;
2866  }
2867  if (victim_tid != -1) { // found last victim
2868  asleep = 0;
2869  } else if (!new_victim) { // no recent steals and we haven't already
2870  // used a new victim; select a random thread
2871  do { // Find a different thread to steal work from.
2872  // Pick a random thread. Initial plan was to cycle through all the
2873  // threads, and only return if we tried to steal from every thread,
2874  // and failed. Arch says that's not such a great idea.
2875  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
2876  if (victim_tid >= tid) {
2877  ++victim_tid; // Adjusts random distribution to exclude self
2878  }
2879  // Found a potential victim
2880  other_thread = threads_data[victim_tid].td.td_thr;
2881  // There is a slight chance that __kmp_enable_tasking() did not wake
2882  // up all threads waiting at the barrier. If victim is sleeping,
2883  // then wake it up. Since we were going to pay the cache miss
2884  // penalty for referencing another thread's kmp_info_t struct
2885  // anyway,
2886  // the check shouldn't cost too much performance at this point. In
2887  // extra barrier mode, tasks do not sleep at the separate tasking
2888  // barrier, so this isn't a problem.
2889  asleep = 0;
2890  if ((__kmp_tasking_mode == tskm_task_teams) &&
2891  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
2892  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
2893  NULL)) {
2894  asleep = 1;
2895  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread),
2896  other_thread->th.th_sleep_loc);
2897  // A sleeping thread should not have any tasks on it's queue.
2898  // There is a slight possibility that it resumes, steals a task
2899  // from another thread, which spawns more tasks, all in the time
2900  // that it takes this thread to check => don't write an assertion
2901  // that the victim's queue is empty. Try stealing from a
2902  // different thread.
2903  }
2904  } while (asleep);
2905  }
2906 
2907  if (!asleep) {
2908  // We have a victim to try to steal from
2909  task = __kmp_steal_task(other_thread, gtid, task_team,
2910  unfinished_threads, thread_finished,
2911  is_constrained);
2912  }
2913  if (task != NULL) { // set last stolen to victim
2914  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
2915  threads_data[tid].td.td_deque_last_stolen = victim_tid;
2916  // The pre-refactored code did not try more than 1 successful new
2917  // vicitm, unless the last one generated more local tasks;
2918  // new_victim keeps track of this
2919  new_victim = 1;
2920  }
2921  } else { // No tasks found; unset last_stolen
2922  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
2923  victim_tid = -2; // no successful victim found
2924  }
2925  }
2926 
2927  if (task == NULL) // break out of tasking loop
2928  break;
2929 
2930 // Found a task; execute it
2931 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2932  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2933  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
2934  // get the object reliably
2935  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2936  }
2937  __kmp_itt_task_starting(itt_sync_obj);
2938  }
2939 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2940  __kmp_invoke_task(gtid, task, current_task);
2941 #if USE_ITT_BUILD
2942  if (itt_sync_obj != NULL)
2943  __kmp_itt_task_finished(itt_sync_obj);
2944 #endif /* USE_ITT_BUILD */
2945  // If this thread is only partway through the barrier and the condition is
2946  // met, then return now, so that the barrier gather/release pattern can
2947  // proceed. If this thread is in the last spin loop in the barrier,
2948  // waiting to be released, we know that the termination condition will not
2949  // be satisfied, so don't waste any cycles checking it.
2950  if (flag == NULL || (!final_spin && flag->done_check())) {
2951  KA_TRACE(
2952  15,
2953  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2954  gtid));
2955  return TRUE;
2956  }
2957  if (thread->th.th_task_team == NULL) {
2958  break;
2959  }
2960  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
2961  // If execution of a stolen task results in more tasks being placed on our
2962  // run queue, reset use_own_tasks
2963  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
2964  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
2965  "other tasks, restart\n",
2966  gtid));
2967  use_own_tasks = 1;
2968  new_victim = 0;
2969  }
2970  }
2971 
2972  // The task source has been exhausted. If in final spin loop of barrier,
2973  // check if termination condition is satisfied. The work queue may be empty
2974  // but there might be proxy tasks still executing.
2975  if (final_spin &&
2976  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
2977  // First, decrement the #unfinished threads, if that has not already been
2978  // done. This decrement might be to the spin location, and result in the
2979  // termination condition being satisfied.
2980  if (!*thread_finished) {
2981  kmp_int32 count;
2982 
2983  count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
2984  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
2985  "unfinished_threads to %d task_team=%p\n",
2986  gtid, count, task_team));
2987  *thread_finished = TRUE;
2988  }
2989 
2990  // It is now unsafe to reference thread->th.th_team !!!
2991  // Decrementing task_team->tt.tt_unfinished_threads can allow the master
2992  // thread to pass through the barrier, where it might reset each thread's
2993  // th.th_team field for the next parallel region. If we can steal more
2994  // work, we know that this has not happened yet.
2995  if (flag != NULL && flag->done_check()) {
2996  KA_TRACE(
2997  15,
2998  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
2999  gtid));
3000  return TRUE;
3001  }
3002  }
3003 
3004  // If this thread's task team is NULL, master has recognized that there are
3005  // no more tasks; bail out
3006  if (thread->th.th_task_team == NULL) {
3007  KA_TRACE(15,
3008  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3009  return FALSE;
3010  }
3011 
3012  // We could be getting tasks from target constructs; if this is the only
3013  // thread, keep trying to execute tasks from own queue
3014  if (nthreads == 1)
3015  use_own_tasks = 1;
3016  else {
3017  KA_TRACE(15,
3018  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3019  return FALSE;
3020  }
3021  }
3022 }
3023 
3024 int __kmp_execute_tasks_32(
3025  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin,
3026  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3027  kmp_int32 is_constrained) {
3028  return __kmp_execute_tasks_template(
3029  thread, gtid, flag, final_spin,
3030  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3031 }
3032 
3033 int __kmp_execute_tasks_64(
3034  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64 *flag, int final_spin,
3035  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3036  kmp_int32 is_constrained) {
3037  return __kmp_execute_tasks_template(
3038  thread, gtid, flag, final_spin,
3039  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3040 }
3041 
3042 int __kmp_execute_tasks_oncore(
3043  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3044  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3045  kmp_int32 is_constrained) {
3046  return __kmp_execute_tasks_template(
3047  thread, gtid, flag, final_spin,
3048  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3049 }
3050 
3051 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3052 // next barrier so they can assist in executing enqueued tasks.
3053 // First thread in allocates the task team atomically.
3054 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3055  kmp_info_t *this_thr) {
3056  kmp_thread_data_t *threads_data;
3057  int nthreads, i, is_init_thread;
3058 
3059  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3060  __kmp_gtid_from_thread(this_thr)));
3061 
3062  KMP_DEBUG_ASSERT(task_team != NULL);
3063  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3064 
3065  nthreads = task_team->tt.tt_nproc;
3066  KMP_DEBUG_ASSERT(nthreads > 0);
3067  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3068 
3069  // Allocate or increase the size of threads_data if necessary
3070  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3071 
3072  if (!is_init_thread) {
3073  // Some other thread already set up the array.
3074  KA_TRACE(
3075  20,
3076  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3077  __kmp_gtid_from_thread(this_thr)));
3078  return;
3079  }
3080  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3081  KMP_DEBUG_ASSERT(threads_data != NULL);
3082 
3083  if (__kmp_tasking_mode == tskm_task_teams &&
3084  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3085  // Release any threads sleeping at the barrier, so that they can steal
3086  // tasks and execute them. In extra barrier mode, tasks do not sleep
3087  // at the separate tasking barrier, so this isn't a problem.
3088  for (i = 0; i < nthreads; i++) {
3089  volatile void *sleep_loc;
3090  kmp_info_t *thread = threads_data[i].td.td_thr;
3091 
3092  if (i == this_thr->th.th_info.ds.ds_tid) {
3093  continue;
3094  }
3095  // Since we haven't locked the thread's suspend mutex lock at this
3096  // point, there is a small window where a thread might be putting
3097  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3098  // To work around this, __kmp_execute_tasks_template() periodically checks
3099  // see if other threads are sleeping (using the same random mechanism that
3100  // is used for task stealing) and awakens them if they are.
3101  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3102  NULL) {
3103  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3104  __kmp_gtid_from_thread(this_thr),
3105  __kmp_gtid_from_thread(thread)));
3106  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3107  } else {
3108  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3109  __kmp_gtid_from_thread(this_thr),
3110  __kmp_gtid_from_thread(thread)));
3111  }
3112  }
3113  }
3114 
3115  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3116  __kmp_gtid_from_thread(this_thr)));
3117 }
3118 
3119 /* // TODO: Check the comment consistency
3120  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3121  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3122  * After a child * thread checks into a barrier and calls __kmp_release() from
3123  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3124  * longer assume that the kmp_team_t structure is intact (at any moment, the
3125  * master thread may exit the barrier code and free the team data structure,
3126  * and return the threads to the thread pool).
3127  *
3128  * This does not work with the tasking code, as the thread is still
3129  * expected to participate in the execution of any tasks that may have been
3130  * spawned my a member of the team, and the thread still needs access to all
3131  * to each thread in the team, so that it can steal work from it.
3132  *
3133  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3134  * counting mechanism, and is allocated by the master thread before calling
3135  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3136  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3137  * of the kmp_task_team_t structs for consecutive barriers can overlap
3138  * (and will, unless the master thread is the last thread to exit the barrier
3139  * release phase, which is not typical). The existence of such a struct is
3140  * useful outside the context of tasking.
3141  *
3142  * We currently use the existence of the threads array as an indicator that
3143  * tasks were spawned since the last barrier. If the structure is to be
3144  * useful outside the context of tasking, then this will have to change, but
3145  * not setting the field minimizes the performance impact of tasking on
3146  * barriers, when no explicit tasks were spawned (pushed, actually).
3147  */
3148 
3149 static kmp_task_team_t *__kmp_free_task_teams =
3150  NULL; // Free list for task_team data structures
3151 // Lock for task team data structures
3152 kmp_bootstrap_lock_t __kmp_task_team_lock =
3153  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3154 
3155 // __kmp_alloc_task_deque:
3156 // Allocates a task deque for a particular thread, and initialize the necessary
3157 // data structures relating to the deque. This only happens once per thread
3158 // per task team since task teams are recycled. No lock is needed during
3159 // allocation since each thread allocates its own deque.
3160 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3161  kmp_thread_data_t *thread_data) {
3162  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3163  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3164 
3165  // Initialize last stolen task field to "none"
3166  thread_data->td.td_deque_last_stolen = -1;
3167 
3168  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3169  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3170  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3171 
3172  KE_TRACE(
3173  10,
3174  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3175  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3176  // Allocate space for task deque, and zero the deque
3177  // Cannot use __kmp_thread_calloc() because threads not around for
3178  // kmp_reap_task_team( ).
3179  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3180  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3181  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3182 }
3183 
3184 // __kmp_free_task_deque:
3185 // Deallocates a task deque for a particular thread. Happens at library
3186 // deallocation so don't need to reset all thread data fields.
3187 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3188  if (thread_data->td.td_deque != NULL) {
3189  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3190  TCW_4(thread_data->td.td_deque_ntasks, 0);
3191  __kmp_free(thread_data->td.td_deque);
3192  thread_data->td.td_deque = NULL;
3193  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3194  }
3195 
3196 #ifdef BUILD_TIED_TASK_STACK
3197  // GEH: Figure out what to do here for td_susp_tied_tasks
3198  if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) {
3199  __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data);
3200  }
3201 #endif // BUILD_TIED_TASK_STACK
3202 }
3203 
3204 // __kmp_realloc_task_threads_data:
3205 // Allocates a threads_data array for a task team, either by allocating an
3206 // initial array or enlarging an existing array. Only the first thread to get
3207 // the lock allocs or enlarges the array and re-initializes the array elements.
3208 // That thread returns "TRUE", the rest return "FALSE".
3209 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3210 // The current size is given by task_team -> tt.tt_max_threads.
3211 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3212  kmp_task_team_t *task_team) {
3213  kmp_thread_data_t **threads_data_p;
3214  kmp_int32 nthreads, maxthreads;
3215  int is_init_thread = FALSE;
3216 
3217  if (TCR_4(task_team->tt.tt_found_tasks)) {
3218  // Already reallocated and initialized.
3219  return FALSE;
3220  }
3221 
3222  threads_data_p = &task_team->tt.tt_threads_data;
3223  nthreads = task_team->tt.tt_nproc;
3224  maxthreads = task_team->tt.tt_max_threads;
3225 
3226  // All threads must lock when they encounter the first task of the implicit
3227  // task region to make sure threads_data fields are (re)initialized before
3228  // used.
3229  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3230 
3231  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3232  // first thread to enable tasking
3233  kmp_team_t *team = thread->th.th_team;
3234  int i;
3235 
3236  is_init_thread = TRUE;
3237  if (maxthreads < nthreads) {
3238 
3239  if (*threads_data_p != NULL) {
3240  kmp_thread_data_t *old_data = *threads_data_p;
3241  kmp_thread_data_t *new_data = NULL;
3242 
3243  KE_TRACE(
3244  10,
3245  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3246  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3247  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3248  // Reallocate threads_data to have more elements than current array
3249  // Cannot use __kmp_thread_realloc() because threads not around for
3250  // kmp_reap_task_team( ). Note all new array entries are initialized
3251  // to zero by __kmp_allocate().
3252  new_data = (kmp_thread_data_t *)__kmp_allocate(
3253  nthreads * sizeof(kmp_thread_data_t));
3254  // copy old data to new data
3255  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3256  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3257 
3258 #ifdef BUILD_TIED_TASK_STACK
3259  // GEH: Figure out if this is the right thing to do
3260  for (i = maxthreads; i < nthreads; i++) {
3261  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3262  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3263  }
3264 #endif // BUILD_TIED_TASK_STACK
3265  // Install the new data and free the old data
3266  (*threads_data_p) = new_data;
3267  __kmp_free(old_data);
3268  } else {
3269  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3270  "threads data for task_team %p, size = %d\n",
3271  __kmp_gtid_from_thread(thread), task_team, nthreads));
3272  // Make the initial allocate for threads_data array, and zero entries
3273  // Cannot use __kmp_thread_calloc() because threads not around for
3274  // kmp_reap_task_team( ).
3275  ANNOTATE_IGNORE_WRITES_BEGIN();
3276  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3277  nthreads * sizeof(kmp_thread_data_t));
3278  ANNOTATE_IGNORE_WRITES_END();
3279 #ifdef BUILD_TIED_TASK_STACK
3280  // GEH: Figure out if this is the right thing to do
3281  for (i = 0; i < nthreads; i++) {
3282  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3283  __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data);
3284  }
3285 #endif // BUILD_TIED_TASK_STACK
3286  }
3287  task_team->tt.tt_max_threads = nthreads;
3288  } else {
3289  // If array has (more than) enough elements, go ahead and use it
3290  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3291  }
3292 
3293  // initialize threads_data pointers back to thread_info structures
3294  for (i = 0; i < nthreads; i++) {
3295  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3296  thread_data->td.td_thr = team->t.t_threads[i];
3297 
3298  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3299  // The last stolen field survives across teams / barrier, and the number
3300  // of threads may have changed. It's possible (likely?) that a new
3301  // parallel region will exhibit the same behavior as previous region.
3302  thread_data->td.td_deque_last_stolen = -1;
3303  }
3304  }
3305 
3306  KMP_MB();
3307  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3308  }
3309 
3310  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3311  return is_init_thread;
3312 }
3313 
3314 // __kmp_free_task_threads_data:
3315 // Deallocates a threads_data array for a task team, including any attached
3316 // tasking deques. Only occurs at library shutdown.
3317 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3318  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3319  if (task_team->tt.tt_threads_data != NULL) {
3320  int i;
3321  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3322  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3323  }
3324  __kmp_free(task_team->tt.tt_threads_data);
3325  task_team->tt.tt_threads_data = NULL;
3326  }
3327  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3328 }
3329 
3330 // __kmp_allocate_task_team:
3331 // Allocates a task team associated with a specific team, taking it from
3332 // the global task team free list if possible. Also initializes data
3333 // structures.
3334 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3335  kmp_team_t *team) {
3336  kmp_task_team_t *task_team = NULL;
3337  int nthreads;
3338 
3339  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3340  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3341 
3342  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3343  // Take a task team from the task team pool
3344  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3345  if (__kmp_free_task_teams != NULL) {
3346  task_team = __kmp_free_task_teams;
3347  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3348  task_team->tt.tt_next = NULL;
3349  }
3350  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3351  }
3352 
3353  if (task_team == NULL) {
3354  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3355  "task team for team %p\n",
3356  __kmp_gtid_from_thread(thread), team));
3357  // Allocate a new task team if one is not available. Cannot use
3358  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3359  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3360  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3361 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3362  // suppress race conditions detection on synchronization flags in debug mode
3363  // this helps to analyze library internals eliminating false positives
3364  __itt_suppress_mark_range(
3365  __itt_suppress_range, __itt_suppress_threading_errors,
3366  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3367  __itt_suppress_mark_range(__itt_suppress_range,
3368  __itt_suppress_threading_errors,
3369  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3370  sizeof(task_team->tt.tt_active));
3371 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3372  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3373  // task_team->tt.tt_threads_data = NULL;
3374  // task_team->tt.tt_max_threads = 0;
3375  // task_team->tt.tt_next = NULL;
3376  }
3377 
3378  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3379  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3380  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
3381 
3382  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
3383  TCW_4(task_team->tt.tt_active, TRUE);
3384 
3385  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3386  "unfinished_threads init'd to %d\n",
3387  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3388  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3389  return task_team;
3390 }
3391 
3392 // __kmp_free_task_team:
3393 // Frees the task team associated with a specific thread, and adds it
3394 // to the global task team free list.
3395 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3396  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3397  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3398 
3399  // Put task team back on free list
3400  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3401 
3402  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3403  task_team->tt.tt_next = __kmp_free_task_teams;
3404  TCW_PTR(__kmp_free_task_teams, task_team);
3405 
3406  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3407 }
3408 
3409 // __kmp_reap_task_teams:
3410 // Free all the task teams on the task team free list.
3411 // Should only be done during library shutdown.
3412 // Cannot do anything that needs a thread structure or gtid since they are
3413 // already gone.
3414 void __kmp_reap_task_teams(void) {
3415  kmp_task_team_t *task_team;
3416 
3417  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3418  // Free all task_teams on the free list
3419  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3420  while ((task_team = __kmp_free_task_teams) != NULL) {
3421  __kmp_free_task_teams = task_team->tt.tt_next;
3422  task_team->tt.tt_next = NULL;
3423 
3424  // Free threads_data if necessary
3425  if (task_team->tt.tt_threads_data != NULL) {
3426  __kmp_free_task_threads_data(task_team);
3427  }
3428  __kmp_free(task_team);
3429  }
3430  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3431  }
3432 }
3433 
3434 // __kmp_wait_to_unref_task_teams:
3435 // Some threads could still be in the fork barrier release code, possibly
3436 // trying to steal tasks. Wait for each thread to unreference its task team.
3437 void __kmp_wait_to_unref_task_teams(void) {
3438  kmp_info_t *thread;
3439  kmp_uint32 spins;
3440  int done;
3441 
3442  KMP_INIT_YIELD(spins);
3443 
3444  for (;;) {
3445  done = TRUE;
3446 
3447  // TODO: GEH - this may be is wrong because some sync would be necessary
3448  // in case threads are added to the pool during the traversal. Need to
3449  // verify that lock for thread pool is held when calling this routine.
3450  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3451  thread = thread->th.th_next_pool) {
3452 #if KMP_OS_WINDOWS
3453  DWORD exit_val;
3454 #endif
3455  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3456  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3457  __kmp_gtid_from_thread(thread)));
3458  continue;
3459  }
3460 #if KMP_OS_WINDOWS
3461  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3462  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3463  thread->th.th_task_team = NULL;
3464  continue;
3465  }
3466 #endif
3467 
3468  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3469 
3470  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3471  "unreference task_team\n",
3472  __kmp_gtid_from_thread(thread)));
3473 
3474  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3475  volatile void *sleep_loc;
3476  // If the thread is sleeping, awaken it.
3477  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3478  NULL) {
3479  KA_TRACE(
3480  10,
3481  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3482  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3483  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(thread), sleep_loc);
3484  }
3485  }
3486  }
3487  if (done) {
3488  break;
3489  }
3490 
3491  // If oversubscribed or have waited a bit, yield.
3492  KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
3493  }
3494 }
3495 
3496 // __kmp_task_team_setup: Create a task_team for the current team, but use
3497 // an already created, unused one if it already exists.
3498 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
3499  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3500 
3501  // If this task_team hasn't been created yet, allocate it. It will be used in
3502  // the region after the next.
3503  // If it exists, it is the current task team and shouldn't be touched yet as
3504  // it may still be in use.
3505  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
3506  (always || team->t.t_nproc > 1)) {
3507  team->t.t_task_team[this_thr->th.th_task_state] =
3508  __kmp_allocate_task_team(this_thr, team);
3509  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created new task_team %p "
3510  "for team %d at parity=%d\n",
3511  __kmp_gtid_from_thread(this_thr),
3512  team->t.t_task_team[this_thr->th.th_task_state],
3513  ((team != NULL) ? team->t.t_id : -1),
3514  this_thr->th.th_task_state));
3515  }
3516 
3517  // After threads exit the release, they will call sync, and then point to this
3518  // other task_team; make sure it is allocated and properly initialized. As
3519  // threads spin in the barrier release phase, they will continue to use the
3520  // previous task_team struct(above), until they receive the signal to stop
3521  // checking for tasks (they can't safely reference the kmp_team_t struct,
3522  // which could be reallocated by the master thread). No task teams are formed
3523  // for serialized teams.
3524  if (team->t.t_nproc > 1) {
3525  int other_team = 1 - this_thr->th.th_task_state;
3526  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3527  team->t.t_task_team[other_team] =
3528  __kmp_allocate_task_team(this_thr, team);
3529  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d created second new "
3530  "task_team %p for team %d at parity=%d\n",
3531  __kmp_gtid_from_thread(this_thr),
3532  team->t.t_task_team[other_team],
3533  ((team != NULL) ? team->t.t_id : -1), other_team));
3534  } else { // Leave the old task team struct in place for the upcoming region;
3535  // adjust as needed
3536  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3537  if (!task_team->tt.tt_active ||
3538  team->t.t_nproc != task_team->tt.tt_nproc) {
3539  TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
3540  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3541  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3542  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
3543  team->t.t_nproc);
3544  TCW_4(task_team->tt.tt_active, TRUE);
3545  }
3546  // if team size has changed, the first thread to enable tasking will
3547  // realloc threads_data if necessary
3548  KA_TRACE(20, ("__kmp_task_team_setup: Master T#%d reset next task_team "
3549  "%p for team %d at parity=%d\n",
3550  __kmp_gtid_from_thread(this_thr),
3551  team->t.t_task_team[other_team],
3552  ((team != NULL) ? team->t.t_id : -1), other_team));
3553  }
3554  }
3555 }
3556 
3557 // __kmp_task_team_sync: Propagation of task team data from team to threads
3558 // which happens just after the release phase of a team barrier. This may be
3559 // called by any thread, but only for teams with # threads > 1.
3560 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3561  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3562 
3563  // Toggle the th_task_state field, to switch which task_team this thread
3564  // refers to
3565  this_thr->th.th_task_state = 1 - this_thr->th.th_task_state;
3566  // It is now safe to propagate the task team pointer from the team struct to
3567  // the current thread.
3568  TCW_PTR(this_thr->th.th_task_team,
3569  team->t.t_task_team[this_thr->th.th_task_state]);
3570  KA_TRACE(20,
3571  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
3572  "%p from Team #%d (parity=%d)\n",
3573  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
3574  ((team != NULL) ? team->t.t_id : -1), this_thr->th.th_task_state));
3575 }
3576 
3577 // __kmp_task_team_wait: Master thread waits for outstanding tasks after the
3578 // barrier gather phase. Only called by master thread if #threads in team > 1 or
3579 // if proxy tasks were created.
3580 //
3581 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
3582 // by passing in 0 optionally as the last argument. When wait is zero, master
3583 // thread does not wait for unfinished_threads to reach 0.
3584 void __kmp_task_team_wait(
3585  kmp_info_t *this_thr,
3586  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
3587  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
3588 
3589  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3590  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
3591 
3592  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
3593  if (wait) {
3594  KA_TRACE(20, ("__kmp_task_team_wait: Master T#%d waiting for all tasks "
3595  "(for unfinished_threads to reach 0) on task_team = %p\n",
3596  __kmp_gtid_from_thread(this_thr), task_team));
3597  // Worker threads may have dropped through to release phase, but could
3598  // still be executing tasks. Wait here for tasks to complete. To avoid
3599  // memory contention, only master thread checks termination condition.
3600  kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
3601  &task_team->tt.tt_unfinished_threads),
3602  0U);
3603  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
3604  }
3605  // Deactivate the old task team, so that the worker threads will stop
3606  // referencing it while spinning.
3607  KA_TRACE(
3608  20,
3609  ("__kmp_task_team_wait: Master T#%d deactivating task_team %p: "
3610  "setting active to false, setting local and team's pointer to NULL\n",
3611  __kmp_gtid_from_thread(this_thr), task_team));
3612  KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
3613  task_team->tt.tt_found_proxy_tasks == TRUE);
3614  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3615  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
3616  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
3617  KMP_MB();
3618 
3619  TCW_PTR(this_thr->th.th_task_team, NULL);
3620  }
3621 }
3622 
3623 // __kmp_tasking_barrier:
3624 // This routine may only called when __kmp_tasking_mode == tskm_extra_barrier.
3625 // Internal function to execute all tasks prior to a regular barrier or a join
3626 // barrier. It is a full barrier itself, which unfortunately turns regular
3627 // barriers into double barriers and join barriers into 1 1/2 barriers.
3628 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
3629  std::atomic<kmp_uint32> *spin = RCAST(
3630  std::atomic<kmp_uint32> *,
3631  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
3632  int flag = FALSE;
3633  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
3634 
3635 #if USE_ITT_BUILD
3636  KMP_FSYNC_SPIN_INIT(spin, NULL);
3637 #endif /* USE_ITT_BUILD */
3638  kmp_flag_32 spin_flag(spin, 0U);
3639  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
3640  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
3641 #if USE_ITT_BUILD
3642  // TODO: What about itt_sync_obj??
3643  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
3644 #endif /* USE_ITT_BUILD */
3645 
3646  if (TCR_4(__kmp_global.g.g_done)) {
3647  if (__kmp_global.g.g_abort)
3648  __kmp_abort_thread();
3649  break;
3650  }
3651  KMP_YIELD(TRUE);
3652  }
3653 #if USE_ITT_BUILD
3654  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
3655 #endif /* USE_ITT_BUILD */
3656 }
3657 
3658 // __kmp_give_task puts a task into a given thread queue if:
3659 // - the queue for that thread was created
3660 // - there's space in that queue
3661 // Because of this, __kmp_push_task needs to check if there's space after
3662 // getting the lock
3663 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
3664  kmp_int32 pass) {
3665  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
3666  kmp_task_team_t *task_team = taskdata->td_task_team;
3667 
3668  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
3669  taskdata, tid));
3670 
3671  // If task_team is NULL something went really bad...
3672  KMP_DEBUG_ASSERT(task_team != NULL);
3673 
3674  bool result = false;
3675  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
3676 
3677  if (thread_data->td.td_deque == NULL) {
3678  // There's no queue in this thread, go find another one
3679  // We're guaranteed that at least one thread has a queue
3680  KA_TRACE(30,
3681  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
3682  tid, taskdata));
3683  return result;
3684  }
3685 
3686  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3687  TASK_DEQUE_SIZE(thread_data->td)) {
3688  KA_TRACE(
3689  30,
3690  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
3691  taskdata, tid));
3692 
3693  // if this deque is bigger than the pass ratio give a chance to another
3694  // thread
3695  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3696  return result;
3697 
3698  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3699  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3700  TASK_DEQUE_SIZE(thread_data->td)) {
3701  // expand deque to push the task which is not allowed to execute
3702  __kmp_realloc_task_deque(thread, thread_data);
3703  }
3704 
3705  } else {
3706 
3707  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3708 
3709  if (TCR_4(thread_data->td.td_deque_ntasks) >=
3710  TASK_DEQUE_SIZE(thread_data->td)) {
3711  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
3712  "thread %d.\n",
3713  taskdata, tid));
3714 
3715  // if this deque is bigger than the pass ratio give a chance to another
3716  // thread
3717  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
3718  goto release_and_exit;
3719 
3720  __kmp_realloc_task_deque(thread, thread_data);
3721  }
3722  }
3723 
3724  // lock is held here, and there is space in the deque
3725 
3726  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
3727  // Wrap index.
3728  thread_data->td.td_deque_tail =
3729  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
3730  TCW_4(thread_data->td.td_deque_ntasks,
3731  TCR_4(thread_data->td.td_deque_ntasks) + 1);
3732 
3733  result = true;
3734  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
3735  taskdata, tid));
3736 
3737 release_and_exit:
3738  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3739 
3740  return result;
3741 }
3742 
3743 /* The finish of the proxy tasks is divided in two pieces:
3744  - the top half is the one that can be done from a thread outside the team
3745  - the bottom half must be run from a thread within the team
3746 
3747  In order to run the bottom half the task gets queued back into one of the
3748  threads of the team. Once the td_incomplete_child_task counter of the parent
3749  is decremented the threads can leave the barriers. So, the bottom half needs
3750  to be queued before the counter is decremented. The top half is therefore
3751  divided in two parts:
3752  - things that can be run before queuing the bottom half
3753  - things that must be run after queuing the bottom half
3754 
3755  This creates a second race as the bottom half can free the task before the
3756  second top half is executed. To avoid this we use the
3757  td_incomplete_child_task of the proxy task to synchronize the top and bottom
3758  half. */
3759 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3760  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
3761  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3762  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
3763  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
3764 
3765  taskdata->td_flags.complete = 1; // mark the task as completed
3766 
3767  if (taskdata->td_taskgroup)
3768  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
3769 
3770  // Create an imaginary children for this task so the bottom half cannot
3771  // release the task before we have completed the second top half
3772  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
3773 }
3774 
3775 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
3776  kmp_int32 children = 0;
3777 
3778  // Predecrement simulated by "- 1" calculation
3779  children =
3780  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
3781  KMP_DEBUG_ASSERT(children >= 0);
3782 
3783  // Remove the imaginary children
3784  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
3785 }
3786 
3787 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
3788  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3789  kmp_info_t *thread = __kmp_threads[gtid];
3790 
3791  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3792  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
3793  1); // top half must run before bottom half
3794 
3795  // We need to wait to make sure the top half is finished
3796  // Spinning here should be ok as this should happen quickly
3797  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
3798  ;
3799 
3800  __kmp_release_deps(gtid, taskdata);
3801  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
3802 }
3803 
3812 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
3813  KMP_DEBUG_ASSERT(ptask != NULL);
3814  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3815  KA_TRACE(
3816  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
3817  gtid, taskdata));
3818  __kmp_assert_valid_gtid(gtid);
3819  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3820 
3821  __kmp_first_top_half_finish_proxy(taskdata);
3822  __kmp_second_top_half_finish_proxy(taskdata);
3823  __kmp_bottom_half_finish_proxy(gtid, ptask);
3824 
3825  KA_TRACE(10,
3826  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
3827  gtid, taskdata));
3828 }
3829 
3837 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
3838  KMP_DEBUG_ASSERT(ptask != NULL);
3839  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3840 
3841  KA_TRACE(
3842  10,
3843  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
3844  taskdata));
3845 
3846  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
3847 
3848  __kmp_first_top_half_finish_proxy(taskdata);
3849 
3850  // Enqueue task to complete bottom half completion from a thread within the
3851  // corresponding team
3852  kmp_team_t *team = taskdata->td_team;
3853  kmp_int32 nthreads = team->t.t_nproc;
3854  kmp_info_t *thread;
3855 
3856  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
3857  // but we cannot use __kmp_get_random here
3858  kmp_int32 start_k = 0;
3859  kmp_int32 pass = 1;
3860  kmp_int32 k = start_k;
3861 
3862  do {
3863  // For now we're just linearly trying to find a thread
3864  thread = team->t.t_threads[k];
3865  k = (k + 1) % nthreads;
3866 
3867  // we did a full pass through all the threads
3868  if (k == start_k)
3869  pass = pass << 1;
3870 
3871  } while (!__kmp_give_task(thread, k, ptask, pass));
3872 
3873  __kmp_second_top_half_finish_proxy(taskdata);
3874 
3875  KA_TRACE(
3876  10,
3877  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
3878  taskdata));
3879 }
3880 
3881 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
3882  kmp_task_t *task) {
3883  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
3884  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
3885  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
3886  td->td_allow_completion_event.ed.task = task;
3887  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
3888  }
3889  return &td->td_allow_completion_event;
3890 }
3891 
3892 void __kmp_fulfill_event(kmp_event_t *event) {
3893  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
3894  kmp_task_t *ptask = event->ed.task;
3895  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
3896  bool detached = false;
3897  int gtid = __kmp_get_gtid();
3898 
3899  // The associated task might have completed or could be completing at this
3900  // point.
3901  // We need to take the lock to avoid races
3902  __kmp_acquire_tas_lock(&event->lock, gtid);
3903  if (taskdata->td_flags.proxy == TASK_PROXY) {
3904  detached = true;
3905  } else {
3906 #if OMPT_SUPPORT
3907  // The OMPT event must occur under mutual exclusion,
3908  // otherwise the tool might access ptask after free
3909  if (UNLIKELY(ompt_enabled.enabled))
3910  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
3911 #endif
3912  }
3913  event->type = KMP_EVENT_UNINITIALIZED;
3914  __kmp_release_tas_lock(&event->lock, gtid);
3915 
3916  if (detached) {
3917 #if OMPT_SUPPORT
3918  // We free ptask afterwards and know the task is finished,
3919  // so locking is not necessary
3920  if (UNLIKELY(ompt_enabled.enabled))
3921  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
3922 #endif
3923  // If the task detached complete the proxy task
3924  if (gtid >= 0) {
3925  kmp_team_t *team = taskdata->td_team;
3926  kmp_info_t *thread = __kmp_get_thread();
3927  if (thread->th.th_team == team) {
3928  __kmpc_proxy_task_completed(gtid, ptask);
3929  return;
3930  }
3931  }
3932 
3933  // fallback
3935  }
3936  }
3937 }
3938 
3939 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
3940 // for taskloop
3941 //
3942 // thread: allocating thread
3943 // task_src: pointer to source task to be duplicated
3944 // returns: a pointer to the allocated kmp_task_t structure (task).
3945 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
3946  kmp_task_t *task;
3947  kmp_taskdata_t *taskdata;
3948  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
3949  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
3950  size_t shareds_offset;
3951  size_t task_size;
3952 
3953  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
3954  task_src));
3955  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
3956  TASK_FULL); // it should not be proxy task
3957  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
3958  task_size = taskdata_src->td_size_alloc;
3959 
3960  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
3961  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
3962  task_size));
3963 #if USE_FAST_MEMORY
3964  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
3965 #else
3966  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
3967 #endif /* USE_FAST_MEMORY */
3968  KMP_MEMCPY(taskdata, taskdata_src, task_size);
3969 
3970  task = KMP_TASKDATA_TO_TASK(taskdata);
3971 
3972  // Initialize new task (only specific fields not affected by memcpy)
3973  taskdata->td_task_id = KMP_GEN_TASK_ID();
3974  if (task->shareds != NULL) { // need setup shareds pointer
3975  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
3976  task->shareds = &((char *)taskdata)[shareds_offset];
3977  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
3978  0);
3979  }
3980  taskdata->td_alloc_thread = thread;
3981  taskdata->td_parent = parent_task;
3982  // task inherits the taskgroup from the parent task
3983  taskdata->td_taskgroup = parent_task->td_taskgroup;
3984  // tied task needs to initialize the td_last_tied at creation,
3985  // untied one does this when it is scheduled for execution
3986  if (taskdata->td_flags.tiedness == TASK_TIED)
3987  taskdata->td_last_tied = taskdata;
3988 
3989  // Only need to keep track of child task counts if team parallel and tasking
3990  // not serialized
3991  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
3992  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
3993  if (parent_task->td_taskgroup)
3994  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
3995  // Only need to keep track of allocated child tasks for explicit tasks since
3996  // implicit not deallocated
3997  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
3998  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
3999  }
4000 
4001  KA_TRACE(20,
4002  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4003  thread, taskdata, taskdata->td_parent));
4004 #if OMPT_SUPPORT
4005  if (UNLIKELY(ompt_enabled.enabled))
4006  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4007 #endif
4008  return task;
4009 }
4010 
4011 // Routine optionally generated by the compiler for setting the lastprivate flag
4012 // and calling needed constructors for private/firstprivate objects
4013 // (used to form taskloop tasks from pattern task)
4014 // Parameters: dest task, src task, lastprivate flag.
4015 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4016 
4017 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4018 
4019 // class to encapsulate manipulating loop bounds in a taskloop task.
4020 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4021 // the loop bound variables.
4022 class kmp_taskloop_bounds_t {
4023  kmp_task_t *task;
4024  const kmp_taskdata_t *taskdata;
4025  size_t lower_offset;
4026  size_t upper_offset;
4027 
4028 public:
4029  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4030  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4031  lower_offset((char *)lb - (char *)task),
4032  upper_offset((char *)ub - (char *)task) {
4033  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4034  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4035  }
4036  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4037  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4038  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4039  size_t get_lower_offset() const { return lower_offset; }
4040  size_t get_upper_offset() const { return upper_offset; }
4041  kmp_uint64 get_lb() const {
4042  kmp_int64 retval;
4043 #if defined(KMP_GOMP_COMPAT)
4044  // Intel task just returns the lower bound normally
4045  if (!taskdata->td_flags.native) {
4046  retval = *(kmp_int64 *)((char *)task + lower_offset);
4047  } else {
4048  // GOMP task has to take into account the sizeof(long)
4049  if (taskdata->td_size_loop_bounds == 4) {
4050  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4051  retval = (kmp_int64)*lb;
4052  } else {
4053  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4054  retval = (kmp_int64)*lb;
4055  }
4056  }
4057 #else
4058  retval = *(kmp_int64 *)((char *)task + lower_offset);
4059 #endif // defined(KMP_GOMP_COMPAT)
4060  return retval;
4061  }
4062  kmp_uint64 get_ub() const {
4063  kmp_int64 retval;
4064 #if defined(KMP_GOMP_COMPAT)
4065  // Intel task just returns the upper bound normally
4066  if (!taskdata->td_flags.native) {
4067  retval = *(kmp_int64 *)((char *)task + upper_offset);
4068  } else {
4069  // GOMP task has to take into account the sizeof(long)
4070  if (taskdata->td_size_loop_bounds == 4) {
4071  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4072  retval = (kmp_int64)*ub;
4073  } else {
4074  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4075  retval = (kmp_int64)*ub;
4076  }
4077  }
4078 #else
4079  retval = *(kmp_int64 *)((char *)task + upper_offset);
4080 #endif // defined(KMP_GOMP_COMPAT)
4081  return retval;
4082  }
4083  void set_lb(kmp_uint64 lb) {
4084 #if defined(KMP_GOMP_COMPAT)
4085  // Intel task just sets the lower bound normally
4086  if (!taskdata->td_flags.native) {
4087  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4088  } else {
4089  // GOMP task has to take into account the sizeof(long)
4090  if (taskdata->td_size_loop_bounds == 4) {
4091  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4092  *lower = (kmp_uint32)lb;
4093  } else {
4094  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4095  *lower = (kmp_uint64)lb;
4096  }
4097  }
4098 #else
4099  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4100 #endif // defined(KMP_GOMP_COMPAT)
4101  }
4102  void set_ub(kmp_uint64 ub) {
4103 #if defined(KMP_GOMP_COMPAT)
4104  // Intel task just sets the upper bound normally
4105  if (!taskdata->td_flags.native) {
4106  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4107  } else {
4108  // GOMP task has to take into account the sizeof(long)
4109  if (taskdata->td_size_loop_bounds == 4) {
4110  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4111  *upper = (kmp_uint32)ub;
4112  } else {
4113  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4114  *upper = (kmp_uint64)ub;
4115  }
4116  }
4117 #else
4118  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4119 #endif // defined(KMP_GOMP_COMPAT)
4120  }
4121 };
4122 
4123 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4124 //
4125 // loc Source location information
4126 // gtid Global thread ID
4127 // task Pattern task, exposes the loop iteration range
4128 // lb Pointer to loop lower bound in task structure
4129 // ub Pointer to loop upper bound in task structure
4130 // st Loop stride
4131 // ub_glob Global upper bound (used for lastprivate check)
4132 // num_tasks Number of tasks to execute
4133 // grainsize Number of loop iterations per task
4134 // extras Number of chunks with grainsize+1 iterations
4135 // tc Iterations count
4136 // task_dup Tasks duplication routine
4137 // codeptr_ra Return address for OMPT events
4138 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4139  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4140  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4141  kmp_uint64 grainsize, kmp_uint64 extras,
4142  kmp_uint64 tc,
4143 #if OMPT_SUPPORT
4144  void *codeptr_ra,
4145 #endif
4146  void *task_dup) {
4147  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4148  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4149  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4150  // compiler provides global bounds here
4151  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4152  kmp_uint64 lower = task_bounds.get_lb();
4153  kmp_uint64 upper = task_bounds.get_ub();
4154  kmp_uint64 i;
4155  kmp_info_t *thread = __kmp_threads[gtid];
4156  kmp_taskdata_t *current_task = thread->th.th_current_task;
4157  kmp_task_t *next_task;
4158  kmp_int32 lastpriv = 0;
4159 
4160  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4161  KMP_DEBUG_ASSERT(num_tasks > extras);
4162  KMP_DEBUG_ASSERT(num_tasks > 0);
4163  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4164  "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4165  gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st,
4166  task_dup));
4167 
4168  // Launch num_tasks tasks, assign grainsize iterations each task
4169  for (i = 0; i < num_tasks; ++i) {
4170  kmp_uint64 chunk_minus_1;
4171  if (extras == 0) {
4172  chunk_minus_1 = grainsize - 1;
4173  } else {
4174  chunk_minus_1 = grainsize;
4175  --extras; // first extras iterations get bigger chunk (grainsize+1)
4176  }
4177  upper = lower + st * chunk_minus_1;
4178  if (i == num_tasks - 1) {
4179  // schedule the last task, set lastprivate flag if needed
4180  if (st == 1) { // most common case
4181  KMP_DEBUG_ASSERT(upper == *ub);
4182  if (upper == ub_glob)
4183  lastpriv = 1;
4184  } else if (st > 0) { // positive loop stride
4185  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4186  if ((kmp_uint64)st > ub_glob - upper)
4187  lastpriv = 1;
4188  } else { // negative loop stride
4189  KMP_DEBUG_ASSERT(upper + st < *ub);
4190  if (upper - ub_glob < (kmp_uint64)(-st))
4191  lastpriv = 1;
4192  }
4193  }
4194  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4195  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4196  kmp_taskloop_bounds_t next_task_bounds =
4197  kmp_taskloop_bounds_t(next_task, task_bounds);
4198 
4199  // adjust task-specific bounds
4200  next_task_bounds.set_lb(lower);
4201  if (next_taskdata->td_flags.native) {
4202  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4203  } else {
4204  next_task_bounds.set_ub(upper);
4205  }
4206  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4207  // etc.
4208  ptask_dup(next_task, task, lastpriv);
4209  KA_TRACE(40,
4210  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4211  "upper %lld stride %lld, (offsets %p %p)\n",
4212  gtid, i, next_task, lower, upper, st,
4213  next_task_bounds.get_lower_offset(),
4214  next_task_bounds.get_upper_offset()));
4215 #if OMPT_SUPPORT
4216  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4217  codeptr_ra); // schedule new task
4218 #else
4219  __kmp_omp_task(gtid, next_task, true); // schedule new task
4220 #endif
4221  lower = upper + st; // adjust lower bound for the next iteration
4222  }
4223  // free the pattern task and exit
4224  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4225  // do not execute the pattern task, just do internal bookkeeping
4226  __kmp_task_finish<false>(gtid, task, current_task);
4227 }
4228 
4229 // Structure to keep taskloop parameters for auxiliary task
4230 // kept in the shareds of the task structure.
4231 typedef struct __taskloop_params {
4232  kmp_task_t *task;
4233  kmp_uint64 *lb;
4234  kmp_uint64 *ub;
4235  void *task_dup;
4236  kmp_int64 st;
4237  kmp_uint64 ub_glob;
4238  kmp_uint64 num_tasks;
4239  kmp_uint64 grainsize;
4240  kmp_uint64 extras;
4241  kmp_uint64 tc;
4242  kmp_uint64 num_t_min;
4243 #if OMPT_SUPPORT
4244  void *codeptr_ra;
4245 #endif
4246 } __taskloop_params_t;
4247 
4248 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4249  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4250  kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
4251 #if OMPT_SUPPORT
4252  void *,
4253 #endif
4254  void *);
4255 
4256 // Execute part of the taskloop submitted as a task.
4257 int __kmp_taskloop_task(int gtid, void *ptask) {
4258  __taskloop_params_t *p =
4259  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4260  kmp_task_t *task = p->task;
4261  kmp_uint64 *lb = p->lb;
4262  kmp_uint64 *ub = p->ub;
4263  void *task_dup = p->task_dup;
4264  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4265  kmp_int64 st = p->st;
4266  kmp_uint64 ub_glob = p->ub_glob;
4267  kmp_uint64 num_tasks = p->num_tasks;
4268  kmp_uint64 grainsize = p->grainsize;
4269  kmp_uint64 extras = p->extras;
4270  kmp_uint64 tc = p->tc;
4271  kmp_uint64 num_t_min = p->num_t_min;
4272 #if OMPT_SUPPORT
4273  void *codeptr_ra = p->codeptr_ra;
4274 #endif
4275 #if KMP_DEBUG
4276  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4277  KMP_DEBUG_ASSERT(task != NULL);
4278  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4279  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4280  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4281  task_dup));
4282 #endif
4283  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4284  if (num_tasks > num_t_min)
4285  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4286  grainsize, extras, tc, num_t_min,
4287 #if OMPT_SUPPORT
4288  codeptr_ra,
4289 #endif
4290  task_dup);
4291  else
4292  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4293  grainsize, extras, tc,
4294 #if OMPT_SUPPORT
4295  codeptr_ra,
4296 #endif
4297  task_dup);
4298 
4299  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4300  return 0;
4301 }
4302 
4303 // Schedule part of the taskloop as a task,
4304 // execute the rest of the taskloop.
4305 //
4306 // loc Source location information
4307 // gtid Global thread ID
4308 // task Pattern task, exposes the loop iteration range
4309 // lb Pointer to loop lower bound in task structure
4310 // ub Pointer to loop upper bound in task structure
4311 // st Loop stride
4312 // ub_glob Global upper bound (used for lastprivate check)
4313 // num_tasks Number of tasks to execute
4314 // grainsize Number of loop iterations per task
4315 // extras Number of chunks with grainsize+1 iterations
4316 // tc Iterations count
4317 // num_t_min Threshold to launch tasks recursively
4318 // task_dup Tasks duplication routine
4319 // codeptr_ra Return address for OMPT events
4320 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4321  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4322  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4323  kmp_uint64 grainsize, kmp_uint64 extras,
4324  kmp_uint64 tc, kmp_uint64 num_t_min,
4325 #if OMPT_SUPPORT
4326  void *codeptr_ra,
4327 #endif
4328  void *task_dup) {
4329  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4330  KMP_DEBUG_ASSERT(task != NULL);
4331  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4332  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4333  " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n",
4334  gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st,
4335  task_dup));
4336  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4337  kmp_uint64 lower = *lb;
4338  kmp_info_t *thread = __kmp_threads[gtid];
4339  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4340  kmp_task_t *next_task;
4341  size_t lower_offset =
4342  (char *)lb - (char *)task; // remember offset of lb in the task structure
4343  size_t upper_offset =
4344  (char *)ub - (char *)task; // remember offset of ub in the task structure
4345 
4346  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4347  KMP_DEBUG_ASSERT(num_tasks > extras);
4348  KMP_DEBUG_ASSERT(num_tasks > 0);
4349 
4350  // split the loop in two halves
4351  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4352  kmp_uint64 gr_size0 = grainsize;
4353  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4354  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4355  if (n_tsk0 <= extras) {
4356  gr_size0++; // integrate extras into grainsize
4357  ext0 = 0; // no extra iters in 1st half
4358  ext1 = extras - n_tsk0; // remaining extras
4359  tc0 = gr_size0 * n_tsk0;
4360  tc1 = tc - tc0;
4361  } else { // n_tsk0 > extras
4362  ext1 = 0; // no extra iters in 2nd half
4363  ext0 = extras;
4364  tc1 = grainsize * n_tsk1;
4365  tc0 = tc - tc1;
4366  }
4367  ub0 = lower + st * (tc0 - 1);
4368  lb1 = ub0 + st;
4369 
4370  // create pattern task for 2nd half of the loop
4371  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4372  // adjust lower bound (upper bound is not changed) for the 2nd half
4373  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4374  if (ptask_dup != NULL) // construct firstprivates, etc.
4375  ptask_dup(next_task, task, 0);
4376  *ub = ub0; // adjust upper bound for the 1st half
4377 
4378  // create auxiliary task for 2nd half of the loop
4379  // make sure new task has same parent task as the pattern task
4380  kmp_taskdata_t *current_task = thread->th.th_current_task;
4381  thread->th.th_current_task = taskdata->td_parent;
4382  kmp_task_t *new_task =
4383  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4384  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4385  // restore current task
4386  thread->th.th_current_task = current_task;
4387  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4388  p->task = next_task;
4389  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4390  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4391  p->task_dup = task_dup;
4392  p->st = st;
4393  p->ub_glob = ub_glob;
4394  p->num_tasks = n_tsk1;
4395  p->grainsize = grainsize;
4396  p->extras = ext1;
4397  p->tc = tc1;
4398  p->num_t_min = num_t_min;
4399 #if OMPT_SUPPORT
4400  p->codeptr_ra = codeptr_ra;
4401 #endif
4402 
4403 #if OMPT_SUPPORT
4404  // schedule new task with correct return address for OMPT events
4405  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4406 #else
4407  __kmp_omp_task(gtid, new_task, true); // schedule new task
4408 #endif
4409 
4410  // execute the 1st half of current subrange
4411  if (n_tsk0 > num_t_min)
4412  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4413  ext0, tc0, num_t_min,
4414 #if OMPT_SUPPORT
4415  codeptr_ra,
4416 #endif
4417  task_dup);
4418  else
4419  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4420  gr_size0, ext0, tc0,
4421 #if OMPT_SUPPORT
4422  codeptr_ra,
4423 #endif
4424  task_dup);
4425 
4426  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
4427 }
4428 
4445 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4446  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
4447  int sched, kmp_uint64 grainsize, void *task_dup) {
4448  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4449  KMP_DEBUG_ASSERT(task != NULL);
4450  __kmp_assert_valid_gtid(gtid);
4451  if (nogroup == 0) {
4452 #if OMPT_SUPPORT && OMPT_OPTIONAL
4453  OMPT_STORE_RETURN_ADDRESS(gtid);
4454 #endif
4455  __kmpc_taskgroup(loc, gtid);
4456  }
4457 
4458  // =========================================================================
4459  // calculate loop parameters
4460  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4461  kmp_uint64 tc;
4462  // compiler provides global bounds here
4463  kmp_uint64 lower = task_bounds.get_lb();
4464  kmp_uint64 upper = task_bounds.get_ub();
4465  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4466  kmp_uint64 num_tasks = 0, extras = 0;
4467  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4468  kmp_info_t *thread = __kmp_threads[gtid];
4469  kmp_taskdata_t *current_task = thread->th.th_current_task;
4470 
4471  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4472  "grain %llu(%d), dup %p\n",
4473  gtid, taskdata, lower, upper, st, grainsize, sched, task_dup));
4474 
4475  // compute trip count
4476  if (st == 1) { // most common case
4477  tc = upper - lower + 1;
4478  } else if (st < 0) {
4479  tc = (lower - upper) / (-st) + 1;
4480  } else { // st > 0
4481  tc = (upper - lower) / st + 1;
4482  }
4483  if (tc == 0) {
4484  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid));
4485  // free the pattern task and exit
4486  __kmp_task_start(gtid, task, current_task);
4487  // do not execute anything for zero-trip loop
4488  __kmp_task_finish<false>(gtid, task, current_task);
4489  return;
4490  }
4491 
4492 #if OMPT_SUPPORT && OMPT_OPTIONAL
4493  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4494  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4495  if (ompt_enabled.ompt_callback_work) {
4496  ompt_callbacks.ompt_callback(ompt_callback_work)(
4497  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
4498  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4499  }
4500 #endif
4501 
4502  if (num_tasks_min == 0)
4503  // TODO: can we choose better default heuristic?
4504  num_tasks_min =
4505  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
4506 
4507  // compute num_tasks/grainsize based on the input provided
4508  switch (sched) {
4509  case 0: // no schedule clause specified, we can choose the default
4510  // let's try to schedule (team_size*10) tasks
4511  grainsize = thread->th.th_team_nproc * 10;
4512  KMP_FALLTHROUGH();
4513  case 2: // num_tasks provided
4514  if (grainsize > tc) {
4515  num_tasks = tc; // too big num_tasks requested, adjust values
4516  grainsize = 1;
4517  extras = 0;
4518  } else {
4519  num_tasks = grainsize;
4520  grainsize = tc / num_tasks;
4521  extras = tc % num_tasks;
4522  }
4523  break;
4524  case 1: // grainsize provided
4525  if (grainsize > tc) {
4526  num_tasks = 1; // too big grainsize requested, adjust values
4527  grainsize = tc;
4528  extras = 0;
4529  } else {
4530  num_tasks = tc / grainsize;
4531  // adjust grainsize for balanced distribution of iterations
4532  grainsize = tc / num_tasks;
4533  extras = tc % num_tasks;
4534  }
4535  break;
4536  default:
4537  KMP_ASSERT2(0, "unknown scheduling of taskloop");
4538  }
4539  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
4540  KMP_DEBUG_ASSERT(num_tasks > extras);
4541  KMP_DEBUG_ASSERT(num_tasks > 0);
4542  // =========================================================================
4543 
4544  // check if clause value first
4545  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
4546  if (if_val == 0) { // if(0) specified, mark task as serial
4547  taskdata->td_flags.task_serial = 1;
4548  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
4549  // always start serial tasks linearly
4550  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4551  grainsize, extras, tc,
4552 #if OMPT_SUPPORT
4553  OMPT_GET_RETURN_ADDRESS(0),
4554 #endif
4555  task_dup);
4556  // !taskdata->td_flags.native => currently force linear spawning of tasks
4557  // for GOMP_taskloop
4558  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
4559  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
4560  "(%lld), grain %llu, extras %llu\n",
4561  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4562  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4563  grainsize, extras, tc, num_tasks_min,
4564 #if OMPT_SUPPORT
4565  OMPT_GET_RETURN_ADDRESS(0),
4566 #endif
4567  task_dup);
4568  } else {
4569  KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
4570  "(%lld), grain %llu, extras %llu\n",
4571  gtid, tc, num_tasks, num_tasks_min, grainsize, extras));
4572  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
4573  grainsize, extras, tc,
4574 #if OMPT_SUPPORT
4575  OMPT_GET_RETURN_ADDRESS(0),
4576 #endif
4577  task_dup);
4578  }
4579 
4580 #if OMPT_SUPPORT && OMPT_OPTIONAL
4581  if (ompt_enabled.ompt_callback_work) {
4582  ompt_callbacks.ompt_callback(ompt_callback_work)(
4583  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
4584  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
4585  }
4586 #endif
4587 
4588  if (nogroup == 0) {
4589 #if OMPT_SUPPORT && OMPT_OPTIONAL
4590  OMPT_STORE_RETURN_ADDRESS(gtid);
4591 #endif
4592  __kmpc_end_taskgroup(loc, gtid);
4593  }
4594  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
4595 }
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
struct kmp_taskred_input kmp_taskred_input_t
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
kmp_taskred_flags_t flags
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
struct kmp_taskred_data kmp_taskred_data_t
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:900
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
Definition: kmp.h:226
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
struct kmp_taskred_flags kmp_taskred_flags_t
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
struct kmp_task_red_input kmp_task_red_input_t