LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 // returns 1 if new task is allowed to execute, 0 otherwise
46 // checks Task Scheduling constraint (if requested) and
47 // mutexinoutset dependencies if any
48 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49  const kmp_taskdata_t *tasknew,
50  const kmp_taskdata_t *taskcurr) {
51  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53  // only descendant of all deferred tied tasks can be scheduled, checking
54  // the last one is enough, as it in turn is the descendant of all others
55  kmp_taskdata_t *current = taskcurr->td_last_tied;
56  KMP_DEBUG_ASSERT(current != NULL);
57  // check if the task is not suspended on barrier
58  if (current->td_flags.tasktype == TASK_EXPLICIT ||
59  current->td_taskwait_thread > 0) { // <= 0 on barrier
60  kmp_int32 level = current->td_level;
61  kmp_taskdata_t *parent = tasknew->td_parent;
62  while (parent != current && parent->td_level > level) {
63  // check generation up to the level of the current task
64  parent = parent->td_parent;
65  KMP_DEBUG_ASSERT(parent != NULL);
66  }
67  if (parent != current)
68  return false;
69  }
70  }
71  // Check mutexinoutset dependencies, acquire locks
72  kmp_depnode_t *node = tasknew->td_depnode;
73 #if OMPX_TASKGRAPH
74  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75 #else
76  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77 #endif
78  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81  continue;
82  // could not get the lock, release previous locks
83  for (int j = i - 1; j >= 0; --j)
84  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85  return false;
86  }
87  // negative num_locks means all locks acquired successfully
88  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89  }
90  return true;
91 }
92 
93 // __kmp_realloc_task_deque:
94 // Re-allocates a task deque for a particular thread, copies the content from
95 // the old deque and adjusts the necessary data structures relating to the
96 // deque. This operation must be done with the deque_lock being held
97 static void __kmp_realloc_task_deque(kmp_info_t *thread,
98  kmp_thread_data_t *thread_data) {
99  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101  kmp_int32 new_size = 2 * size;
102 
103  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104  "%d] for thread_data %p\n",
105  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106 
107  kmp_taskdata_t **new_deque =
108  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
109 
110  int i, j;
111  for (i = thread_data->td.td_deque_head, j = 0; j < size;
112  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113  new_deque[j] = thread_data->td.td_deque[i];
114 
115  __kmp_free(thread_data->td.td_deque);
116 
117  thread_data->td.td_deque_head = 0;
118  thread_data->td.td_deque_tail = size;
119  thread_data->td.td_deque = new_deque;
120  thread_data->td.td_deque_size = new_size;
121 }
122 
123 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
124  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
125  kmp_thread_data_t *thread_data = &l->td;
126  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127  thread_data->td.td_deque_last_stolen = -1;
128  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129  "for thread_data %p\n",
130  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
132  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
133  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134  return l;
135 }
136 
137 // The function finds the deque of priority tasks with given priority, or
138 // allocates a new deque and put it into sorted (high -> low) list of deques.
139 // Deques of non-default priority tasks are shared between all threads in team,
140 // as opposed to per-thread deques of tasks with default priority.
141 // The function is called under the lock task_team->tt.tt_task_pri_lock.
142 static kmp_thread_data_t *
143 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
144  kmp_thread_data_t *thread_data;
145  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146  if (lst->priority == pri) {
147  // Found queue of tasks with given priority.
148  thread_data = &lst->td;
149  } else if (lst->priority < pri) {
150  // All current priority queues contain tasks with lower priority.
151  // Allocate new one for given priority tasks.
152  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
153  thread_data = &list->td;
154  list->priority = pri;
155  list->next = lst;
156  task_team->tt.tt_task_pri_list = list;
157  } else { // task_team->tt.tt_task_pri_list->priority > pri
158  kmp_task_pri_t *next_queue = lst->next;
159  while (next_queue && next_queue->priority > pri) {
160  lst = next_queue;
161  next_queue = lst->next;
162  }
163  // lst->priority > pri && (next == NULL || pri >= next->priority)
164  if (next_queue == NULL) {
165  // No queue with pri priority, need to allocate new one.
166  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
167  thread_data = &list->td;
168  list->priority = pri;
169  list->next = NULL;
170  lst->next = list;
171  } else if (next_queue->priority == pri) {
172  // Found queue of tasks with given priority.
173  thread_data = &next_queue->td;
174  } else { // lst->priority > pri > next->priority
175  // insert newly allocated between existed queues
176  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
177  thread_data = &list->td;
178  list->priority = pri;
179  list->next = next_queue;
180  lst->next = list;
181  }
182  }
183  return thread_data;
184 }
185 
186 // __kmp_push_priority_task: Add a task to the team's priority task deque
187 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
188  kmp_taskdata_t *taskdata,
189  kmp_task_team_t *task_team,
190  kmp_int32 pri) {
191  kmp_thread_data_t *thread_data = NULL;
192  KA_TRACE(20,
193  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194  gtid, taskdata, pri));
195 
196  // Find task queue specific to priority value
197  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198  if (UNLIKELY(lst == NULL)) {
199  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
200  if (task_team->tt.tt_task_pri_list == NULL) {
201  // List of queues is still empty, allocate one.
202  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
203  thread_data = &list->td;
204  list->priority = pri;
205  list->next = NULL;
206  task_team->tt.tt_task_pri_list = list;
207  } else {
208  // Other thread initialized a queue. Check if it fits and get thread_data.
209  thread_data = __kmp_get_priority_deque_data(task_team, pri);
210  }
211  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
212  } else {
213  if (lst->priority == pri) {
214  // Found queue of tasks with given priority.
215  thread_data = &lst->td;
216  } else {
217  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
218  thread_data = __kmp_get_priority_deque_data(task_team, pri);
219  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
220  }
221  }
222  KMP_DEBUG_ASSERT(thread_data);
223 
224  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225  // Check if deque is full
226  if (TCR_4(thread_data->td.td_deque_ntasks) >=
227  TASK_DEQUE_SIZE(thread_data->td)) {
228  if (__kmp_enable_task_throttling &&
229  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
230  thread->th.th_current_task)) {
231  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233  "TASK_NOT_PUSHED for task %p\n",
234  gtid, taskdata));
235  return TASK_NOT_PUSHED;
236  } else {
237  // expand deque to push the task which is not allowed to execute
238  __kmp_realloc_task_deque(thread, thread_data);
239  }
240  }
241  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242  TASK_DEQUE_SIZE(thread_data->td));
243  // Push taskdata.
244  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245  // Wrap index.
246  thread_data->td.td_deque_tail =
247  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248  TCW_4(thread_data->td.td_deque_ntasks,
249  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251  KMP_FSYNC_RELEASING(taskdata); // releasing child
252  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254  gtid, taskdata, thread_data->td.td_deque_ntasks,
255  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257  task_team->tt.tt_num_task_pri++; // atomic inc
258  return TASK_SUCCESSFULLY_PUSHED;
259 }
260 
261 // __kmp_push_task: Add a task to the thread's deque
262 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
263  kmp_info_t *thread = __kmp_threads[gtid];
264  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
265 
266  // If we encounter a hidden helper task, and the current thread is not a
267  // hidden helper thread, we have to give the task to any hidden helper thread
268  // starting from its shadow one.
269  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
272  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
273  // Signal the hidden helper threads.
274  __kmp_hidden_helper_worker_thread_signal();
275  return TASK_SUCCESSFULLY_PUSHED;
276  }
277 
278  kmp_task_team_t *task_team = thread->th.th_task_team;
279  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280  kmp_thread_data_t *thread_data;
281 
282  KA_TRACE(20,
283  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284 
285  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286  // untied task needs to increment counter so that the task structure is not
287  // freed prematurely
288  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
289  KMP_DEBUG_USE_VAR(counter);
290  KA_TRACE(
291  20,
292  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293  gtid, counter, taskdata));
294  }
295 
296  // The first check avoids building task_team thread data if serialized
297  if (UNLIKELY(taskdata->td_flags.task_serial)) {
298  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299  "TASK_NOT_PUSHED for task %p\n",
300  gtid, taskdata));
301  return TASK_NOT_PUSHED;
302  }
303 
304  // Now that serialized tasks have returned, we can assume that we are not in
305  // immediate exec mode
306  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
307  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308  __kmp_enable_tasking(task_team, thread);
309  }
310  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
311  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312 
313  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
314  __kmp_max_task_priority > 0) {
315  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317  }
318 
319  // Find tasking deque specific to encountering thread
320  thread_data = &task_team->tt.tt_threads_data[tid];
321 
322  // No lock needed since only owner can allocate. If the task is hidden_helper,
323  // we don't need it either because we have initialized the dequeue for hidden
324  // helper thread data.
325  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326  __kmp_alloc_task_deque(thread, thread_data);
327  }
328 
329  int locked = 0;
330  // Check if deque is full
331  if (TCR_4(thread_data->td.td_deque_ntasks) >=
332  TASK_DEQUE_SIZE(thread_data->td)) {
333  if (__kmp_enable_task_throttling &&
334  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
335  thread->th.th_current_task)) {
336  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337  "TASK_NOT_PUSHED for task %p\n",
338  gtid, taskdata));
339  return TASK_NOT_PUSHED;
340  } else {
341  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342  locked = 1;
343  if (TCR_4(thread_data->td.td_deque_ntasks) >=
344  TASK_DEQUE_SIZE(thread_data->td)) {
345  // expand deque to push the task which is not allowed to execute
346  __kmp_realloc_task_deque(thread, thread_data);
347  }
348  }
349  }
350  // Lock the deque for the task push operation
351  if (!locked) {
352  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353  // Need to recheck as we can get a proxy task from thread outside of OpenMP
354  if (TCR_4(thread_data->td.td_deque_ntasks) >=
355  TASK_DEQUE_SIZE(thread_data->td)) {
356  if (__kmp_enable_task_throttling &&
357  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
358  thread->th.th_current_task)) {
359  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361  "returning TASK_NOT_PUSHED for task %p\n",
362  gtid, taskdata));
363  return TASK_NOT_PUSHED;
364  } else {
365  // expand deque to push the task which is not allowed to execute
366  __kmp_realloc_task_deque(thread, thread_data);
367  }
368  }
369  }
370  // Must have room since no thread can add tasks but calling thread
371  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372  TASK_DEQUE_SIZE(thread_data->td));
373 
374  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375  taskdata; // Push taskdata
376  // Wrap index.
377  thread_data->td.td_deque_tail =
378  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379  TCW_4(thread_data->td.td_deque_ntasks,
380  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382  KMP_FSYNC_RELEASING(taskdata); // releasing child
383  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384  "task=%p ntasks=%d head=%u tail=%u\n",
385  gtid, taskdata, thread_data->td.td_deque_ntasks,
386  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387 
388  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389 
390  return TASK_SUCCESSFULLY_PUSHED;
391 }
392 
393 // __kmp_pop_current_task_from_thread: set up current task from called thread
394 // when team ends
395 //
396 // this_thr: thread structure to set current_task in.
397 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
398  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399  "this_thread=%p, curtask=%p, "
400  "curtask_parent=%p\n",
401  0, this_thr, this_thr->th.th_current_task,
402  this_thr->th.th_current_task->td_parent));
403 
404  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405 
406  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407  "this_thread=%p, curtask=%p, "
408  "curtask_parent=%p\n",
409  0, this_thr, this_thr->th.th_current_task,
410  this_thr->th.th_current_task->td_parent));
411 }
412 
413 // __kmp_push_current_task_to_thread: set up current task in called thread for a
414 // new team
415 //
416 // this_thr: thread structure to set up
417 // team: team for implicit task data
418 // tid: thread within team to set up
419 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
420  int tid) {
421  // current task of the thread is a parent of the new just created implicit
422  // tasks of new team
423  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424  "curtask=%p "
425  "parent_task=%p\n",
426  tid, this_thr, this_thr->th.th_current_task,
427  team->t.t_implicit_task_taskdata[tid].td_parent));
428 
429  KMP_DEBUG_ASSERT(this_thr != NULL);
430 
431  if (tid == 0) {
432  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433  team->t.t_implicit_task_taskdata[0].td_parent =
434  this_thr->th.th_current_task;
435  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436  }
437  } else {
438  team->t.t_implicit_task_taskdata[tid].td_parent =
439  team->t.t_implicit_task_taskdata[0].td_parent;
440  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441  }
442 
443  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444  "curtask=%p "
445  "parent_task=%p\n",
446  tid, this_thr, this_thr->th.th_current_task,
447  team->t.t_implicit_task_taskdata[tid].td_parent));
448 }
449 
450 // __kmp_task_start: bookkeeping for a task starting execution
451 //
452 // GTID: global thread id of calling thread
453 // task: task starting execution
454 // current_task: task suspending
455 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
456  kmp_taskdata_t *current_task) {
457  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
458  kmp_info_t *thread = __kmp_threads[gtid];
459 
460  KA_TRACE(10,
461  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462  gtid, taskdata, current_task));
463 
464  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
465 
466  // mark currently executing task as suspended
467  // TODO: GEH - make sure root team implicit task is initialized properly.
468  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469  current_task->td_flags.executing = 0;
470 
471  // mark starting task as executing and as current task
472  thread->th.th_current_task = taskdata;
473 
474  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475  taskdata->td_flags.tiedness == TASK_UNTIED);
476  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477  taskdata->td_flags.tiedness == TASK_UNTIED);
478  taskdata->td_flags.started = 1;
479  taskdata->td_flags.executing = 1;
480  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482 
483  // GEH TODO: shouldn't we pass some sort of location identifier here?
484  // APT: yes, we will pass location here.
485  // need to store current thread state (in a thread or taskdata structure)
486  // before setting work_state, otherwise wrong state is set after end of task
487 
488  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489 
490  return;
491 }
492 
493 #if OMPT_SUPPORT
494 //------------------------------------------------------------------------------
495 
496 // __ompt_task_start:
497 // Build and trigger task-begin event
498 static inline void __ompt_task_start(kmp_task_t *task,
499  kmp_taskdata_t *current_task,
500  kmp_int32 gtid) {
501  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
502  ompt_task_status_t status = ompt_task_switch;
503  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504  status = ompt_task_yield;
505  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506  }
507  /* let OMPT know that we're about to run this task */
508  if (ompt_enabled.ompt_callback_task_schedule) {
509  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510  &(current_task->ompt_task_info.task_data), status,
511  &(taskdata->ompt_task_info.task_data));
512  }
513  taskdata->ompt_task_info.scheduling_parent = current_task;
514 }
515 
516 // __ompt_task_finish:
517 // Build and trigger final task-schedule event
518 static inline void __ompt_task_finish(kmp_task_t *task,
519  kmp_taskdata_t *resumed_task,
520  ompt_task_status_t status) {
521  if (ompt_enabled.ompt_callback_task_schedule) {
522  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
523  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
524  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
525  status = ompt_task_cancel;
526  }
527 
528  /* let OMPT know that we're returning to the callee task */
529  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530  &(taskdata->ompt_task_info.task_data), status,
531  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532  }
533 }
534 #endif
535 
536 template <bool ompt>
537 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
538  kmp_task_t *task,
539  void *frame_address,
540  void *return_address) {
541  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
542  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543 
544  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545  "current_task=%p\n",
546  gtid, loc_ref, taskdata, current_task));
547 
548  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549  // untied task needs to increment counter so that the task structure is not
550  // freed prematurely
551  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
552  KMP_DEBUG_USE_VAR(counter);
553  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554  "incremented for task %p\n",
555  gtid, counter, taskdata));
556  }
557 
558  taskdata->td_flags.task_serial =
559  1; // Execute this task immediately, not deferred.
560  __kmp_task_start(gtid, task, current_task);
561 
562 #if OMPT_SUPPORT
563  if (ompt) {
564  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565  current_task->ompt_task_info.frame.enter_frame.ptr =
566  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567  current_task->ompt_task_info.frame.enter_frame_flags =
568  taskdata->ompt_task_info.frame.exit_frame_flags =
569  OMPT_FRAME_FLAGS_APP;
570  }
571  if (ompt_enabled.ompt_callback_task_create) {
572  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574  &(parent_info->task_data), &(parent_info->frame),
575  &(taskdata->ompt_task_info.task_data),
576  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577  }
578  __ompt_task_start(task, current_task, gtid);
579  }
580 #endif // OMPT_SUPPORT
581 
582  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583  loc_ref, taskdata));
584 }
585 
586 #if OMPT_SUPPORT
587 OMPT_NOINLINE
588 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
589  kmp_task_t *task,
590  void *frame_address,
591  void *return_address) {
592  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593  return_address);
594 }
595 #endif // OMPT_SUPPORT
596 
597 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
598 // execution
599 //
600 // loc_ref: source location information; points to beginning of task block.
601 // gtid: global thread number.
602 // task: task thunk for the started task.
603 #ifdef __s390x__
604 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605 // In order for it to work correctly, the caller also needs to be compiled with
606 // backchain. If a caller is compiled without backchain,
607 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608 // crash.
609 __attribute__((target("backchain")))
610 #endif
611 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
612  kmp_task_t *task) {
613 #if OMPT_SUPPORT
614  if (UNLIKELY(ompt_enabled.enabled)) {
615  OMPT_STORE_RETURN_ADDRESS(gtid);
616  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
617  OMPT_GET_FRAME_ADDRESS(1),
618  OMPT_LOAD_RETURN_ADDRESS(gtid));
619  return;
620  }
621 #endif
622  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623 }
624 
625 #ifdef TASK_UNUSED
626 // __kmpc_omp_task_begin: report that a given task has started execution
627 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
628 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630 
631  KA_TRACE(
632  10,
633  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635 
636  __kmp_task_start(gtid, task, current_task);
637 
638  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639  loc_ref, KMP_TASK_TO_TASKDATA(task)));
640  return;
641 }
642 #endif // TASK_UNUSED
643 
644 // __kmp_free_task: free the current task space and the space for shareds
645 //
646 // gtid: Global thread ID of calling thread
647 // taskdata: task to free
648 // thread: thread data structure of caller
649 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650  kmp_info_t *thread) {
651  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652  taskdata));
653 
654  // Check to make sure all flags and counters have the correct values
655  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
656  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
659  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
660  taskdata->td_flags.task_serial == 1);
661  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
662  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
663  // Clear data to not be re-used later by mistake.
664  task->data1.destructors = NULL;
665  task->data2.priority = 0;
666 
667  taskdata->td_flags.freed = 1;
668 #if OMPX_TASKGRAPH
669  // do not free tasks in taskgraph
670  if (!taskdata->is_taskgraph) {
671 #endif
672 // deallocate the taskdata and shared variable blocks associated with this task
673 #if USE_FAST_MEMORY
674  __kmp_fast_free(thread, taskdata);
675 #else /* ! USE_FAST_MEMORY */
676  __kmp_thread_free(thread, taskdata);
677 #endif
678 #if OMPX_TASKGRAPH
679  } else {
680  taskdata->td_flags.complete = 0;
681  taskdata->td_flags.started = 0;
682  taskdata->td_flags.freed = 0;
683  taskdata->td_flags.executing = 0;
684  taskdata->td_flags.task_serial =
685  (taskdata->td_parent->td_flags.final ||
686  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687 
688  // taskdata->td_allow_completion_event.pending_events_count = 1;
689  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
690  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
691  // start at one because counts current task and children
692  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
693  }
694 #endif
695 
696  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697 }
698 
699 // __kmp_free_task_and_ancestors: free the current task and ancestors without
700 // children
701 //
702 // gtid: Global thread ID of calling thread
703 // taskdata: task to free
704 // thread: thread data structure of caller
705 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
706  kmp_taskdata_t *taskdata,
707  kmp_info_t *thread) {
708  // Proxy tasks must always be allowed to free their parents
709  // because they can be run in background even in serial mode.
710  kmp_int32 team_serial =
711  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712  !taskdata->td_flags.proxy;
713  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
714 
715  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716  KMP_DEBUG_ASSERT(children >= 0);
717 
718  // Now, go up the ancestor tree to see if any ancestors can now be freed.
719  while (children == 0) {
720  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721 
722  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723  "and freeing itself\n",
724  gtid, taskdata));
725 
726  // --- Deallocate my ancestor task ---
727  __kmp_free_task(gtid, taskdata, thread);
728 
729  taskdata = parent_taskdata;
730 
731  if (team_serial)
732  return;
733  // Stop checking ancestors at implicit task instead of walking up ancestor
734  // tree to avoid premature deallocation of ancestors.
735  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736  if (taskdata->td_dephash) { // do we need to cleanup dephash?
737  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738  kmp_tasking_flags_t flags_old = taskdata->td_flags;
739  if (children == 0 && flags_old.complete == 1) {
740  kmp_tasking_flags_t flags_new = flags_old;
741  flags_new.complete = 0;
742  if (KMP_COMPARE_AND_STORE_ACQ32(
743  RCAST(kmp_int32 *, &taskdata->td_flags),
744  *RCAST(kmp_int32 *, &flags_old),
745  *RCAST(kmp_int32 *, &flags_new))) {
746  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747  "dephash of implicit task %p\n",
748  gtid, taskdata));
749  // cleanup dephash of finished implicit task
750  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751  }
752  }
753  }
754  return;
755  }
756  // Predecrement simulated by "- 1" calculation
757  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758  KMP_DEBUG_ASSERT(children >= 0);
759  }
760 
761  KA_TRACE(
762  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763  "not freeing it yet\n",
764  gtid, taskdata, children));
765 }
766 
767 // Only need to keep track of child task counts if any of the following:
768 // 1. team parallel and tasking not serialized;
769 // 2. it is a proxy or detachable or hidden helper task
770 // 3. the children counter of its parent task is greater than 0.
771 // The reason for the 3rd one is for serialized team that found detached task,
772 // hidden helper task, T. In this case, the execution of T is still deferred,
773 // and it is also possible that a regular task depends on T. In this case, if we
774 // don't track the children, task synchronization will be broken.
775 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
776  kmp_tasking_flags_t flags = taskdata->td_flags;
777  bool ret = !(flags.team_serial || flags.tasking_ser);
778  ret = ret || flags.proxy == TASK_PROXY ||
779  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780  ret = ret ||
781  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
782 #if OMPX_TASKGRAPH
783  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785 #endif
786  return ret;
787 }
788 
789 // __kmp_task_finish: bookkeeping to do when a task finishes execution
790 //
791 // gtid: global thread ID for calling thread
792 // task: task to be finished
793 // resumed_task: task to be resumed. (may be NULL if task is serialized)
794 //
795 // template<ompt>: effectively ompt_enabled.enabled!=0
796 // the version with ompt=false is inlined, allowing to optimize away all ompt
797 // code in this case
798 template <bool ompt>
799 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
800  kmp_taskdata_t *resumed_task) {
801  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
802  kmp_info_t *thread = __kmp_threads[gtid];
803  kmp_task_team_t *task_team =
804  thread->th.th_task_team; // might be NULL for serial teams...
805 #if OMPX_TASKGRAPH
806  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807  bool is_taskgraph;
808 #endif
809 #if KMP_DEBUG
810  kmp_int32 children = 0;
811 #endif
812  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813  "task %p\n",
814  gtid, taskdata, resumed_task));
815 
816  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
817 
818 #if OMPX_TASKGRAPH
819  is_taskgraph = taskdata->is_taskgraph;
820 #endif
821 
822  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823  // untied task needs to check the counter so that the task structure is not
824  // freed prematurely
825  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
826  KA_TRACE(
827  20,
828  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829  gtid, counter, taskdata));
830  if (counter > 0) {
831  // untied task is not done, to be continued possibly by other thread, do
832  // not free it now
833  if (resumed_task == NULL) {
834  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
835  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836  // task is the parent
837  }
838  thread->th.th_current_task = resumed_task; // restore current_task
839  resumed_task->td_flags.executing = 1; // resume previous task
840  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841  "resuming task %p\n",
842  gtid, taskdata, resumed_task));
843  return;
844  }
845  }
846 
847  // bookkeeping for resuming task:
848  // GEH - note tasking_ser => task_serial
849  KMP_DEBUG_ASSERT(
850  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851  taskdata->td_flags.task_serial);
852  if (taskdata->td_flags.task_serial) {
853  if (resumed_task == NULL) {
854  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855  // task is the parent
856  }
857  } else {
858  KMP_DEBUG_ASSERT(resumed_task !=
859  NULL); // verify that resumed task is passed as argument
860  }
861 
862  /* If the tasks' destructor thunk flag has been set, we need to invoke the
863  destructor thunk that has been generated by the compiler. The code is
864  placed here, since at this point other tasks might have been released
865  hence overlapping the destructor invocations with some other work in the
866  released tasks. The OpenMP spec is not specific on when the destructors
867  are invoked, so we should be free to choose. */
868  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869  kmp_routine_entry_t destr_thunk = task->data1.destructors;
870  KMP_ASSERT(destr_thunk);
871  destr_thunk(gtid, task);
872  }
873 
874  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877 
878  bool completed = true;
879  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880  if (taskdata->td_allow_completion_event.type ==
881  KMP_EVENT_ALLOW_COMPLETION) {
882  // event hasn't been fulfilled yet. Try to detach task.
883  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
884  if (taskdata->td_allow_completion_event.type ==
885  KMP_EVENT_ALLOW_COMPLETION) {
886  // task finished execution
887  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888  taskdata->td_flags.executing = 0; // suspend the finishing task
889 
890 #if OMPT_SUPPORT
891  // For a detached task, which is not completed, we switch back
892  // the omp_fulfill_event signals completion
893  // locking is necessary to avoid a race with ompt_task_late_fulfill
894  if (ompt)
895  __ompt_task_finish(task, resumed_task, ompt_task_detach);
896 #endif
897 
898  // no access to taskdata after this point!
899  // __kmp_fulfill_event might free taskdata at any time from now
900 
901  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902  completed = false;
903  }
904  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
905  }
906  }
907 
908  // Tasks with valid target async handles must be re-enqueued.
909  if (taskdata->td_target_data.async_handle != NULL) {
910  // Note: no need to translate gtid to its shadow. If the current thread is a
911  // hidden helper one, then the gtid is already correct. Otherwise, hidden
912  // helper threads are disabled, and gtid refers to a OpenMP thread.
913 #if OMPT_SUPPORT
914  if (ompt) {
915  __ompt_task_finish(task, resumed_task, ompt_task_switch);
916  }
917 #endif
918  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
919  if (KMP_HIDDEN_HELPER_THREAD(gtid))
920  __kmp_hidden_helper_worker_thread_signal();
921  completed = false;
922  }
923 
924  if (completed) {
925  taskdata->td_flags.complete = 1; // mark the task as completed
926 #if OMPX_TASKGRAPH
927  taskdata->td_flags.onced = 1; // mark the task as ran once already
928 #endif
929 
930 #if OMPT_SUPPORT
931  // This is not a detached task, we are done here
932  if (ompt)
933  __ompt_task_finish(task, resumed_task, ompt_task_complete);
934 #endif
935  // TODO: What would be the balance between the conditions in the function
936  // and an atomic operation?
937  if (__kmp_track_children_task(taskdata)) {
938  __kmp_release_deps(gtid, taskdata);
939  // Predecrement simulated by "- 1" calculation
940 #if KMP_DEBUG
941  children = -1 +
942 #endif
943  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
944  KMP_DEBUG_ASSERT(children >= 0);
945 #if OMPX_TASKGRAPH
946  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947 #else
948  if (taskdata->td_taskgroup)
949 #endif
950  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
952  task_team->tt.tt_hidden_helper_task_encountered)) {
953  // if we found proxy or hidden helper tasks there could exist a dependency
954  // chain with the proxy task as origin
955  __kmp_release_deps(gtid, taskdata);
956  }
957  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958  // called. Othertwise, if a task is executed immediately from the
959  // release_deps code, the flag will be reset to 1 again by this same
960  // function
961  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962  taskdata->td_flags.executing = 0; // suspend the finishing task
963 
964  // Decrement the counter of hidden helper tasks to be executed.
965  if (taskdata->td_flags.hidden_helper) {
966  // Hidden helper tasks can only be executed by hidden helper threads.
967  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
968  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
969  }
970  }
971 
972  KA_TRACE(
973  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974  gtid, taskdata, children));
975 
976  // Free this task and then ancestor tasks if they have no children.
977  // Restore th_current_task first as suggested by John:
978  // johnmc: if an asynchronous inquiry peers into the runtime system
979  // it doesn't see the freed task as the current task.
980  thread->th.th_current_task = resumed_task;
981  if (completed)
982  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983 
984  // TODO: GEH - make sure root team implicit task is initialized properly.
985  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986  resumed_task->td_flags.executing = 1; // resume previous task
987 
988 #if OMPX_TASKGRAPH
989  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990  taskdata->td_taskgroup) {
991  // TDG: we only release taskgroup barrier here because
992  // free_task_and_ancestors will call
993  // __kmp_free_task, which resets all task parameters such as
994  // taskdata->started, etc. If we release the barrier earlier, these
995  // parameters could be read before being reset. This is not an issue for
996  // non-TDG implementation because we never reuse a task(data) structure
997  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998  }
999 #endif
1000 
1001  KA_TRACE(
1002  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003  gtid, taskdata, resumed_task));
1004 
1005  return;
1006 }
1007 
1008 template <bool ompt>
1009 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1010  kmp_int32 gtid,
1011  kmp_task_t *task) {
1012  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014  KMP_DEBUG_ASSERT(gtid >= 0);
1015  // this routine will provide task to resume
1016  __kmp_task_finish<ompt>(gtid, task, NULL);
1017 
1018  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020 
1021 #if OMPT_SUPPORT
1022  if (ompt) {
1023  ompt_frame_t *ompt_frame;
1024  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025  ompt_frame->enter_frame = ompt_data_none;
1026  ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027  }
1028 #endif
1029 
1030  return;
1031 }
1032 
1033 #if OMPT_SUPPORT
1034 OMPT_NOINLINE
1035 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036  kmp_task_t *task) {
1037  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1038 }
1039 #endif // OMPT_SUPPORT
1040 
1041 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1042 //
1043 // loc_ref: source location information; points to end of task block.
1044 // gtid: global thread number.
1045 // task: task thunk for the completed task.
1046 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1047  kmp_task_t *task) {
1048 #if OMPT_SUPPORT
1049  if (UNLIKELY(ompt_enabled.enabled)) {
1050  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051  return;
1052  }
1053 #endif
1054  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1055 }
1056 
1057 #ifdef TASK_UNUSED
1058 // __kmpc_omp_task_complete: report that a task has completed execution
1059 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061  kmp_task_t *task) {
1062  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064 
1065  __kmp_task_finish<false>(gtid, task,
1066  NULL); // Not sure how to find task to resume
1067 
1068  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070  return;
1071 }
1072 #endif // TASK_UNUSED
1073 
1074 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075 // task for a given thread
1076 //
1077 // loc_ref: reference to source location of parallel region
1078 // this_thr: thread data structure corresponding to implicit task
1079 // team: team for this_thr
1080 // tid: thread id of given thread within team
1081 // set_curr_task: TRUE if need to push current task to thread
1082 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083 // have already been done elsewhere.
1084 // TODO: Get better loc_ref. Value passed in may be NULL
1085 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1086  kmp_team_t *team, int tid, int set_curr_task) {
1087  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088 
1089  KF_TRACE(
1090  10,
1091  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093 
1094  task->td_task_id = KMP_GEN_TASK_ID();
1095  task->td_team = team;
1096  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097  // in debugger)
1098  task->td_ident = loc_ref;
1099  task->td_taskwait_ident = NULL;
1100  task->td_taskwait_counter = 0;
1101  task->td_taskwait_thread = 0;
1102 
1103  task->td_flags.tiedness = TASK_TIED;
1104  task->td_flags.tasktype = TASK_IMPLICIT;
1105  task->td_flags.proxy = TASK_FULL;
1106 
1107  // All implicit tasks are executed immediately, not deferred
1108  task->td_flags.task_serial = 1;
1109  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111 
1112  task->td_flags.started = 1;
1113  task->td_flags.executing = 1;
1114  task->td_flags.complete = 0;
1115  task->td_flags.freed = 0;
1116 #if OMPX_TASKGRAPH
1117  task->td_flags.onced = 0;
1118 #endif
1119 
1120  task->td_depnode = NULL;
1121  task->td_last_tied = task;
1122  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123 
1124  if (set_curr_task) { // only do this init first time thread is created
1125  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126  // Not used: don't need to deallocate implicit task
1127  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129  task->td_dephash = NULL;
1130  __kmp_push_current_task_to_thread(this_thr, team, tid);
1131  } else {
1132  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134  }
1135 
1136 #if OMPT_SUPPORT
1137  if (UNLIKELY(ompt_enabled.enabled))
1138  __ompt_task_init(task, tid);
1139 #endif
1140 
1141  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142  team, task));
1143 }
1144 
1145 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146 // at the end of parallel regions. Some resources are kept for reuse in the next
1147 // parallel region.
1148 //
1149 // thread: thread data structure corresponding to implicit task
1150 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1151  kmp_taskdata_t *task = thread->th.th_current_task;
1152  if (task->td_dephash) {
1153  int children;
1154  task->td_flags.complete = 1;
1155 #if OMPX_TASKGRAPH
1156  task->td_flags.onced = 1;
1157 #endif
1158  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1159  kmp_tasking_flags_t flags_old = task->td_flags;
1160  if (children == 0 && flags_old.complete == 1) {
1161  kmp_tasking_flags_t flags_new = flags_old;
1162  flags_new.complete = 0;
1163  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1164  *RCAST(kmp_int32 *, &flags_old),
1165  *RCAST(kmp_int32 *, &flags_new))) {
1166  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1167  "dephash of implicit task %p\n",
1168  thread->th.th_info.ds.ds_gtid, task));
1169  __kmp_dephash_free_entries(thread, task->td_dephash);
1170  }
1171  }
1172  }
1173 }
1174 
1175 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1176 // when these are destroyed regions
1177 //
1178 // thread: thread data structure corresponding to implicit task
1179 void __kmp_free_implicit_task(kmp_info_t *thread) {
1180  kmp_taskdata_t *task = thread->th.th_current_task;
1181  if (task && task->td_dephash) {
1182  __kmp_dephash_free(thread, task->td_dephash);
1183  task->td_dephash = NULL;
1184  }
1185 }
1186 
1187 // Round up a size to a power of two specified by val: Used to insert padding
1188 // between structures co-allocated using a single malloc() call
1189 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1190  if (size & (val - 1)) {
1191  size &= ~(val - 1);
1192  if (size <= KMP_SIZE_T_MAX - val) {
1193  size += val; // Round up if there is no overflow.
1194  }
1195  }
1196  return size;
1197 } // __kmp_round_up_to_va
1198 
1199 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1200 //
1201 // loc_ref: source location information
1202 // gtid: global thread number.
1203 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1204 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1205 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1206 // private vars accessed in task.
1207 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1208 // in task.
1209 // task_entry: Pointer to task code entry point generated by compiler.
1210 // returns: a pointer to the allocated kmp_task_t structure (task).
1211 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1212  kmp_tasking_flags_t *flags,
1213  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1214  kmp_routine_entry_t task_entry) {
1215  kmp_task_t *task;
1216  kmp_taskdata_t *taskdata;
1217  kmp_info_t *thread = __kmp_threads[gtid];
1218  kmp_team_t *team = thread->th.th_team;
1219  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1220  size_t shareds_offset;
1221 
1222  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1223  __kmp_middle_initialize();
1224 
1225  if (flags->hidden_helper) {
1226  if (__kmp_enable_hidden_helper) {
1227  if (!TCR_4(__kmp_init_hidden_helper))
1228  __kmp_hidden_helper_initialize();
1229  } else {
1230  // If the hidden helper task is not enabled, reset the flag to FALSE.
1231  flags->hidden_helper = FALSE;
1232  }
1233  }
1234 
1235  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1236  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1237  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1238  sizeof_shareds, task_entry));
1239 
1240  KMP_DEBUG_ASSERT(parent_task);
1241  if (parent_task->td_flags.final) {
1242  if (flags->merged_if0) {
1243  }
1244  flags->final = 1;
1245  }
1246 
1247  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1248  // Untied task encountered causes the TSC algorithm to check entire deque of
1249  // the victim thread. If no untied task encountered, then checking the head
1250  // of the deque should be enough.
1251  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1252  }
1253 
1254  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1255  // the tasking setup
1256  // when that happens is too late.
1257  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1258  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1259  if (flags->proxy == TASK_PROXY) {
1260  flags->tiedness = TASK_UNTIED;
1261  flags->merged_if0 = 1;
1262  }
1263  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1264  tasking support enabled */
1265  if ((thread->th.th_task_team) == NULL) {
1266  /* This should only happen if the team is serialized
1267  setup a task team and propagate it to the thread */
1268  KMP_DEBUG_ASSERT(team->t.t_serialized);
1269  KA_TRACE(30,
1270  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1271  gtid));
1272  __kmp_task_team_setup(thread, team);
1273  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1274  }
1275  kmp_task_team_t *task_team = thread->th.th_task_team;
1276 
1277  /* tasking must be enabled now as the task might not be pushed */
1278  if (!KMP_TASKING_ENABLED(task_team)) {
1279  KA_TRACE(
1280  30,
1281  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1282  __kmp_enable_tasking(task_team, thread);
1283  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1284  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1285  // No lock needed since only owner can allocate
1286  if (thread_data->td.td_deque == NULL) {
1287  __kmp_alloc_task_deque(thread, thread_data);
1288  }
1289  }
1290 
1291  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1292  task_team->tt.tt_found_proxy_tasks == FALSE)
1293  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1294  if (flags->hidden_helper &&
1295  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1296  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1297  }
1298 
1299  // Calculate shared structure offset including padding after kmp_task_t struct
1300  // to align pointers in shared struct
1301  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1302  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1303 
1304  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1305  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1306  shareds_offset));
1307  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1308  sizeof_shareds));
1309 
1310  // Avoid double allocation here by combining shareds with taskdata
1311 #if USE_FAST_MEMORY
1312  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1313  sizeof_shareds);
1314 #else /* ! USE_FAST_MEMORY */
1315  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1316  sizeof_shareds);
1317 #endif /* USE_FAST_MEMORY */
1318 
1319  task = KMP_TASKDATA_TO_TASK(taskdata);
1320 
1321 // Make sure task & taskdata are aligned appropriately
1322 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1323  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1324  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1325 #else
1326  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1327  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1328 #endif
1329  if (sizeof_shareds > 0) {
1330  // Avoid double allocation here by combining shareds with taskdata
1331  task->shareds = &((char *)taskdata)[shareds_offset];
1332  // Make sure shareds struct is aligned to pointer size
1333  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1334  0);
1335  } else {
1336  task->shareds = NULL;
1337  }
1338  task->routine = task_entry;
1339  task->part_id = 0; // AC: Always start with 0 part id
1340 
1341  taskdata->td_task_id = KMP_GEN_TASK_ID();
1342  taskdata->td_team = thread->th.th_team;
1343  taskdata->td_alloc_thread = thread;
1344  taskdata->td_parent = parent_task;
1345  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1346  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1347  taskdata->td_ident = loc_ref;
1348  taskdata->td_taskwait_ident = NULL;
1349  taskdata->td_taskwait_counter = 0;
1350  taskdata->td_taskwait_thread = 0;
1351  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1352  // avoid copying icvs for proxy tasks
1353  if (flags->proxy == TASK_FULL)
1354  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1355 
1356  taskdata->td_flags = *flags;
1357  taskdata->td_task_team = thread->th.th_task_team;
1358  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1359  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1360  // If it is hidden helper task, we need to set the team and task team
1361  // correspondingly.
1362  if (flags->hidden_helper) {
1363  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1364  taskdata->td_team = shadow_thread->th.th_team;
1365  taskdata->td_task_team = shadow_thread->th.th_task_team;
1366  }
1367 
1368  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1369  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1370 
1371  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1372  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1373 
1374  // GEH - Note we serialize the task if the team is serialized to make sure
1375  // implicit parallel region tasks are not left until program termination to
1376  // execute. Also, it helps locality to execute immediately.
1377 
1378  taskdata->td_flags.task_serial =
1379  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1380  taskdata->td_flags.tasking_ser || flags->merged_if0);
1381 
1382  taskdata->td_flags.started = 0;
1383  taskdata->td_flags.executing = 0;
1384  taskdata->td_flags.complete = 0;
1385  taskdata->td_flags.freed = 0;
1386 #if OMPX_TASKGRAPH
1387  taskdata->td_flags.onced = 0;
1388  taskdata->is_taskgraph = 0;
1389  taskdata->tdg = nullptr;
1390 #endif
1391  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1392  // start at one because counts current task and children
1393  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1394  taskdata->td_taskgroup =
1395  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1396  taskdata->td_dephash = NULL;
1397  taskdata->td_depnode = NULL;
1398  taskdata->td_target_data.async_handle = NULL;
1399  if (flags->tiedness == TASK_UNTIED)
1400  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1401  else
1402  taskdata->td_last_tied = taskdata;
1403  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1404 #if OMPT_SUPPORT
1405  if (UNLIKELY(ompt_enabled.enabled))
1406  __ompt_task_init(taskdata, gtid);
1407 #endif
1408  // TODO: What would be the balance between the conditions in the function and
1409  // an atomic operation?
1410  if (__kmp_track_children_task(taskdata)) {
1411  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1412  if (parent_task->td_taskgroup)
1413  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1414  // Only need to keep track of allocated child tasks for explicit tasks since
1415  // implicit not deallocated
1416  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1417  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1418  }
1419  if (flags->hidden_helper) {
1420  taskdata->td_flags.task_serial = FALSE;
1421  // Increment the number of hidden helper tasks to be executed
1422  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1423  }
1424  }
1425 
1426 #if OMPX_TASKGRAPH
1427  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1428  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1429  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1430  taskdata->is_taskgraph = 1;
1431  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1432  taskdata->td_task_id = KMP_GEN_TASK_ID();
1433  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1434  }
1435 #endif
1436  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1437  gtid, taskdata, taskdata->td_parent));
1438 
1439  return task;
1440 }
1441 
1442 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1443  kmp_int32 flags, size_t sizeof_kmp_task_t,
1444  size_t sizeof_shareds,
1445  kmp_routine_entry_t task_entry) {
1446  kmp_task_t *retval;
1447  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1448  __kmp_assert_valid_gtid(gtid);
1449  input_flags->native = FALSE;
1450  // __kmp_task_alloc() sets up all other runtime flags
1451  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1452  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1453  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1454  input_flags->proxy ? "proxy" : "",
1455  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1456  sizeof_shareds, task_entry));
1457 
1458  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1459  sizeof_shareds, task_entry);
1460 
1461  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1462 
1463  return retval;
1464 }
1465 
1466 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1467  kmp_int32 flags,
1468  size_t sizeof_kmp_task_t,
1469  size_t sizeof_shareds,
1470  kmp_routine_entry_t task_entry,
1471  kmp_int64 device_id) {
1472  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1473  // target task is untied defined in the specification
1474  input_flags.tiedness = TASK_UNTIED;
1475  input_flags.target = 1;
1476 
1477  if (__kmp_enable_hidden_helper)
1478  input_flags.hidden_helper = TRUE;
1479 
1480  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1481  sizeof_shareds, task_entry);
1482 }
1483 
1497 kmp_int32
1499  kmp_task_t *new_task, kmp_int32 naffins,
1500  kmp_task_affinity_info_t *affin_list) {
1501  return 0;
1502 }
1503 
1504 // __kmp_invoke_task: invoke the specified task
1505 //
1506 // gtid: global thread ID of caller
1507 // task: the task to invoke
1508 // current_task: the task to resume after task invocation
1509 #ifdef __s390x__
1510 __attribute__((target("backchain")))
1511 #endif
1512 static void
1513 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1514  kmp_taskdata_t *current_task) {
1515  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1516  kmp_info_t *thread;
1517  int discard = 0 /* false */;
1518  KA_TRACE(
1519  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1520  gtid, taskdata, current_task));
1521  KMP_DEBUG_ASSERT(task);
1522  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1523  taskdata->td_flags.complete == 1)) {
1524  // This is a proxy task that was already completed but it needs to run
1525  // its bottom-half finish
1526  KA_TRACE(
1527  30,
1528  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1529  gtid, taskdata));
1530 
1531  __kmp_bottom_half_finish_proxy(gtid, task);
1532 
1533  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1534  "proxy task %p, resuming task %p\n",
1535  gtid, taskdata, current_task));
1536 
1537  return;
1538  }
1539 
1540 #if OMPT_SUPPORT
1541  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1542  // does not execute code.
1543  ompt_thread_info_t oldInfo;
1544  if (UNLIKELY(ompt_enabled.enabled)) {
1545  // Store the threads states and restore them after the task
1546  thread = __kmp_threads[gtid];
1547  oldInfo = thread->th.ompt_thread_info;
1548  thread->th.ompt_thread_info.wait_id = 0;
1549  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1550  ? ompt_state_work_serial
1551  : ompt_state_work_parallel;
1552  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1553  }
1554 #endif
1555 
1556  // Proxy tasks are not handled by the runtime
1557  if (taskdata->td_flags.proxy != TASK_PROXY) {
1558  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1559  }
1560 
1561  // TODO: cancel tasks if the parallel region has also been cancelled
1562  // TODO: check if this sequence can be hoisted above __kmp_task_start
1563  // if cancellation has been enabled for this run ...
1564  if (UNLIKELY(__kmp_omp_cancellation)) {
1565  thread = __kmp_threads[gtid];
1566  kmp_team_t *this_team = thread->th.th_team;
1567  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1568  if ((taskgroup && taskgroup->cancel_request) ||
1569  (this_team->t.t_cancel_request == cancel_parallel)) {
1570 #if OMPT_SUPPORT && OMPT_OPTIONAL
1571  ompt_data_t *task_data;
1572  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1573  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1574  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1575  task_data,
1576  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1577  : ompt_cancel_parallel) |
1578  ompt_cancel_discarded_task,
1579  NULL);
1580  }
1581 #endif
1582  KMP_COUNT_BLOCK(TASK_cancelled);
1583  // this task belongs to a task group and we need to cancel it
1584  discard = 1 /* true */;
1585  }
1586  }
1587 
1588  // Invoke the task routine and pass in relevant data.
1589  // Thunks generated by gcc take a different argument list.
1590  if (!discard) {
1591  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1592  taskdata->td_last_tied = current_task->td_last_tied;
1593  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1594  }
1595 #if KMP_STATS_ENABLED
1596  KMP_COUNT_BLOCK(TASK_executed);
1597  switch (KMP_GET_THREAD_STATE()) {
1598  case FORK_JOIN_BARRIER:
1599  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1600  break;
1601  case PLAIN_BARRIER:
1602  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1603  break;
1604  case TASKYIELD:
1605  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1606  break;
1607  case TASKWAIT:
1608  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1609  break;
1610  case TASKGROUP:
1611  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1612  break;
1613  default:
1614  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1615  break;
1616  }
1617 #endif // KMP_STATS_ENABLED
1618 
1619 // OMPT task begin
1620 #if OMPT_SUPPORT
1621  if (UNLIKELY(ompt_enabled.enabled))
1622  __ompt_task_start(task, current_task, gtid);
1623 #endif
1624 #if OMPT_SUPPORT && OMPT_OPTIONAL
1625  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1626  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1627  ompt_data_t instance = ompt_data_none;
1628  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1629  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1630  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1631  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1632  ompt_dispatch_taskloop_chunk, instance);
1633  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1634  }
1635 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1636 
1637 #if OMPD_SUPPORT
1638  if (ompd_state & OMPD_ENABLE_BP)
1639  ompd_bp_task_begin();
1640 #endif
1641 
1642 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1643  kmp_uint64 cur_time;
1644  kmp_int32 kmp_itt_count_task =
1645  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1646  current_task->td_flags.tasktype == TASK_IMPLICIT;
1647  if (kmp_itt_count_task) {
1648  thread = __kmp_threads[gtid];
1649  // Time outer level explicit task on barrier for adjusting imbalance time
1650  if (thread->th.th_bar_arrive_time)
1651  cur_time = __itt_get_timestamp();
1652  else
1653  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1654  }
1655  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1656 #endif
1657 
1658 #if ENABLE_LIBOMPTARGET
1659  if (taskdata->td_target_data.async_handle != NULL) {
1660  // If we have a valid target async handle, that means that we have already
1661  // executed the task routine once. We must query for the handle completion
1662  // instead of re-executing the routine.
1663  KMP_ASSERT(tgt_target_nowait_query);
1664  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1665  } else
1666 #endif
1667  if (task->routine != NULL) {
1668 #ifdef KMP_GOMP_COMPAT
1669  if (taskdata->td_flags.native) {
1670  ((void (*)(void *))(*(task->routine)))(task->shareds);
1671  } else
1672 #endif /* KMP_GOMP_COMPAT */
1673  {
1674  (*(task->routine))(gtid, task);
1675  }
1676  }
1677  KMP_POP_PARTITIONED_TIMER();
1678 
1679 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1680  if (kmp_itt_count_task) {
1681  // Barrier imbalance - adjust arrive time with the task duration
1682  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1683  }
1684  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1685  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1686 #endif
1687  }
1688 
1689 #if OMPD_SUPPORT
1690  if (ompd_state & OMPD_ENABLE_BP)
1691  ompd_bp_task_end();
1692 #endif
1693 
1694  // Proxy tasks are not handled by the runtime
1695  if (taskdata->td_flags.proxy != TASK_PROXY) {
1696 #if OMPT_SUPPORT
1697  if (UNLIKELY(ompt_enabled.enabled)) {
1698  thread->th.ompt_thread_info = oldInfo;
1699  if (taskdata->td_flags.tiedness == TASK_TIED) {
1700  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1701  }
1702  __kmp_task_finish<true>(gtid, task, current_task);
1703  } else
1704 #endif
1705  __kmp_task_finish<false>(gtid, task, current_task);
1706  }
1707 #if OMPT_SUPPORT
1708  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1709  __ompt_task_finish(task, current_task, ompt_task_switch);
1710  }
1711 #endif
1712 
1713  KA_TRACE(
1714  30,
1715  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1716  gtid, taskdata, current_task));
1717  return;
1718 }
1719 
1720 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1721 //
1722 // loc_ref: location of original task pragma (ignored)
1723 // gtid: Global Thread ID of encountering thread
1724 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1725 // Returns:
1726 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1727 // be resumed later.
1728 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1729 // resumed later.
1730 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1731  kmp_task_t *new_task) {
1732  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1733 
1734  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1735  loc_ref, new_taskdata));
1736 
1737 #if OMPT_SUPPORT
1738  kmp_taskdata_t *parent;
1739  if (UNLIKELY(ompt_enabled.enabled)) {
1740  parent = new_taskdata->td_parent;
1741  if (ompt_enabled.ompt_callback_task_create) {
1742  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1743  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1744  &(new_taskdata->ompt_task_info.task_data),
1745  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1746  OMPT_GET_RETURN_ADDRESS(0));
1747  }
1748  }
1749 #endif
1750 
1751  /* Should we execute the new task or queue it? For now, let's just always try
1752  to queue it. If the queue fills up, then we'll execute it. */
1753 
1754  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1755  { // Execute this task immediately
1756  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1757  new_taskdata->td_flags.task_serial = 1;
1758  __kmp_invoke_task(gtid, new_task, current_task);
1759  }
1760 
1761  KA_TRACE(
1762  10,
1763  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1764  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1765  gtid, loc_ref, new_taskdata));
1766 
1767 #if OMPT_SUPPORT
1768  if (UNLIKELY(ompt_enabled.enabled)) {
1769  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1770  parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1771  }
1772 #endif
1773  return TASK_CURRENT_NOT_QUEUED;
1774 }
1775 
1776 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1777 //
1778 // gtid: Global Thread ID of encountering thread
1779 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1780 // serialize_immediate: if TRUE then if the task is executed immediately its
1781 // execution will be serialized
1782 // Returns:
1783 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1784 // be resumed later.
1785 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1786 // resumed later.
1787 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1788  bool serialize_immediate) {
1789  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1790 
1791 #if OMPX_TASKGRAPH
1792  if (new_taskdata->is_taskgraph &&
1793  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1794  kmp_tdg_info_t *tdg = new_taskdata->tdg;
1795  // extend the record_map if needed
1796  if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1797  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1798  // map_size could have been updated by another thread if recursive
1799  // taskloop
1800  if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1801  kmp_uint old_size = tdg->map_size;
1802  kmp_uint new_size = old_size * 2;
1803  kmp_node_info_t *old_record = tdg->record_map;
1804  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1805  new_size * sizeof(kmp_node_info_t));
1806 
1807  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1808  tdg->record_map = new_record;
1809 
1810  __kmp_free(old_record);
1811 
1812  for (kmp_int i = old_size; i < new_size; i++) {
1813  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1814  __kmp_successors_size * sizeof(kmp_int32));
1815  new_record[i].task = nullptr;
1816  new_record[i].successors = successorsList;
1817  new_record[i].nsuccessors = 0;
1818  new_record[i].npredecessors = 0;
1819  new_record[i].successors_size = __kmp_successors_size;
1820  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1821  }
1822  // update the size at the end, so that we avoid other
1823  // threads use old_record while map_size is already updated
1824  tdg->map_size = new_size;
1825  }
1826  __kmp_release_bootstrap_lock(&tdg->graph_lock);
1827  }
1828  // record a task
1829  if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1830  tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1831  tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1832  new_taskdata->td_parent;
1833  KMP_ATOMIC_INC(&tdg->num_tasks);
1834  }
1835  }
1836 #endif
1837 
1838  /* Should we execute the new task or queue it? For now, let's just always try
1839  to queue it. If the queue fills up, then we'll execute it. */
1840  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1841  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1842  { // Execute this task immediately
1843  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1844  if (serialize_immediate)
1845  new_taskdata->td_flags.task_serial = 1;
1846  __kmp_invoke_task(gtid, new_task, current_task);
1847  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1848  __kmp_wpolicy_passive) {
1849  kmp_info_t *this_thr = __kmp_threads[gtid];
1850  kmp_team_t *team = this_thr->th.th_team;
1851  kmp_int32 nthreads = this_thr->th.th_team_nproc;
1852  for (int i = 0; i < nthreads; ++i) {
1853  kmp_info_t *thread = team->t.t_threads[i];
1854  if (thread == this_thr)
1855  continue;
1856  if (thread->th.th_sleep_loc != NULL) {
1857  __kmp_null_resume_wrapper(thread);
1858  break; // awake one thread at a time
1859  }
1860  }
1861  }
1862  return TASK_CURRENT_NOT_QUEUED;
1863 }
1864 
1865 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1866 // non-thread-switchable task from the parent thread only!
1867 //
1868 // loc_ref: location of original task pragma (ignored)
1869 // gtid: Global Thread ID of encountering thread
1870 // new_task: non-thread-switchable task thunk allocated by
1871 // __kmp_omp_task_alloc()
1872 // Returns:
1873 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1874 // be resumed later.
1875 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1876 // resumed later.
1877 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1878  kmp_task_t *new_task) {
1879  kmp_int32 res;
1880  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1881 
1882 #if KMP_DEBUG || OMPT_SUPPORT
1883  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1884 #endif
1885  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1886  new_taskdata));
1887  __kmp_assert_valid_gtid(gtid);
1888 
1889 #if OMPT_SUPPORT
1890  kmp_taskdata_t *parent = NULL;
1891  if (UNLIKELY(ompt_enabled.enabled)) {
1892  if (!new_taskdata->td_flags.started) {
1893  OMPT_STORE_RETURN_ADDRESS(gtid);
1894  parent = new_taskdata->td_parent;
1895  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1896  parent->ompt_task_info.frame.enter_frame.ptr =
1897  OMPT_GET_FRAME_ADDRESS(0);
1898  }
1899  if (ompt_enabled.ompt_callback_task_create) {
1900  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1901  &(parent->ompt_task_info.task_data),
1902  &(parent->ompt_task_info.frame),
1903  &(new_taskdata->ompt_task_info.task_data),
1904  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1905  OMPT_LOAD_RETURN_ADDRESS(gtid));
1906  }
1907  } else {
1908  // We are scheduling the continuation of an UNTIED task.
1909  // Scheduling back to the parent task.
1910  __ompt_task_finish(new_task,
1911  new_taskdata->ompt_task_info.scheduling_parent,
1912  ompt_task_switch);
1913  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1914  }
1915  }
1916 #endif
1917 
1918  res = __kmp_omp_task(gtid, new_task, true);
1919 
1920  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1921  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1922  gtid, loc_ref, new_taskdata));
1923 #if OMPT_SUPPORT
1924  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1925  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1926  }
1927 #endif
1928  return res;
1929 }
1930 
1931 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1932 // a taskloop task with the correct OMPT return address
1933 //
1934 // loc_ref: location of original task pragma (ignored)
1935 // gtid: Global Thread ID of encountering thread
1936 // new_task: non-thread-switchable task thunk allocated by
1937 // __kmp_omp_task_alloc()
1938 // codeptr_ra: return address for OMPT callback
1939 // Returns:
1940 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1941 // be resumed later.
1942 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1943 // resumed later.
1944 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1945  kmp_task_t *new_task, void *codeptr_ra) {
1946  kmp_int32 res;
1947  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1948 
1949 #if KMP_DEBUG || OMPT_SUPPORT
1950  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1951 #endif
1952  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1953  new_taskdata));
1954 
1955 #if OMPT_SUPPORT
1956  kmp_taskdata_t *parent = NULL;
1957  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1958  parent = new_taskdata->td_parent;
1959  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1960  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1961  if (ompt_enabled.ompt_callback_task_create) {
1962  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1963  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1964  &(new_taskdata->ompt_task_info.task_data),
1965  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1966  }
1967  }
1968 #endif
1969 
1970  res = __kmp_omp_task(gtid, new_task, true);
1971 
1972  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1973  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1974  gtid, loc_ref, new_taskdata));
1975 #if OMPT_SUPPORT
1976  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1977  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1978  }
1979 #endif
1980  return res;
1981 }
1982 
1983 template <bool ompt>
1984 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1985  void *frame_address,
1986  void *return_address) {
1987  kmp_taskdata_t *taskdata = nullptr;
1988  kmp_info_t *thread;
1989  int thread_finished = FALSE;
1990  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1991 
1992  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
1993  KMP_DEBUG_ASSERT(gtid >= 0);
1994 
1995  if (__kmp_tasking_mode != tskm_immediate_exec) {
1996  thread = __kmp_threads[gtid];
1997  taskdata = thread->th.th_current_task;
1998 
1999 #if OMPT_SUPPORT && OMPT_OPTIONAL
2000  ompt_data_t *my_task_data;
2001  ompt_data_t *my_parallel_data;
2002 
2003  if (ompt) {
2004  my_task_data = &(taskdata->ompt_task_info.task_data);
2005  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2006 
2007  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2008 
2009  if (ompt_enabled.ompt_callback_sync_region) {
2010  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2011  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2012  my_task_data, return_address);
2013  }
2014 
2015  if (ompt_enabled.ompt_callback_sync_region_wait) {
2016  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2017  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2018  my_task_data, return_address);
2019  }
2020  }
2021 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2022 
2023 // Debugger: The taskwait is active. Store location and thread encountered the
2024 // taskwait.
2025 #if USE_ITT_BUILD
2026 // Note: These values are used by ITT events as well.
2027 #endif /* USE_ITT_BUILD */
2028  taskdata->td_taskwait_counter += 1;
2029  taskdata->td_taskwait_ident = loc_ref;
2030  taskdata->td_taskwait_thread = gtid + 1;
2031 
2032 #if USE_ITT_BUILD
2033  void *itt_sync_obj = NULL;
2034 #if USE_ITT_NOTIFY
2035  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2036 #endif /* USE_ITT_NOTIFY */
2037 #endif /* USE_ITT_BUILD */
2038 
2039  bool must_wait =
2040  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2041 
2042  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2043  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2044  // If hidden helper thread is encountered, we must enable wait here.
2045  must_wait =
2046  must_wait ||
2047  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2048  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2049 
2050  if (must_wait) {
2051  kmp_flag_32<false, false> flag(
2052  RCAST(std::atomic<kmp_uint32> *,
2053  &(taskdata->td_incomplete_child_tasks)),
2054  0U);
2055  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2056  flag.execute_tasks(thread, gtid, FALSE,
2057  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2058  __kmp_task_stealing_constraint);
2059  }
2060  }
2061 #if USE_ITT_BUILD
2062  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2063  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2064 #endif /* USE_ITT_BUILD */
2065 
2066  // Debugger: The taskwait is completed. Location remains, but thread is
2067  // negated.
2068  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2069 
2070 #if OMPT_SUPPORT && OMPT_OPTIONAL
2071  if (ompt) {
2072  if (ompt_enabled.ompt_callback_sync_region_wait) {
2073  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2074  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2075  my_task_data, return_address);
2076  }
2077  if (ompt_enabled.ompt_callback_sync_region) {
2078  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2079  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2080  my_task_data, return_address);
2081  }
2082  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2083  }
2084 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2085  }
2086 
2087  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2088  "returning TASK_CURRENT_NOT_QUEUED\n",
2089  gtid, taskdata));
2090 
2091  return TASK_CURRENT_NOT_QUEUED;
2092 }
2093 
2094 #if OMPT_SUPPORT && OMPT_OPTIONAL
2095 OMPT_NOINLINE
2096 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2097  void *frame_address,
2098  void *return_address) {
2099  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2100  return_address);
2101 }
2102 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2103 
2104 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2105 // complete
2106 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2107 #if OMPT_SUPPORT && OMPT_OPTIONAL
2108  if (UNLIKELY(ompt_enabled.enabled)) {
2109  OMPT_STORE_RETURN_ADDRESS(gtid);
2110  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2111  OMPT_LOAD_RETURN_ADDRESS(gtid));
2112  }
2113 #endif
2114  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2115 }
2116 
2117 // __kmpc_omp_taskyield: switch to a different task
2118 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2119  kmp_taskdata_t *taskdata = NULL;
2120  kmp_info_t *thread;
2121  int thread_finished = FALSE;
2122 
2123  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2124  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2125 
2126  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2127  gtid, loc_ref, end_part));
2128  __kmp_assert_valid_gtid(gtid);
2129 
2130  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2131  thread = __kmp_threads[gtid];
2132  taskdata = thread->th.th_current_task;
2133 // Should we model this as a task wait or not?
2134 // Debugger: The taskwait is active. Store location and thread encountered the
2135 // taskwait.
2136 #if USE_ITT_BUILD
2137 // Note: These values are used by ITT events as well.
2138 #endif /* USE_ITT_BUILD */
2139  taskdata->td_taskwait_counter += 1;
2140  taskdata->td_taskwait_ident = loc_ref;
2141  taskdata->td_taskwait_thread = gtid + 1;
2142 
2143 #if USE_ITT_BUILD
2144  void *itt_sync_obj = NULL;
2145 #if USE_ITT_NOTIFY
2146  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2147 #endif /* USE_ITT_NOTIFY */
2148 #endif /* USE_ITT_BUILD */
2149  if (!taskdata->td_flags.team_serial) {
2150  kmp_task_team_t *task_team = thread->th.th_task_team;
2151  if (task_team != NULL) {
2152  if (KMP_TASKING_ENABLED(task_team)) {
2153 #if OMPT_SUPPORT
2154  if (UNLIKELY(ompt_enabled.enabled))
2155  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2156 #endif
2157  __kmp_execute_tasks_32(
2158  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2159  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2160  __kmp_task_stealing_constraint);
2161 #if OMPT_SUPPORT
2162  if (UNLIKELY(ompt_enabled.enabled))
2163  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2164 #endif
2165  }
2166  }
2167  }
2168 #if USE_ITT_BUILD
2169  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2170 #endif /* USE_ITT_BUILD */
2171 
2172  // Debugger: The taskwait is completed. Location remains, but thread is
2173  // negated.
2174  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2175  }
2176 
2177  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2178  "returning TASK_CURRENT_NOT_QUEUED\n",
2179  gtid, taskdata));
2180 
2181  return TASK_CURRENT_NOT_QUEUED;
2182 }
2183 
2184 // Task Reduction implementation
2185 //
2186 // Note: initial implementation didn't take into account the possibility
2187 // to specify omp_orig for initializer of the UDR (user defined reduction).
2188 // Corrected implementation takes into account the omp_orig object.
2189 // Compiler is free to use old implementation if omp_orig is not specified.
2190 
2199 typedef struct kmp_taskred_flags {
2201  unsigned lazy_priv : 1;
2202  unsigned reserved31 : 31;
2204 
2208 typedef struct kmp_task_red_input {
2209  void *reduce_shar;
2210  size_t reduce_size;
2211  // three compiler-generated routines (init, fini are optional):
2212  void *reduce_init;
2213  void *reduce_fini;
2214  void *reduce_comb;
2217 
2221 typedef struct kmp_taskred_data {
2222  void *reduce_shar;
2223  size_t reduce_size;
2225  void *reduce_priv;
2226  void *reduce_pend;
2227  // three compiler-generated routines (init, fini are optional):
2228  void *reduce_comb;
2229  void *reduce_init;
2230  void *reduce_fini;
2231  void *reduce_orig;
2233 
2239 typedef struct kmp_taskred_input {
2240  void *reduce_shar;
2241  void *reduce_orig;
2242  size_t reduce_size;
2243  // three compiler-generated routines (init, fini are optional):
2244  void *reduce_init;
2245  void *reduce_fini;
2246  void *reduce_comb;
2253 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2254 template <>
2255 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2256  kmp_task_red_input_t &src) {
2257  item.reduce_orig = NULL;
2258 }
2259 template <>
2260 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2261  kmp_taskred_input_t &src) {
2262  if (src.reduce_orig != NULL) {
2263  item.reduce_orig = src.reduce_orig;
2264  } else {
2265  item.reduce_orig = src.reduce_shar;
2266  } // non-NULL reduce_orig means new interface used
2267 }
2268 
2269 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2270 template <>
2271 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2272  size_t offset) {
2273  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2274 }
2275 template <>
2276 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2277  size_t offset) {
2278  ((void (*)(void *, void *))item.reduce_init)(
2279  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2280 }
2281 
2282 template <typename T>
2283 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2284  __kmp_assert_valid_gtid(gtid);
2285  kmp_info_t *thread = __kmp_threads[gtid];
2286  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2287  kmp_uint32 nth = thread->th.th_team_nproc;
2288  kmp_taskred_data_t *arr;
2289 
2290  // check input data just in case
2291  KMP_ASSERT(tg != NULL);
2292  KMP_ASSERT(data != NULL);
2293  KMP_ASSERT(num > 0);
2294  if (nth == 1 && !__kmp_enable_hidden_helper) {
2295  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2296  gtid, tg));
2297  return (void *)tg;
2298  }
2299  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2300  gtid, tg, num));
2301  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2302  thread, num * sizeof(kmp_taskred_data_t));
2303  for (int i = 0; i < num; ++i) {
2304  size_t size = data[i].reduce_size - 1;
2305  // round the size up to cache line per thread-specific item
2306  size += CACHE_LINE - size % CACHE_LINE;
2307  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2308  arr[i].reduce_shar = data[i].reduce_shar;
2309  arr[i].reduce_size = size;
2310  arr[i].flags = data[i].flags;
2311  arr[i].reduce_comb = data[i].reduce_comb;
2312  arr[i].reduce_init = data[i].reduce_init;
2313  arr[i].reduce_fini = data[i].reduce_fini;
2314  __kmp_assign_orig<T>(arr[i], data[i]);
2315  if (!arr[i].flags.lazy_priv) {
2316  // allocate cache-line aligned block and fill it with zeros
2317  arr[i].reduce_priv = __kmp_allocate(nth * size);
2318  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2319  if (arr[i].reduce_init != NULL) {
2320  // initialize all thread-specific items
2321  for (size_t j = 0; j < nth; ++j) {
2322  __kmp_call_init<T>(arr[i], j * size);
2323  }
2324  }
2325  } else {
2326  // only allocate space for pointers now,
2327  // objects will be lazily allocated/initialized if/when requested
2328  // note that __kmp_allocate zeroes the allocated memory
2329  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2330  }
2331  }
2332  tg->reduce_data = (void *)arr;
2333  tg->reduce_num_data = num;
2334  return (void *)tg;
2335 }
2336 
2351 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2352 #if OMPX_TASKGRAPH
2353  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2354  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2355  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2356  this_tdg->rec_taskred_data =
2357  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2358  this_tdg->rec_num_taskred = num;
2359  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2360  sizeof(kmp_task_red_input_t) * num);
2361  }
2362 #endif
2363  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2364 }
2365 
2378 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2379 #if OMPX_TASKGRAPH
2380  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2381  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2382  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2383  this_tdg->rec_taskred_data =
2384  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2385  this_tdg->rec_num_taskred = num;
2386  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2387  sizeof(kmp_task_red_input_t) * num);
2388  }
2389 #endif
2390  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2391 }
2392 
2393 // Copy task reduction data (except for shared pointers).
2394 template <typename T>
2395 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2396  kmp_taskgroup_t *tg, void *reduce_data) {
2397  kmp_taskred_data_t *arr;
2398  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2399  " from data %p\n",
2400  thr, tg, reduce_data));
2401  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2402  thr, num * sizeof(kmp_taskred_data_t));
2403  // threads will share private copies, thunk routines, sizes, flags, etc.:
2404  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2405  for (int i = 0; i < num; ++i) {
2406  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2407  }
2408  tg->reduce_data = (void *)arr;
2409  tg->reduce_num_data = num;
2410 }
2411 
2421 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2422  __kmp_assert_valid_gtid(gtid);
2423  kmp_info_t *thread = __kmp_threads[gtid];
2424  kmp_int32 nth = thread->th.th_team_nproc;
2425  if (nth == 1)
2426  return data; // nothing to do
2427 
2428  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2429  if (tg == NULL)
2430  tg = thread->th.th_current_task->td_taskgroup;
2431  KMP_ASSERT(tg != NULL);
2432  kmp_taskred_data_t *arr;
2433  kmp_int32 num;
2434  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2435 
2436 #if OMPX_TASKGRAPH
2437  if ((thread->th.th_current_task->is_taskgraph) &&
2438  (!__kmp_tdg_is_recording(
2439  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2440  tg = thread->th.th_current_task->td_taskgroup;
2441  KMP_ASSERT(tg != NULL);
2442  KMP_ASSERT(tg->reduce_data != NULL);
2443  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2444  num = tg->reduce_num_data;
2445  }
2446 #endif
2447 
2448  KMP_ASSERT(data != NULL);
2449  while (tg != NULL) {
2450  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2451  num = tg->reduce_num_data;
2452  for (int i = 0; i < num; ++i) {
2453  if (!arr[i].flags.lazy_priv) {
2454  if (data == arr[i].reduce_shar ||
2455  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2456  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2457  } else {
2458  // check shared location first
2459  void **p_priv = (void **)(arr[i].reduce_priv);
2460  if (data == arr[i].reduce_shar)
2461  goto found;
2462  // check if we get some thread specific location as parameter
2463  for (int j = 0; j < nth; ++j)
2464  if (data == p_priv[j])
2465  goto found;
2466  continue; // not found, continue search
2467  found:
2468  if (p_priv[tid] == NULL) {
2469  // allocate thread specific object lazily
2470  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2471  if (arr[i].reduce_init != NULL) {
2472  if (arr[i].reduce_orig != NULL) { // new interface
2473  ((void (*)(void *, void *))arr[i].reduce_init)(
2474  p_priv[tid], arr[i].reduce_orig);
2475  } else { // old interface (single parameter)
2476  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2477  }
2478  }
2479  }
2480  return p_priv[tid];
2481  }
2482  }
2483  KMP_ASSERT(tg->parent);
2484  tg = tg->parent;
2485  }
2486  KMP_ASSERT2(0, "Unknown task reduction item");
2487  return NULL; // ERROR, this line never executed
2488 }
2489 
2490 // Finalize task reduction.
2491 // Called from __kmpc_end_taskgroup()
2492 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2493  kmp_int32 nth = th->th.th_team_nproc;
2494  KMP_DEBUG_ASSERT(
2495  nth > 1 ||
2496  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2497  // are using hidden helper threads
2498  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2499  kmp_int32 num = tg->reduce_num_data;
2500  for (int i = 0; i < num; ++i) {
2501  void *sh_data = arr[i].reduce_shar;
2502  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2503  void (*f_comb)(void *, void *) =
2504  (void (*)(void *, void *))(arr[i].reduce_comb);
2505  if (!arr[i].flags.lazy_priv) {
2506  void *pr_data = arr[i].reduce_priv;
2507  size_t size = arr[i].reduce_size;
2508  for (int j = 0; j < nth; ++j) {
2509  void *priv_data = (char *)pr_data + j * size;
2510  f_comb(sh_data, priv_data); // combine results
2511  if (f_fini)
2512  f_fini(priv_data); // finalize if needed
2513  }
2514  } else {
2515  void **pr_data = (void **)(arr[i].reduce_priv);
2516  for (int j = 0; j < nth; ++j) {
2517  if (pr_data[j] != NULL) {
2518  f_comb(sh_data, pr_data[j]); // combine results
2519  if (f_fini)
2520  f_fini(pr_data[j]); // finalize if needed
2521  __kmp_free(pr_data[j]);
2522  }
2523  }
2524  }
2525  __kmp_free(arr[i].reduce_priv);
2526  }
2527  __kmp_thread_free(th, arr);
2528  tg->reduce_data = NULL;
2529  tg->reduce_num_data = 0;
2530 }
2531 
2532 // Cleanup task reduction data for parallel or worksharing,
2533 // do not touch task private data other threads still working with.
2534 // Called from __kmpc_end_taskgroup()
2535 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2536  __kmp_thread_free(th, tg->reduce_data);
2537  tg->reduce_data = NULL;
2538  tg->reduce_num_data = 0;
2539 }
2540 
2541 template <typename T>
2542 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2543  int num, T *data) {
2544  __kmp_assert_valid_gtid(gtid);
2545  kmp_info_t *thr = __kmp_threads[gtid];
2546  kmp_int32 nth = thr->th.th_team_nproc;
2547  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2548  if (nth == 1) {
2549  KA_TRACE(10,
2550  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2551  gtid, thr->th.th_current_task->td_taskgroup));
2552  return (void *)thr->th.th_current_task->td_taskgroup;
2553  }
2554  kmp_team_t *team = thr->th.th_team;
2555  void *reduce_data;
2556  kmp_taskgroup_t *tg;
2557  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2558  if (reduce_data == NULL &&
2559  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2560  (void *)1)) {
2561  // single thread enters this block to initialize common reduction data
2562  KMP_DEBUG_ASSERT(reduce_data == NULL);
2563  // first initialize own data, then make a copy other threads can use
2564  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2565  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2566  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2567  // fini counters should be 0 at this point
2568  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2569  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2570  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2571  } else {
2572  while (
2573  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2574  (void *)1) { // wait for task reduction initialization
2575  KMP_CPU_PAUSE();
2576  }
2577  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2578  tg = thr->th.th_current_task->td_taskgroup;
2579  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2580  }
2581  return tg;
2582 }
2583 
2600 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2601  int num, void *data) {
2602  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2603  (kmp_task_red_input_t *)data);
2604 }
2605 
2620 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2621  void *data) {
2622  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2623  (kmp_taskred_input_t *)data);
2624 }
2625 
2634 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2635  __kmpc_end_taskgroup(loc, gtid);
2636 }
2637 
2638 // __kmpc_taskgroup: Start a new taskgroup
2639 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2640  __kmp_assert_valid_gtid(gtid);
2641  kmp_info_t *thread = __kmp_threads[gtid];
2642  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2643  kmp_taskgroup_t *tg_new =
2644  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2645  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2646  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2647  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2648  tg_new->parent = taskdata->td_taskgroup;
2649  tg_new->reduce_data = NULL;
2650  tg_new->reduce_num_data = 0;
2651  tg_new->gomp_data = NULL;
2652  taskdata->td_taskgroup = tg_new;
2653 
2654 #if OMPT_SUPPORT && OMPT_OPTIONAL
2655  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2656  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2657  if (!codeptr)
2658  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2659  kmp_team_t *team = thread->th.th_team;
2660  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2661  // FIXME: I think this is wrong for lwt!
2662  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2663 
2664  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2665  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2666  &(my_task_data), codeptr);
2667  }
2668 #endif
2669 }
2670 
2671 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2672 // and its descendants are complete
2673 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2674  __kmp_assert_valid_gtid(gtid);
2675  kmp_info_t *thread = __kmp_threads[gtid];
2676  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2677  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2678  int thread_finished = FALSE;
2679 
2680 #if OMPT_SUPPORT && OMPT_OPTIONAL
2681  kmp_team_t *team;
2682  ompt_data_t my_task_data;
2683  ompt_data_t my_parallel_data;
2684  void *codeptr = nullptr;
2685  if (UNLIKELY(ompt_enabled.enabled)) {
2686  team = thread->th.th_team;
2687  my_task_data = taskdata->ompt_task_info.task_data;
2688  // FIXME: I think this is wrong for lwt!
2689  my_parallel_data = team->t.ompt_team_info.parallel_data;
2690  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2691  if (!codeptr)
2692  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2693  }
2694 #endif
2695 
2696  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2697  KMP_DEBUG_ASSERT(taskgroup != NULL);
2698  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2699 
2700  if (__kmp_tasking_mode != tskm_immediate_exec) {
2701  // mark task as waiting not on a barrier
2702  taskdata->td_taskwait_counter += 1;
2703  taskdata->td_taskwait_ident = loc;
2704  taskdata->td_taskwait_thread = gtid + 1;
2705 #if USE_ITT_BUILD
2706  // For ITT the taskgroup wait is similar to taskwait until we need to
2707  // distinguish them
2708  void *itt_sync_obj = NULL;
2709 #if USE_ITT_NOTIFY
2710  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2711 #endif /* USE_ITT_NOTIFY */
2712 #endif /* USE_ITT_BUILD */
2713 
2714 #if OMPT_SUPPORT && OMPT_OPTIONAL
2715  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2716  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2717  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2718  &(my_task_data), codeptr);
2719  }
2720 #endif
2721 
2722  if (!taskdata->td_flags.team_serial ||
2723  (thread->th.th_task_team != NULL &&
2724  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2725  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2726  kmp_flag_32<false, false> flag(
2727  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2728  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2729  flag.execute_tasks(thread, gtid, FALSE,
2730  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2731  __kmp_task_stealing_constraint);
2732  }
2733  }
2734  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2735 
2736 #if OMPT_SUPPORT && OMPT_OPTIONAL
2737  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2738  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2739  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2740  &(my_task_data), codeptr);
2741  }
2742 #endif
2743 
2744 #if USE_ITT_BUILD
2745  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2746  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2747 #endif /* USE_ITT_BUILD */
2748  }
2749  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2750 
2751  if (taskgroup->reduce_data != NULL &&
2752  !taskgroup->gomp_data) { // need to reduce?
2753  int cnt;
2754  void *reduce_data;
2755  kmp_team_t *t = thread->th.th_team;
2756  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2757  // check if <priv> data of the first reduction variable shared for the team
2758  void *priv0 = arr[0].reduce_priv;
2759  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2760  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2761  // finishing task reduction on parallel
2762  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2763  if (cnt == thread->th.th_team_nproc - 1) {
2764  // we are the last thread passing __kmpc_reduction_modifier_fini()
2765  // finalize task reduction:
2766  __kmp_task_reduction_fini(thread, taskgroup);
2767  // cleanup fields in the team structure:
2768  // TODO: is relaxed store enough here (whole barrier should follow)?
2769  __kmp_thread_free(thread, reduce_data);
2770  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2771  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2772  } else {
2773  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2774  // so do not finalize reduction, just clean own copy of the data
2775  __kmp_task_reduction_clean(thread, taskgroup);
2776  }
2777  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2778  NULL &&
2779  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2780  // finishing task reduction on worksharing
2781  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2782  if (cnt == thread->th.th_team_nproc - 1) {
2783  // we are the last thread passing __kmpc_reduction_modifier_fini()
2784  __kmp_task_reduction_fini(thread, taskgroup);
2785  // cleanup fields in team structure:
2786  // TODO: is relaxed store enough here (whole barrier should follow)?
2787  __kmp_thread_free(thread, reduce_data);
2788  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2789  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2790  } else {
2791  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2792  // so do not finalize reduction, just clean own copy of the data
2793  __kmp_task_reduction_clean(thread, taskgroup);
2794  }
2795  } else {
2796  // finishing task reduction on taskgroup
2797  __kmp_task_reduction_fini(thread, taskgroup);
2798  }
2799  }
2800  // Restore parent taskgroup for the current task
2801  taskdata->td_taskgroup = taskgroup->parent;
2802  __kmp_thread_free(thread, taskgroup);
2803 
2804  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2805  gtid, taskdata));
2806 
2807 #if OMPT_SUPPORT && OMPT_OPTIONAL
2808  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2809  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2810  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2811  &(my_task_data), codeptr);
2812  }
2813 #endif
2814 }
2815 
2816 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2817  kmp_task_team_t *task_team,
2818  kmp_int32 is_constrained) {
2819  kmp_task_t *task = NULL;
2820  kmp_taskdata_t *taskdata;
2821  kmp_taskdata_t *current;
2822  kmp_thread_data_t *thread_data;
2823  int ntasks = task_team->tt.tt_num_task_pri;
2824  if (ntasks == 0) {
2825  KA_TRACE(
2826  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2827  return NULL;
2828  }
2829  do {
2830  // decrement num_tasks to "reserve" one task to get for execution
2831  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2832  ntasks - 1))
2833  break;
2834  ntasks = task_team->tt.tt_num_task_pri;
2835  } while (ntasks > 0);
2836  if (ntasks == 0) {
2837  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2838  __kmp_get_gtid()));
2839  return NULL;
2840  }
2841  // We got a "ticket" to get a "reserved" priority task
2842  int deque_ntasks;
2843  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2844  do {
2845  KMP_ASSERT(list != NULL);
2846  thread_data = &list->td;
2847  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2848  deque_ntasks = thread_data->td.td_deque_ntasks;
2849  if (deque_ntasks == 0) {
2850  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2851  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2852  __kmp_get_gtid(), thread_data));
2853  list = list->next;
2854  }
2855  } while (deque_ntasks == 0);
2856  KMP_DEBUG_ASSERT(deque_ntasks);
2857  int target = thread_data->td.td_deque_head;
2858  current = __kmp_threads[gtid]->th.th_current_task;
2859  taskdata = thread_data->td.td_deque[target];
2860  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2861  // Bump head pointer and Wrap.
2862  thread_data->td.td_deque_head =
2863  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2864  } else {
2865  if (!task_team->tt.tt_untied_task_encountered) {
2866  // The TSC does not allow to steal victim task
2867  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2868  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2869  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2870  gtid, thread_data, task_team, deque_ntasks, target,
2871  thread_data->td.td_deque_tail));
2872  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2873  return NULL;
2874  }
2875  int i;
2876  // walk through the deque trying to steal any task
2877  taskdata = NULL;
2878  for (i = 1; i < deque_ntasks; ++i) {
2879  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2880  taskdata = thread_data->td.td_deque[target];
2881  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2882  break; // found task to execute
2883  } else {
2884  taskdata = NULL;
2885  }
2886  }
2887  if (taskdata == NULL) {
2888  // No appropriate candidate found to execute
2889  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2890  KA_TRACE(
2891  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2892  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2893  gtid, thread_data, task_team, deque_ntasks,
2894  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2895  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2896  return NULL;
2897  }
2898  int prev = target;
2899  for (i = i + 1; i < deque_ntasks; ++i) {
2900  // shift remaining tasks in the deque left by 1
2901  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2902  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2903  prev = target;
2904  }
2905  KMP_DEBUG_ASSERT(
2906  thread_data->td.td_deque_tail ==
2907  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2908  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2909  }
2910  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2911  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2912  task = KMP_TASKDATA_TO_TASK(taskdata);
2913  return task;
2914 }
2915 
2916 // __kmp_remove_my_task: remove a task from my own deque
2917 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2918  kmp_task_team_t *task_team,
2919  kmp_int32 is_constrained) {
2920  kmp_task_t *task;
2921  kmp_taskdata_t *taskdata;
2922  kmp_thread_data_t *thread_data;
2923  kmp_uint32 tail;
2924 
2925  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2926  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2927  NULL); // Caller should check this condition
2928 
2929  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2930 
2931  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2932  gtid, thread_data->td.td_deque_ntasks,
2933  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2934 
2935  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2936  KA_TRACE(10,
2937  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2938  "ntasks=%d head=%u tail=%u\n",
2939  gtid, thread_data->td.td_deque_ntasks,
2940  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2941  return NULL;
2942  }
2943 
2944  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2945 
2946  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2947  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2948  KA_TRACE(10,
2949  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2950  "ntasks=%d head=%u tail=%u\n",
2951  gtid, thread_data->td.td_deque_ntasks,
2952  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2953  return NULL;
2954  }
2955 
2956  tail = (thread_data->td.td_deque_tail - 1) &
2957  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2958  taskdata = thread_data->td.td_deque[tail];
2959 
2960  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2961  thread->th.th_current_task)) {
2962  // The TSC does not allow to steal victim task
2963  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2964  KA_TRACE(10,
2965  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2966  "ntasks=%d head=%u tail=%u\n",
2967  gtid, thread_data->td.td_deque_ntasks,
2968  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2969  return NULL;
2970  }
2971 
2972  thread_data->td.td_deque_tail = tail;
2973  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2974 
2975  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2976 
2977  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
2978  "ntasks=%d head=%u tail=%u\n",
2979  gtid, taskdata, thread_data->td.td_deque_ntasks,
2980  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2981 
2982  task = KMP_TASKDATA_TO_TASK(taskdata);
2983  return task;
2984 }
2985 
2986 // __kmp_steal_task: remove a task from another thread's deque
2987 // Assume that calling thread has already checked existence of
2988 // task_team thread_data before calling this routine.
2989 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
2990  kmp_task_team_t *task_team,
2991  std::atomic<kmp_int32> *unfinished_threads,
2992  int *thread_finished,
2993  kmp_int32 is_constrained) {
2994  kmp_task_t *task;
2995  kmp_taskdata_t *taskdata;
2996  kmp_taskdata_t *current;
2997  kmp_thread_data_t *victim_td, *threads_data;
2998  kmp_int32 target;
2999  kmp_info_t *victim_thr;
3000 
3001  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3002 
3003  threads_data = task_team->tt.tt_threads_data;
3004  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3005  KMP_DEBUG_ASSERT(victim_tid >= 0);
3006  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3007 
3008  victim_td = &threads_data[victim_tid];
3009  victim_thr = victim_td->td.td_thr;
3010  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3011 
3012  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3013  "task_team=%p ntasks=%d head=%u tail=%u\n",
3014  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3015  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3016  victim_td->td.td_deque_tail));
3017 
3018  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3019  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3020  "task_team=%p ntasks=%d head=%u tail=%u\n",
3021  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3022  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3023  victim_td->td.td_deque_tail));
3024  return NULL;
3025  }
3026 
3027  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3028 
3029  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3030  // Check again after we acquire the lock
3031  if (ntasks == 0) {
3032  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3033  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3034  "task_team=%p ntasks=%d head=%u tail=%u\n",
3035  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3036  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3037  return NULL;
3038  }
3039 
3040  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3041  current = __kmp_threads[gtid]->th.th_current_task;
3042  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3043  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3044  // Bump head pointer and Wrap.
3045  victim_td->td.td_deque_head =
3046  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3047  } else {
3048  if (!task_team->tt.tt_untied_task_encountered) {
3049  // The TSC does not allow to steal victim task
3050  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3051  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3052  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3053  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3054  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3055  return NULL;
3056  }
3057  int i;
3058  // walk through victim's deque trying to steal any task
3059  target = victim_td->td.td_deque_head;
3060  taskdata = NULL;
3061  for (i = 1; i < ntasks; ++i) {
3062  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3063  taskdata = victim_td->td.td_deque[target];
3064  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3065  break; // found victim task
3066  } else {
3067  taskdata = NULL;
3068  }
3069  }
3070  if (taskdata == NULL) {
3071  // No appropriate candidate to steal found
3072  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3073  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3074  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3075  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3076  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3077  return NULL;
3078  }
3079  int prev = target;
3080  for (i = i + 1; i < ntasks; ++i) {
3081  // shift remaining tasks in the deque left by 1
3082  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3083  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3084  prev = target;
3085  }
3086  KMP_DEBUG_ASSERT(
3087  victim_td->td.td_deque_tail ==
3088  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3089  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3090  }
3091  if (*thread_finished) {
3092  // We need to un-mark this victim as a finished victim. This must be done
3093  // before releasing the lock, or else other threads (starting with the
3094  // primary thread victim) might be prematurely released from the barrier!!!
3095 #if KMP_DEBUG
3096  kmp_int32 count =
3097 #endif
3098  KMP_ATOMIC_INC(unfinished_threads);
3099  KA_TRACE(
3100  20,
3101  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3102  gtid, count + 1, task_team));
3103  *thread_finished = FALSE;
3104  }
3105  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3106 
3107  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3108 
3109  KMP_COUNT_BLOCK(TASK_stolen);
3110  KA_TRACE(10,
3111  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3112  "task_team=%p ntasks=%d head=%u tail=%u\n",
3113  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3114  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3115 
3116  task = KMP_TASKDATA_TO_TASK(taskdata);
3117  return task;
3118 }
3119 
3120 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3121 // condition is statisfied (return true) or there are none left (return false).
3122 //
3123 // final_spin is TRUE if this is the spin at the release barrier.
3124 // thread_finished indicates whether the thread is finished executing all
3125 // the tasks it has on its deque, and is at the release barrier.
3126 // spinner is the location on which to spin.
3127 // spinner == NULL means only execute a single task and return.
3128 // checker is the value to check to terminate the spin.
3129 template <class C>
3130 static inline int __kmp_execute_tasks_template(
3131  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3132  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3133  kmp_int32 is_constrained) {
3134  kmp_task_team_t *task_team = thread->th.th_task_team;
3135  kmp_thread_data_t *threads_data;
3136  kmp_task_t *task;
3137  kmp_info_t *other_thread;
3138  kmp_taskdata_t *current_task = thread->th.th_current_task;
3139  std::atomic<kmp_int32> *unfinished_threads;
3140  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3141  tid = thread->th.th_info.ds.ds_tid;
3142 
3143  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3144  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3145 
3146  if (task_team == NULL || current_task == NULL)
3147  return FALSE;
3148 
3149  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3150  "*thread_finished=%d\n",
3151  gtid, final_spin, *thread_finished));
3152 
3153  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3154  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3155 
3156  KMP_DEBUG_ASSERT(threads_data != NULL);
3157 
3158  nthreads = task_team->tt.tt_nproc;
3159  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3160  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3161 
3162  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3163  // getting tasks from target constructs
3164  while (1) { // Inner loop to find a task and execute it
3165  task = NULL;
3166  if (task_team->tt.tt_num_task_pri) { // get priority task first
3167  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3168  }
3169  if (task == NULL && use_own_tasks) { // check own queue next
3170  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3171  }
3172  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3173  int asleep = 1;
3174  use_own_tasks = 0;
3175  // Try to steal from the last place I stole from successfully.
3176  if (victim_tid == -2) { // haven't stolen anything yet
3177  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3178  if (victim_tid !=
3179  -1) // if we have a last stolen from victim, get the thread
3180  other_thread = threads_data[victim_tid].td.td_thr;
3181  }
3182  if (victim_tid != -1) { // found last victim
3183  asleep = 0;
3184  } else if (!new_victim) { // no recent steals and we haven't already
3185  // used a new victim; select a random thread
3186  do { // Find a different thread to steal work from.
3187  // Pick a random thread. Initial plan was to cycle through all the
3188  // threads, and only return if we tried to steal from every thread,
3189  // and failed. Arch says that's not such a great idea.
3190  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3191  if (victim_tid >= tid) {
3192  ++victim_tid; // Adjusts random distribution to exclude self
3193  }
3194  // Found a potential victim
3195  other_thread = threads_data[victim_tid].td.td_thr;
3196  // There is a slight chance that __kmp_enable_tasking() did not wake
3197  // up all threads waiting at the barrier. If victim is sleeping,
3198  // then wake it up. Since we were going to pay the cache miss
3199  // penalty for referencing another thread's kmp_info_t struct
3200  // anyway,
3201  // the check shouldn't cost too much performance at this point. In
3202  // extra barrier mode, tasks do not sleep at the separate tasking
3203  // barrier, so this isn't a problem.
3204  asleep = 0;
3205  if ((__kmp_tasking_mode == tskm_task_teams) &&
3206  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3207  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3208  NULL)) {
3209  asleep = 1;
3210  __kmp_null_resume_wrapper(other_thread);
3211  // A sleeping thread should not have any tasks on it's queue.
3212  // There is a slight possibility that it resumes, steals a task
3213  // from another thread, which spawns more tasks, all in the time
3214  // that it takes this thread to check => don't write an assertion
3215  // that the victim's queue is empty. Try stealing from a
3216  // different thread.
3217  }
3218  } while (asleep);
3219  }
3220 
3221  if (!asleep) {
3222  // We have a victim to try to steal from
3223  task =
3224  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3225  thread_finished, is_constrained);
3226  }
3227  if (task != NULL) { // set last stolen to victim
3228  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3229  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3230  // The pre-refactored code did not try more than 1 successful new
3231  // vicitm, unless the last one generated more local tasks;
3232  // new_victim keeps track of this
3233  new_victim = 1;
3234  }
3235  } else { // No tasks found; unset last_stolen
3236  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3237  victim_tid = -2; // no successful victim found
3238  }
3239  }
3240 
3241  if (task == NULL)
3242  break; // break out of tasking loop
3243 
3244 // Found a task; execute it
3245 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3246  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3247  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3248  // get the object reliably
3249  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3250  }
3251  __kmp_itt_task_starting(itt_sync_obj);
3252  }
3253 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3254  __kmp_invoke_task(gtid, task, current_task);
3255 #if USE_ITT_BUILD
3256  if (itt_sync_obj != NULL)
3257  __kmp_itt_task_finished(itt_sync_obj);
3258 #endif /* USE_ITT_BUILD */
3259  // If this thread is only partway through the barrier and the condition is
3260  // met, then return now, so that the barrier gather/release pattern can
3261  // proceed. If this thread is in the last spin loop in the barrier,
3262  // waiting to be released, we know that the termination condition will not
3263  // be satisfied, so don't waste any cycles checking it.
3264  if (flag == NULL || (!final_spin && flag->done_check())) {
3265  KA_TRACE(
3266  15,
3267  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3268  gtid));
3269  return TRUE;
3270  }
3271  if (thread->th.th_task_team == NULL) {
3272  break;
3273  }
3274  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3275  // If execution of a stolen task results in more tasks being placed on our
3276  // run queue, reset use_own_tasks
3277  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3278  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3279  "other tasks, restart\n",
3280  gtid));
3281  use_own_tasks = 1;
3282  new_victim = 0;
3283  }
3284  }
3285 
3286  // The task source has been exhausted. If in final spin loop of barrier,
3287  // check if termination condition is satisfied. The work queue may be empty
3288  // but there might be proxy tasks still executing.
3289  if (final_spin &&
3290  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3291  // First, decrement the #unfinished threads, if that has not already been
3292  // done. This decrement might be to the spin location, and result in the
3293  // termination condition being satisfied.
3294  if (!*thread_finished) {
3295 #if KMP_DEBUG
3296  kmp_int32 count = -1 +
3297 #endif
3298  KMP_ATOMIC_DEC(unfinished_threads);
3299  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3300  "unfinished_threads to %d task_team=%p\n",
3301  gtid, count, task_team));
3302  *thread_finished = TRUE;
3303  }
3304 
3305  // It is now unsafe to reference thread->th.th_team !!!
3306  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3307  // thread to pass through the barrier, where it might reset each thread's
3308  // th.th_team field for the next parallel region. If we can steal more
3309  // work, we know that this has not happened yet.
3310  if (flag != NULL && flag->done_check()) {
3311  KA_TRACE(
3312  15,
3313  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3314  gtid));
3315  return TRUE;
3316  }
3317  }
3318 
3319  // If this thread's task team is NULL, primary thread has recognized that
3320  // there are no more tasks; bail out
3321  if (thread->th.th_task_team == NULL) {
3322  KA_TRACE(15,
3323  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3324  return FALSE;
3325  }
3326 
3327  // Check the flag again to see if it has already done in case to be trapped
3328  // into infinite loop when a if0 task depends on a hidden helper task
3329  // outside any parallel region. Detached tasks are not impacted in this case
3330  // because the only thread executing this function has to execute the proxy
3331  // task so it is in another code path that has the same check.
3332  if (flag == NULL || (!final_spin && flag->done_check())) {
3333  KA_TRACE(15,
3334  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3335  gtid));
3336  return TRUE;
3337  }
3338 
3339  // We could be getting tasks from target constructs; if this is the only
3340  // thread, keep trying to execute tasks from own queue
3341  if (nthreads == 1 &&
3342  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3343  use_own_tasks = 1;
3344  else {
3345  KA_TRACE(15,
3346  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3347  return FALSE;
3348  }
3349  }
3350 }
3351 
3352 template <bool C, bool S>
3353 int __kmp_execute_tasks_32(
3354  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3355  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3356  kmp_int32 is_constrained) {
3357  return __kmp_execute_tasks_template(
3358  thread, gtid, flag, final_spin,
3359  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3360 }
3361 
3362 template <bool C, bool S>
3363 int __kmp_execute_tasks_64(
3364  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3365  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3366  kmp_int32 is_constrained) {
3367  return __kmp_execute_tasks_template(
3368  thread, gtid, flag, final_spin,
3369  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3370 }
3371 
3372 template <bool C, bool S>
3373 int __kmp_atomic_execute_tasks_64(
3374  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3375  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3376  kmp_int32 is_constrained) {
3377  return __kmp_execute_tasks_template(
3378  thread, gtid, flag, final_spin,
3379  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3380 }
3381 
3382 int __kmp_execute_tasks_oncore(
3383  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3384  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3385  kmp_int32 is_constrained) {
3386  return __kmp_execute_tasks_template(
3387  thread, gtid, flag, final_spin,
3388  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3389 }
3390 
3391 template int
3392 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3393  kmp_flag_32<false, false> *, int,
3394  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3395 
3396 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3397  kmp_flag_64<false, true> *,
3398  int,
3399  int *USE_ITT_BUILD_ARG(void *),
3400  kmp_int32);
3401 
3402 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3403  kmp_flag_64<true, false> *,
3404  int,
3405  int *USE_ITT_BUILD_ARG(void *),
3406  kmp_int32);
3407 
3408 template int __kmp_atomic_execute_tasks_64<false, true>(
3409  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3410  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3411 
3412 template int __kmp_atomic_execute_tasks_64<true, false>(
3413  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3414  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3415 
3416 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3417 // next barrier so they can assist in executing enqueued tasks.
3418 // First thread in allocates the task team atomically.
3419 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3420  kmp_info_t *this_thr) {
3421  kmp_thread_data_t *threads_data;
3422  int nthreads, i, is_init_thread;
3423 
3424  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3425  __kmp_gtid_from_thread(this_thr)));
3426 
3427  KMP_DEBUG_ASSERT(task_team != NULL);
3428  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3429 
3430  nthreads = task_team->tt.tt_nproc;
3431  KMP_DEBUG_ASSERT(nthreads > 0);
3432  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3433 
3434  // Allocate or increase the size of threads_data if necessary
3435  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3436 
3437  if (!is_init_thread) {
3438  // Some other thread already set up the array.
3439  KA_TRACE(
3440  20,
3441  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3442  __kmp_gtid_from_thread(this_thr)));
3443  return;
3444  }
3445  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3446  KMP_DEBUG_ASSERT(threads_data != NULL);
3447 
3448  if (__kmp_tasking_mode == tskm_task_teams &&
3449  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3450  // Release any threads sleeping at the barrier, so that they can steal
3451  // tasks and execute them. In extra barrier mode, tasks do not sleep
3452  // at the separate tasking barrier, so this isn't a problem.
3453  for (i = 0; i < nthreads; i++) {
3454  void *sleep_loc;
3455  kmp_info_t *thread = threads_data[i].td.td_thr;
3456 
3457  if (i == this_thr->th.th_info.ds.ds_tid) {
3458  continue;
3459  }
3460  // Since we haven't locked the thread's suspend mutex lock at this
3461  // point, there is a small window where a thread might be putting
3462  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3463  // To work around this, __kmp_execute_tasks_template() periodically checks
3464  // see if other threads are sleeping (using the same random mechanism that
3465  // is used for task stealing) and awakens them if they are.
3466  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3467  NULL) {
3468  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3469  __kmp_gtid_from_thread(this_thr),
3470  __kmp_gtid_from_thread(thread)));
3471  __kmp_null_resume_wrapper(thread);
3472  } else {
3473  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3474  __kmp_gtid_from_thread(this_thr),
3475  __kmp_gtid_from_thread(thread)));
3476  }
3477  }
3478  }
3479 
3480  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3481  __kmp_gtid_from_thread(this_thr)));
3482 }
3483 
3484 /* // TODO: Check the comment consistency
3485  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3486  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3487  * After a child * thread checks into a barrier and calls __kmp_release() from
3488  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3489  * longer assume that the kmp_team_t structure is intact (at any moment, the
3490  * primary thread may exit the barrier code and free the team data structure,
3491  * and return the threads to the thread pool).
3492  *
3493  * This does not work with the tasking code, as the thread is still
3494  * expected to participate in the execution of any tasks that may have been
3495  * spawned my a member of the team, and the thread still needs access to all
3496  * to each thread in the team, so that it can steal work from it.
3497  *
3498  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3499  * counting mechanism, and is allocated by the primary thread before calling
3500  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3501  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3502  * of the kmp_task_team_t structs for consecutive barriers can overlap
3503  * (and will, unless the primary thread is the last thread to exit the barrier
3504  * release phase, which is not typical). The existence of such a struct is
3505  * useful outside the context of tasking.
3506  *
3507  * We currently use the existence of the threads array as an indicator that
3508  * tasks were spawned since the last barrier. If the structure is to be
3509  * useful outside the context of tasking, then this will have to change, but
3510  * not setting the field minimizes the performance impact of tasking on
3511  * barriers, when no explicit tasks were spawned (pushed, actually).
3512  */
3513 
3514 static kmp_task_team_t *__kmp_free_task_teams =
3515  NULL; // Free list for task_team data structures
3516 // Lock for task team data structures
3517 kmp_bootstrap_lock_t __kmp_task_team_lock =
3518  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3519 
3520 // __kmp_alloc_task_deque:
3521 // Allocates a task deque for a particular thread, and initialize the necessary
3522 // data structures relating to the deque. This only happens once per thread
3523 // per task team since task teams are recycled. No lock is needed during
3524 // allocation since each thread allocates its own deque.
3525 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3526  kmp_thread_data_t *thread_data) {
3527  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3528  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3529 
3530  // Initialize last stolen task field to "none"
3531  thread_data->td.td_deque_last_stolen = -1;
3532 
3533  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3534  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3535  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3536 
3537  KE_TRACE(
3538  10,
3539  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3540  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3541  // Allocate space for task deque, and zero the deque
3542  // Cannot use __kmp_thread_calloc() because threads not around for
3543  // kmp_reap_task_team( ).
3544  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3545  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3546  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3547 }
3548 
3549 // __kmp_free_task_deque:
3550 // Deallocates a task deque for a particular thread. Happens at library
3551 // deallocation so don't need to reset all thread data fields.
3552 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3553  if (thread_data->td.td_deque != NULL) {
3554  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3555  TCW_4(thread_data->td.td_deque_ntasks, 0);
3556  __kmp_free(thread_data->td.td_deque);
3557  thread_data->td.td_deque = NULL;
3558  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3559  }
3560 }
3561 
3562 // __kmp_realloc_task_threads_data:
3563 // Allocates a threads_data array for a task team, either by allocating an
3564 // initial array or enlarging an existing array. Only the first thread to get
3565 // the lock allocs or enlarges the array and re-initializes the array elements.
3566 // That thread returns "TRUE", the rest return "FALSE".
3567 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3568 // The current size is given by task_team -> tt.tt_max_threads.
3569 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3570  kmp_task_team_t *task_team) {
3571  kmp_thread_data_t **threads_data_p;
3572  kmp_int32 nthreads, maxthreads;
3573  int is_init_thread = FALSE;
3574 
3575  if (TCR_4(task_team->tt.tt_found_tasks)) {
3576  // Already reallocated and initialized.
3577  return FALSE;
3578  }
3579 
3580  threads_data_p = &task_team->tt.tt_threads_data;
3581  nthreads = task_team->tt.tt_nproc;
3582  maxthreads = task_team->tt.tt_max_threads;
3583 
3584  // All threads must lock when they encounter the first task of the implicit
3585  // task region to make sure threads_data fields are (re)initialized before
3586  // used.
3587  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3588 
3589  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3590  // first thread to enable tasking
3591  kmp_team_t *team = thread->th.th_team;
3592  int i;
3593 
3594  is_init_thread = TRUE;
3595  if (maxthreads < nthreads) {
3596 
3597  if (*threads_data_p != NULL) {
3598  kmp_thread_data_t *old_data = *threads_data_p;
3599  kmp_thread_data_t *new_data = NULL;
3600 
3601  KE_TRACE(
3602  10,
3603  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3604  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3605  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3606  // Reallocate threads_data to have more elements than current array
3607  // Cannot use __kmp_thread_realloc() because threads not around for
3608  // kmp_reap_task_team( ). Note all new array entries are initialized
3609  // to zero by __kmp_allocate().
3610  new_data = (kmp_thread_data_t *)__kmp_allocate(
3611  nthreads * sizeof(kmp_thread_data_t));
3612  // copy old data to new data
3613  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3614  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3615 
3616  // Install the new data and free the old data
3617  (*threads_data_p) = new_data;
3618  __kmp_free(old_data);
3619  } else {
3620  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3621  "threads data for task_team %p, size = %d\n",
3622  __kmp_gtid_from_thread(thread), task_team, nthreads));
3623  // Make the initial allocate for threads_data array, and zero entries
3624  // Cannot use __kmp_thread_calloc() because threads not around for
3625  // kmp_reap_task_team( ).
3626  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3627  nthreads * sizeof(kmp_thread_data_t));
3628  }
3629  task_team->tt.tt_max_threads = nthreads;
3630  } else {
3631  // If array has (more than) enough elements, go ahead and use it
3632  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3633  }
3634 
3635  // initialize threads_data pointers back to thread_info structures
3636  for (i = 0; i < nthreads; i++) {
3637  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3638  thread_data->td.td_thr = team->t.t_threads[i];
3639 
3640  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3641  // The last stolen field survives across teams / barrier, and the number
3642  // of threads may have changed. It's possible (likely?) that a new
3643  // parallel region will exhibit the same behavior as previous region.
3644  thread_data->td.td_deque_last_stolen = -1;
3645  }
3646  }
3647 
3648  KMP_MB();
3649  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3650  }
3651 
3652  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3653  return is_init_thread;
3654 }
3655 
3656 // __kmp_free_task_threads_data:
3657 // Deallocates a threads_data array for a task team, including any attached
3658 // tasking deques. Only occurs at library shutdown.
3659 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3660  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3661  if (task_team->tt.tt_threads_data != NULL) {
3662  int i;
3663  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3664  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3665  }
3666  __kmp_free(task_team->tt.tt_threads_data);
3667  task_team->tt.tt_threads_data = NULL;
3668  }
3669  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3670 }
3671 
3672 // __kmp_free_task_pri_list:
3673 // Deallocates tasking deques used for priority tasks.
3674 // Only occurs at library shutdown.
3675 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3676  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3677  if (task_team->tt.tt_task_pri_list != NULL) {
3678  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3679  while (list != NULL) {
3680  kmp_task_pri_t *next = list->next;
3681  __kmp_free_task_deque(&list->td);
3682  __kmp_free(list);
3683  list = next;
3684  }
3685  task_team->tt.tt_task_pri_list = NULL;
3686  }
3687  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3688 }
3689 
3690 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3691  kmp_team_t *team) {
3692  int team_nth = team->t.t_nproc;
3693  // Only need to init if task team is isn't active or team size changed
3694  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3695  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3696  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3697  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3698  TCW_4(task_team->tt.tt_nproc, team_nth);
3699  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3700  TCW_4(task_team->tt.tt_active, TRUE);
3701  }
3702 }
3703 
3704 // __kmp_allocate_task_team:
3705 // Allocates a task team associated with a specific team, taking it from
3706 // the global task team free list if possible. Also initializes data
3707 // structures.
3708 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3709  kmp_team_t *team) {
3710  kmp_task_team_t *task_team = NULL;
3711 
3712  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3713  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3714 
3715  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3716  // Take a task team from the task team pool
3717  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3718  if (__kmp_free_task_teams != NULL) {
3719  task_team = __kmp_free_task_teams;
3720  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3721  task_team->tt.tt_next = NULL;
3722  }
3723  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3724  }
3725 
3726  if (task_team == NULL) {
3727  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3728  "task team for team %p\n",
3729  __kmp_gtid_from_thread(thread), team));
3730  // Allocate a new task team if one is not available. Cannot use
3731  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3732  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3733  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3734  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3735 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3736  // suppress race conditions detection on synchronization flags in debug mode
3737  // this helps to analyze library internals eliminating false positives
3738  __itt_suppress_mark_range(
3739  __itt_suppress_range, __itt_suppress_threading_errors,
3740  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3741  __itt_suppress_mark_range(__itt_suppress_range,
3742  __itt_suppress_threading_errors,
3743  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3744  sizeof(task_team->tt.tt_active));
3745 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3746  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3747  // task_team->tt.tt_threads_data = NULL;
3748  // task_team->tt.tt_max_threads = 0;
3749  // task_team->tt.tt_next = NULL;
3750  }
3751 
3752  __kmp_task_team_init(task_team, team);
3753 
3754  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3755  "unfinished_threads init'd to %d\n",
3756  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3757  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3758  return task_team;
3759 }
3760 
3761 // __kmp_free_task_team:
3762 // Frees the task team associated with a specific thread, and adds it
3763 // to the global task team free list.
3764 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3765  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3766  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3767 
3768  // Put task team back on free list
3769  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3770 
3771  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3772  task_team->tt.tt_next = __kmp_free_task_teams;
3773  TCW_PTR(__kmp_free_task_teams, task_team);
3774 
3775  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3776 }
3777 
3778 // __kmp_reap_task_teams:
3779 // Free all the task teams on the task team free list.
3780 // Should only be done during library shutdown.
3781 // Cannot do anything that needs a thread structure or gtid since they are
3782 // already gone.
3783 void __kmp_reap_task_teams(void) {
3784  kmp_task_team_t *task_team;
3785 
3786  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3787  // Free all task_teams on the free list
3788  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3789  while ((task_team = __kmp_free_task_teams) != NULL) {
3790  __kmp_free_task_teams = task_team->tt.tt_next;
3791  task_team->tt.tt_next = NULL;
3792 
3793  // Free threads_data if necessary
3794  if (task_team->tt.tt_threads_data != NULL) {
3795  __kmp_free_task_threads_data(task_team);
3796  }
3797  if (task_team->tt.tt_task_pri_list != NULL) {
3798  __kmp_free_task_pri_list(task_team);
3799  }
3800  __kmp_free(task_team);
3801  }
3802  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3803  }
3804 }
3805 
3806 // View the array of two task team pointers as a pair of pointers:
3807 // 1) a single task_team pointer
3808 // 2) next pointer for stack
3809 // Serial teams can create a stack of task teams for nested serial teams.
3810 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3811  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3812  kmp_task_team_list_t *current =
3813  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3814  kmp_task_team_list_t *node =
3815  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
3816  node->task_team = current->task_team;
3817  node->next = current->next;
3818  thread->th.th_task_team = current->task_team = NULL;
3819  current->next = node;
3820 }
3821 
3822 // Serial team pops a task team off the stack
3823 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3824  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3825  kmp_task_team_list_t *current =
3826  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3827  if (current->task_team) {
3828  __kmp_free_task_team(thread, current->task_team);
3829  }
3830  kmp_task_team_list_t *next = current->next;
3831  if (next) {
3832  current->task_team = next->task_team;
3833  current->next = next->next;
3834  KMP_DEBUG_ASSERT(next != current);
3835  __kmp_free(next);
3836  thread->th.th_task_team = current->task_team;
3837  }
3838 }
3839 
3840 // __kmp_wait_to_unref_task_teams:
3841 // Some threads could still be in the fork barrier release code, possibly
3842 // trying to steal tasks. Wait for each thread to unreference its task team.
3843 void __kmp_wait_to_unref_task_teams(void) {
3844  kmp_info_t *thread;
3845  kmp_uint32 spins;
3846  kmp_uint64 time;
3847  int done;
3848 
3849  KMP_INIT_YIELD(spins);
3850  KMP_INIT_BACKOFF(time);
3851 
3852  for (;;) {
3853  done = TRUE;
3854 
3855  // TODO: GEH - this may be is wrong because some sync would be necessary
3856  // in case threads are added to the pool during the traversal. Need to
3857  // verify that lock for thread pool is held when calling this routine.
3858  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3859  thread = thread->th.th_next_pool) {
3860 #if KMP_OS_WINDOWS
3861  DWORD exit_val;
3862 #endif
3863  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3864  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3865  __kmp_gtid_from_thread(thread)));
3866  continue;
3867  }
3868 #if KMP_OS_WINDOWS
3869  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3870  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3871  thread->th.th_task_team = NULL;
3872  continue;
3873  }
3874 #endif
3875 
3876  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3877 
3878  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3879  "unreference task_team\n",
3880  __kmp_gtid_from_thread(thread)));
3881 
3882  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3883  void *sleep_loc;
3884  // If the thread is sleeping, awaken it.
3885  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3886  NULL) {
3887  KA_TRACE(
3888  10,
3889  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3890  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3891  __kmp_null_resume_wrapper(thread);
3892  }
3893  }
3894  }
3895  if (done) {
3896  break;
3897  }
3898 
3899  // If oversubscribed or have waited a bit, yield.
3900  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3901  }
3902 }
3903 
3904 // __kmp_task_team_setup: Create a task_team for the current team, but use
3905 // an already created, unused one if it already exists.
3906 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
3907  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3908 
3909  // For the serial and root teams, setup the first task team pointer to point
3910  // to task team. The other pointer is a stack of task teams from previous
3911  // serial levels.
3912  if (team == this_thr->th.th_serial_team ||
3913  team == this_thr->th.th_root->r.r_root_team) {
3914  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3915  if (team->t.t_task_team[0] == NULL) {
3916  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3917  KA_TRACE(
3918  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3919  " for serial/root team %p\n",
3920  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3921 
3922  } else
3923  __kmp_task_team_init(team->t.t_task_team[0], team);
3924  return;
3925  }
3926 
3927  // If this task_team hasn't been created yet, allocate it. It will be used in
3928  // the region after the next.
3929  // If it exists, it is the current task team and shouldn't be touched yet as
3930  // it may still be in use.
3931  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3932  team->t.t_task_team[this_thr->th.th_task_state] =
3933  __kmp_allocate_task_team(this_thr, team);
3934  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3935  " for team %d at parity=%d\n",
3936  __kmp_gtid_from_thread(this_thr),
3937  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3938  this_thr->th.th_task_state));
3939  }
3940 
3941  // After threads exit the release, they will call sync, and then point to this
3942  // other task_team; make sure it is allocated and properly initialized. As
3943  // threads spin in the barrier release phase, they will continue to use the
3944  // previous task_team struct(above), until they receive the signal to stop
3945  // checking for tasks (they can't safely reference the kmp_team_t struct,
3946  // which could be reallocated by the primary thread).
3947  int other_team = 1 - this_thr->th.th_task_state;
3948  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3949  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3950  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3951  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3952  "task_team %p for team %d at parity=%d\n",
3953  __kmp_gtid_from_thread(this_thr),
3954  team->t.t_task_team[other_team], team->t.t_id, other_team));
3955  } else { // Leave the old task team struct in place for the upcoming region;
3956  // adjust as needed
3957  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3958  __kmp_task_team_init(task_team, team);
3959  // if team size has changed, the first thread to enable tasking will
3960  // realloc threads_data if necessary
3961  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3962  "%p for team %d at parity=%d\n",
3963  __kmp_gtid_from_thread(this_thr),
3964  team->t.t_task_team[other_team], team->t.t_id, other_team));
3965  }
3966 
3967  // For regular thread, task enabling should be called when the task is going
3968  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3969  // it ahead of time so that some operations can be performed without race
3970  // condition.
3971  if (this_thr == __kmp_hidden_helper_main_thread) {
3972  for (int i = 0; i < 2; ++i) {
3973  kmp_task_team_t *task_team = team->t.t_task_team[i];
3974  if (KMP_TASKING_ENABLED(task_team)) {
3975  continue;
3976  }
3977  __kmp_enable_tasking(task_team, this_thr);
3978  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
3979  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
3980  if (thread_data->td.td_deque == NULL) {
3981  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
3982  }
3983  }
3984  }
3985  }
3986 }
3987 
3988 // __kmp_task_team_sync: Propagation of task team data from team to threads
3989 // which happens just after the release phase of a team barrier. This may be
3990 // called by any thread. This is not called for serial or root teams.
3991 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
3992  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3993  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
3994  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
3995 
3996  // Toggle the th_task_state field, to switch which task_team this thread
3997  // refers to
3998  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
3999 
4000  // It is now safe to propagate the task team pointer from the team struct to
4001  // the current thread.
4002  TCW_PTR(this_thr->th.th_task_team,
4003  team->t.t_task_team[this_thr->th.th_task_state]);
4004  KA_TRACE(20,
4005  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4006  "%p from Team #%d (parity=%d)\n",
4007  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4008  team->t.t_id, this_thr->th.th_task_state));
4009 }
4010 
4011 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4012 // barrier gather phase. Only called by the primary thread.
4013 //
4014 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4015 // by passing in 0 optionally as the last argument. When wait is zero, primary
4016 // thread does not wait for unfinished_threads to reach 0.
4017 void __kmp_task_team_wait(
4018  kmp_info_t *this_thr,
4019  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4020  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4021 
4022  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4023  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4024 
4025  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4026  if (wait) {
4027  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4028  "(for unfinished_threads to reach 0) on task_team = %p\n",
4029  __kmp_gtid_from_thread(this_thr), task_team));
4030  // Worker threads may have dropped through to release phase, but could
4031  // still be executing tasks. Wait here for tasks to complete. To avoid
4032  // memory contention, only primary thread checks termination condition.
4033  kmp_flag_32<false, false> flag(
4034  RCAST(std::atomic<kmp_uint32> *,
4035  &task_team->tt.tt_unfinished_threads),
4036  0U);
4037  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4038  }
4039  // Deactivate the old task team, so that the worker threads will stop
4040  // referencing it while spinning.
4041  KA_TRACE(
4042  20,
4043  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4044  "setting active to false, setting local and team's pointer to NULL\n",
4045  __kmp_gtid_from_thread(this_thr), task_team));
4046  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4047  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4048  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4049  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4050  KMP_MB();
4051 
4052  TCW_PTR(this_thr->th.th_task_team, NULL);
4053  }
4054 }
4055 
4056 // __kmp_tasking_barrier:
4057 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4058 // Internal function to execute all tasks prior to a regular barrier or a join
4059 // barrier. It is a full barrier itself, which unfortunately turns regular
4060 // barriers into double barriers and join barriers into 1 1/2 barriers.
4061 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4062  std::atomic<kmp_uint32> *spin = RCAST(
4063  std::atomic<kmp_uint32> *,
4064  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4065  int flag = FALSE;
4066  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4067 
4068 #if USE_ITT_BUILD
4069  KMP_FSYNC_SPIN_INIT(spin, NULL);
4070 #endif /* USE_ITT_BUILD */
4071  kmp_flag_32<false, false> spin_flag(spin, 0U);
4072  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4073  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4074 #if USE_ITT_BUILD
4075  // TODO: What about itt_sync_obj??
4076  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4077 #endif /* USE_ITT_BUILD */
4078 
4079  if (TCR_4(__kmp_global.g.g_done)) {
4080  if (__kmp_global.g.g_abort)
4081  __kmp_abort_thread();
4082  break;
4083  }
4084  KMP_YIELD(TRUE);
4085  }
4086 #if USE_ITT_BUILD
4087  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4088 #endif /* USE_ITT_BUILD */
4089 }
4090 
4091 // __kmp_give_task puts a task into a given thread queue if:
4092 // - the queue for that thread was created
4093 // - there's space in that queue
4094 // Because of this, __kmp_push_task needs to check if there's space after
4095 // getting the lock
4096 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4097  kmp_int32 pass) {
4098  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4099  kmp_task_team_t *task_team = taskdata->td_task_team;
4100 
4101  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4102  taskdata, tid));
4103 
4104  // If task_team is NULL something went really bad...
4105  KMP_DEBUG_ASSERT(task_team != NULL);
4106 
4107  bool result = false;
4108  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4109 
4110  if (thread_data->td.td_deque == NULL) {
4111  // There's no queue in this thread, go find another one
4112  // We're guaranteed that at least one thread has a queue
4113  KA_TRACE(30,
4114  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4115  tid, taskdata));
4116  return result;
4117  }
4118 
4119  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4120  TASK_DEQUE_SIZE(thread_data->td)) {
4121  KA_TRACE(
4122  30,
4123  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4124  taskdata, tid));
4125 
4126  // if this deque is bigger than the pass ratio give a chance to another
4127  // thread
4128  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4129  return result;
4130 
4131  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4132  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4133  TASK_DEQUE_SIZE(thread_data->td)) {
4134  // expand deque to push the task which is not allowed to execute
4135  __kmp_realloc_task_deque(thread, thread_data);
4136  }
4137 
4138  } else {
4139 
4140  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4141 
4142  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4143  TASK_DEQUE_SIZE(thread_data->td)) {
4144  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4145  "thread %d.\n",
4146  taskdata, tid));
4147 
4148  // if this deque is bigger than the pass ratio give a chance to another
4149  // thread
4150  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4151  goto release_and_exit;
4152 
4153  __kmp_realloc_task_deque(thread, thread_data);
4154  }
4155  }
4156 
4157  // lock is held here, and there is space in the deque
4158 
4159  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4160  // Wrap index.
4161  thread_data->td.td_deque_tail =
4162  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4163  TCW_4(thread_data->td.td_deque_ntasks,
4164  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4165 
4166  result = true;
4167  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4168  taskdata, tid));
4169 
4170 release_and_exit:
4171  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4172 
4173  return result;
4174 }
4175 
4176 #define PROXY_TASK_FLAG 0x40000000
4177 /* The finish of the proxy tasks is divided in two pieces:
4178  - the top half is the one that can be done from a thread outside the team
4179  - the bottom half must be run from a thread within the team
4180 
4181  In order to run the bottom half the task gets queued back into one of the
4182  threads of the team. Once the td_incomplete_child_task counter of the parent
4183  is decremented the threads can leave the barriers. So, the bottom half needs
4184  to be queued before the counter is decremented. The top half is therefore
4185  divided in two parts:
4186  - things that can be run before queuing the bottom half
4187  - things that must be run after queuing the bottom half
4188 
4189  This creates a second race as the bottom half can free the task before the
4190  second top half is executed. To avoid this we use the
4191  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4192  half. */
4193 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4194  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4195  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4196  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4197  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4198 
4199  taskdata->td_flags.complete = 1; // mark the task as completed
4200 #if OMPX_TASKGRAPH
4201  taskdata->td_flags.onced = 1;
4202 #endif
4203 
4204  if (taskdata->td_taskgroup)
4205  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4206 
4207  // Create an imaginary children for this task so the bottom half cannot
4208  // release the task before we have completed the second top half
4209  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4210 }
4211 
4212 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4213 #if KMP_DEBUG
4214  kmp_int32 children = 0;
4215  // Predecrement simulated by "- 1" calculation
4216  children = -1 +
4217 #endif
4218  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4219  KMP_DEBUG_ASSERT(children >= 0);
4220 
4221  // Remove the imaginary children
4222  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4223 }
4224 
4225 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4226  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4227  kmp_info_t *thread = __kmp_threads[gtid];
4228 
4229  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4230  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4231  1); // top half must run before bottom half
4232 
4233  // We need to wait to make sure the top half is finished
4234  // Spinning here should be ok as this should happen quickly
4235  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4236  PROXY_TASK_FLAG) > 0)
4237  ;
4238 
4239  __kmp_release_deps(gtid, taskdata);
4240  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4241 }
4242 
4251 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4252  KMP_DEBUG_ASSERT(ptask != NULL);
4253  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4254  KA_TRACE(
4255  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4256  gtid, taskdata));
4257  __kmp_assert_valid_gtid(gtid);
4258  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4259 
4260  __kmp_first_top_half_finish_proxy(taskdata);
4261  __kmp_second_top_half_finish_proxy(taskdata);
4262  __kmp_bottom_half_finish_proxy(gtid, ptask);
4263 
4264  KA_TRACE(10,
4265  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4266  gtid, taskdata));
4267 }
4268 
4269 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4270  KMP_DEBUG_ASSERT(ptask != NULL);
4271  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4272 
4273  // Enqueue task to complete bottom half completion from a thread within the
4274  // corresponding team
4275  kmp_team_t *team = taskdata->td_team;
4276  kmp_int32 nthreads = team->t.t_nproc;
4277  kmp_info_t *thread;
4278 
4279  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4280  // but we cannot use __kmp_get_random here
4281  kmp_int32 start_k = start % nthreads;
4282  kmp_int32 pass = 1;
4283  kmp_int32 k = start_k;
4284 
4285  do {
4286  // For now we're just linearly trying to find a thread
4287  thread = team->t.t_threads[k];
4288  k = (k + 1) % nthreads;
4289 
4290  // we did a full pass through all the threads
4291  if (k == start_k)
4292  pass = pass << 1;
4293 
4294  } while (!__kmp_give_task(thread, k, ptask, pass));
4295 
4296  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4297  // awake at least one thread to execute given task
4298  for (int i = 0; i < nthreads; ++i) {
4299  thread = team->t.t_threads[i];
4300  if (thread->th.th_sleep_loc != NULL) {
4301  __kmp_null_resume_wrapper(thread);
4302  break;
4303  }
4304  }
4305  }
4306 }
4307 
4315 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4316  KMP_DEBUG_ASSERT(ptask != NULL);
4317  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4318 
4319  KA_TRACE(
4320  10,
4321  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4322  taskdata));
4323 
4324  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4325 
4326  __kmp_first_top_half_finish_proxy(taskdata);
4327 
4328  __kmpc_give_task(ptask);
4329 
4330  __kmp_second_top_half_finish_proxy(taskdata);
4331 
4332  KA_TRACE(
4333  10,
4334  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4335  taskdata));
4336 }
4337 
4338 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4339  kmp_task_t *task) {
4340  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4341  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4342  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4343  td->td_allow_completion_event.ed.task = task;
4344  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4345  }
4346  return &td->td_allow_completion_event;
4347 }
4348 
4349 void __kmp_fulfill_event(kmp_event_t *event) {
4350  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4351  kmp_task_t *ptask = event->ed.task;
4352  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4353  bool detached = false;
4354  int gtid = __kmp_get_gtid();
4355 
4356  // The associated task might have completed or could be completing at this
4357  // point.
4358  // We need to take the lock to avoid races
4359  __kmp_acquire_tas_lock(&event->lock, gtid);
4360  if (taskdata->td_flags.proxy == TASK_PROXY) {
4361  detached = true;
4362  } else {
4363 #if OMPT_SUPPORT
4364  // The OMPT event must occur under mutual exclusion,
4365  // otherwise the tool might access ptask after free
4366  if (UNLIKELY(ompt_enabled.enabled))
4367  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4368 #endif
4369  }
4370  event->type = KMP_EVENT_UNINITIALIZED;
4371  __kmp_release_tas_lock(&event->lock, gtid);
4372 
4373  if (detached) {
4374 #if OMPT_SUPPORT
4375  // We free ptask afterwards and know the task is finished,
4376  // so locking is not necessary
4377  if (UNLIKELY(ompt_enabled.enabled))
4378  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4379 #endif
4380  // If the task detached complete the proxy task
4381  if (gtid >= 0) {
4382  kmp_team_t *team = taskdata->td_team;
4383  kmp_info_t *thread = __kmp_get_thread();
4384  if (thread->th.th_team == team) {
4385  __kmpc_proxy_task_completed(gtid, ptask);
4386  return;
4387  }
4388  }
4389 
4390  // fallback
4392  }
4393  }
4394 }
4395 
4396 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4397 // for taskloop
4398 //
4399 // thread: allocating thread
4400 // task_src: pointer to source task to be duplicated
4401 // taskloop_recur: used only when dealing with taskgraph,
4402 // indicating whether we need to update task->td_task_id
4403 // returns: a pointer to the allocated kmp_task_t structure (task).
4404 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4405 #if OMPX_TASKGRAPH
4406  , int taskloop_recur
4407 #endif
4408 ) {
4409  kmp_task_t *task;
4410  kmp_taskdata_t *taskdata;
4411  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4412  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4413  size_t shareds_offset;
4414  size_t task_size;
4415 
4416  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4417  task_src));
4418  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4419  TASK_FULL); // it should not be proxy task
4420  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4421  task_size = taskdata_src->td_size_alloc;
4422 
4423  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4424  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4425  task_size));
4426 #if USE_FAST_MEMORY
4427  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4428 #else
4429  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4430 #endif /* USE_FAST_MEMORY */
4431  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4432 
4433  task = KMP_TASKDATA_TO_TASK(taskdata);
4434 
4435  // Initialize new task (only specific fields not affected by memcpy)
4436 #if OMPX_TASKGRAPH
4437  if (taskdata->is_taskgraph && !taskloop_recur &&
4438  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4439  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4440 #endif
4441  taskdata->td_task_id = KMP_GEN_TASK_ID();
4442  if (task->shareds != NULL) { // need setup shareds pointer
4443  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4444  task->shareds = &((char *)taskdata)[shareds_offset];
4445  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4446  0);
4447  }
4448  taskdata->td_alloc_thread = thread;
4449  taskdata->td_parent = parent_task;
4450  // task inherits the taskgroup from the parent task
4451  taskdata->td_taskgroup = parent_task->td_taskgroup;
4452  // tied task needs to initialize the td_last_tied at creation,
4453  // untied one does this when it is scheduled for execution
4454  if (taskdata->td_flags.tiedness == TASK_TIED)
4455  taskdata->td_last_tied = taskdata;
4456 
4457  // Only need to keep track of child task counts if team parallel and tasking
4458  // not serialized
4459  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4460  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4461  if (parent_task->td_taskgroup)
4462  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4463  // Only need to keep track of allocated child tasks for explicit tasks since
4464  // implicit not deallocated
4465  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4466  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4467  }
4468 
4469  KA_TRACE(20,
4470  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4471  thread, taskdata, taskdata->td_parent));
4472 #if OMPT_SUPPORT
4473  if (UNLIKELY(ompt_enabled.enabled))
4474  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4475 #endif
4476  return task;
4477 }
4478 
4479 // Routine optionally generated by the compiler for setting the lastprivate flag
4480 // and calling needed constructors for private/firstprivate objects
4481 // (used to form taskloop tasks from pattern task)
4482 // Parameters: dest task, src task, lastprivate flag.
4483 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4484 
4485 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4486 
4487 // class to encapsulate manipulating loop bounds in a taskloop task.
4488 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4489 // the loop bound variables.
4490 class kmp_taskloop_bounds_t {
4491  kmp_task_t *task;
4492  const kmp_taskdata_t *taskdata;
4493  size_t lower_offset;
4494  size_t upper_offset;
4495 
4496 public:
4497  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4498  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4499  lower_offset((char *)lb - (char *)task),
4500  upper_offset((char *)ub - (char *)task) {
4501  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4502  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4503  }
4504  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4505  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4506  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4507  size_t get_lower_offset() const { return lower_offset; }
4508  size_t get_upper_offset() const { return upper_offset; }
4509  kmp_uint64 get_lb() const {
4510  kmp_int64 retval;
4511 #if defined(KMP_GOMP_COMPAT)
4512  // Intel task just returns the lower bound normally
4513  if (!taskdata->td_flags.native) {
4514  retval = *(kmp_int64 *)((char *)task + lower_offset);
4515  } else {
4516  // GOMP task has to take into account the sizeof(long)
4517  if (taskdata->td_size_loop_bounds == 4) {
4518  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4519  retval = (kmp_int64)*lb;
4520  } else {
4521  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4522  retval = (kmp_int64)*lb;
4523  }
4524  }
4525 #else
4526  (void)taskdata;
4527  retval = *(kmp_int64 *)((char *)task + lower_offset);
4528 #endif // defined(KMP_GOMP_COMPAT)
4529  return retval;
4530  }
4531  kmp_uint64 get_ub() const {
4532  kmp_int64 retval;
4533 #if defined(KMP_GOMP_COMPAT)
4534  // Intel task just returns the upper bound normally
4535  if (!taskdata->td_flags.native) {
4536  retval = *(kmp_int64 *)((char *)task + upper_offset);
4537  } else {
4538  // GOMP task has to take into account the sizeof(long)
4539  if (taskdata->td_size_loop_bounds == 4) {
4540  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4541  retval = (kmp_int64)*ub;
4542  } else {
4543  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4544  retval = (kmp_int64)*ub;
4545  }
4546  }
4547 #else
4548  retval = *(kmp_int64 *)((char *)task + upper_offset);
4549 #endif // defined(KMP_GOMP_COMPAT)
4550  return retval;
4551  }
4552  void set_lb(kmp_uint64 lb) {
4553 #if defined(KMP_GOMP_COMPAT)
4554  // Intel task just sets the lower bound normally
4555  if (!taskdata->td_flags.native) {
4556  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4557  } else {
4558  // GOMP task has to take into account the sizeof(long)
4559  if (taskdata->td_size_loop_bounds == 4) {
4560  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4561  *lower = (kmp_uint32)lb;
4562  } else {
4563  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4564  *lower = (kmp_uint64)lb;
4565  }
4566  }
4567 #else
4568  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4569 #endif // defined(KMP_GOMP_COMPAT)
4570  }
4571  void set_ub(kmp_uint64 ub) {
4572 #if defined(KMP_GOMP_COMPAT)
4573  // Intel task just sets the upper bound normally
4574  if (!taskdata->td_flags.native) {
4575  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4576  } else {
4577  // GOMP task has to take into account the sizeof(long)
4578  if (taskdata->td_size_loop_bounds == 4) {
4579  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4580  *upper = (kmp_uint32)ub;
4581  } else {
4582  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4583  *upper = (kmp_uint64)ub;
4584  }
4585  }
4586 #else
4587  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4588 #endif // defined(KMP_GOMP_COMPAT)
4589  }
4590 };
4591 
4592 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4593 //
4594 // loc Source location information
4595 // gtid Global thread ID
4596 // task Pattern task, exposes the loop iteration range
4597 // lb Pointer to loop lower bound in task structure
4598 // ub Pointer to loop upper bound in task structure
4599 // st Loop stride
4600 // ub_glob Global upper bound (used for lastprivate check)
4601 // num_tasks Number of tasks to execute
4602 // grainsize Number of loop iterations per task
4603 // extras Number of chunks with grainsize+1 iterations
4604 // last_chunk Reduction of grainsize for last task
4605 // tc Iterations count
4606 // task_dup Tasks duplication routine
4607 // codeptr_ra Return address for OMPT events
4608 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4609  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4610  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4611  kmp_uint64 grainsize, kmp_uint64 extras,
4612  kmp_int64 last_chunk, kmp_uint64 tc,
4613 #if OMPT_SUPPORT
4614  void *codeptr_ra,
4615 #endif
4616  void *task_dup) {
4617  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4618  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4619  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4620  // compiler provides global bounds here
4621  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4622  kmp_uint64 lower = task_bounds.get_lb();
4623  kmp_uint64 upper = task_bounds.get_ub();
4624  kmp_uint64 i;
4625  kmp_info_t *thread = __kmp_threads[gtid];
4626  kmp_taskdata_t *current_task = thread->th.th_current_task;
4627  kmp_task_t *next_task;
4628  kmp_int32 lastpriv = 0;
4629 
4630  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4631  (last_chunk < 0 ? last_chunk : extras));
4632  KMP_DEBUG_ASSERT(num_tasks > extras);
4633  KMP_DEBUG_ASSERT(num_tasks > 0);
4634  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4635  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4636  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4637  ub_glob, st, task_dup));
4638 
4639  // Launch num_tasks tasks, assign grainsize iterations each task
4640  for (i = 0; i < num_tasks; ++i) {
4641  kmp_uint64 chunk_minus_1;
4642  if (extras == 0) {
4643  chunk_minus_1 = grainsize - 1;
4644  } else {
4645  chunk_minus_1 = grainsize;
4646  --extras; // first extras iterations get bigger chunk (grainsize+1)
4647  }
4648  upper = lower + st * chunk_minus_1;
4649  if (upper > *ub) {
4650  upper = *ub;
4651  }
4652  if (i == num_tasks - 1) {
4653  // schedule the last task, set lastprivate flag if needed
4654  if (st == 1) { // most common case
4655  KMP_DEBUG_ASSERT(upper == *ub);
4656  if (upper == ub_glob)
4657  lastpriv = 1;
4658  } else if (st > 0) { // positive loop stride
4659  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4660  if ((kmp_uint64)st > ub_glob - upper)
4661  lastpriv = 1;
4662  } else { // negative loop stride
4663  KMP_DEBUG_ASSERT(upper + st < *ub);
4664  if (upper - ub_glob < (kmp_uint64)(-st))
4665  lastpriv = 1;
4666  }
4667  }
4668 
4669 #if OMPX_TASKGRAPH
4670  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4671 #else
4672  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4673 #endif
4674 
4675  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4676  kmp_taskloop_bounds_t next_task_bounds =
4677  kmp_taskloop_bounds_t(next_task, task_bounds);
4678 
4679  // adjust task-specific bounds
4680  next_task_bounds.set_lb(lower);
4681  if (next_taskdata->td_flags.native) {
4682  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4683  } else {
4684  next_task_bounds.set_ub(upper);
4685  }
4686  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4687  // etc.
4688  ptask_dup(next_task, task, lastpriv);
4689  KA_TRACE(40,
4690  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4691  "upper %lld stride %lld, (offsets %p %p)\n",
4692  gtid, i, next_task, lower, upper, st,
4693  next_task_bounds.get_lower_offset(),
4694  next_task_bounds.get_upper_offset()));
4695 #if OMPT_SUPPORT
4696  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4697  codeptr_ra); // schedule new task
4698 #if OMPT_OPTIONAL
4699  if (ompt_enabled.ompt_callback_dispatch) {
4700  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4701  lower, upper, st);
4702  }
4703 #endif // OMPT_OPTIONAL
4704 #else
4705  __kmp_omp_task(gtid, next_task, true); // schedule new task
4706 #endif
4707  lower = upper + st; // adjust lower bound for the next iteration
4708  }
4709  // free the pattern task and exit
4710  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4711  // do not execute the pattern task, just do internal bookkeeping
4712  __kmp_task_finish<false>(gtid, task, current_task);
4713 }
4714 
4715 // Structure to keep taskloop parameters for auxiliary task
4716 // kept in the shareds of the task structure.
4717 typedef struct __taskloop_params {
4718  kmp_task_t *task;
4719  kmp_uint64 *lb;
4720  kmp_uint64 *ub;
4721  void *task_dup;
4722  kmp_int64 st;
4723  kmp_uint64 ub_glob;
4724  kmp_uint64 num_tasks;
4725  kmp_uint64 grainsize;
4726  kmp_uint64 extras;
4727  kmp_int64 last_chunk;
4728  kmp_uint64 tc;
4729  kmp_uint64 num_t_min;
4730 #if OMPT_SUPPORT
4731  void *codeptr_ra;
4732 #endif
4733 } __taskloop_params_t;
4734 
4735 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4736  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4737  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4738  kmp_uint64,
4739 #if OMPT_SUPPORT
4740  void *,
4741 #endif
4742  void *);
4743 
4744 // Execute part of the taskloop submitted as a task.
4745 int __kmp_taskloop_task(int gtid, void *ptask) {
4746  __taskloop_params_t *p =
4747  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4748  kmp_task_t *task = p->task;
4749  kmp_uint64 *lb = p->lb;
4750  kmp_uint64 *ub = p->ub;
4751  void *task_dup = p->task_dup;
4752  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4753  kmp_int64 st = p->st;
4754  kmp_uint64 ub_glob = p->ub_glob;
4755  kmp_uint64 num_tasks = p->num_tasks;
4756  kmp_uint64 grainsize = p->grainsize;
4757  kmp_uint64 extras = p->extras;
4758  kmp_int64 last_chunk = p->last_chunk;
4759  kmp_uint64 tc = p->tc;
4760  kmp_uint64 num_t_min = p->num_t_min;
4761 #if OMPT_SUPPORT
4762  void *codeptr_ra = p->codeptr_ra;
4763 #endif
4764 #if KMP_DEBUG
4765  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4766  KMP_DEBUG_ASSERT(task != NULL);
4767  KA_TRACE(20,
4768  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4769  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4770  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4771  st, task_dup));
4772 #endif
4773  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4774  if (num_tasks > num_t_min)
4775  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4776  grainsize, extras, last_chunk, tc, num_t_min,
4777 #if OMPT_SUPPORT
4778  codeptr_ra,
4779 #endif
4780  task_dup);
4781  else
4782  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4783  grainsize, extras, last_chunk, tc,
4784 #if OMPT_SUPPORT
4785  codeptr_ra,
4786 #endif
4787  task_dup);
4788 
4789  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4790  return 0;
4791 }
4792 
4793 // Schedule part of the taskloop as a task,
4794 // execute the rest of the taskloop.
4795 //
4796 // loc Source location information
4797 // gtid Global thread ID
4798 // task Pattern task, exposes the loop iteration range
4799 // lb Pointer to loop lower bound in task structure
4800 // ub Pointer to loop upper bound in task structure
4801 // st Loop stride
4802 // ub_glob Global upper bound (used for lastprivate check)
4803 // num_tasks Number of tasks to execute
4804 // grainsize Number of loop iterations per task
4805 // extras Number of chunks with grainsize+1 iterations
4806 // last_chunk Reduction of grainsize for last task
4807 // tc Iterations count
4808 // num_t_min Threshold to launch tasks recursively
4809 // task_dup Tasks duplication routine
4810 // codeptr_ra Return address for OMPT events
4811 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4812  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4813  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4814  kmp_uint64 grainsize, kmp_uint64 extras,
4815  kmp_int64 last_chunk, kmp_uint64 tc,
4816  kmp_uint64 num_t_min,
4817 #if OMPT_SUPPORT
4818  void *codeptr_ra,
4819 #endif
4820  void *task_dup) {
4821  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4822  KMP_DEBUG_ASSERT(task != NULL);
4823  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4824  KA_TRACE(20,
4825  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4826  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4827  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4828  st, task_dup));
4829  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4830  kmp_uint64 lower = *lb;
4831  kmp_info_t *thread = __kmp_threads[gtid];
4832  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4833  kmp_task_t *next_task;
4834  size_t lower_offset =
4835  (char *)lb - (char *)task; // remember offset of lb in the task structure
4836  size_t upper_offset =
4837  (char *)ub - (char *)task; // remember offset of ub in the task structure
4838 
4839  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4840  (last_chunk < 0 ? last_chunk : extras));
4841  KMP_DEBUG_ASSERT(num_tasks > extras);
4842  KMP_DEBUG_ASSERT(num_tasks > 0);
4843 
4844  // split the loop in two halves
4845  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4846  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4847  kmp_uint64 gr_size0 = grainsize;
4848  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4849  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4850  if (last_chunk < 0) {
4851  ext0 = ext1 = 0;
4852  last_chunk1 = last_chunk;
4853  tc0 = grainsize * n_tsk0;
4854  tc1 = tc - tc0;
4855  } else if (n_tsk0 <= extras) {
4856  gr_size0++; // integrate extras into grainsize
4857  ext0 = 0; // no extra iters in 1st half
4858  ext1 = extras - n_tsk0; // remaining extras
4859  tc0 = gr_size0 * n_tsk0;
4860  tc1 = tc - tc0;
4861  } else { // n_tsk0 > extras
4862  ext1 = 0; // no extra iters in 2nd half
4863  ext0 = extras;
4864  tc1 = grainsize * n_tsk1;
4865  tc0 = tc - tc1;
4866  }
4867  ub0 = lower + st * (tc0 - 1);
4868  lb1 = ub0 + st;
4869 
4870  // create pattern task for 2nd half of the loop
4871 #if OMPX_TASKGRAPH
4872  next_task = __kmp_task_dup_alloc(thread, task,
4873  /* taskloop_recur */ 1);
4874 #else
4875  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4876 #endif
4877  // adjust lower bound (upper bound is not changed) for the 2nd half
4878  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4879  if (ptask_dup != NULL) // construct firstprivates, etc.
4880  ptask_dup(next_task, task, 0);
4881  *ub = ub0; // adjust upper bound for the 1st half
4882 
4883  // create auxiliary task for 2nd half of the loop
4884  // make sure new task has same parent task as the pattern task
4885  kmp_taskdata_t *current_task = thread->th.th_current_task;
4886  thread->th.th_current_task = taskdata->td_parent;
4887  kmp_task_t *new_task =
4888  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4889  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4890  // restore current task
4891  thread->th.th_current_task = current_task;
4892  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4893  p->task = next_task;
4894  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4895  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4896  p->task_dup = task_dup;
4897  p->st = st;
4898  p->ub_glob = ub_glob;
4899  p->num_tasks = n_tsk1;
4900  p->grainsize = grainsize;
4901  p->extras = ext1;
4902  p->last_chunk = last_chunk1;
4903  p->tc = tc1;
4904  p->num_t_min = num_t_min;
4905 #if OMPT_SUPPORT
4906  p->codeptr_ra = codeptr_ra;
4907 #endif
4908 
4909 #if OMPX_TASKGRAPH
4910  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4911  new_task_data->tdg = taskdata->tdg;
4912  new_task_data->is_taskgraph = 0;
4913 #endif
4914 
4915 #if OMPT_SUPPORT
4916  // schedule new task with correct return address for OMPT events
4917  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4918 #else
4919  __kmp_omp_task(gtid, new_task, true); // schedule new task
4920 #endif
4921 
4922  // execute the 1st half of current subrange
4923  if (n_tsk0 > num_t_min)
4924  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4925  ext0, last_chunk0, tc0, num_t_min,
4926 #if OMPT_SUPPORT
4927  codeptr_ra,
4928 #endif
4929  task_dup);
4930  else
4931  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4932  gr_size0, ext0, last_chunk0, tc0,
4933 #if OMPT_SUPPORT
4934  codeptr_ra,
4935 #endif
4936  task_dup);
4937 
4938  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4939 }
4940 
4941 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4942  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4943  int nogroup, int sched, kmp_uint64 grainsize,
4944  int modifier, void *task_dup) {
4945  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4946  KMP_DEBUG_ASSERT(task != NULL);
4947  if (nogroup == 0) {
4948 #if OMPT_SUPPORT && OMPT_OPTIONAL
4949  OMPT_STORE_RETURN_ADDRESS(gtid);
4950 #endif
4951  __kmpc_taskgroup(loc, gtid);
4952  }
4953 
4954 #if OMPX_TASKGRAPH
4955  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4956 #endif
4957  // =========================================================================
4958  // calculate loop parameters
4959  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4960  kmp_uint64 tc;
4961  // compiler provides global bounds here
4962  kmp_uint64 lower = task_bounds.get_lb();
4963  kmp_uint64 upper = task_bounds.get_ub();
4964  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4965  kmp_uint64 num_tasks = 0, extras = 0;
4966  kmp_int64 last_chunk =
4967  0; // reduce grainsize of last task by last_chunk in strict mode
4968  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4969  kmp_info_t *thread = __kmp_threads[gtid];
4970  kmp_taskdata_t *current_task = thread->th.th_current_task;
4971 
4972  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
4973  "grain %llu(%d, %d), dup %p\n",
4974  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
4975  task_dup));
4976 
4977  // compute trip count
4978  if (st == 1) { // most common case
4979  tc = upper - lower + 1;
4980  } else if (st < 0) {
4981  tc = (lower - upper) / (-st) + 1;
4982  } else { // st > 0
4983  tc = (upper - lower) / st + 1;
4984  }
4985  if (tc == 0) {
4986  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
4987  // free the pattern task and exit
4988  __kmp_task_start(gtid, task, current_task);
4989  // do not execute anything for zero-trip loop
4990  __kmp_task_finish<false>(gtid, task, current_task);
4991  return;
4992  }
4993 
4994 #if OMPT_SUPPORT && OMPT_OPTIONAL
4995  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
4996  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
4997  if (ompt_enabled.ompt_callback_work) {
4998  ompt_callbacks.ompt_callback(ompt_callback_work)(
4999  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5000  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5001  }
5002 #endif
5003 
5004  if (num_tasks_min == 0)
5005  // TODO: can we choose better default heuristic?
5006  num_tasks_min =
5007  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5008 
5009  // compute num_tasks/grainsize based on the input provided
5010  switch (sched) {
5011  case 0: // no schedule clause specified, we can choose the default
5012  // let's try to schedule (team_size*10) tasks
5013  grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5014  KMP_FALLTHROUGH();
5015  case 2: // num_tasks provided
5016  if (grainsize > tc) {
5017  num_tasks = tc; // too big num_tasks requested, adjust values
5018  grainsize = 1;
5019  extras = 0;
5020  } else {
5021  num_tasks = grainsize;
5022  grainsize = tc / num_tasks;
5023  extras = tc % num_tasks;
5024  }
5025  break;
5026  case 1: // grainsize provided
5027  if (grainsize > tc) {
5028  num_tasks = 1;
5029  grainsize = tc; // too big grainsize requested, adjust values
5030  extras = 0;
5031  } else {
5032  if (modifier) {
5033  num_tasks = (tc + grainsize - 1) / grainsize;
5034  last_chunk = tc - (num_tasks * grainsize);
5035  extras = 0;
5036  } else {
5037  num_tasks = tc / grainsize;
5038  // adjust grainsize for balanced distribution of iterations
5039  grainsize = tc / num_tasks;
5040  extras = tc % num_tasks;
5041  }
5042  }
5043  break;
5044  default:
5045  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5046  }
5047 
5048  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5049  (last_chunk < 0 ? last_chunk : extras));
5050  KMP_DEBUG_ASSERT(num_tasks > extras);
5051  KMP_DEBUG_ASSERT(num_tasks > 0);
5052  // =========================================================================
5053 
5054  // check if clause value first
5055  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5056  if (if_val == 0) { // if(0) specified, mark task as serial
5057  taskdata->td_flags.task_serial = 1;
5058  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5059  // always start serial tasks linearly
5060  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5061  grainsize, extras, last_chunk, tc,
5062 #if OMPT_SUPPORT
5063  OMPT_GET_RETURN_ADDRESS(0),
5064 #endif
5065  task_dup);
5066  // !taskdata->td_flags.native => currently force linear spawning of tasks
5067  // for GOMP_taskloop
5068  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5069  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5070  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5071  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5072  last_chunk));
5073  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5074  grainsize, extras, last_chunk, tc, num_tasks_min,
5075 #if OMPT_SUPPORT
5076  OMPT_GET_RETURN_ADDRESS(0),
5077 #endif
5078  task_dup);
5079  } else {
5080  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5081  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5082  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5083  last_chunk));
5084  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5085  grainsize, extras, last_chunk, tc,
5086 #if OMPT_SUPPORT
5087  OMPT_GET_RETURN_ADDRESS(0),
5088 #endif
5089  task_dup);
5090  }
5091 
5092 #if OMPT_SUPPORT && OMPT_OPTIONAL
5093  if (ompt_enabled.ompt_callback_work) {
5094  ompt_callbacks.ompt_callback(ompt_callback_work)(
5095  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5096  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5097  }
5098 #endif
5099 
5100  if (nogroup == 0) {
5101 #if OMPT_SUPPORT && OMPT_OPTIONAL
5102  OMPT_STORE_RETURN_ADDRESS(gtid);
5103 #endif
5104  __kmpc_end_taskgroup(loc, gtid);
5105  }
5106  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5107 }
5108 
5125 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5126  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5127  int sched, kmp_uint64 grainsize, void *task_dup) {
5128  __kmp_assert_valid_gtid(gtid);
5129  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5130  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5131  0, task_dup);
5132  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5133 }
5134 
5152 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5153  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5154  int nogroup, int sched, kmp_uint64 grainsize,
5155  int modifier, void *task_dup) {
5156  __kmp_assert_valid_gtid(gtid);
5157  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5158  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5159  modifier, task_dup);
5160  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5161 }
5162 
5172  if (gtid == KMP_GTID_DNE)
5173  return NULL;
5174 
5175  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5176  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5177 
5178  if (!taskdata)
5179  return NULL;
5180 
5181  return &taskdata->td_target_data.async_handle;
5182 }
5183 
5192 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5193  if (gtid == KMP_GTID_DNE)
5194  return FALSE;
5195 
5196  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5197  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5198 
5199  if (!taskdata)
5200  return FALSE;
5201 
5202  return taskdata->td_task_team != NULL;
5203 }
5204 
5205 #if OMPX_TASKGRAPH
5206 // __kmp_find_tdg: identify a TDG through its ID
5207 // tdg_id: ID of the TDG
5208 // returns: If a TDG corresponding to this ID is found and not
5209 // its initial state, return the pointer to it, otherwise nullptr
5210 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5211  kmp_tdg_info_t *res = nullptr;
5212  if (__kmp_max_tdgs == 0)
5213  return res;
5214 
5215  if (__kmp_global_tdgs == NULL)
5216  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5217  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5218 
5219  if ((__kmp_global_tdgs[tdg_id]) &&
5220  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5221  res = __kmp_global_tdgs[tdg_id];
5222  return res;
5223 }
5224 
5225 // __kmp_print_tdg_dot: prints the TDG to a dot file
5226 // tdg: ID of the TDG
5227 // gtid: Global Thread ID
5228 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5229  kmp_int32 tdg_id = tdg->tdg_id;
5230  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5231 
5232  char file_name[20];
5233  sprintf(file_name, "tdg_%d.dot", tdg_id);
5234  kmp_safe_raii_file_t tdg_file(file_name, "w");
5235 
5236  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5237  fprintf(tdg_file,
5238  "digraph TDG {\n"
5239  " compound=true\n"
5240  " subgraph cluster {\n"
5241  " label=TDG_%d\n",
5242  tdg_id);
5243  for (kmp_int32 i = 0; i < num_tasks; i++) {
5244  fprintf(tdg_file, " %d[style=bold]\n", i);
5245  }
5246  fprintf(tdg_file, " }\n");
5247  for (kmp_int32 i = 0; i < num_tasks; i++) {
5248  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5249  kmp_int32 *successors = tdg->record_map[i].successors;
5250  if (nsuccessors > 0) {
5251  for (kmp_int32 j = 0; j < nsuccessors; j++)
5252  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5253  }
5254  }
5255  fprintf(tdg_file, "}");
5256  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5257 }
5258 
5259 // __kmp_exec_tdg: launch the execution of a previous
5260 // recorded TDG
5261 // gtid: Global Thread ID
5262 // tdg: ID of the TDG
5263 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5264  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5265  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5266  tdg->tdg_id, tdg->num_roots));
5267  kmp_node_info_t *this_record_map = tdg->record_map;
5268  kmp_int32 *this_root_tasks = tdg->root_tasks;
5269  kmp_int32 this_num_roots = tdg->num_roots;
5270  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5271 
5272  kmp_info_t *thread = __kmp_threads[gtid];
5273  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5274 
5275  if (tdg->rec_taskred_data) {
5276  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5277  }
5278 
5279  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5280  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5281 
5282  td->td_parent = parent_task;
5283  this_record_map[j].parent_task = parent_task;
5284 
5285  kmp_taskgroup_t *parent_taskgroup =
5286  this_record_map[j].parent_task->td_taskgroup;
5287 
5288  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5289  this_record_map[j].npredecessors);
5290  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5291 
5292  if (parent_taskgroup) {
5293  KMP_ATOMIC_INC(&parent_taskgroup->count);
5294  // The taskgroup is different so we must update it
5295  td->td_taskgroup = parent_taskgroup;
5296  } else if (td->td_taskgroup != nullptr) {
5297  // If the parent doesnt have a taskgroup, remove it from the task
5298  td->td_taskgroup = nullptr;
5299  }
5300  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5301  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5302  }
5303 
5304  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5305  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5306  }
5307  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5308  tdg->tdg_id, tdg->num_roots));
5309 }
5310 
5311 // __kmp_start_record: set up a TDG structure and turn the
5312 // recording flag to true
5313 // gtid: Global Thread ID of the encountering thread
5314 // input_flags: Flags associated with the TDG
5315 // tdg_id: ID of the TDG to record
5316 static inline void __kmp_start_record(kmp_int32 gtid,
5317  kmp_taskgraph_flags_t *flags,
5318  kmp_int32 tdg_id) {
5319  kmp_tdg_info_t *tdg =
5320  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5321  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5322  // Initializing the TDG structure
5323  tdg->tdg_id = tdg_id;
5324  tdg->map_size = INIT_MAPSIZE;
5325  tdg->num_roots = -1;
5326  tdg->root_tasks = nullptr;
5327  tdg->tdg_status = KMP_TDG_RECORDING;
5328  tdg->rec_num_taskred = 0;
5329  tdg->rec_taskred_data = nullptr;
5330  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5331 
5332  // Initializing the list of nodes in this TDG
5333  kmp_node_info_t *this_record_map =
5334  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5335  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5336  kmp_int32 *successorsList =
5337  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5338  this_record_map[i].task = nullptr;
5339  this_record_map[i].successors = successorsList;
5340  this_record_map[i].nsuccessors = 0;
5341  this_record_map[i].npredecessors = 0;
5342  this_record_map[i].successors_size = __kmp_successors_size;
5343  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5344  }
5345 
5346  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5347 }
5348 
5349 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5350 // the beginning of the record process of a task region
5351 // loc_ref: Location of TDG, not used yet
5352 // gtid: Global Thread ID of the encountering thread
5353 // input_flags: Flags associated with the TDG
5354 // tdg_id: ID of the TDG to record, for now, incremental integer
5355 // returns: 1 if we record, otherwise, 0
5356 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5357  kmp_int32 input_flags, kmp_int32 tdg_id) {
5358 
5359  kmp_int32 res;
5360  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5361  KA_TRACE(10,
5362  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5363  gtid, loc_ref, input_flags, tdg_id));
5364 
5365  if (__kmp_max_tdgs == 0) {
5366  KA_TRACE(
5367  10,
5368  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5369  "__kmp_max_tdgs = 0\n",
5370  gtid, loc_ref, input_flags, tdg_id));
5371  return 1;
5372  }
5373 
5374  __kmpc_taskgroup(loc_ref, gtid);
5375  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5376  // TODO: use re_record flag
5377  __kmp_exec_tdg(gtid, tdg);
5378  res = 0;
5379  } else {
5380  __kmp_curr_tdg_idx = tdg_id;
5381  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5382  __kmp_start_record(gtid, flags, tdg_id);
5383  __kmp_num_tdg++;
5384  res = 1;
5385  }
5386  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5387  gtid, tdg_id, res ? "record" : "execute"));
5388  return res;
5389 }
5390 
5391 // __kmp_end_record: set up a TDG after recording it
5392 // gtid: Global thread ID
5393 // tdg: Pointer to the TDG
5394 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5395  // Store roots
5396  kmp_node_info_t *this_record_map = tdg->record_map;
5397  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5398  kmp_int32 *this_root_tasks =
5399  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5400  kmp_int32 this_map_size = tdg->map_size;
5401  kmp_int32 this_num_roots = 0;
5402  kmp_info_t *thread = __kmp_threads[gtid];
5403 
5404  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5405  if (this_record_map[i].npredecessors == 0) {
5406  this_root_tasks[this_num_roots++] = i;
5407  }
5408  }
5409 
5410  // Update with roots info and mapsize
5411  tdg->map_size = this_map_size;
5412  tdg->num_roots = this_num_roots;
5413  tdg->root_tasks = this_root_tasks;
5414  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5415  tdg->tdg_status = KMP_TDG_READY;
5416 
5417  if (thread->th.th_current_task->td_dephash) {
5418  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5419  thread->th.th_current_task->td_dephash = NULL;
5420  }
5421 
5422  // Reset predecessor counter
5423  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5424  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5425  this_record_map[i].npredecessors);
5426  }
5427  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5428 
5429  if (__kmp_tdg_dot)
5430  __kmp_print_tdg_dot(tdg, gtid);
5431 }
5432 
5433 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5434 // the end of recording phase
5435 //
5436 // loc_ref: Source location information
5437 // gtid: Global thread ID
5438 // input_flags: Flags attached to the graph
5439 // tdg_id: ID of the TDG just finished recording
5440 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5441  kmp_int32 input_flags, kmp_int32 tdg_id) {
5442  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5443 
5444  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5445  " tdg=%d with flags=%d\n",
5446  gtid, loc_ref, tdg_id, input_flags));
5447  if (__kmp_max_tdgs) {
5448  // TODO: use input_flags->nowait
5449  __kmpc_end_taskgroup(loc_ref, gtid);
5450  if (__kmp_tdg_is_recording(tdg->tdg_status))
5451  __kmp_end_record(gtid, tdg);
5452  }
5453  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5454  " tdg=%d, its status is now READY\n",
5455  gtid, loc_ref, tdg_id));
5456 }
5457 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:227
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags