LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 // returns 1 if new task is allowed to execute, 0 otherwise
46 // checks Task Scheduling constraint (if requested) and
47 // mutexinoutset dependencies if any
48 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49  const kmp_taskdata_t *tasknew,
50  const kmp_taskdata_t *taskcurr) {
51  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53  // only descendant of all deferred tied tasks can be scheduled, checking
54  // the last one is enough, as it in turn is the descendant of all others
55  kmp_taskdata_t *current = taskcurr->td_last_tied;
56  KMP_DEBUG_ASSERT(current != NULL);
57  // check if the task is not suspended on barrier
58  if (current->td_flags.tasktype == TASK_EXPLICIT ||
59  current->td_taskwait_thread > 0) { // <= 0 on barrier
60  kmp_int32 level = current->td_level;
61  kmp_taskdata_t *parent = tasknew->td_parent;
62  while (parent != current && parent->td_level > level) {
63  // check generation up to the level of the current task
64  parent = parent->td_parent;
65  KMP_DEBUG_ASSERT(parent != NULL);
66  }
67  if (parent != current)
68  return false;
69  }
70  }
71  // Check mutexinoutset dependencies, acquire locks
72  kmp_depnode_t *node = tasknew->td_depnode;
73 #if OMPX_TASKGRAPH
74  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75 #else
76  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77 #endif
78  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81  continue;
82  // could not get the lock, release previous locks
83  for (int j = i - 1; j >= 0; --j)
84  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85  return false;
86  }
87  // negative num_locks means all locks acquired successfully
88  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89  }
90  return true;
91 }
92 
93 // __kmp_realloc_task_deque:
94 // Re-allocates a task deque for a particular thread, copies the content from
95 // the old deque and adjusts the necessary data structures relating to the
96 // deque. This operation must be done with the deque_lock being held
97 static void __kmp_realloc_task_deque(kmp_info_t *thread,
98  kmp_thread_data_t *thread_data) {
99  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101  kmp_int32 new_size = 2 * size;
102 
103  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104  "%d] for thread_data %p\n",
105  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106 
107  kmp_taskdata_t **new_deque =
108  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
109 
110  int i, j;
111  for (i = thread_data->td.td_deque_head, j = 0; j < size;
112  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113  new_deque[j] = thread_data->td.td_deque[i];
114 
115  __kmp_free(thread_data->td.td_deque);
116 
117  thread_data->td.td_deque_head = 0;
118  thread_data->td.td_deque_tail = size;
119  thread_data->td.td_deque = new_deque;
120  thread_data->td.td_deque_size = new_size;
121 }
122 
123 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
124  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
125  kmp_thread_data_t *thread_data = &l->td;
126  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127  thread_data->td.td_deque_last_stolen = -1;
128  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129  "for thread_data %p\n",
130  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
132  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
133  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134  return l;
135 }
136 
137 // The function finds the deque of priority tasks with given priority, or
138 // allocates a new deque and put it into sorted (high -> low) list of deques.
139 // Deques of non-default priority tasks are shared between all threads in team,
140 // as opposed to per-thread deques of tasks with default priority.
141 // The function is called under the lock task_team->tt.tt_task_pri_lock.
142 static kmp_thread_data_t *
143 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
144  kmp_thread_data_t *thread_data;
145  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146  if (lst->priority == pri) {
147  // Found queue of tasks with given priority.
148  thread_data = &lst->td;
149  } else if (lst->priority < pri) {
150  // All current priority queues contain tasks with lower priority.
151  // Allocate new one for given priority tasks.
152  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
153  thread_data = &list->td;
154  list->priority = pri;
155  list->next = lst;
156  task_team->tt.tt_task_pri_list = list;
157  } else { // task_team->tt.tt_task_pri_list->priority > pri
158  kmp_task_pri_t *next_queue = lst->next;
159  while (next_queue && next_queue->priority > pri) {
160  lst = next_queue;
161  next_queue = lst->next;
162  }
163  // lst->priority > pri && (next == NULL || pri >= next->priority)
164  if (next_queue == NULL) {
165  // No queue with pri priority, need to allocate new one.
166  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
167  thread_data = &list->td;
168  list->priority = pri;
169  list->next = NULL;
170  lst->next = list;
171  } else if (next_queue->priority == pri) {
172  // Found queue of tasks with given priority.
173  thread_data = &next_queue->td;
174  } else { // lst->priority > pri > next->priority
175  // insert newly allocated between existed queues
176  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
177  thread_data = &list->td;
178  list->priority = pri;
179  list->next = next_queue;
180  lst->next = list;
181  }
182  }
183  return thread_data;
184 }
185 
186 // __kmp_push_priority_task: Add a task to the team's priority task deque
187 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
188  kmp_taskdata_t *taskdata,
189  kmp_task_team_t *task_team,
190  kmp_int32 pri) {
191  kmp_thread_data_t *thread_data = NULL;
192  KA_TRACE(20,
193  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194  gtid, taskdata, pri));
195 
196  // Find task queue specific to priority value
197  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198  if (UNLIKELY(lst == NULL)) {
199  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
200  if (task_team->tt.tt_task_pri_list == NULL) {
201  // List of queues is still empty, allocate one.
202  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
203  thread_data = &list->td;
204  list->priority = pri;
205  list->next = NULL;
206  task_team->tt.tt_task_pri_list = list;
207  } else {
208  // Other thread initialized a queue. Check if it fits and get thread_data.
209  thread_data = __kmp_get_priority_deque_data(task_team, pri);
210  }
211  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
212  } else {
213  if (lst->priority == pri) {
214  // Found queue of tasks with given priority.
215  thread_data = &lst->td;
216  } else {
217  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
218  thread_data = __kmp_get_priority_deque_data(task_team, pri);
219  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
220  }
221  }
222  KMP_DEBUG_ASSERT(thread_data);
223 
224  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225  // Check if deque is full
226  if (TCR_4(thread_data->td.td_deque_ntasks) >=
227  TASK_DEQUE_SIZE(thread_data->td)) {
228  if (__kmp_enable_task_throttling &&
229  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
230  thread->th.th_current_task)) {
231  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233  "TASK_NOT_PUSHED for task %p\n",
234  gtid, taskdata));
235  return TASK_NOT_PUSHED;
236  } else {
237  // expand deque to push the task which is not allowed to execute
238  __kmp_realloc_task_deque(thread, thread_data);
239  }
240  }
241  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242  TASK_DEQUE_SIZE(thread_data->td));
243  // Push taskdata.
244  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245  // Wrap index.
246  thread_data->td.td_deque_tail =
247  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248  TCW_4(thread_data->td.td_deque_ntasks,
249  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251  KMP_FSYNC_RELEASING(taskdata); // releasing child
252  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254  gtid, taskdata, thread_data->td.td_deque_ntasks,
255  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257  task_team->tt.tt_num_task_pri++; // atomic inc
258  return TASK_SUCCESSFULLY_PUSHED;
259 }
260 
261 // __kmp_push_task: Add a task to the thread's deque
262 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
263  kmp_info_t *thread = __kmp_threads[gtid];
264  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
265 
266  // If we encounter a hidden helper task, and the current thread is not a
267  // hidden helper thread, we have to give the task to any hidden helper thread
268  // starting from its shadow one.
269  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
272  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
273  // Signal the hidden helper threads.
274  __kmp_hidden_helper_worker_thread_signal();
275  return TASK_SUCCESSFULLY_PUSHED;
276  }
277 
278  kmp_task_team_t *task_team = thread->th.th_task_team;
279  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280  kmp_thread_data_t *thread_data;
281 
282  KA_TRACE(20,
283  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284 
285  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286  // untied task needs to increment counter so that the task structure is not
287  // freed prematurely
288  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
289  KMP_DEBUG_USE_VAR(counter);
290  KA_TRACE(
291  20,
292  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293  gtid, counter, taskdata));
294  }
295 
296  // The first check avoids building task_team thread data if serialized
297  if (UNLIKELY(taskdata->td_flags.task_serial)) {
298  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299  "TASK_NOT_PUSHED for task %p\n",
300  gtid, taskdata));
301  return TASK_NOT_PUSHED;
302  }
303 
304  // Now that serialized tasks have returned, we can assume that we are not in
305  // immediate exec mode
306  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
307  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308  __kmp_enable_tasking(task_team, thread);
309  }
310  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
311  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312 
313  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
314  __kmp_max_task_priority > 0) {
315  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317  }
318 
319  // Find tasking deque specific to encountering thread
320  thread_data = &task_team->tt.tt_threads_data[tid];
321 
322  // No lock needed since only owner can allocate. If the task is hidden_helper,
323  // we don't need it either because we have initialized the dequeue for hidden
324  // helper thread data.
325  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326  __kmp_alloc_task_deque(thread, thread_data);
327  }
328 
329  int locked = 0;
330  // Check if deque is full
331  if (TCR_4(thread_data->td.td_deque_ntasks) >=
332  TASK_DEQUE_SIZE(thread_data->td)) {
333  if (__kmp_enable_task_throttling &&
334  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
335  thread->th.th_current_task)) {
336  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337  "TASK_NOT_PUSHED for task %p\n",
338  gtid, taskdata));
339  return TASK_NOT_PUSHED;
340  } else {
341  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342  locked = 1;
343  if (TCR_4(thread_data->td.td_deque_ntasks) >=
344  TASK_DEQUE_SIZE(thread_data->td)) {
345  // expand deque to push the task which is not allowed to execute
346  __kmp_realloc_task_deque(thread, thread_data);
347  }
348  }
349  }
350  // Lock the deque for the task push operation
351  if (!locked) {
352  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353  // Need to recheck as we can get a proxy task from thread outside of OpenMP
354  if (TCR_4(thread_data->td.td_deque_ntasks) >=
355  TASK_DEQUE_SIZE(thread_data->td)) {
356  if (__kmp_enable_task_throttling &&
357  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
358  thread->th.th_current_task)) {
359  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361  "returning TASK_NOT_PUSHED for task %p\n",
362  gtid, taskdata));
363  return TASK_NOT_PUSHED;
364  } else {
365  // expand deque to push the task which is not allowed to execute
366  __kmp_realloc_task_deque(thread, thread_data);
367  }
368  }
369  }
370  // Must have room since no thread can add tasks but calling thread
371  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372  TASK_DEQUE_SIZE(thread_data->td));
373 
374  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375  taskdata; // Push taskdata
376  // Wrap index.
377  thread_data->td.td_deque_tail =
378  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379  TCW_4(thread_data->td.td_deque_ntasks,
380  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382  KMP_FSYNC_RELEASING(taskdata); // releasing child
383  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384  "task=%p ntasks=%d head=%u tail=%u\n",
385  gtid, taskdata, thread_data->td.td_deque_ntasks,
386  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387 
388  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389 
390  return TASK_SUCCESSFULLY_PUSHED;
391 }
392 
393 // __kmp_pop_current_task_from_thread: set up current task from called thread
394 // when team ends
395 //
396 // this_thr: thread structure to set current_task in.
397 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
398  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399  "this_thread=%p, curtask=%p, "
400  "curtask_parent=%p\n",
401  0, this_thr, this_thr->th.th_current_task,
402  this_thr->th.th_current_task->td_parent));
403 
404  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405 
406  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407  "this_thread=%p, curtask=%p, "
408  "curtask_parent=%p\n",
409  0, this_thr, this_thr->th.th_current_task,
410  this_thr->th.th_current_task->td_parent));
411 }
412 
413 // __kmp_push_current_task_to_thread: set up current task in called thread for a
414 // new team
415 //
416 // this_thr: thread structure to set up
417 // team: team for implicit task data
418 // tid: thread within team to set up
419 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
420  int tid) {
421  // current task of the thread is a parent of the new just created implicit
422  // tasks of new team
423  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424  "curtask=%p "
425  "parent_task=%p\n",
426  tid, this_thr, this_thr->th.th_current_task,
427  team->t.t_implicit_task_taskdata[tid].td_parent));
428 
429  KMP_DEBUG_ASSERT(this_thr != NULL);
430 
431  if (tid == 0) {
432  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433  team->t.t_implicit_task_taskdata[0].td_parent =
434  this_thr->th.th_current_task;
435  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436  }
437  } else {
438  team->t.t_implicit_task_taskdata[tid].td_parent =
439  team->t.t_implicit_task_taskdata[0].td_parent;
440  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441  }
442 
443  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444  "curtask=%p "
445  "parent_task=%p\n",
446  tid, this_thr, this_thr->th.th_current_task,
447  team->t.t_implicit_task_taskdata[tid].td_parent));
448 }
449 
450 // __kmp_task_start: bookkeeping for a task starting execution
451 //
452 // GTID: global thread id of calling thread
453 // task: task starting execution
454 // current_task: task suspending
455 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
456  kmp_taskdata_t *current_task) {
457  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
458  kmp_info_t *thread = __kmp_threads[gtid];
459 
460  KA_TRACE(10,
461  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462  gtid, taskdata, current_task));
463 
464  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
465 
466  // mark currently executing task as suspended
467  // TODO: GEH - make sure root team implicit task is initialized properly.
468  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469  current_task->td_flags.executing = 0;
470 
471  // mark starting task as executing and as current task
472  thread->th.th_current_task = taskdata;
473 
474  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475  taskdata->td_flags.tiedness == TASK_UNTIED);
476  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477  taskdata->td_flags.tiedness == TASK_UNTIED);
478  taskdata->td_flags.started = 1;
479  taskdata->td_flags.executing = 1;
480  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482 
483  // GEH TODO: shouldn't we pass some sort of location identifier here?
484  // APT: yes, we will pass location here.
485  // need to store current thread state (in a thread or taskdata structure)
486  // before setting work_state, otherwise wrong state is set after end of task
487 
488  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489 
490  return;
491 }
492 
493 #if OMPT_SUPPORT
494 //------------------------------------------------------------------------------
495 
496 // __ompt_task_start:
497 // Build and trigger task-begin event
498 static inline void __ompt_task_start(kmp_task_t *task,
499  kmp_taskdata_t *current_task,
500  kmp_int32 gtid) {
501  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
502  ompt_task_status_t status = ompt_task_switch;
503  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504  status = ompt_task_yield;
505  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506  }
507  /* let OMPT know that we're about to run this task */
508  if (ompt_enabled.ompt_callback_task_schedule) {
509  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510  &(current_task->ompt_task_info.task_data), status,
511  &(taskdata->ompt_task_info.task_data));
512  }
513  taskdata->ompt_task_info.scheduling_parent = current_task;
514 }
515 
516 // __ompt_task_finish:
517 // Build and trigger final task-schedule event
518 static inline void __ompt_task_finish(kmp_task_t *task,
519  kmp_taskdata_t *resumed_task,
520  ompt_task_status_t status) {
521  if (ompt_enabled.ompt_callback_task_schedule) {
522  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
523  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
524  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
525  status = ompt_task_cancel;
526  }
527 
528  /* let OMPT know that we're returning to the callee task */
529  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530  &(taskdata->ompt_task_info.task_data), status,
531  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532  }
533 }
534 #endif
535 
536 template <bool ompt>
537 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
538  kmp_task_t *task,
539  void *frame_address,
540  void *return_address) {
541  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
542  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543 
544  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545  "current_task=%p\n",
546  gtid, loc_ref, taskdata, current_task));
547 
548  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549  // untied task needs to increment counter so that the task structure is not
550  // freed prematurely
551  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
552  KMP_DEBUG_USE_VAR(counter);
553  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554  "incremented for task %p\n",
555  gtid, counter, taskdata));
556  }
557 
558  taskdata->td_flags.task_serial =
559  1; // Execute this task immediately, not deferred.
560  __kmp_task_start(gtid, task, current_task);
561 
562 #if OMPT_SUPPORT
563  if (ompt) {
564  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565  current_task->ompt_task_info.frame.enter_frame.ptr =
566  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567  current_task->ompt_task_info.frame.enter_frame_flags =
568  taskdata->ompt_task_info.frame.exit_frame_flags =
569  OMPT_FRAME_FLAGS_APP;
570  }
571  if (ompt_enabled.ompt_callback_task_create) {
572  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574  &(parent_info->task_data), &(parent_info->frame),
575  &(taskdata->ompt_task_info.task_data),
576  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577  }
578  __ompt_task_start(task, current_task, gtid);
579  }
580 #endif // OMPT_SUPPORT
581 
582  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583  loc_ref, taskdata));
584 }
585 
586 #if OMPT_SUPPORT
587 OMPT_NOINLINE
588 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
589  kmp_task_t *task,
590  void *frame_address,
591  void *return_address) {
592  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593  return_address);
594 }
595 #endif // OMPT_SUPPORT
596 
597 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
598 // execution
599 //
600 // loc_ref: source location information; points to beginning of task block.
601 // gtid: global thread number.
602 // task: task thunk for the started task.
603 #ifdef __s390x__
604 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605 // In order for it to work correctly, the caller also needs to be compiled with
606 // backchain. If a caller is compiled without backchain,
607 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608 // crash.
609 __attribute__((target("backchain")))
610 #endif
611 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
612  kmp_task_t *task) {
613 #if OMPT_SUPPORT
614  if (UNLIKELY(ompt_enabled.enabled)) {
615  OMPT_STORE_RETURN_ADDRESS(gtid);
616  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
617  OMPT_GET_FRAME_ADDRESS(1),
618  OMPT_LOAD_RETURN_ADDRESS(gtid));
619  return;
620  }
621 #endif
622  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623 }
624 
625 #ifdef TASK_UNUSED
626 // __kmpc_omp_task_begin: report that a given task has started execution
627 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
628 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630 
631  KA_TRACE(
632  10,
633  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635 
636  __kmp_task_start(gtid, task, current_task);
637 
638  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639  loc_ref, KMP_TASK_TO_TASKDATA(task)));
640  return;
641 }
642 #endif // TASK_UNUSED
643 
644 // __kmp_free_task: free the current task space and the space for shareds
645 //
646 // gtid: Global thread ID of calling thread
647 // taskdata: task to free
648 // thread: thread data structure of caller
649 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650  kmp_info_t *thread) {
651  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652  taskdata));
653 
654  // Check to make sure all flags and counters have the correct values
655  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
656  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
659  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
660  taskdata->td_flags.task_serial == 1);
661  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
662  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
663  // Clear data to not be re-used later by mistake.
664  task->data1.destructors = NULL;
665  task->data2.priority = 0;
666 
667  taskdata->td_flags.freed = 1;
668 #if OMPX_TASKGRAPH
669  // do not free tasks in taskgraph
670  if (!taskdata->is_taskgraph) {
671 #endif
672 // deallocate the taskdata and shared variable blocks associated with this task
673 #if USE_FAST_MEMORY
674  __kmp_fast_free(thread, taskdata);
675 #else /* ! USE_FAST_MEMORY */
676  __kmp_thread_free(thread, taskdata);
677 #endif
678 #if OMPX_TASKGRAPH
679  } else {
680  taskdata->td_flags.complete = 0;
681  taskdata->td_flags.started = 0;
682  taskdata->td_flags.freed = 0;
683  taskdata->td_flags.executing = 0;
684  taskdata->td_flags.task_serial =
685  (taskdata->td_parent->td_flags.final ||
686  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687 
688  // taskdata->td_allow_completion_event.pending_events_count = 1;
689  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
690  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
691  // start at one because counts current task and children
692  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
693  }
694 #endif
695 
696  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697 }
698 
699 // __kmp_free_task_and_ancestors: free the current task and ancestors without
700 // children
701 //
702 // gtid: Global thread ID of calling thread
703 // taskdata: task to free
704 // thread: thread data structure of caller
705 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
706  kmp_taskdata_t *taskdata,
707  kmp_info_t *thread) {
708  // Proxy tasks must always be allowed to free their parents
709  // because they can be run in background even in serial mode.
710  kmp_int32 team_serial =
711  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712  !taskdata->td_flags.proxy;
713  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
714 
715  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716  KMP_DEBUG_ASSERT(children >= 0);
717 
718  // Now, go up the ancestor tree to see if any ancestors can now be freed.
719  while (children == 0) {
720  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721 
722  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723  "and freeing itself\n",
724  gtid, taskdata));
725 
726  // --- Deallocate my ancestor task ---
727  __kmp_free_task(gtid, taskdata, thread);
728 
729  taskdata = parent_taskdata;
730 
731  if (team_serial)
732  return;
733  // Stop checking ancestors at implicit task instead of walking up ancestor
734  // tree to avoid premature deallocation of ancestors.
735  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736  if (taskdata->td_dephash) { // do we need to cleanup dephash?
737  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738  kmp_tasking_flags_t flags_old = taskdata->td_flags;
739  if (children == 0 && flags_old.complete == 1) {
740  kmp_tasking_flags_t flags_new = flags_old;
741  flags_new.complete = 0;
742  if (KMP_COMPARE_AND_STORE_ACQ32(
743  RCAST(kmp_int32 *, &taskdata->td_flags),
744  *RCAST(kmp_int32 *, &flags_old),
745  *RCAST(kmp_int32 *, &flags_new))) {
746  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747  "dephash of implicit task %p\n",
748  gtid, taskdata));
749  // cleanup dephash of finished implicit task
750  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751  }
752  }
753  }
754  return;
755  }
756  // Predecrement simulated by "- 1" calculation
757  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758  KMP_DEBUG_ASSERT(children >= 0);
759  }
760 
761  KA_TRACE(
762  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763  "not freeing it yet\n",
764  gtid, taskdata, children));
765 }
766 
767 // Only need to keep track of child task counts if any of the following:
768 // 1. team parallel and tasking not serialized;
769 // 2. it is a proxy or detachable or hidden helper task
770 // 3. the children counter of its parent task is greater than 0.
771 // The reason for the 3rd one is for serialized team that found detached task,
772 // hidden helper task, T. In this case, the execution of T is still deferred,
773 // and it is also possible that a regular task depends on T. In this case, if we
774 // don't track the children, task synchronization will be broken.
775 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
776  kmp_tasking_flags_t flags = taskdata->td_flags;
777  bool ret = !(flags.team_serial || flags.tasking_ser);
778  ret = ret || flags.proxy == TASK_PROXY ||
779  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780  ret = ret ||
781  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
782 #if OMPX_TASKGRAPH
783  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785 #endif
786  return ret;
787 }
788 
789 // __kmp_task_finish: bookkeeping to do when a task finishes execution
790 //
791 // gtid: global thread ID for calling thread
792 // task: task to be finished
793 // resumed_task: task to be resumed. (may be NULL if task is serialized)
794 //
795 // template<ompt>: effectively ompt_enabled.enabled!=0
796 // the version with ompt=false is inlined, allowing to optimize away all ompt
797 // code in this case
798 template <bool ompt>
799 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
800  kmp_taskdata_t *resumed_task) {
801  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
802  kmp_info_t *thread = __kmp_threads[gtid];
803  kmp_task_team_t *task_team =
804  thread->th.th_task_team; // might be NULL for serial teams...
805 #if OMPX_TASKGRAPH
806  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807  bool is_taskgraph;
808 #endif
809 #if KMP_DEBUG
810  kmp_int32 children = 0;
811 #endif
812  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813  "task %p\n",
814  gtid, taskdata, resumed_task));
815 
816  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
817 
818 #if OMPX_TASKGRAPH
819  is_taskgraph = taskdata->is_taskgraph;
820 #endif
821 
822  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823  // untied task needs to check the counter so that the task structure is not
824  // freed prematurely
825  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
826  KA_TRACE(
827  20,
828  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829  gtid, counter, taskdata));
830  if (counter > 0) {
831  // untied task is not done, to be continued possibly by other thread, do
832  // not free it now
833  if (resumed_task == NULL) {
834  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
835  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836  // task is the parent
837  }
838  thread->th.th_current_task = resumed_task; // restore current_task
839  resumed_task->td_flags.executing = 1; // resume previous task
840  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841  "resuming task %p\n",
842  gtid, taskdata, resumed_task));
843  return;
844  }
845  }
846 
847  // bookkeeping for resuming task:
848  // GEH - note tasking_ser => task_serial
849  KMP_DEBUG_ASSERT(
850  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851  taskdata->td_flags.task_serial);
852  if (taskdata->td_flags.task_serial) {
853  if (resumed_task == NULL) {
854  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855  // task is the parent
856  }
857  } else {
858  KMP_DEBUG_ASSERT(resumed_task !=
859  NULL); // verify that resumed task is passed as argument
860  }
861 
862  /* If the tasks' destructor thunk flag has been set, we need to invoke the
863  destructor thunk that has been generated by the compiler. The code is
864  placed here, since at this point other tasks might have been released
865  hence overlapping the destructor invocations with some other work in the
866  released tasks. The OpenMP spec is not specific on when the destructors
867  are invoked, so we should be free to choose. */
868  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869  kmp_routine_entry_t destr_thunk = task->data1.destructors;
870  KMP_ASSERT(destr_thunk);
871  destr_thunk(gtid, task);
872  }
873 
874  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877 
878  bool completed = true;
879  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880  if (taskdata->td_allow_completion_event.type ==
881  KMP_EVENT_ALLOW_COMPLETION) {
882  // event hasn't been fulfilled yet. Try to detach task.
883  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
884  if (taskdata->td_allow_completion_event.type ==
885  KMP_EVENT_ALLOW_COMPLETION) {
886  // task finished execution
887  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888  taskdata->td_flags.executing = 0; // suspend the finishing task
889 
890 #if OMPT_SUPPORT
891  // For a detached task, which is not completed, we switch back
892  // the omp_fulfill_event signals completion
893  // locking is necessary to avoid a race with ompt_task_late_fulfill
894  if (ompt)
895  __ompt_task_finish(task, resumed_task, ompt_task_detach);
896 #endif
897 
898  // no access to taskdata after this point!
899  // __kmp_fulfill_event might free taskdata at any time from now
900 
901  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902  completed = false;
903  }
904  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
905  }
906  }
907 
908  // Tasks with valid target async handles must be re-enqueued.
909  if (taskdata->td_target_data.async_handle != NULL) {
910  // Note: no need to translate gtid to its shadow. If the current thread is a
911  // hidden helper one, then the gtid is already correct. Otherwise, hidden
912  // helper threads are disabled, and gtid refers to a OpenMP thread.
913 #if OMPT_SUPPORT
914  if (ompt) {
915  __ompt_task_finish(task, resumed_task, ompt_task_switch);
916  }
917 #endif
918  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
919  if (KMP_HIDDEN_HELPER_THREAD(gtid))
920  __kmp_hidden_helper_worker_thread_signal();
921  completed = false;
922  }
923 
924  if (completed) {
925  taskdata->td_flags.complete = 1; // mark the task as completed
926 #if OMPX_TASKGRAPH
927  taskdata->td_flags.onced = 1; // mark the task as ran once already
928 #endif
929 
930 #if OMPT_SUPPORT
931  // This is not a detached task, we are done here
932  if (ompt)
933  __ompt_task_finish(task, resumed_task, ompt_task_complete);
934 #endif
935  // TODO: What would be the balance between the conditions in the function
936  // and an atomic operation?
937  if (__kmp_track_children_task(taskdata)) {
938  __kmp_release_deps(gtid, taskdata);
939  // Predecrement simulated by "- 1" calculation
940 #if KMP_DEBUG
941  children = -1 +
942 #endif
943  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
944  KMP_DEBUG_ASSERT(children >= 0);
945 #if OMPX_TASKGRAPH
946  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947 #else
948  if (taskdata->td_taskgroup)
949 #endif
950  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
952  task_team->tt.tt_hidden_helper_task_encountered)) {
953  // if we found proxy or hidden helper tasks there could exist a dependency
954  // chain with the proxy task as origin
955  __kmp_release_deps(gtid, taskdata);
956  }
957  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958  // called. Othertwise, if a task is executed immediately from the
959  // release_deps code, the flag will be reset to 1 again by this same
960  // function
961  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962  taskdata->td_flags.executing = 0; // suspend the finishing task
963 
964  // Decrement the counter of hidden helper tasks to be executed.
965  if (taskdata->td_flags.hidden_helper) {
966  // Hidden helper tasks can only be executed by hidden helper threads.
967  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
968  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
969  }
970  }
971 
972  KA_TRACE(
973  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974  gtid, taskdata, children));
975 
976  // Free this task and then ancestor tasks if they have no children.
977  // Restore th_current_task first as suggested by John:
978  // johnmc: if an asynchronous inquiry peers into the runtime system
979  // it doesn't see the freed task as the current task.
980  thread->th.th_current_task = resumed_task;
981  if (completed)
982  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983 
984  // TODO: GEH - make sure root team implicit task is initialized properly.
985  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986  resumed_task->td_flags.executing = 1; // resume previous task
987 
988 #if OMPX_TASKGRAPH
989  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990  taskdata->td_taskgroup) {
991  // TDG: we only release taskgroup barrier here because
992  // free_task_and_ancestors will call
993  // __kmp_free_task, which resets all task parameters such as
994  // taskdata->started, etc. If we release the barrier earlier, these
995  // parameters could be read before being reset. This is not an issue for
996  // non-TDG implementation because we never reuse a task(data) structure
997  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998  }
999 #endif
1000 
1001  KA_TRACE(
1002  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003  gtid, taskdata, resumed_task));
1004 
1005  return;
1006 }
1007 
1008 template <bool ompt>
1009 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1010  kmp_int32 gtid,
1011  kmp_task_t *task) {
1012  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014  KMP_DEBUG_ASSERT(gtid >= 0);
1015  // this routine will provide task to resume
1016  __kmp_task_finish<ompt>(gtid, task, NULL);
1017 
1018  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020 
1021 #if OMPT_SUPPORT
1022  if (ompt) {
1023  ompt_frame_t *ompt_frame;
1024  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025  ompt_frame->enter_frame = ompt_data_none;
1026  ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027  }
1028 #endif
1029 
1030  return;
1031 }
1032 
1033 #if OMPT_SUPPORT
1034 OMPT_NOINLINE
1035 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036  kmp_task_t *task) {
1037  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1038 }
1039 #endif // OMPT_SUPPORT
1040 
1041 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1042 //
1043 // loc_ref: source location information; points to end of task block.
1044 // gtid: global thread number.
1045 // task: task thunk for the completed task.
1046 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1047  kmp_task_t *task) {
1048 #if OMPT_SUPPORT
1049  if (UNLIKELY(ompt_enabled.enabled)) {
1050  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051  return;
1052  }
1053 #endif
1054  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1055 }
1056 
1057 #ifdef TASK_UNUSED
1058 // __kmpc_omp_task_complete: report that a task has completed execution
1059 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061  kmp_task_t *task) {
1062  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064 
1065  __kmp_task_finish<false>(gtid, task,
1066  NULL); // Not sure how to find task to resume
1067 
1068  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070  return;
1071 }
1072 #endif // TASK_UNUSED
1073 
1074 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075 // task for a given thread
1076 //
1077 // loc_ref: reference to source location of parallel region
1078 // this_thr: thread data structure corresponding to implicit task
1079 // team: team for this_thr
1080 // tid: thread id of given thread within team
1081 // set_curr_task: TRUE if need to push current task to thread
1082 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083 // have already been done elsewhere.
1084 // TODO: Get better loc_ref. Value passed in may be NULL
1085 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1086  kmp_team_t *team, int tid, int set_curr_task) {
1087  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088 
1089  KF_TRACE(
1090  10,
1091  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093 
1094  task->td_task_id = KMP_GEN_TASK_ID();
1095  task->td_team = team;
1096  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097  // in debugger)
1098  task->td_ident = loc_ref;
1099  task->td_taskwait_ident = NULL;
1100  task->td_taskwait_counter = 0;
1101  task->td_taskwait_thread = 0;
1102 
1103  task->td_flags.tiedness = TASK_TIED;
1104  task->td_flags.tasktype = TASK_IMPLICIT;
1105  task->td_flags.proxy = TASK_FULL;
1106 
1107  // All implicit tasks are executed immediately, not deferred
1108  task->td_flags.task_serial = 1;
1109  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111 
1112  task->td_flags.started = 1;
1113  task->td_flags.executing = 1;
1114  task->td_flags.complete = 0;
1115  task->td_flags.freed = 0;
1116 #if OMPX_TASKGRAPH
1117  task->td_flags.onced = 0;
1118 #endif
1119 
1120  task->td_depnode = NULL;
1121  task->td_last_tied = task;
1122  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123 
1124  if (set_curr_task) { // only do this init first time thread is created
1125  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126  // Not used: don't need to deallocate implicit task
1127  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129  task->td_dephash = NULL;
1130  __kmp_push_current_task_to_thread(this_thr, team, tid);
1131  } else {
1132  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134  }
1135 
1136 #if OMPT_SUPPORT
1137  if (UNLIKELY(ompt_enabled.enabled))
1138  __ompt_task_init(task, tid);
1139 #endif
1140 
1141  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142  team, task));
1143 }
1144 
1145 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146 // at the end of parallel regions. Some resources are kept for reuse in the next
1147 // parallel region.
1148 //
1149 // thread: thread data structure corresponding to implicit task
1150 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1151  kmp_taskdata_t *task = thread->th.th_current_task;
1152 #if ENABLE_LIBOMPTARGET
1153  // Give an opportunity to the offload runtime to synchronize any unfinished
1154  // target async regions before finishing the implicit task
1155  if (UNLIKELY(kmp_target_sync_cb != NULL))
1156  (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid,
1157  KMP_TASKDATA_TO_TASK(task), NULL);
1158 #endif // ENABLE_LIBOMPTARGET
1159  if (task->td_dephash) {
1160  int children;
1161  task->td_flags.complete = 1;
1162 #if OMPX_TASKGRAPH
1163  task->td_flags.onced = 1;
1164 #endif
1165  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1166  kmp_tasking_flags_t flags_old = task->td_flags;
1167  if (children == 0 && flags_old.complete == 1) {
1168  kmp_tasking_flags_t flags_new = flags_old;
1169  flags_new.complete = 0;
1170  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1171  *RCAST(kmp_int32 *, &flags_old),
1172  *RCAST(kmp_int32 *, &flags_new))) {
1173  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1174  "dephash of implicit task %p\n",
1175  thread->th.th_info.ds.ds_gtid, task));
1176  __kmp_dephash_free_entries(thread, task->td_dephash);
1177  }
1178  }
1179  }
1180 }
1181 
1182 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1183 // when these are destroyed regions
1184 //
1185 // thread: thread data structure corresponding to implicit task
1186 void __kmp_free_implicit_task(kmp_info_t *thread) {
1187  kmp_taskdata_t *task = thread->th.th_current_task;
1188  if (task && task->td_dephash) {
1189  __kmp_dephash_free(thread, task->td_dephash);
1190  task->td_dephash = NULL;
1191  }
1192 }
1193 
1194 // Round up a size to a power of two specified by val: Used to insert padding
1195 // between structures co-allocated using a single malloc() call
1196 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1197  if (size & (val - 1)) {
1198  size &= ~(val - 1);
1199  if (size <= KMP_SIZE_T_MAX - val) {
1200  size += val; // Round up if there is no overflow.
1201  }
1202  }
1203  return size;
1204 } // __kmp_round_up_to_va
1205 
1206 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1207 //
1208 // loc_ref: source location information
1209 // gtid: global thread number.
1210 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1211 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1212 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1213 // private vars accessed in task.
1214 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1215 // in task.
1216 // task_entry: Pointer to task code entry point generated by compiler.
1217 // returns: a pointer to the allocated kmp_task_t structure (task).
1218 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1219  kmp_tasking_flags_t *flags,
1220  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1221  kmp_routine_entry_t task_entry) {
1222  kmp_task_t *task;
1223  kmp_taskdata_t *taskdata;
1224  kmp_info_t *thread = __kmp_threads[gtid];
1225  kmp_team_t *team = thread->th.th_team;
1226  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1227  size_t shareds_offset;
1228 
1229  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1230  __kmp_middle_initialize();
1231 
1232  if (flags->hidden_helper) {
1233  if (__kmp_enable_hidden_helper) {
1234  if (!TCR_4(__kmp_init_hidden_helper))
1235  __kmp_hidden_helper_initialize();
1236  } else {
1237  // If the hidden helper task is not enabled, reset the flag to FALSE.
1238  flags->hidden_helper = FALSE;
1239  }
1240  }
1241 
1242  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1243  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1244  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1245  sizeof_shareds, task_entry));
1246 
1247  KMP_DEBUG_ASSERT(parent_task);
1248  if (parent_task->td_flags.final) {
1249  if (flags->merged_if0) {
1250  }
1251  flags->final = 1;
1252  }
1253 
1254  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1255  // Untied task encountered causes the TSC algorithm to check entire deque of
1256  // the victim thread. If no untied task encountered, then checking the head
1257  // of the deque should be enough.
1258  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1259  }
1260 
1261  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1262  // the tasking setup
1263  // when that happens is too late.
1264  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1265  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1266  if (flags->proxy == TASK_PROXY) {
1267  flags->tiedness = TASK_UNTIED;
1268  flags->merged_if0 = 1;
1269  }
1270  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1271  tasking support enabled */
1272  if ((thread->th.th_task_team) == NULL) {
1273  /* This should only happen if the team is serialized
1274  setup a task team and propagate it to the thread */
1275  KMP_DEBUG_ASSERT(team->t.t_serialized);
1276  KA_TRACE(30,
1277  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1278  gtid));
1279  __kmp_task_team_setup(thread, team);
1280  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1281  }
1282  kmp_task_team_t *task_team = thread->th.th_task_team;
1283 
1284  /* tasking must be enabled now as the task might not be pushed */
1285  if (!KMP_TASKING_ENABLED(task_team)) {
1286  KA_TRACE(
1287  30,
1288  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1289  __kmp_enable_tasking(task_team, thread);
1290  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1291  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1292  // No lock needed since only owner can allocate
1293  if (thread_data->td.td_deque == NULL) {
1294  __kmp_alloc_task_deque(thread, thread_data);
1295  }
1296  }
1297 
1298  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1299  task_team->tt.tt_found_proxy_tasks == FALSE)
1300  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1301  if (flags->hidden_helper &&
1302  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1303  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1304  }
1305 
1306  // Calculate shared structure offset including padding after kmp_task_t struct
1307  // to align pointers in shared struct
1308  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1309  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1310 
1311  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1312  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1313  shareds_offset));
1314  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1315  sizeof_shareds));
1316 
1317  // Avoid double allocation here by combining shareds with taskdata
1318 #if USE_FAST_MEMORY
1319  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1320  sizeof_shareds);
1321 #else /* ! USE_FAST_MEMORY */
1322  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1323  sizeof_shareds);
1324 #endif /* USE_FAST_MEMORY */
1325 
1326  task = KMP_TASKDATA_TO_TASK(taskdata);
1327 
1328 // Make sure task & taskdata are aligned appropriately
1329 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1330  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1331  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1332 #else
1333  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1334  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1335 #endif
1336  if (sizeof_shareds > 0) {
1337  // Avoid double allocation here by combining shareds with taskdata
1338  task->shareds = &((char *)taskdata)[shareds_offset];
1339  // Make sure shareds struct is aligned to pointer size
1340  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1341  0);
1342  } else {
1343  task->shareds = NULL;
1344  }
1345  task->routine = task_entry;
1346  task->part_id = 0; // AC: Always start with 0 part id
1347 
1348  taskdata->td_task_id = KMP_GEN_TASK_ID();
1349  taskdata->td_team = thread->th.th_team;
1350  taskdata->td_alloc_thread = thread;
1351  taskdata->td_parent = parent_task;
1352  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1353  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1354  taskdata->td_ident = loc_ref;
1355  taskdata->td_taskwait_ident = NULL;
1356  taskdata->td_taskwait_counter = 0;
1357  taskdata->td_taskwait_thread = 0;
1358  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1359  // avoid copying icvs for proxy tasks
1360  if (flags->proxy == TASK_FULL)
1361  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1362 
1363  taskdata->td_flags = *flags;
1364  taskdata->td_task_team = thread->th.th_task_team;
1365  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1366  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1367  // If it is hidden helper task, we need to set the team and task team
1368  // correspondingly.
1369  if (flags->hidden_helper) {
1370  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1371  taskdata->td_team = shadow_thread->th.th_team;
1372  taskdata->td_task_team = shadow_thread->th.th_task_team;
1373  }
1374 
1375  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1376  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1377 
1378  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1379  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1380 
1381  // GEH - Note we serialize the task if the team is serialized to make sure
1382  // implicit parallel region tasks are not left until program termination to
1383  // execute. Also, it helps locality to execute immediately.
1384 
1385  taskdata->td_flags.task_serial =
1386  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1387  taskdata->td_flags.tasking_ser || flags->merged_if0);
1388 
1389  taskdata->td_flags.started = 0;
1390  taskdata->td_flags.executing = 0;
1391  taskdata->td_flags.complete = 0;
1392  taskdata->td_flags.freed = 0;
1393 #if OMPX_TASKGRAPH
1394  taskdata->td_flags.onced = 0;
1395  taskdata->is_taskgraph = 0;
1396  taskdata->tdg = nullptr;
1397 #endif
1398  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1399  // start at one because counts current task and children
1400  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1401  taskdata->td_taskgroup =
1402  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1403  taskdata->td_dephash = NULL;
1404  taskdata->td_depnode = NULL;
1405  taskdata->td_target_data.async_handle = NULL;
1406  if (flags->tiedness == TASK_UNTIED)
1407  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1408  else
1409  taskdata->td_last_tied = taskdata;
1410  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1411 #if OMPT_SUPPORT
1412  if (UNLIKELY(ompt_enabled.enabled))
1413  __ompt_task_init(taskdata, gtid);
1414 #endif
1415  // TODO: What would be the balance between the conditions in the function and
1416  // an atomic operation?
1417  if (__kmp_track_children_task(taskdata)) {
1418  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1419  if (parent_task->td_taskgroup)
1420  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1421  // Only need to keep track of allocated child tasks for explicit tasks since
1422  // implicit not deallocated
1423  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1424  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1425  }
1426  if (flags->hidden_helper) {
1427  taskdata->td_flags.task_serial = FALSE;
1428  // Increment the number of hidden helper tasks to be executed
1429  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1430  }
1431  }
1432 
1433 #if OMPX_TASKGRAPH
1434  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1435  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1436  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1437  taskdata->is_taskgraph = 1;
1438  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1439  taskdata->td_task_id = KMP_GEN_TASK_ID();
1440  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1441  }
1442 #endif
1443  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1444  gtid, taskdata, taskdata->td_parent));
1445 
1446  return task;
1447 }
1448 
1449 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1450  kmp_int32 flags, size_t sizeof_kmp_task_t,
1451  size_t sizeof_shareds,
1452  kmp_routine_entry_t task_entry) {
1453  kmp_task_t *retval;
1454  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1455  __kmp_assert_valid_gtid(gtid);
1456  input_flags->native = FALSE;
1457  // __kmp_task_alloc() sets up all other runtime flags
1458  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1459  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1460  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1461  input_flags->proxy ? "proxy" : "",
1462  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1463  sizeof_shareds, task_entry));
1464 
1465  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1466  sizeof_shareds, task_entry);
1467 
1468  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1469 
1470  return retval;
1471 }
1472 
1473 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1474  kmp_int32 flags,
1475  size_t sizeof_kmp_task_t,
1476  size_t sizeof_shareds,
1477  kmp_routine_entry_t task_entry,
1478  kmp_int64 device_id) {
1479  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1480  // target task is untied defined in the specification
1481  input_flags.tiedness = TASK_UNTIED;
1482  input_flags.target = 1;
1483 
1484  if (__kmp_enable_hidden_helper)
1485  input_flags.hidden_helper = TRUE;
1486 
1487  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1488  sizeof_shareds, task_entry);
1489 }
1490 
1504 kmp_int32
1506  kmp_task_t *new_task, kmp_int32 naffins,
1507  kmp_task_affinity_info_t *affin_list) {
1508  return 0;
1509 }
1510 
1511 // __kmp_invoke_task: invoke the specified task
1512 //
1513 // gtid: global thread ID of caller
1514 // task: the task to invoke
1515 // current_task: the task to resume after task invocation
1516 #ifdef __s390x__
1517 __attribute__((target("backchain")))
1518 #endif
1519 static void
1520 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1521  kmp_taskdata_t *current_task) {
1522  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1523  kmp_info_t *thread;
1524  int discard = 0 /* false */;
1525  KA_TRACE(
1526  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1527  gtid, taskdata, current_task));
1528  KMP_DEBUG_ASSERT(task);
1529  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1530  taskdata->td_flags.complete == 1)) {
1531  // This is a proxy task that was already completed but it needs to run
1532  // its bottom-half finish
1533  KA_TRACE(
1534  30,
1535  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1536  gtid, taskdata));
1537 
1538  __kmp_bottom_half_finish_proxy(gtid, task);
1539 
1540  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1541  "proxy task %p, resuming task %p\n",
1542  gtid, taskdata, current_task));
1543 
1544  return;
1545  }
1546 
1547 #if OMPT_SUPPORT
1548  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1549  // does not execute code.
1550  ompt_thread_info_t oldInfo;
1551  if (UNLIKELY(ompt_enabled.enabled)) {
1552  // Store the threads states and restore them after the task
1553  thread = __kmp_threads[gtid];
1554  oldInfo = thread->th.ompt_thread_info;
1555  thread->th.ompt_thread_info.wait_id = 0;
1556  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1557  ? ompt_state_work_serial
1558  : ompt_state_work_parallel;
1559  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1560  }
1561 #endif
1562 
1563  // Proxy tasks are not handled by the runtime
1564  if (taskdata->td_flags.proxy != TASK_PROXY) {
1565  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1566  }
1567 
1568  // TODO: cancel tasks if the parallel region has also been cancelled
1569  // TODO: check if this sequence can be hoisted above __kmp_task_start
1570  // if cancellation has been enabled for this run ...
1571  if (UNLIKELY(__kmp_omp_cancellation)) {
1572  thread = __kmp_threads[gtid];
1573  kmp_team_t *this_team = thread->th.th_team;
1574  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1575  if ((taskgroup && taskgroup->cancel_request) ||
1576  (this_team->t.t_cancel_request == cancel_parallel)) {
1577 #if OMPT_SUPPORT && OMPT_OPTIONAL
1578  ompt_data_t *task_data;
1579  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1580  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1581  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1582  task_data,
1583  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1584  : ompt_cancel_parallel) |
1585  ompt_cancel_discarded_task,
1586  NULL);
1587  }
1588 #endif
1589  KMP_COUNT_BLOCK(TASK_cancelled);
1590  // this task belongs to a task group and we need to cancel it
1591  discard = 1 /* true */;
1592  }
1593  }
1594 
1595  // Invoke the task routine and pass in relevant data.
1596  // Thunks generated by gcc take a different argument list.
1597  if (!discard) {
1598  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1599  taskdata->td_last_tied = current_task->td_last_tied;
1600  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1601  }
1602 #if KMP_STATS_ENABLED
1603  KMP_COUNT_BLOCK(TASK_executed);
1604  switch (KMP_GET_THREAD_STATE()) {
1605  case FORK_JOIN_BARRIER:
1606  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1607  break;
1608  case PLAIN_BARRIER:
1609  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1610  break;
1611  case TASKYIELD:
1612  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1613  break;
1614  case TASKWAIT:
1615  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1616  break;
1617  case TASKGROUP:
1618  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1619  break;
1620  default:
1621  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1622  break;
1623  }
1624 #endif // KMP_STATS_ENABLED
1625 
1626 // OMPT task begin
1627 #if OMPT_SUPPORT
1628  if (UNLIKELY(ompt_enabled.enabled))
1629  __ompt_task_start(task, current_task, gtid);
1630 #endif
1631 #if OMPT_SUPPORT && OMPT_OPTIONAL
1632  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1633  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1634  ompt_data_t instance = ompt_data_none;
1635  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1636  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1637  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1638  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1639  ompt_dispatch_taskloop_chunk, instance);
1640  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1641  }
1642 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1643 
1644 #if OMPD_SUPPORT
1645  if (ompd_state & OMPD_ENABLE_BP)
1646  ompd_bp_task_begin();
1647 #endif
1648 
1649 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1650  kmp_uint64 cur_time;
1651  kmp_int32 kmp_itt_count_task =
1652  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1653  current_task->td_flags.tasktype == TASK_IMPLICIT;
1654  if (kmp_itt_count_task) {
1655  thread = __kmp_threads[gtid];
1656  // Time outer level explicit task on barrier for adjusting imbalance time
1657  if (thread->th.th_bar_arrive_time)
1658  cur_time = __itt_get_timestamp();
1659  else
1660  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1661  }
1662  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1663 #endif
1664 
1665 #if ENABLE_LIBOMPTARGET
1666  if (taskdata->td_target_data.async_handle != NULL) {
1667  // If we have a valid target async handle, that means that we have already
1668  // executed the task routine once. We must query for the handle completion
1669  // instead of re-executing the routine.
1670  KMP_ASSERT(tgt_target_nowait_query);
1671  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1672  } else
1673 #endif
1674  if (task->routine != NULL) {
1675 #ifdef KMP_GOMP_COMPAT
1676  if (taskdata->td_flags.native) {
1677  ((void (*)(void *))(*(task->routine)))(task->shareds);
1678  } else
1679 #endif /* KMP_GOMP_COMPAT */
1680  {
1681  (*(task->routine))(gtid, task);
1682  }
1683  }
1684  KMP_POP_PARTITIONED_TIMER();
1685 
1686 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1687  if (kmp_itt_count_task) {
1688  // Barrier imbalance - adjust arrive time with the task duration
1689  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1690  }
1691  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1692  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1693 #endif
1694  }
1695 
1696 #if OMPD_SUPPORT
1697  if (ompd_state & OMPD_ENABLE_BP)
1698  ompd_bp_task_end();
1699 #endif
1700 
1701  // Proxy tasks are not handled by the runtime
1702  if (taskdata->td_flags.proxy != TASK_PROXY) {
1703 #if OMPT_SUPPORT
1704  if (UNLIKELY(ompt_enabled.enabled)) {
1705  thread->th.ompt_thread_info = oldInfo;
1706  if (taskdata->td_flags.tiedness == TASK_TIED) {
1707  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1708  }
1709  __kmp_task_finish<true>(gtid, task, current_task);
1710  } else
1711 #endif
1712  __kmp_task_finish<false>(gtid, task, current_task);
1713  }
1714 #if OMPT_SUPPORT
1715  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1716  __ompt_task_finish(task, current_task, ompt_task_switch);
1717  }
1718 #endif
1719 
1720  KA_TRACE(
1721  30,
1722  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1723  gtid, taskdata, current_task));
1724  return;
1725 }
1726 
1727 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1728 //
1729 // loc_ref: location of original task pragma (ignored)
1730 // gtid: Global Thread ID of encountering thread
1731 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1732 // Returns:
1733 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1734 // be resumed later.
1735 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1736 // resumed later.
1737 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1738  kmp_task_t *new_task) {
1739  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1740 
1741  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1742  loc_ref, new_taskdata));
1743 
1744 #if OMPT_SUPPORT
1745  kmp_taskdata_t *parent;
1746  if (UNLIKELY(ompt_enabled.enabled)) {
1747  parent = new_taskdata->td_parent;
1748  if (ompt_enabled.ompt_callback_task_create) {
1749  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1750  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1751  &(new_taskdata->ompt_task_info.task_data),
1752  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1753  OMPT_GET_RETURN_ADDRESS(0));
1754  }
1755  }
1756 #endif
1757 
1758  /* Should we execute the new task or queue it? For now, let's just always try
1759  to queue it. If the queue fills up, then we'll execute it. */
1760 
1761  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1762  { // Execute this task immediately
1763  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1764  new_taskdata->td_flags.task_serial = 1;
1765  __kmp_invoke_task(gtid, new_task, current_task);
1766  }
1767 
1768  KA_TRACE(
1769  10,
1770  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1771  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1772  gtid, loc_ref, new_taskdata));
1773 
1774 #if OMPT_SUPPORT
1775  if (UNLIKELY(ompt_enabled.enabled)) {
1776  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1777  parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1778  }
1779 #endif
1780  return TASK_CURRENT_NOT_QUEUED;
1781 }
1782 
1783 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1784 //
1785 // gtid: Global Thread ID of encountering thread
1786 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1787 // serialize_immediate: if TRUE then if the task is executed immediately its
1788 // execution will be serialized
1789 // Returns:
1790 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1791 // be resumed later.
1792 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1793 // resumed later.
1794 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1795  bool serialize_immediate) {
1796  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1797 
1798 #if OMPX_TASKGRAPH
1799  if (new_taskdata->is_taskgraph &&
1800  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1801  kmp_tdg_info_t *tdg = new_taskdata->tdg;
1802  // extend the record_map if needed
1803  if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1804  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1805  // map_size could have been updated by another thread if recursive
1806  // taskloop
1807  if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1808  kmp_uint old_size = tdg->map_size;
1809  kmp_uint new_size = old_size * 2;
1810  kmp_node_info_t *old_record = tdg->record_map;
1811  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1812  new_size * sizeof(kmp_node_info_t));
1813 
1814  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1815  tdg->record_map = new_record;
1816 
1817  __kmp_free(old_record);
1818 
1819  for (kmp_int i = old_size; i < new_size; i++) {
1820  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1821  __kmp_successors_size * sizeof(kmp_int32));
1822  new_record[i].task = nullptr;
1823  new_record[i].successors = successorsList;
1824  new_record[i].nsuccessors = 0;
1825  new_record[i].npredecessors = 0;
1826  new_record[i].successors_size = __kmp_successors_size;
1827  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1828  }
1829  // update the size at the end, so that we avoid other
1830  // threads use old_record while map_size is already updated
1831  tdg->map_size = new_size;
1832  }
1833  __kmp_release_bootstrap_lock(&tdg->graph_lock);
1834  }
1835  // record a task
1836  if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1837  tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1838  tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1839  new_taskdata->td_parent;
1840  KMP_ATOMIC_INC(&tdg->num_tasks);
1841  }
1842  }
1843 #endif
1844 
1845  /* Should we execute the new task or queue it? For now, let's just always try
1846  to queue it. If the queue fills up, then we'll execute it. */
1847  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1848  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1849  { // Execute this task immediately
1850  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1851  if (serialize_immediate)
1852  new_taskdata->td_flags.task_serial = 1;
1853  __kmp_invoke_task(gtid, new_task, current_task);
1854  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1855  __kmp_wpolicy_passive) {
1856  kmp_info_t *this_thr = __kmp_threads[gtid];
1857  kmp_team_t *team = this_thr->th.th_team;
1858  kmp_int32 nthreads = this_thr->th.th_team_nproc;
1859  for (int i = 0; i < nthreads; ++i) {
1860  kmp_info_t *thread = team->t.t_threads[i];
1861  if (thread == this_thr)
1862  continue;
1863  if (thread->th.th_sleep_loc != NULL) {
1864  __kmp_null_resume_wrapper(thread);
1865  break; // awake one thread at a time
1866  }
1867  }
1868  }
1869  return TASK_CURRENT_NOT_QUEUED;
1870 }
1871 
1872 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1873 // non-thread-switchable task from the parent thread only!
1874 //
1875 // loc_ref: location of original task pragma (ignored)
1876 // gtid: Global Thread ID of encountering thread
1877 // new_task: non-thread-switchable task thunk allocated by
1878 // __kmp_omp_task_alloc()
1879 // Returns:
1880 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1881 // be resumed later.
1882 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1883 // resumed later.
1884 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1885  kmp_task_t *new_task) {
1886  kmp_int32 res;
1887  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1888 
1889 #if KMP_DEBUG || OMPT_SUPPORT
1890  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1891 #endif
1892  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1893  new_taskdata));
1894  __kmp_assert_valid_gtid(gtid);
1895 
1896 #if OMPT_SUPPORT
1897  kmp_taskdata_t *parent = NULL;
1898  if (UNLIKELY(ompt_enabled.enabled)) {
1899  if (!new_taskdata->td_flags.started) {
1900  OMPT_STORE_RETURN_ADDRESS(gtid);
1901  parent = new_taskdata->td_parent;
1902  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1903  parent->ompt_task_info.frame.enter_frame.ptr =
1904  OMPT_GET_FRAME_ADDRESS(0);
1905  }
1906  if (ompt_enabled.ompt_callback_task_create) {
1907  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1908  &(parent->ompt_task_info.task_data),
1909  &(parent->ompt_task_info.frame),
1910  &(new_taskdata->ompt_task_info.task_data),
1911  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1912  OMPT_LOAD_RETURN_ADDRESS(gtid));
1913  }
1914  } else {
1915  // We are scheduling the continuation of an UNTIED task.
1916  // Scheduling back to the parent task.
1917  __ompt_task_finish(new_task,
1918  new_taskdata->ompt_task_info.scheduling_parent,
1919  ompt_task_switch);
1920  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1921  }
1922  }
1923 #endif
1924 
1925  res = __kmp_omp_task(gtid, new_task, true);
1926 
1927  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1928  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1929  gtid, loc_ref, new_taskdata));
1930 #if OMPT_SUPPORT
1931  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1932  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1933  }
1934 #endif
1935  return res;
1936 }
1937 
1938 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1939 // a taskloop task with the correct OMPT return address
1940 //
1941 // loc_ref: location of original task pragma (ignored)
1942 // gtid: Global Thread ID of encountering thread
1943 // new_task: non-thread-switchable task thunk allocated by
1944 // __kmp_omp_task_alloc()
1945 // codeptr_ra: return address for OMPT callback
1946 // Returns:
1947 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1948 // be resumed later.
1949 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1950 // resumed later.
1951 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1952  kmp_task_t *new_task, void *codeptr_ra) {
1953  kmp_int32 res;
1954  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1955 
1956 #if KMP_DEBUG || OMPT_SUPPORT
1957  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1958 #endif
1959  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1960  new_taskdata));
1961 
1962 #if OMPT_SUPPORT
1963  kmp_taskdata_t *parent = NULL;
1964  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1965  parent = new_taskdata->td_parent;
1966  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1967  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1968  if (ompt_enabled.ompt_callback_task_create) {
1969  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1970  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1971  &(new_taskdata->ompt_task_info.task_data),
1972  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1973  }
1974  }
1975 #endif
1976 
1977  res = __kmp_omp_task(gtid, new_task, true);
1978 
1979  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1980  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1981  gtid, loc_ref, new_taskdata));
1982 #if OMPT_SUPPORT
1983  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1984  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1985  }
1986 #endif
1987  return res;
1988 }
1989 
1990 template <bool ompt>
1991 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
1992  void *frame_address,
1993  void *return_address) {
1994  kmp_taskdata_t *taskdata = nullptr;
1995  kmp_info_t *thread;
1996  int thread_finished = FALSE;
1997  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
1998 
1999  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2000  KMP_DEBUG_ASSERT(gtid >= 0);
2001 
2002  if (__kmp_tasking_mode != tskm_immediate_exec) {
2003  thread = __kmp_threads[gtid];
2004  taskdata = thread->th.th_current_task;
2005 
2006 #if OMPT_SUPPORT && OMPT_OPTIONAL
2007  ompt_data_t *my_task_data;
2008  ompt_data_t *my_parallel_data;
2009 
2010  if (ompt) {
2011  my_task_data = &(taskdata->ompt_task_info.task_data);
2012  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2013 
2014  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2015 
2016  if (ompt_enabled.ompt_callback_sync_region) {
2017  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2018  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2019  my_task_data, return_address);
2020  }
2021 
2022  if (ompt_enabled.ompt_callback_sync_region_wait) {
2023  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2024  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2025  my_task_data, return_address);
2026  }
2027  }
2028 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2029 
2030 #if ENABLE_LIBOMPTARGET
2031  // Give an opportunity to the offload runtime to make progress and create
2032  // any necessary proxy tasks
2033  if (UNLIKELY(kmp_target_sync_cb))
2034  (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata),
2035  NULL);
2036 #endif // ENABLE_LIBOMPTARGET
2037 
2038 // Debugger: The taskwait is active. Store location and thread encountered the
2039 // taskwait.
2040 #if USE_ITT_BUILD
2041 // Note: These values are used by ITT events as well.
2042 #endif /* USE_ITT_BUILD */
2043  taskdata->td_taskwait_counter += 1;
2044  taskdata->td_taskwait_ident = loc_ref;
2045  taskdata->td_taskwait_thread = gtid + 1;
2046 
2047 #if USE_ITT_BUILD
2048  void *itt_sync_obj = NULL;
2049 #if USE_ITT_NOTIFY
2050  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2051 #endif /* USE_ITT_NOTIFY */
2052 #endif /* USE_ITT_BUILD */
2053 
2054  bool must_wait =
2055  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2056 
2057  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2058  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2059  // If hidden helper thread is encountered, we must enable wait here.
2060  must_wait =
2061  must_wait ||
2062  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2063  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2064 
2065  if (must_wait) {
2066  kmp_flag_32<false, false> flag(
2067  RCAST(std::atomic<kmp_uint32> *,
2068  &(taskdata->td_incomplete_child_tasks)),
2069  0U);
2070  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2071  flag.execute_tasks(thread, gtid, FALSE,
2072  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2073  __kmp_task_stealing_constraint);
2074  }
2075  }
2076 #if USE_ITT_BUILD
2077  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2078  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2079 #endif /* USE_ITT_BUILD */
2080 
2081  // Debugger: The taskwait is completed. Location remains, but thread is
2082  // negated.
2083  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2084 
2085 #if OMPT_SUPPORT && OMPT_OPTIONAL
2086  if (ompt) {
2087  if (ompt_enabled.ompt_callback_sync_region_wait) {
2088  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2089  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2090  my_task_data, return_address);
2091  }
2092  if (ompt_enabled.ompt_callback_sync_region) {
2093  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2094  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2095  my_task_data, return_address);
2096  }
2097  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2098  }
2099 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2100  }
2101 
2102  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2103  "returning TASK_CURRENT_NOT_QUEUED\n",
2104  gtid, taskdata));
2105 
2106  return TASK_CURRENT_NOT_QUEUED;
2107 }
2108 
2109 #if OMPT_SUPPORT && OMPT_OPTIONAL
2110 OMPT_NOINLINE
2111 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2112  void *frame_address,
2113  void *return_address) {
2114  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2115  return_address);
2116 }
2117 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2118 
2119 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2120 // complete
2121 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2122 #if OMPT_SUPPORT && OMPT_OPTIONAL
2123  if (UNLIKELY(ompt_enabled.enabled)) {
2124  OMPT_STORE_RETURN_ADDRESS(gtid);
2125  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2126  OMPT_LOAD_RETURN_ADDRESS(gtid));
2127  }
2128 #endif
2129  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2130 }
2131 
2132 // __kmpc_omp_taskyield: switch to a different task
2133 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2134  kmp_taskdata_t *taskdata = NULL;
2135  kmp_info_t *thread;
2136  int thread_finished = FALSE;
2137 
2138  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2139  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2140 
2141  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2142  gtid, loc_ref, end_part));
2143  __kmp_assert_valid_gtid(gtid);
2144 
2145  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2146  thread = __kmp_threads[gtid];
2147  taskdata = thread->th.th_current_task;
2148 // Should we model this as a task wait or not?
2149 // Debugger: The taskwait is active. Store location and thread encountered the
2150 // taskwait.
2151 #if USE_ITT_BUILD
2152 // Note: These values are used by ITT events as well.
2153 #endif /* USE_ITT_BUILD */
2154  taskdata->td_taskwait_counter += 1;
2155  taskdata->td_taskwait_ident = loc_ref;
2156  taskdata->td_taskwait_thread = gtid + 1;
2157 
2158 #if USE_ITT_BUILD
2159  void *itt_sync_obj = NULL;
2160 #if USE_ITT_NOTIFY
2161  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2162 #endif /* USE_ITT_NOTIFY */
2163 #endif /* USE_ITT_BUILD */
2164  if (!taskdata->td_flags.team_serial) {
2165  kmp_task_team_t *task_team = thread->th.th_task_team;
2166  if (task_team != NULL) {
2167  if (KMP_TASKING_ENABLED(task_team)) {
2168 #if OMPT_SUPPORT
2169  if (UNLIKELY(ompt_enabled.enabled))
2170  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2171 #endif
2172  __kmp_execute_tasks_32(
2173  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2174  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2175  __kmp_task_stealing_constraint);
2176 #if OMPT_SUPPORT
2177  if (UNLIKELY(ompt_enabled.enabled))
2178  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2179 #endif
2180  }
2181  }
2182  }
2183 #if USE_ITT_BUILD
2184  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2185 #endif /* USE_ITT_BUILD */
2186 
2187  // Debugger: The taskwait is completed. Location remains, but thread is
2188  // negated.
2189  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2190  }
2191 
2192  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2193  "returning TASK_CURRENT_NOT_QUEUED\n",
2194  gtid, taskdata));
2195 
2196  return TASK_CURRENT_NOT_QUEUED;
2197 }
2198 
2199 // Task Reduction implementation
2200 //
2201 // Note: initial implementation didn't take into account the possibility
2202 // to specify omp_orig for initializer of the UDR (user defined reduction).
2203 // Corrected implementation takes into account the omp_orig object.
2204 // Compiler is free to use old implementation if omp_orig is not specified.
2205 
2214 typedef struct kmp_taskred_flags {
2216  unsigned lazy_priv : 1;
2217  unsigned reserved31 : 31;
2219 
2223 typedef struct kmp_task_red_input {
2224  void *reduce_shar;
2225  size_t reduce_size;
2226  // three compiler-generated routines (init, fini are optional):
2227  void *reduce_init;
2228  void *reduce_fini;
2229  void *reduce_comb;
2232 
2236 typedef struct kmp_taskred_data {
2237  void *reduce_shar;
2238  size_t reduce_size;
2240  void *reduce_priv;
2241  void *reduce_pend;
2242  // three compiler-generated routines (init, fini are optional):
2243  void *reduce_comb;
2244  void *reduce_init;
2245  void *reduce_fini;
2246  void *reduce_orig;
2248 
2254 typedef struct kmp_taskred_input {
2255  void *reduce_shar;
2256  void *reduce_orig;
2257  size_t reduce_size;
2258  // three compiler-generated routines (init, fini are optional):
2259  void *reduce_init;
2260  void *reduce_fini;
2261  void *reduce_comb;
2268 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2269 template <>
2270 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2271  kmp_task_red_input_t &src) {
2272  item.reduce_orig = NULL;
2273 }
2274 template <>
2275 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2276  kmp_taskred_input_t &src) {
2277  if (src.reduce_orig != NULL) {
2278  item.reduce_orig = src.reduce_orig;
2279  } else {
2280  item.reduce_orig = src.reduce_shar;
2281  } // non-NULL reduce_orig means new interface used
2282 }
2283 
2284 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2285 template <>
2286 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2287  size_t offset) {
2288  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2289 }
2290 template <>
2291 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2292  size_t offset) {
2293  ((void (*)(void *, void *))item.reduce_init)(
2294  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2295 }
2296 
2297 template <typename T>
2298 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2299  __kmp_assert_valid_gtid(gtid);
2300  kmp_info_t *thread = __kmp_threads[gtid];
2301  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2302  kmp_uint32 nth = thread->th.th_team_nproc;
2303  kmp_taskred_data_t *arr;
2304 
2305  // check input data just in case
2306  KMP_ASSERT(tg != NULL);
2307  KMP_ASSERT(data != NULL);
2308  KMP_ASSERT(num > 0);
2309  if (nth == 1 && !__kmp_enable_hidden_helper) {
2310  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2311  gtid, tg));
2312  return (void *)tg;
2313  }
2314  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2315  gtid, tg, num));
2316  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2317  thread, num * sizeof(kmp_taskred_data_t));
2318  for (int i = 0; i < num; ++i) {
2319  size_t size = data[i].reduce_size - 1;
2320  // round the size up to cache line per thread-specific item
2321  size += CACHE_LINE - size % CACHE_LINE;
2322  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2323  arr[i].reduce_shar = data[i].reduce_shar;
2324  arr[i].reduce_size = size;
2325  arr[i].flags = data[i].flags;
2326  arr[i].reduce_comb = data[i].reduce_comb;
2327  arr[i].reduce_init = data[i].reduce_init;
2328  arr[i].reduce_fini = data[i].reduce_fini;
2329  __kmp_assign_orig<T>(arr[i], data[i]);
2330  if (!arr[i].flags.lazy_priv) {
2331  // allocate cache-line aligned block and fill it with zeros
2332  arr[i].reduce_priv = __kmp_allocate(nth * size);
2333  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2334  if (arr[i].reduce_init != NULL) {
2335  // initialize all thread-specific items
2336  for (size_t j = 0; j < nth; ++j) {
2337  __kmp_call_init<T>(arr[i], j * size);
2338  }
2339  }
2340  } else {
2341  // only allocate space for pointers now,
2342  // objects will be lazily allocated/initialized if/when requested
2343  // note that __kmp_allocate zeroes the allocated memory
2344  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2345  }
2346  }
2347  tg->reduce_data = (void *)arr;
2348  tg->reduce_num_data = num;
2349  return (void *)tg;
2350 }
2351 
2366 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2367 #if OMPX_TASKGRAPH
2368  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2369  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2370  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2371  this_tdg->rec_taskred_data =
2372  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2373  this_tdg->rec_num_taskred = num;
2374  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2375  sizeof(kmp_task_red_input_t) * num);
2376  }
2377 #endif
2378  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2379 }
2380 
2393 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2394 #if OMPX_TASKGRAPH
2395  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2396  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2397  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2398  this_tdg->rec_taskred_data =
2399  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2400  this_tdg->rec_num_taskred = num;
2401  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2402  sizeof(kmp_task_red_input_t) * num);
2403  }
2404 #endif
2405  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2406 }
2407 
2408 // Copy task reduction data (except for shared pointers).
2409 template <typename T>
2410 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2411  kmp_taskgroup_t *tg, void *reduce_data) {
2412  kmp_taskred_data_t *arr;
2413  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2414  " from data %p\n",
2415  thr, tg, reduce_data));
2416  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2417  thr, num * sizeof(kmp_taskred_data_t));
2418  // threads will share private copies, thunk routines, sizes, flags, etc.:
2419  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2420  for (int i = 0; i < num; ++i) {
2421  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2422  }
2423  tg->reduce_data = (void *)arr;
2424  tg->reduce_num_data = num;
2425 }
2426 
2436 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2437  __kmp_assert_valid_gtid(gtid);
2438  kmp_info_t *thread = __kmp_threads[gtid];
2439  kmp_int32 nth = thread->th.th_team_nproc;
2440  if (nth == 1)
2441  return data; // nothing to do
2442 
2443  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2444  if (tg == NULL)
2445  tg = thread->th.th_current_task->td_taskgroup;
2446  KMP_ASSERT(tg != NULL);
2447  kmp_taskred_data_t *arr;
2448  kmp_int32 num;
2449  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2450 
2451 #if OMPX_TASKGRAPH
2452  if ((thread->th.th_current_task->is_taskgraph) &&
2453  (!__kmp_tdg_is_recording(
2454  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2455  tg = thread->th.th_current_task->td_taskgroup;
2456  KMP_ASSERT(tg != NULL);
2457  KMP_ASSERT(tg->reduce_data != NULL);
2458  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2459  num = tg->reduce_num_data;
2460  }
2461 #endif
2462 
2463  KMP_ASSERT(data != NULL);
2464  while (tg != NULL) {
2465  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2466  num = tg->reduce_num_data;
2467  for (int i = 0; i < num; ++i) {
2468  if (!arr[i].flags.lazy_priv) {
2469  if (data == arr[i].reduce_shar ||
2470  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2471  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2472  } else {
2473  // check shared location first
2474  void **p_priv = (void **)(arr[i].reduce_priv);
2475  if (data == arr[i].reduce_shar)
2476  goto found;
2477  // check if we get some thread specific location as parameter
2478  for (int j = 0; j < nth; ++j)
2479  if (data == p_priv[j])
2480  goto found;
2481  continue; // not found, continue search
2482  found:
2483  if (p_priv[tid] == NULL) {
2484  // allocate thread specific object lazily
2485  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2486  if (arr[i].reduce_init != NULL) {
2487  if (arr[i].reduce_orig != NULL) { // new interface
2488  ((void (*)(void *, void *))arr[i].reduce_init)(
2489  p_priv[tid], arr[i].reduce_orig);
2490  } else { // old interface (single parameter)
2491  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2492  }
2493  }
2494  }
2495  return p_priv[tid];
2496  }
2497  }
2498  KMP_ASSERT(tg->parent);
2499  tg = tg->parent;
2500  }
2501  KMP_ASSERT2(0, "Unknown task reduction item");
2502  return NULL; // ERROR, this line never executed
2503 }
2504 
2505 // Finalize task reduction.
2506 // Called from __kmpc_end_taskgroup()
2507 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2508  kmp_int32 nth = th->th.th_team_nproc;
2509  KMP_DEBUG_ASSERT(
2510  nth > 1 ||
2511  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2512  // are using hidden helper threads
2513  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2514  kmp_int32 num = tg->reduce_num_data;
2515  for (int i = 0; i < num; ++i) {
2516  void *sh_data = arr[i].reduce_shar;
2517  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2518  void (*f_comb)(void *, void *) =
2519  (void (*)(void *, void *))(arr[i].reduce_comb);
2520  if (!arr[i].flags.lazy_priv) {
2521  void *pr_data = arr[i].reduce_priv;
2522  size_t size = arr[i].reduce_size;
2523  for (int j = 0; j < nth; ++j) {
2524  void *priv_data = (char *)pr_data + j * size;
2525  f_comb(sh_data, priv_data); // combine results
2526  if (f_fini)
2527  f_fini(priv_data); // finalize if needed
2528  }
2529  } else {
2530  void **pr_data = (void **)(arr[i].reduce_priv);
2531  for (int j = 0; j < nth; ++j) {
2532  if (pr_data[j] != NULL) {
2533  f_comb(sh_data, pr_data[j]); // combine results
2534  if (f_fini)
2535  f_fini(pr_data[j]); // finalize if needed
2536  __kmp_free(pr_data[j]);
2537  }
2538  }
2539  }
2540  __kmp_free(arr[i].reduce_priv);
2541  }
2542  __kmp_thread_free(th, arr);
2543  tg->reduce_data = NULL;
2544  tg->reduce_num_data = 0;
2545 }
2546 
2547 // Cleanup task reduction data for parallel or worksharing,
2548 // do not touch task private data other threads still working with.
2549 // Called from __kmpc_end_taskgroup()
2550 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2551  __kmp_thread_free(th, tg->reduce_data);
2552  tg->reduce_data = NULL;
2553  tg->reduce_num_data = 0;
2554 }
2555 
2556 template <typename T>
2557 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2558  int num, T *data) {
2559  __kmp_assert_valid_gtid(gtid);
2560  kmp_info_t *thr = __kmp_threads[gtid];
2561  kmp_int32 nth = thr->th.th_team_nproc;
2562  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2563  if (nth == 1) {
2564  KA_TRACE(10,
2565  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2566  gtid, thr->th.th_current_task->td_taskgroup));
2567  return (void *)thr->th.th_current_task->td_taskgroup;
2568  }
2569  kmp_team_t *team = thr->th.th_team;
2570  void *reduce_data;
2571  kmp_taskgroup_t *tg;
2572  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2573  if (reduce_data == NULL &&
2574  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2575  (void *)1)) {
2576  // single thread enters this block to initialize common reduction data
2577  KMP_DEBUG_ASSERT(reduce_data == NULL);
2578  // first initialize own data, then make a copy other threads can use
2579  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2580  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2581  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2582  // fini counters should be 0 at this point
2583  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2584  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2585  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2586  } else {
2587  while (
2588  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2589  (void *)1) { // wait for task reduction initialization
2590  KMP_CPU_PAUSE();
2591  }
2592  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2593  tg = thr->th.th_current_task->td_taskgroup;
2594  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2595  }
2596  return tg;
2597 }
2598 
2615 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2616  int num, void *data) {
2617  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2618  (kmp_task_red_input_t *)data);
2619 }
2620 
2635 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2636  void *data) {
2637  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2638  (kmp_taskred_input_t *)data);
2639 }
2640 
2649 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2650  __kmpc_end_taskgroup(loc, gtid);
2651 }
2652 
2653 // __kmpc_taskgroup: Start a new taskgroup
2654 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2655  __kmp_assert_valid_gtid(gtid);
2656  kmp_info_t *thread = __kmp_threads[gtid];
2657  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2658  kmp_taskgroup_t *tg_new =
2659  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2660  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2661  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2662  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2663  tg_new->parent = taskdata->td_taskgroup;
2664  tg_new->reduce_data = NULL;
2665  tg_new->reduce_num_data = 0;
2666  tg_new->gomp_data = NULL;
2667  taskdata->td_taskgroup = tg_new;
2668 
2669 #if OMPT_SUPPORT && OMPT_OPTIONAL
2670  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2671  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2672  if (!codeptr)
2673  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2674  kmp_team_t *team = thread->th.th_team;
2675  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2676  // FIXME: I think this is wrong for lwt!
2677  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2678 
2679  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2680  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2681  &(my_task_data), codeptr);
2682  }
2683 #endif
2684 }
2685 
2686 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2687 // and its descendants are complete
2688 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2689  __kmp_assert_valid_gtid(gtid);
2690  kmp_info_t *thread = __kmp_threads[gtid];
2691  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2692  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2693  int thread_finished = FALSE;
2694 
2695 #if OMPT_SUPPORT && OMPT_OPTIONAL
2696  kmp_team_t *team;
2697  ompt_data_t my_task_data;
2698  ompt_data_t my_parallel_data;
2699  void *codeptr = nullptr;
2700  if (UNLIKELY(ompt_enabled.enabled)) {
2701  team = thread->th.th_team;
2702  my_task_data = taskdata->ompt_task_info.task_data;
2703  // FIXME: I think this is wrong for lwt!
2704  my_parallel_data = team->t.ompt_team_info.parallel_data;
2705  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2706  if (!codeptr)
2707  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2708  }
2709 #endif
2710 
2711  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2712  KMP_DEBUG_ASSERT(taskgroup != NULL);
2713  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2714 
2715  if (__kmp_tasking_mode != tskm_immediate_exec) {
2716  // mark task as waiting not on a barrier
2717  taskdata->td_taskwait_counter += 1;
2718  taskdata->td_taskwait_ident = loc;
2719  taskdata->td_taskwait_thread = gtid + 1;
2720 #if USE_ITT_BUILD
2721  // For ITT the taskgroup wait is similar to taskwait until we need to
2722  // distinguish them
2723  void *itt_sync_obj = NULL;
2724 #if USE_ITT_NOTIFY
2725  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2726 #endif /* USE_ITT_NOTIFY */
2727 #endif /* USE_ITT_BUILD */
2728 
2729 #if OMPT_SUPPORT && OMPT_OPTIONAL
2730  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2731  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2732  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2733  &(my_task_data), codeptr);
2734  }
2735 #endif
2736 
2737 #if ENABLE_LIBOMPTARGET
2738  // Give an opportunity to the offload runtime to make progress and create
2739  // any necessary proxy tasks
2740  if (UNLIKELY(kmp_target_sync_cb))
2741  (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL);
2742 #endif // ENABLE_LIBOMPTARGET
2743 
2744  if (!taskdata->td_flags.team_serial ||
2745  (thread->th.th_task_team != NULL &&
2746  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2747  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2748  kmp_flag_32<false, false> flag(
2749  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2750  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2751  flag.execute_tasks(thread, gtid, FALSE,
2752  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2753  __kmp_task_stealing_constraint);
2754  }
2755  }
2756  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2757 
2758 #if OMPT_SUPPORT && OMPT_OPTIONAL
2759  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2760  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2761  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2762  &(my_task_data), codeptr);
2763  }
2764 #endif
2765 
2766 #if USE_ITT_BUILD
2767  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2768  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2769 #endif /* USE_ITT_BUILD */
2770  }
2771  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2772 
2773  if (taskgroup->reduce_data != NULL &&
2774  !taskgroup->gomp_data) { // need to reduce?
2775  int cnt;
2776  void *reduce_data;
2777  kmp_team_t *t = thread->th.th_team;
2778  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2779  // check if <priv> data of the first reduction variable shared for the team
2780  void *priv0 = arr[0].reduce_priv;
2781  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2782  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2783  // finishing task reduction on parallel
2784  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2785  if (cnt == thread->th.th_team_nproc - 1) {
2786  // we are the last thread passing __kmpc_reduction_modifier_fini()
2787  // finalize task reduction:
2788  __kmp_task_reduction_fini(thread, taskgroup);
2789  // cleanup fields in the team structure:
2790  // TODO: is relaxed store enough here (whole barrier should follow)?
2791  __kmp_thread_free(thread, reduce_data);
2792  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2793  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2794  } else {
2795  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2796  // so do not finalize reduction, just clean own copy of the data
2797  __kmp_task_reduction_clean(thread, taskgroup);
2798  }
2799  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2800  NULL &&
2801  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2802  // finishing task reduction on worksharing
2803  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2804  if (cnt == thread->th.th_team_nproc - 1) {
2805  // we are the last thread passing __kmpc_reduction_modifier_fini()
2806  __kmp_task_reduction_fini(thread, taskgroup);
2807  // cleanup fields in team structure:
2808  // TODO: is relaxed store enough here (whole barrier should follow)?
2809  __kmp_thread_free(thread, reduce_data);
2810  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2811  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2812  } else {
2813  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2814  // so do not finalize reduction, just clean own copy of the data
2815  __kmp_task_reduction_clean(thread, taskgroup);
2816  }
2817  } else {
2818  // finishing task reduction on taskgroup
2819  __kmp_task_reduction_fini(thread, taskgroup);
2820  }
2821  }
2822  // Restore parent taskgroup for the current task
2823  taskdata->td_taskgroup = taskgroup->parent;
2824  __kmp_thread_free(thread, taskgroup);
2825 
2826  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2827  gtid, taskdata));
2828 
2829 #if OMPT_SUPPORT && OMPT_OPTIONAL
2830  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2831  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2832  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2833  &(my_task_data), codeptr);
2834  }
2835 #endif
2836 }
2837 
2838 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2839  kmp_task_team_t *task_team,
2840  kmp_int32 is_constrained) {
2841  kmp_task_t *task = NULL;
2842  kmp_taskdata_t *taskdata;
2843  kmp_taskdata_t *current;
2844  kmp_thread_data_t *thread_data;
2845  int ntasks = task_team->tt.tt_num_task_pri;
2846  if (ntasks == 0) {
2847  KA_TRACE(
2848  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2849  return NULL;
2850  }
2851  do {
2852  // decrement num_tasks to "reserve" one task to get for execution
2853  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2854  ntasks - 1))
2855  break;
2856  ntasks = task_team->tt.tt_num_task_pri;
2857  } while (ntasks > 0);
2858  if (ntasks == 0) {
2859  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2860  __kmp_get_gtid()));
2861  return NULL;
2862  }
2863  // We got a "ticket" to get a "reserved" priority task
2864  int deque_ntasks;
2865  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2866  do {
2867  KMP_ASSERT(list != NULL);
2868  thread_data = &list->td;
2869  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2870  deque_ntasks = thread_data->td.td_deque_ntasks;
2871  if (deque_ntasks == 0) {
2872  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2873  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2874  __kmp_get_gtid(), thread_data));
2875  list = list->next;
2876  }
2877  } while (deque_ntasks == 0);
2878  KMP_DEBUG_ASSERT(deque_ntasks);
2879  int target = thread_data->td.td_deque_head;
2880  current = __kmp_threads[gtid]->th.th_current_task;
2881  taskdata = thread_data->td.td_deque[target];
2882  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2883  // Bump head pointer and Wrap.
2884  thread_data->td.td_deque_head =
2885  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2886  } else {
2887  if (!task_team->tt.tt_untied_task_encountered) {
2888  // The TSC does not allow to steal victim task
2889  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2890  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2891  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2892  gtid, thread_data, task_team, deque_ntasks, target,
2893  thread_data->td.td_deque_tail));
2894  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2895  return NULL;
2896  }
2897  int i;
2898  // walk through the deque trying to steal any task
2899  taskdata = NULL;
2900  for (i = 1; i < deque_ntasks; ++i) {
2901  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2902  taskdata = thread_data->td.td_deque[target];
2903  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2904  break; // found task to execute
2905  } else {
2906  taskdata = NULL;
2907  }
2908  }
2909  if (taskdata == NULL) {
2910  // No appropriate candidate found to execute
2911  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2912  KA_TRACE(
2913  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2914  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2915  gtid, thread_data, task_team, deque_ntasks,
2916  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2917  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2918  return NULL;
2919  }
2920  int prev = target;
2921  for (i = i + 1; i < deque_ntasks; ++i) {
2922  // shift remaining tasks in the deque left by 1
2923  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2924  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2925  prev = target;
2926  }
2927  KMP_DEBUG_ASSERT(
2928  thread_data->td.td_deque_tail ==
2929  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2930  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2931  }
2932  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2933  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2934  task = KMP_TASKDATA_TO_TASK(taskdata);
2935  return task;
2936 }
2937 
2938 // __kmp_remove_my_task: remove a task from my own deque
2939 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2940  kmp_task_team_t *task_team,
2941  kmp_int32 is_constrained) {
2942  kmp_task_t *task;
2943  kmp_taskdata_t *taskdata;
2944  kmp_thread_data_t *thread_data;
2945  kmp_uint32 tail;
2946 
2947  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2948  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2949  NULL); // Caller should check this condition
2950 
2951  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2952 
2953  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2954  gtid, thread_data->td.td_deque_ntasks,
2955  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2956 
2957  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2958  KA_TRACE(10,
2959  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2960  "ntasks=%d head=%u tail=%u\n",
2961  gtid, thread_data->td.td_deque_ntasks,
2962  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2963  return NULL;
2964  }
2965 
2966  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2967 
2968  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2969  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2970  KA_TRACE(10,
2971  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2972  "ntasks=%d head=%u tail=%u\n",
2973  gtid, thread_data->td.td_deque_ntasks,
2974  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2975  return NULL;
2976  }
2977 
2978  tail = (thread_data->td.td_deque_tail - 1) &
2979  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2980  taskdata = thread_data->td.td_deque[tail];
2981 
2982  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2983  thread->th.th_current_task)) {
2984  // The TSC does not allow to steal victim task
2985  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2986  KA_TRACE(10,
2987  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
2988  "ntasks=%d head=%u tail=%u\n",
2989  gtid, thread_data->td.td_deque_ntasks,
2990  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2991  return NULL;
2992  }
2993 
2994  thread_data->td.td_deque_tail = tail;
2995  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
2996 
2997  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2998 
2999  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3000  "ntasks=%d head=%u tail=%u\n",
3001  gtid, taskdata, thread_data->td.td_deque_ntasks,
3002  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3003 
3004  task = KMP_TASKDATA_TO_TASK(taskdata);
3005  return task;
3006 }
3007 
3008 // __kmp_steal_task: remove a task from another thread's deque
3009 // Assume that calling thread has already checked existence of
3010 // task_team thread_data before calling this routine.
3011 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3012  kmp_task_team_t *task_team,
3013  std::atomic<kmp_int32> *unfinished_threads,
3014  int *thread_finished,
3015  kmp_int32 is_constrained) {
3016  kmp_task_t *task;
3017  kmp_taskdata_t *taskdata;
3018  kmp_taskdata_t *current;
3019  kmp_thread_data_t *victim_td, *threads_data;
3020  kmp_int32 target;
3021  kmp_info_t *victim_thr;
3022 
3023  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3024 
3025  threads_data = task_team->tt.tt_threads_data;
3026  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3027  KMP_DEBUG_ASSERT(victim_tid >= 0);
3028  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3029 
3030  victim_td = &threads_data[victim_tid];
3031  victim_thr = victim_td->td.td_thr;
3032  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3033 
3034  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3035  "task_team=%p ntasks=%d head=%u tail=%u\n",
3036  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3037  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3038  victim_td->td.td_deque_tail));
3039 
3040  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3041  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3042  "task_team=%p ntasks=%d head=%u tail=%u\n",
3043  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3044  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3045  victim_td->td.td_deque_tail));
3046  return NULL;
3047  }
3048 
3049  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3050 
3051  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3052  // Check again after we acquire the lock
3053  if (ntasks == 0) {
3054  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3055  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3056  "task_team=%p ntasks=%d head=%u tail=%u\n",
3057  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3058  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3059  return NULL;
3060  }
3061 
3062  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3063  current = __kmp_threads[gtid]->th.th_current_task;
3064  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3065  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3066  // Bump head pointer and Wrap.
3067  victim_td->td.td_deque_head =
3068  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3069  } else {
3070  if (!task_team->tt.tt_untied_task_encountered) {
3071  // The TSC does not allow to steal victim task
3072  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3073  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3074  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3075  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3076  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3077  return NULL;
3078  }
3079  int i;
3080  // walk through victim's deque trying to steal any task
3081  target = victim_td->td.td_deque_head;
3082  taskdata = NULL;
3083  for (i = 1; i < ntasks; ++i) {
3084  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3085  taskdata = victim_td->td.td_deque[target];
3086  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3087  break; // found victim task
3088  } else {
3089  taskdata = NULL;
3090  }
3091  }
3092  if (taskdata == NULL) {
3093  // No appropriate candidate to steal found
3094  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3095  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3096  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3097  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3098  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3099  return NULL;
3100  }
3101  int prev = target;
3102  for (i = i + 1; i < ntasks; ++i) {
3103  // shift remaining tasks in the deque left by 1
3104  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3105  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3106  prev = target;
3107  }
3108  KMP_DEBUG_ASSERT(
3109  victim_td->td.td_deque_tail ==
3110  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3111  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3112  }
3113  if (*thread_finished) {
3114  // We need to un-mark this victim as a finished victim. This must be done
3115  // before releasing the lock, or else other threads (starting with the
3116  // primary thread victim) might be prematurely released from the barrier!!!
3117 #if KMP_DEBUG
3118  kmp_int32 count =
3119 #endif
3120  KMP_ATOMIC_INC(unfinished_threads);
3121  KA_TRACE(
3122  20,
3123  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3124  gtid, count + 1, task_team));
3125  *thread_finished = FALSE;
3126  }
3127  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3128 
3129  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3130 
3131  KMP_COUNT_BLOCK(TASK_stolen);
3132  KA_TRACE(10,
3133  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3134  "task_team=%p ntasks=%d head=%u tail=%u\n",
3135  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3136  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3137 
3138  task = KMP_TASKDATA_TO_TASK(taskdata);
3139  return task;
3140 }
3141 
3142 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3143 // condition is statisfied (return true) or there are none left (return false).
3144 //
3145 // final_spin is TRUE if this is the spin at the release barrier.
3146 // thread_finished indicates whether the thread is finished executing all
3147 // the tasks it has on its deque, and is at the release barrier.
3148 // spinner is the location on which to spin.
3149 // spinner == NULL means only execute a single task and return.
3150 // checker is the value to check to terminate the spin.
3151 template <class C>
3152 static inline int __kmp_execute_tasks_template(
3153  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3154  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3155  kmp_int32 is_constrained) {
3156  kmp_task_team_t *task_team = thread->th.th_task_team;
3157  kmp_thread_data_t *threads_data;
3158  kmp_task_t *task;
3159  kmp_info_t *other_thread;
3160  kmp_taskdata_t *current_task = thread->th.th_current_task;
3161  std::atomic<kmp_int32> *unfinished_threads;
3162  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3163  tid = thread->th.th_info.ds.ds_tid;
3164 
3165  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3166  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3167 
3168  if (task_team == NULL || current_task == NULL)
3169  return FALSE;
3170 
3171  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3172  "*thread_finished=%d\n",
3173  gtid, final_spin, *thread_finished));
3174 
3175  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3176  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3177 
3178  KMP_DEBUG_ASSERT(threads_data != NULL);
3179 
3180  nthreads = task_team->tt.tt_nproc;
3181  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3182  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3183 
3184  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3185  // getting tasks from target constructs
3186  while (1) { // Inner loop to find a task and execute it
3187 #if ENABLE_LIBOMPTARGET
3188  // Give an opportunity to the offload runtime to make progress
3189  if (UNLIKELY(kmp_target_sync_cb))
3190  (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task),
3191  NULL);
3192 #endif // ENABLE_LIBOMPTARGET
3193 
3194  task = NULL;
3195  if (task_team->tt.tt_num_task_pri) { // get priority task first
3196  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3197  }
3198  if (task == NULL && use_own_tasks) { // check own queue next
3199  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3200  }
3201  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3202  int asleep = 1;
3203  use_own_tasks = 0;
3204  // Try to steal from the last place I stole from successfully.
3205  if (victim_tid == -2) { // haven't stolen anything yet
3206  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3207  if (victim_tid !=
3208  -1) // if we have a last stolen from victim, get the thread
3209  other_thread = threads_data[victim_tid].td.td_thr;
3210  }
3211  if (victim_tid != -1) { // found last victim
3212  asleep = 0;
3213  } else if (!new_victim) { // no recent steals and we haven't already
3214  // used a new victim; select a random thread
3215  do { // Find a different thread to steal work from.
3216  // Pick a random thread. Initial plan was to cycle through all the
3217  // threads, and only return if we tried to steal from every thread,
3218  // and failed. Arch says that's not such a great idea.
3219  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3220  if (victim_tid >= tid) {
3221  ++victim_tid; // Adjusts random distribution to exclude self
3222  }
3223  // Found a potential victim
3224  other_thread = threads_data[victim_tid].td.td_thr;
3225  // There is a slight chance that __kmp_enable_tasking() did not wake
3226  // up all threads waiting at the barrier. If victim is sleeping,
3227  // then wake it up. Since we were going to pay the cache miss
3228  // penalty for referencing another thread's kmp_info_t struct
3229  // anyway,
3230  // the check shouldn't cost too much performance at this point. In
3231  // extra barrier mode, tasks do not sleep at the separate tasking
3232  // barrier, so this isn't a problem.
3233  asleep = 0;
3234  if ((__kmp_tasking_mode == tskm_task_teams) &&
3235  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3236  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3237  NULL)) {
3238  asleep = 1;
3239  __kmp_null_resume_wrapper(other_thread);
3240  // A sleeping thread should not have any tasks on it's queue.
3241  // There is a slight possibility that it resumes, steals a task
3242  // from another thread, which spawns more tasks, all in the time
3243  // that it takes this thread to check => don't write an assertion
3244  // that the victim's queue is empty. Try stealing from a
3245  // different thread.
3246  }
3247  } while (asleep);
3248  }
3249 
3250  if (!asleep) {
3251  // We have a victim to try to steal from
3252  task =
3253  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3254  thread_finished, is_constrained);
3255  }
3256  if (task != NULL) { // set last stolen to victim
3257  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3258  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3259  // The pre-refactored code did not try more than 1 successful new
3260  // vicitm, unless the last one generated more local tasks;
3261  // new_victim keeps track of this
3262  new_victim = 1;
3263  }
3264  } else { // No tasks found; unset last_stolen
3265  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3266  victim_tid = -2; // no successful victim found
3267  }
3268  }
3269 
3270  if (task == NULL)
3271  break; // break out of tasking loop
3272 
3273 // Found a task; execute it
3274 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3275  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3276  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3277  // get the object reliably
3278  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3279  }
3280  __kmp_itt_task_starting(itt_sync_obj);
3281  }
3282 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3283  __kmp_invoke_task(gtid, task, current_task);
3284 #if USE_ITT_BUILD
3285  if (itt_sync_obj != NULL)
3286  __kmp_itt_task_finished(itt_sync_obj);
3287 #endif /* USE_ITT_BUILD */
3288  // If this thread is only partway through the barrier and the condition is
3289  // met, then return now, so that the barrier gather/release pattern can
3290  // proceed. If this thread is in the last spin loop in the barrier,
3291  // waiting to be released, we know that the termination condition will not
3292  // be satisfied, so don't waste any cycles checking it.
3293  if (flag == NULL || (!final_spin && flag->done_check())) {
3294  KA_TRACE(
3295  15,
3296  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3297  gtid));
3298  return TRUE;
3299  }
3300  if (thread->th.th_task_team == NULL) {
3301  break;
3302  }
3303  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3304  // If execution of a stolen task results in more tasks being placed on our
3305  // run queue, reset use_own_tasks
3306  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3307  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3308  "other tasks, restart\n",
3309  gtid));
3310  use_own_tasks = 1;
3311  new_victim = 0;
3312  }
3313  }
3314 
3315  // The task source has been exhausted. If in final spin loop of barrier,
3316  // check if termination condition is satisfied. The work queue may be empty
3317  // but there might be proxy tasks still executing.
3318  if (final_spin &&
3319  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3320  // First, decrement the #unfinished threads, if that has not already been
3321  // done. This decrement might be to the spin location, and result in the
3322  // termination condition being satisfied.
3323  if (!*thread_finished) {
3324 #if KMP_DEBUG
3325  kmp_int32 count = -1 +
3326 #endif
3327  KMP_ATOMIC_DEC(unfinished_threads);
3328  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3329  "unfinished_threads to %d task_team=%p\n",
3330  gtid, count, task_team));
3331  *thread_finished = TRUE;
3332  }
3333 
3334  // It is now unsafe to reference thread->th.th_team !!!
3335  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3336  // thread to pass through the barrier, where it might reset each thread's
3337  // th.th_team field for the next parallel region. If we can steal more
3338  // work, we know that this has not happened yet.
3339  if (flag != NULL && flag->done_check()) {
3340  KA_TRACE(
3341  15,
3342  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3343  gtid));
3344  return TRUE;
3345  }
3346  }
3347 
3348  // If this thread's task team is NULL, primary thread has recognized that
3349  // there are no more tasks; bail out
3350  if (thread->th.th_task_team == NULL) {
3351  KA_TRACE(15,
3352  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3353  return FALSE;
3354  }
3355 
3356  // Check the flag again to see if it has already done in case to be trapped
3357  // into infinite loop when a if0 task depends on a hidden helper task
3358  // outside any parallel region. Detached tasks are not impacted in this case
3359  // because the only thread executing this function has to execute the proxy
3360  // task so it is in another code path that has the same check.
3361  if (flag == NULL || (!final_spin && flag->done_check())) {
3362  KA_TRACE(15,
3363  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3364  gtid));
3365  return TRUE;
3366  }
3367 
3368  // We could be getting tasks from target constructs; if this is the only
3369  // thread, keep trying to execute tasks from own queue
3370  if (nthreads == 1 &&
3371  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3372  use_own_tasks = 1;
3373  else {
3374  KA_TRACE(15,
3375  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3376  return FALSE;
3377  }
3378  }
3379 }
3380 
3381 template <bool C, bool S>
3382 int __kmp_execute_tasks_32(
3383  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3384  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3385  kmp_int32 is_constrained) {
3386  return __kmp_execute_tasks_template(
3387  thread, gtid, flag, final_spin,
3388  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3389 }
3390 
3391 template <bool C, bool S>
3392 int __kmp_execute_tasks_64(
3393  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3394  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3395  kmp_int32 is_constrained) {
3396  return __kmp_execute_tasks_template(
3397  thread, gtid, flag, final_spin,
3398  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3399 }
3400 
3401 template <bool C, bool S>
3402 int __kmp_atomic_execute_tasks_64(
3403  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3404  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3405  kmp_int32 is_constrained) {
3406  return __kmp_execute_tasks_template(
3407  thread, gtid, flag, final_spin,
3408  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3409 }
3410 
3411 int __kmp_execute_tasks_oncore(
3412  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3413  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3414  kmp_int32 is_constrained) {
3415  return __kmp_execute_tasks_template(
3416  thread, gtid, flag, final_spin,
3417  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3418 }
3419 
3420 template int
3421 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3422  kmp_flag_32<false, false> *, int,
3423  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3424 
3425 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3426  kmp_flag_64<false, true> *,
3427  int,
3428  int *USE_ITT_BUILD_ARG(void *),
3429  kmp_int32);
3430 
3431 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3432  kmp_flag_64<true, false> *,
3433  int,
3434  int *USE_ITT_BUILD_ARG(void *),
3435  kmp_int32);
3436 
3437 template int __kmp_atomic_execute_tasks_64<false, true>(
3438  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3439  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3440 
3441 template int __kmp_atomic_execute_tasks_64<true, false>(
3442  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3443  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3444 
3445 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3446 // next barrier so they can assist in executing enqueued tasks.
3447 // First thread in allocates the task team atomically.
3448 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3449  kmp_info_t *this_thr) {
3450  kmp_thread_data_t *threads_data;
3451  int nthreads, i, is_init_thread;
3452 
3453  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3454  __kmp_gtid_from_thread(this_thr)));
3455 
3456  KMP_DEBUG_ASSERT(task_team != NULL);
3457  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3458 
3459  nthreads = task_team->tt.tt_nproc;
3460  KMP_DEBUG_ASSERT(nthreads > 0);
3461  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3462 
3463  // Allocate or increase the size of threads_data if necessary
3464  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3465 
3466  if (!is_init_thread) {
3467  // Some other thread already set up the array.
3468  KA_TRACE(
3469  20,
3470  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3471  __kmp_gtid_from_thread(this_thr)));
3472  return;
3473  }
3474  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3475  KMP_DEBUG_ASSERT(threads_data != NULL);
3476 
3477  if (__kmp_tasking_mode == tskm_task_teams &&
3478  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3479  // Release any threads sleeping at the barrier, so that they can steal
3480  // tasks and execute them. In extra barrier mode, tasks do not sleep
3481  // at the separate tasking barrier, so this isn't a problem.
3482  for (i = 0; i < nthreads; i++) {
3483  void *sleep_loc;
3484  kmp_info_t *thread = threads_data[i].td.td_thr;
3485 
3486  if (i == this_thr->th.th_info.ds.ds_tid) {
3487  continue;
3488  }
3489  // Since we haven't locked the thread's suspend mutex lock at this
3490  // point, there is a small window where a thread might be putting
3491  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3492  // To work around this, __kmp_execute_tasks_template() periodically checks
3493  // see if other threads are sleeping (using the same random mechanism that
3494  // is used for task stealing) and awakens them if they are.
3495  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3496  NULL) {
3497  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3498  __kmp_gtid_from_thread(this_thr),
3499  __kmp_gtid_from_thread(thread)));
3500  __kmp_null_resume_wrapper(thread);
3501  } else {
3502  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3503  __kmp_gtid_from_thread(this_thr),
3504  __kmp_gtid_from_thread(thread)));
3505  }
3506  }
3507  }
3508 
3509  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3510  __kmp_gtid_from_thread(this_thr)));
3511 }
3512 
3513 /* // TODO: Check the comment consistency
3514  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3515  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3516  * After a child * thread checks into a barrier and calls __kmp_release() from
3517  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3518  * longer assume that the kmp_team_t structure is intact (at any moment, the
3519  * primary thread may exit the barrier code and free the team data structure,
3520  * and return the threads to the thread pool).
3521  *
3522  * This does not work with the tasking code, as the thread is still
3523  * expected to participate in the execution of any tasks that may have been
3524  * spawned my a member of the team, and the thread still needs access to all
3525  * to each thread in the team, so that it can steal work from it.
3526  *
3527  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3528  * counting mechanism, and is allocated by the primary thread before calling
3529  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3530  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3531  * of the kmp_task_team_t structs for consecutive barriers can overlap
3532  * (and will, unless the primary thread is the last thread to exit the barrier
3533  * release phase, which is not typical). The existence of such a struct is
3534  * useful outside the context of tasking.
3535  *
3536  * We currently use the existence of the threads array as an indicator that
3537  * tasks were spawned since the last barrier. If the structure is to be
3538  * useful outside the context of tasking, then this will have to change, but
3539  * not setting the field minimizes the performance impact of tasking on
3540  * barriers, when no explicit tasks were spawned (pushed, actually).
3541  */
3542 
3543 static kmp_task_team_t *__kmp_free_task_teams =
3544  NULL; // Free list for task_team data structures
3545 // Lock for task team data structures
3546 kmp_bootstrap_lock_t __kmp_task_team_lock =
3547  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3548 
3549 // __kmp_alloc_task_deque:
3550 // Allocates a task deque for a particular thread, and initialize the necessary
3551 // data structures relating to the deque. This only happens once per thread
3552 // per task team since task teams are recycled. No lock is needed during
3553 // allocation since each thread allocates its own deque.
3554 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3555  kmp_thread_data_t *thread_data) {
3556  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3557  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3558 
3559  // Initialize last stolen task field to "none"
3560  thread_data->td.td_deque_last_stolen = -1;
3561 
3562  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3563  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3564  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3565 
3566  KE_TRACE(
3567  10,
3568  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3569  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3570  // Allocate space for task deque, and zero the deque
3571  // Cannot use __kmp_thread_calloc() because threads not around for
3572  // kmp_reap_task_team( ).
3573  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3574  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3575  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3576 }
3577 
3578 // __kmp_free_task_deque:
3579 // Deallocates a task deque for a particular thread. Happens at library
3580 // deallocation so don't need to reset all thread data fields.
3581 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3582  if (thread_data->td.td_deque != NULL) {
3583  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3584  TCW_4(thread_data->td.td_deque_ntasks, 0);
3585  __kmp_free(thread_data->td.td_deque);
3586  thread_data->td.td_deque = NULL;
3587  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3588  }
3589 }
3590 
3591 // __kmp_realloc_task_threads_data:
3592 // Allocates a threads_data array for a task team, either by allocating an
3593 // initial array or enlarging an existing array. Only the first thread to get
3594 // the lock allocs or enlarges the array and re-initializes the array elements.
3595 // That thread returns "TRUE", the rest return "FALSE".
3596 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3597 // The current size is given by task_team -> tt.tt_max_threads.
3598 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3599  kmp_task_team_t *task_team) {
3600  kmp_thread_data_t **threads_data_p;
3601  kmp_int32 nthreads, maxthreads;
3602  int is_init_thread = FALSE;
3603 
3604  if (TCR_4(task_team->tt.tt_found_tasks)) {
3605  // Already reallocated and initialized.
3606  return FALSE;
3607  }
3608 
3609  threads_data_p = &task_team->tt.tt_threads_data;
3610  nthreads = task_team->tt.tt_nproc;
3611  maxthreads = task_team->tt.tt_max_threads;
3612 
3613  // All threads must lock when they encounter the first task of the implicit
3614  // task region to make sure threads_data fields are (re)initialized before
3615  // used.
3616  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3617 
3618  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3619  // first thread to enable tasking
3620  kmp_team_t *team = thread->th.th_team;
3621  int i;
3622 
3623  is_init_thread = TRUE;
3624  if (maxthreads < nthreads) {
3625 
3626  if (*threads_data_p != NULL) {
3627  kmp_thread_data_t *old_data = *threads_data_p;
3628  kmp_thread_data_t *new_data = NULL;
3629 
3630  KE_TRACE(
3631  10,
3632  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3633  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3634  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3635  // Reallocate threads_data to have more elements than current array
3636  // Cannot use __kmp_thread_realloc() because threads not around for
3637  // kmp_reap_task_team( ). Note all new array entries are initialized
3638  // to zero by __kmp_allocate().
3639  new_data = (kmp_thread_data_t *)__kmp_allocate(
3640  nthreads * sizeof(kmp_thread_data_t));
3641  // copy old data to new data
3642  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3643  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3644 
3645  // Install the new data and free the old data
3646  (*threads_data_p) = new_data;
3647  __kmp_free(old_data);
3648  } else {
3649  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3650  "threads data for task_team %p, size = %d\n",
3651  __kmp_gtid_from_thread(thread), task_team, nthreads));
3652  // Make the initial allocate for threads_data array, and zero entries
3653  // Cannot use __kmp_thread_calloc() because threads not around for
3654  // kmp_reap_task_team( ).
3655  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3656  nthreads * sizeof(kmp_thread_data_t));
3657  }
3658  task_team->tt.tt_max_threads = nthreads;
3659  } else {
3660  // If array has (more than) enough elements, go ahead and use it
3661  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3662  }
3663 
3664  // initialize threads_data pointers back to thread_info structures
3665  for (i = 0; i < nthreads; i++) {
3666  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3667  thread_data->td.td_thr = team->t.t_threads[i];
3668 
3669  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3670  // The last stolen field survives across teams / barrier, and the number
3671  // of threads may have changed. It's possible (likely?) that a new
3672  // parallel region will exhibit the same behavior as previous region.
3673  thread_data->td.td_deque_last_stolen = -1;
3674  }
3675  }
3676 
3677  KMP_MB();
3678  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3679  }
3680 
3681  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3682  return is_init_thread;
3683 }
3684 
3685 // __kmp_free_task_threads_data:
3686 // Deallocates a threads_data array for a task team, including any attached
3687 // tasking deques. Only occurs at library shutdown.
3688 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3689  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3690  if (task_team->tt.tt_threads_data != NULL) {
3691  int i;
3692  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3693  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3694  }
3695  __kmp_free(task_team->tt.tt_threads_data);
3696  task_team->tt.tt_threads_data = NULL;
3697  }
3698  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3699 }
3700 
3701 // __kmp_free_task_pri_list:
3702 // Deallocates tasking deques used for priority tasks.
3703 // Only occurs at library shutdown.
3704 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3705  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3706  if (task_team->tt.tt_task_pri_list != NULL) {
3707  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3708  while (list != NULL) {
3709  kmp_task_pri_t *next = list->next;
3710  __kmp_free_task_deque(&list->td);
3711  __kmp_free(list);
3712  list = next;
3713  }
3714  task_team->tt.tt_task_pri_list = NULL;
3715  }
3716  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3717 }
3718 
3719 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3720  kmp_team_t *team) {
3721  int team_nth = team->t.t_nproc;
3722  // Only need to init if task team is isn't active or team size changed
3723  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3724  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3725  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3726  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3727  TCW_4(task_team->tt.tt_nproc, team_nth);
3728  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3729  TCW_4(task_team->tt.tt_active, TRUE);
3730  }
3731 }
3732 
3733 // __kmp_allocate_task_team:
3734 // Allocates a task team associated with a specific team, taking it from
3735 // the global task team free list if possible. Also initializes data
3736 // structures.
3737 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3738  kmp_team_t *team) {
3739  kmp_task_team_t *task_team = NULL;
3740 
3741  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3742  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3743 
3744  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3745  // Take a task team from the task team pool
3746  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3747  if (__kmp_free_task_teams != NULL) {
3748  task_team = __kmp_free_task_teams;
3749  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3750  task_team->tt.tt_next = NULL;
3751  }
3752  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3753  }
3754 
3755  if (task_team == NULL) {
3756  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3757  "task team for team %p\n",
3758  __kmp_gtid_from_thread(thread), team));
3759  // Allocate a new task team if one is not available. Cannot use
3760  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3761  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3762  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3763  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3764 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3765  // suppress race conditions detection on synchronization flags in debug mode
3766  // this helps to analyze library internals eliminating false positives
3767  __itt_suppress_mark_range(
3768  __itt_suppress_range, __itt_suppress_threading_errors,
3769  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3770  __itt_suppress_mark_range(__itt_suppress_range,
3771  __itt_suppress_threading_errors,
3772  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3773  sizeof(task_team->tt.tt_active));
3774 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3775  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3776  // task_team->tt.tt_threads_data = NULL;
3777  // task_team->tt.tt_max_threads = 0;
3778  // task_team->tt.tt_next = NULL;
3779  }
3780 
3781  __kmp_task_team_init(task_team, team);
3782 
3783  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3784  "unfinished_threads init'd to %d\n",
3785  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3786  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3787  return task_team;
3788 }
3789 
3790 // __kmp_free_task_team:
3791 // Frees the task team associated with a specific thread, and adds it
3792 // to the global task team free list.
3793 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3794  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3795  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3796 
3797  // Put task team back on free list
3798  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3799 
3800  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3801  task_team->tt.tt_next = __kmp_free_task_teams;
3802  TCW_PTR(__kmp_free_task_teams, task_team);
3803 
3804  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3805 }
3806 
3807 // __kmp_reap_task_teams:
3808 // Free all the task teams on the task team free list.
3809 // Should only be done during library shutdown.
3810 // Cannot do anything that needs a thread structure or gtid since they are
3811 // already gone.
3812 void __kmp_reap_task_teams(void) {
3813  kmp_task_team_t *task_team;
3814 
3815  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3816  // Free all task_teams on the free list
3817  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3818  while ((task_team = __kmp_free_task_teams) != NULL) {
3819  __kmp_free_task_teams = task_team->tt.tt_next;
3820  task_team->tt.tt_next = NULL;
3821 
3822  // Free threads_data if necessary
3823  if (task_team->tt.tt_threads_data != NULL) {
3824  __kmp_free_task_threads_data(task_team);
3825  }
3826  if (task_team->tt.tt_task_pri_list != NULL) {
3827  __kmp_free_task_pri_list(task_team);
3828  }
3829  __kmp_free(task_team);
3830  }
3831  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3832  }
3833 }
3834 
3835 // View the array of two task team pointers as a pair of pointers:
3836 // 1) a single task_team pointer
3837 // 2) next pointer for stack
3838 // Serial teams can create a stack of task teams for nested serial teams.
3839 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3840  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3841  kmp_task_team_list_t *current =
3842  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3843  kmp_task_team_list_t *node =
3844  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
3845  node->task_team = current->task_team;
3846  node->next = current->next;
3847  thread->th.th_task_team = current->task_team = NULL;
3848  current->next = node;
3849 }
3850 
3851 // Serial team pops a task team off the stack
3852 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3853  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3854  kmp_task_team_list_t *current =
3855  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3856  if (current->task_team) {
3857  __kmp_free_task_team(thread, current->task_team);
3858  }
3859  kmp_task_team_list_t *next = current->next;
3860  if (next) {
3861  current->task_team = next->task_team;
3862  current->next = next->next;
3863  KMP_DEBUG_ASSERT(next != current);
3864  __kmp_free(next);
3865  thread->th.th_task_team = current->task_team;
3866  }
3867 }
3868 
3869 // __kmp_wait_to_unref_task_teams:
3870 // Some threads could still be in the fork barrier release code, possibly
3871 // trying to steal tasks. Wait for each thread to unreference its task team.
3872 void __kmp_wait_to_unref_task_teams(void) {
3873  kmp_info_t *thread;
3874  kmp_uint32 spins;
3875  kmp_uint64 time;
3876  int done;
3877 
3878  KMP_INIT_YIELD(spins);
3879  KMP_INIT_BACKOFF(time);
3880 
3881  for (;;) {
3882  done = TRUE;
3883 
3884  // TODO: GEH - this may be is wrong because some sync would be necessary
3885  // in case threads are added to the pool during the traversal. Need to
3886  // verify that lock for thread pool is held when calling this routine.
3887  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3888  thread = thread->th.th_next_pool) {
3889 #if KMP_OS_WINDOWS
3890  DWORD exit_val;
3891 #endif
3892  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3893  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3894  __kmp_gtid_from_thread(thread)));
3895  continue;
3896  }
3897 #if KMP_OS_WINDOWS
3898  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3899  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3900  thread->th.th_task_team = NULL;
3901  continue;
3902  }
3903 #endif
3904 
3905  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3906 
3907  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3908  "unreference task_team\n",
3909  __kmp_gtid_from_thread(thread)));
3910 
3911  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3912  void *sleep_loc;
3913  // If the thread is sleeping, awaken it.
3914  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3915  NULL) {
3916  KA_TRACE(
3917  10,
3918  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3919  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3920  __kmp_null_resume_wrapper(thread);
3921  }
3922  }
3923  }
3924  if (done) {
3925  break;
3926  }
3927 
3928  // If oversubscribed or have waited a bit, yield.
3929  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3930  }
3931 }
3932 
3933 // __kmp_task_team_setup: Create a task_team for the current team, but use
3934 // an already created, unused one if it already exists.
3935 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
3936  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3937 
3938  // For the serial and root teams, setup the first task team pointer to point
3939  // to task team. The other pointer is a stack of task teams from previous
3940  // serial levels.
3941  if (team == this_thr->th.th_serial_team ||
3942  team == this_thr->th.th_root->r.r_root_team) {
3943  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3944  if (team->t.t_task_team[0] == NULL) {
3945  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3946  KA_TRACE(
3947  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3948  " for serial/root team %p\n",
3949  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3950 
3951  } else
3952  __kmp_task_team_init(team->t.t_task_team[0], team);
3953  return;
3954  }
3955 
3956  // If this task_team hasn't been created yet, allocate it. It will be used in
3957  // the region after the next.
3958  // If it exists, it is the current task team and shouldn't be touched yet as
3959  // it may still be in use.
3960  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3961  team->t.t_task_team[this_thr->th.th_task_state] =
3962  __kmp_allocate_task_team(this_thr, team);
3963  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3964  " for team %d at parity=%d\n",
3965  __kmp_gtid_from_thread(this_thr),
3966  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3967  this_thr->th.th_task_state));
3968  }
3969 
3970  // After threads exit the release, they will call sync, and then point to this
3971  // other task_team; make sure it is allocated and properly initialized. As
3972  // threads spin in the barrier release phase, they will continue to use the
3973  // previous task_team struct(above), until they receive the signal to stop
3974  // checking for tasks (they can't safely reference the kmp_team_t struct,
3975  // which could be reallocated by the primary thread).
3976  int other_team = 1 - this_thr->th.th_task_state;
3977  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3978  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3979  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3980  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3981  "task_team %p for team %d at parity=%d\n",
3982  __kmp_gtid_from_thread(this_thr),
3983  team->t.t_task_team[other_team], team->t.t_id, other_team));
3984  } else { // Leave the old task team struct in place for the upcoming region;
3985  // adjust as needed
3986  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3987  __kmp_task_team_init(task_team, team);
3988  // if team size has changed, the first thread to enable tasking will
3989  // realloc threads_data if necessary
3990  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
3991  "%p for team %d at parity=%d\n",
3992  __kmp_gtid_from_thread(this_thr),
3993  team->t.t_task_team[other_team], team->t.t_id, other_team));
3994  }
3995 
3996  // For regular thread, task enabling should be called when the task is going
3997  // to be pushed to a dequeue. However, for the hidden helper thread, we need
3998  // it ahead of time so that some operations can be performed without race
3999  // condition.
4000  if (this_thr == __kmp_hidden_helper_main_thread) {
4001  for (int i = 0; i < 2; ++i) {
4002  kmp_task_team_t *task_team = team->t.t_task_team[i];
4003  if (KMP_TASKING_ENABLED(task_team)) {
4004  continue;
4005  }
4006  __kmp_enable_tasking(task_team, this_thr);
4007  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4008  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4009  if (thread_data->td.td_deque == NULL) {
4010  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4011  }
4012  }
4013  }
4014  }
4015 }
4016 
4017 // __kmp_task_team_sync: Propagation of task team data from team to threads
4018 // which happens just after the release phase of a team barrier. This may be
4019 // called by any thread. This is not called for serial or root teams.
4020 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4021  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4022  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4023  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4024 
4025  // Toggle the th_task_state field, to switch which task_team this thread
4026  // refers to
4027  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4028 
4029  // It is now safe to propagate the task team pointer from the team struct to
4030  // the current thread.
4031  TCW_PTR(this_thr->th.th_task_team,
4032  team->t.t_task_team[this_thr->th.th_task_state]);
4033  KA_TRACE(20,
4034  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4035  "%p from Team #%d (parity=%d)\n",
4036  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4037  team->t.t_id, this_thr->th.th_task_state));
4038 }
4039 
4040 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4041 // barrier gather phase. Only called by the primary thread.
4042 //
4043 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4044 // by passing in 0 optionally as the last argument. When wait is zero, primary
4045 // thread does not wait for unfinished_threads to reach 0.
4046 void __kmp_task_team_wait(
4047  kmp_info_t *this_thr,
4048  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4049  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4050 
4051  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4052  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4053 
4054  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4055  if (wait) {
4056  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4057  "(for unfinished_threads to reach 0) on task_team = %p\n",
4058  __kmp_gtid_from_thread(this_thr), task_team));
4059  // Worker threads may have dropped through to release phase, but could
4060  // still be executing tasks. Wait here for tasks to complete. To avoid
4061  // memory contention, only primary thread checks termination condition.
4062  kmp_flag_32<false, false> flag(
4063  RCAST(std::atomic<kmp_uint32> *,
4064  &task_team->tt.tt_unfinished_threads),
4065  0U);
4066  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4067  }
4068  // Deactivate the old task team, so that the worker threads will stop
4069  // referencing it while spinning.
4070  KA_TRACE(
4071  20,
4072  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4073  "setting active to false, setting local and team's pointer to NULL\n",
4074  __kmp_gtid_from_thread(this_thr), task_team));
4075  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4076  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4077  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4078  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4079  KMP_MB();
4080 
4081  TCW_PTR(this_thr->th.th_task_team, NULL);
4082  }
4083 }
4084 
4085 // __kmp_tasking_barrier:
4086 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4087 // Internal function to execute all tasks prior to a regular barrier or a join
4088 // barrier. It is a full barrier itself, which unfortunately turns regular
4089 // barriers into double barriers and join barriers into 1 1/2 barriers.
4090 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4091  std::atomic<kmp_uint32> *spin = RCAST(
4092  std::atomic<kmp_uint32> *,
4093  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4094  int flag = FALSE;
4095  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4096 
4097 #if USE_ITT_BUILD
4098  KMP_FSYNC_SPIN_INIT(spin, NULL);
4099 #endif /* USE_ITT_BUILD */
4100  kmp_flag_32<false, false> spin_flag(spin, 0U);
4101  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4102  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4103 #if USE_ITT_BUILD
4104  // TODO: What about itt_sync_obj??
4105  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4106 #endif /* USE_ITT_BUILD */
4107 
4108  if (TCR_4(__kmp_global.g.g_done)) {
4109  if (__kmp_global.g.g_abort)
4110  __kmp_abort_thread();
4111  break;
4112  }
4113  KMP_YIELD(TRUE);
4114  }
4115 #if USE_ITT_BUILD
4116  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4117 #endif /* USE_ITT_BUILD */
4118 }
4119 
4120 // __kmp_give_task puts a task into a given thread queue if:
4121 // - the queue for that thread was created
4122 // - there's space in that queue
4123 // Because of this, __kmp_push_task needs to check if there's space after
4124 // getting the lock
4125 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4126  kmp_int32 pass) {
4127  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4128  kmp_task_team_t *task_team = taskdata->td_task_team;
4129 
4130  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4131  taskdata, tid));
4132 
4133  // If task_team is NULL something went really bad...
4134  KMP_DEBUG_ASSERT(task_team != NULL);
4135 
4136  bool result = false;
4137  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4138 
4139  if (thread_data->td.td_deque == NULL) {
4140  // There's no queue in this thread, go find another one
4141  // We're guaranteed that at least one thread has a queue
4142  KA_TRACE(30,
4143  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4144  tid, taskdata));
4145  return result;
4146  }
4147 
4148  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4149  TASK_DEQUE_SIZE(thread_data->td)) {
4150  KA_TRACE(
4151  30,
4152  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4153  taskdata, tid));
4154 
4155  // if this deque is bigger than the pass ratio give a chance to another
4156  // thread
4157  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4158  return result;
4159 
4160  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4161  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4162  TASK_DEQUE_SIZE(thread_data->td)) {
4163  // expand deque to push the task which is not allowed to execute
4164  __kmp_realloc_task_deque(thread, thread_data);
4165  }
4166 
4167  } else {
4168 
4169  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4170 
4171  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4172  TASK_DEQUE_SIZE(thread_data->td)) {
4173  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4174  "thread %d.\n",
4175  taskdata, tid));
4176 
4177  // if this deque is bigger than the pass ratio give a chance to another
4178  // thread
4179  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4180  goto release_and_exit;
4181 
4182  __kmp_realloc_task_deque(thread, thread_data);
4183  }
4184  }
4185 
4186  // lock is held here, and there is space in the deque
4187 
4188  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4189  // Wrap index.
4190  thread_data->td.td_deque_tail =
4191  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4192  TCW_4(thread_data->td.td_deque_ntasks,
4193  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4194 
4195  result = true;
4196  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4197  taskdata, tid));
4198 
4199 release_and_exit:
4200  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4201 
4202  return result;
4203 }
4204 
4205 #define PROXY_TASK_FLAG 0x40000000
4206 /* The finish of the proxy tasks is divided in two pieces:
4207  - the top half is the one that can be done from a thread outside the team
4208  - the bottom half must be run from a thread within the team
4209 
4210  In order to run the bottom half the task gets queued back into one of the
4211  threads of the team. Once the td_incomplete_child_task counter of the parent
4212  is decremented the threads can leave the barriers. So, the bottom half needs
4213  to be queued before the counter is decremented. The top half is therefore
4214  divided in two parts:
4215  - things that can be run before queuing the bottom half
4216  - things that must be run after queuing the bottom half
4217 
4218  This creates a second race as the bottom half can free the task before the
4219  second top half is executed. To avoid this we use the
4220  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4221  half. */
4222 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4223  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4224  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4225  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4226  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4227 
4228  taskdata->td_flags.complete = 1; // mark the task as completed
4229 #if OMPX_TASKGRAPH
4230  taskdata->td_flags.onced = 1;
4231 #endif
4232 
4233  if (taskdata->td_taskgroup)
4234  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4235 
4236  // Create an imaginary children for this task so the bottom half cannot
4237  // release the task before we have completed the second top half
4238  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4239 }
4240 
4241 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4242 #if KMP_DEBUG
4243  kmp_int32 children = 0;
4244  // Predecrement simulated by "- 1" calculation
4245  children = -1 +
4246 #endif
4247  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4248  KMP_DEBUG_ASSERT(children >= 0);
4249 
4250  // Remove the imaginary children
4251  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4252 }
4253 
4254 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4255  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4256  kmp_info_t *thread = __kmp_threads[gtid];
4257 
4258  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4259  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4260  1); // top half must run before bottom half
4261 
4262  // We need to wait to make sure the top half is finished
4263  // Spinning here should be ok as this should happen quickly
4264  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4265  PROXY_TASK_FLAG) > 0)
4266  ;
4267 
4268  __kmp_release_deps(gtid, taskdata);
4269  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4270 }
4271 
4280 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4281  KMP_DEBUG_ASSERT(ptask != NULL);
4282  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4283  KA_TRACE(
4284  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4285  gtid, taskdata));
4286  __kmp_assert_valid_gtid(gtid);
4287  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4288 
4289  __kmp_first_top_half_finish_proxy(taskdata);
4290  __kmp_second_top_half_finish_proxy(taskdata);
4291  __kmp_bottom_half_finish_proxy(gtid, ptask);
4292 
4293  KA_TRACE(10,
4294  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4295  gtid, taskdata));
4296 }
4297 
4298 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4299  KMP_DEBUG_ASSERT(ptask != NULL);
4300  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4301 
4302  // Enqueue task to complete bottom half completion from a thread within the
4303  // corresponding team
4304  kmp_team_t *team = taskdata->td_team;
4305  kmp_int32 nthreads = team->t.t_nproc;
4306  kmp_info_t *thread;
4307 
4308  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4309  // but we cannot use __kmp_get_random here
4310  kmp_int32 start_k = start % nthreads;
4311  kmp_int32 pass = 1;
4312  kmp_int32 k = start_k;
4313 
4314  do {
4315  // For now we're just linearly trying to find a thread
4316  thread = team->t.t_threads[k];
4317  k = (k + 1) % nthreads;
4318 
4319  // we did a full pass through all the threads
4320  if (k == start_k)
4321  pass = pass << 1;
4322 
4323  } while (!__kmp_give_task(thread, k, ptask, pass));
4324 
4325  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4326  // awake at least one thread to execute given task
4327  for (int i = 0; i < nthreads; ++i) {
4328  thread = team->t.t_threads[i];
4329  if (thread->th.th_sleep_loc != NULL) {
4330  __kmp_null_resume_wrapper(thread);
4331  break;
4332  }
4333  }
4334  }
4335 }
4336 
4344 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4345  KMP_DEBUG_ASSERT(ptask != NULL);
4346  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4347 
4348  KA_TRACE(
4349  10,
4350  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4351  taskdata));
4352 
4353  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4354 
4355  __kmp_first_top_half_finish_proxy(taskdata);
4356 
4357  __kmpc_give_task(ptask);
4358 
4359  __kmp_second_top_half_finish_proxy(taskdata);
4360 
4361  KA_TRACE(
4362  10,
4363  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4364  taskdata));
4365 }
4366 
4367 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4368  kmp_task_t *task) {
4369  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4370  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4371  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4372  td->td_allow_completion_event.ed.task = task;
4373  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4374  }
4375  return &td->td_allow_completion_event;
4376 }
4377 
4378 void __kmp_fulfill_event(kmp_event_t *event) {
4379  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4380  kmp_task_t *ptask = event->ed.task;
4381  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4382  bool detached = false;
4383  int gtid = __kmp_get_gtid();
4384 
4385  // The associated task might have completed or could be completing at this
4386  // point.
4387  // We need to take the lock to avoid races
4388  __kmp_acquire_tas_lock(&event->lock, gtid);
4389  if (taskdata->td_flags.proxy == TASK_PROXY) {
4390  detached = true;
4391  } else {
4392 #if OMPT_SUPPORT
4393  // The OMPT event must occur under mutual exclusion,
4394  // otherwise the tool might access ptask after free
4395  if (UNLIKELY(ompt_enabled.enabled))
4396  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4397 #endif
4398  }
4399  event->type = KMP_EVENT_UNINITIALIZED;
4400  __kmp_release_tas_lock(&event->lock, gtid);
4401 
4402  if (detached) {
4403 #if OMPT_SUPPORT
4404  // We free ptask afterwards and know the task is finished,
4405  // so locking is not necessary
4406  if (UNLIKELY(ompt_enabled.enabled))
4407  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4408 #endif
4409  // If the task detached complete the proxy task
4410  if (gtid >= 0) {
4411  kmp_team_t *team = taskdata->td_team;
4412  kmp_info_t *thread = __kmp_get_thread();
4413  if (thread->th.th_team == team) {
4414  __kmpc_proxy_task_completed(gtid, ptask);
4415  return;
4416  }
4417  }
4418 
4419  // fallback
4421  }
4422  }
4423 }
4424 
4425 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4426 // for taskloop
4427 //
4428 // thread: allocating thread
4429 // task_src: pointer to source task to be duplicated
4430 // taskloop_recur: used only when dealing with taskgraph,
4431 // indicating whether we need to update task->td_task_id
4432 // returns: a pointer to the allocated kmp_task_t structure (task).
4433 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4434 #if OMPX_TASKGRAPH
4435  , int taskloop_recur
4436 #endif
4437 ) {
4438  kmp_task_t *task;
4439  kmp_taskdata_t *taskdata;
4440  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4441  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4442  size_t shareds_offset;
4443  size_t task_size;
4444 
4445  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4446  task_src));
4447  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4448  TASK_FULL); // it should not be proxy task
4449  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4450  task_size = taskdata_src->td_size_alloc;
4451 
4452  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4453  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4454  task_size));
4455 #if USE_FAST_MEMORY
4456  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4457 #else
4458  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4459 #endif /* USE_FAST_MEMORY */
4460  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4461 
4462  task = KMP_TASKDATA_TO_TASK(taskdata);
4463 
4464  // Initialize new task (only specific fields not affected by memcpy)
4465 #if OMPX_TASKGRAPH
4466  if (taskdata->is_taskgraph && !taskloop_recur &&
4467  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4468  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4469 #endif
4470  taskdata->td_task_id = KMP_GEN_TASK_ID();
4471  if (task->shareds != NULL) { // need setup shareds pointer
4472  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4473  task->shareds = &((char *)taskdata)[shareds_offset];
4474  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4475  0);
4476  }
4477  taskdata->td_alloc_thread = thread;
4478  taskdata->td_parent = parent_task;
4479  // task inherits the taskgroup from the parent task
4480  taskdata->td_taskgroup = parent_task->td_taskgroup;
4481  // tied task needs to initialize the td_last_tied at creation,
4482  // untied one does this when it is scheduled for execution
4483  if (taskdata->td_flags.tiedness == TASK_TIED)
4484  taskdata->td_last_tied = taskdata;
4485 
4486  // Only need to keep track of child task counts if team parallel and tasking
4487  // not serialized
4488  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4489  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4490  if (parent_task->td_taskgroup)
4491  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4492  // Only need to keep track of allocated child tasks for explicit tasks since
4493  // implicit not deallocated
4494  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4495  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4496  }
4497 
4498  KA_TRACE(20,
4499  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4500  thread, taskdata, taskdata->td_parent));
4501 #if OMPT_SUPPORT
4502  if (UNLIKELY(ompt_enabled.enabled))
4503  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4504 #endif
4505  return task;
4506 }
4507 
4508 // Routine optionally generated by the compiler for setting the lastprivate flag
4509 // and calling needed constructors for private/firstprivate objects
4510 // (used to form taskloop tasks from pattern task)
4511 // Parameters: dest task, src task, lastprivate flag.
4512 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4513 
4514 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4515 
4516 // class to encapsulate manipulating loop bounds in a taskloop task.
4517 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4518 // the loop bound variables.
4519 class kmp_taskloop_bounds_t {
4520  kmp_task_t *task;
4521  const kmp_taskdata_t *taskdata;
4522  size_t lower_offset;
4523  size_t upper_offset;
4524 
4525 public:
4526  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4527  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4528  lower_offset((char *)lb - (char *)task),
4529  upper_offset((char *)ub - (char *)task) {
4530  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4531  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4532  }
4533  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4534  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4535  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4536  size_t get_lower_offset() const { return lower_offset; }
4537  size_t get_upper_offset() const { return upper_offset; }
4538  kmp_uint64 get_lb() const {
4539  kmp_int64 retval;
4540 #if defined(KMP_GOMP_COMPAT)
4541  // Intel task just returns the lower bound normally
4542  if (!taskdata->td_flags.native) {
4543  retval = *(kmp_int64 *)((char *)task + lower_offset);
4544  } else {
4545  // GOMP task has to take into account the sizeof(long)
4546  if (taskdata->td_size_loop_bounds == 4) {
4547  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4548  retval = (kmp_int64)*lb;
4549  } else {
4550  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4551  retval = (kmp_int64)*lb;
4552  }
4553  }
4554 #else
4555  (void)taskdata;
4556  retval = *(kmp_int64 *)((char *)task + lower_offset);
4557 #endif // defined(KMP_GOMP_COMPAT)
4558  return retval;
4559  }
4560  kmp_uint64 get_ub() const {
4561  kmp_int64 retval;
4562 #if defined(KMP_GOMP_COMPAT)
4563  // Intel task just returns the upper bound normally
4564  if (!taskdata->td_flags.native) {
4565  retval = *(kmp_int64 *)((char *)task + upper_offset);
4566  } else {
4567  // GOMP task has to take into account the sizeof(long)
4568  if (taskdata->td_size_loop_bounds == 4) {
4569  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4570  retval = (kmp_int64)*ub;
4571  } else {
4572  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4573  retval = (kmp_int64)*ub;
4574  }
4575  }
4576 #else
4577  retval = *(kmp_int64 *)((char *)task + upper_offset);
4578 #endif // defined(KMP_GOMP_COMPAT)
4579  return retval;
4580  }
4581  void set_lb(kmp_uint64 lb) {
4582 #if defined(KMP_GOMP_COMPAT)
4583  // Intel task just sets the lower bound normally
4584  if (!taskdata->td_flags.native) {
4585  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4586  } else {
4587  // GOMP task has to take into account the sizeof(long)
4588  if (taskdata->td_size_loop_bounds == 4) {
4589  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4590  *lower = (kmp_uint32)lb;
4591  } else {
4592  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4593  *lower = (kmp_uint64)lb;
4594  }
4595  }
4596 #else
4597  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4598 #endif // defined(KMP_GOMP_COMPAT)
4599  }
4600  void set_ub(kmp_uint64 ub) {
4601 #if defined(KMP_GOMP_COMPAT)
4602  // Intel task just sets the upper bound normally
4603  if (!taskdata->td_flags.native) {
4604  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4605  } else {
4606  // GOMP task has to take into account the sizeof(long)
4607  if (taskdata->td_size_loop_bounds == 4) {
4608  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4609  *upper = (kmp_uint32)ub;
4610  } else {
4611  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4612  *upper = (kmp_uint64)ub;
4613  }
4614  }
4615 #else
4616  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4617 #endif // defined(KMP_GOMP_COMPAT)
4618  }
4619 };
4620 
4621 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4622 //
4623 // loc Source location information
4624 // gtid Global thread ID
4625 // task Pattern task, exposes the loop iteration range
4626 // lb Pointer to loop lower bound in task structure
4627 // ub Pointer to loop upper bound in task structure
4628 // st Loop stride
4629 // ub_glob Global upper bound (used for lastprivate check)
4630 // num_tasks Number of tasks to execute
4631 // grainsize Number of loop iterations per task
4632 // extras Number of chunks with grainsize+1 iterations
4633 // last_chunk Reduction of grainsize for last task
4634 // tc Iterations count
4635 // task_dup Tasks duplication routine
4636 // codeptr_ra Return address for OMPT events
4637 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4638  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4639  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4640  kmp_uint64 grainsize, kmp_uint64 extras,
4641  kmp_int64 last_chunk, kmp_uint64 tc,
4642 #if OMPT_SUPPORT
4643  void *codeptr_ra,
4644 #endif
4645  void *task_dup) {
4646  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4647  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4648  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4649  // compiler provides global bounds here
4650  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4651  kmp_uint64 lower = task_bounds.get_lb();
4652  kmp_uint64 upper = task_bounds.get_ub();
4653  kmp_uint64 i;
4654  kmp_info_t *thread = __kmp_threads[gtid];
4655  kmp_taskdata_t *current_task = thread->th.th_current_task;
4656  kmp_task_t *next_task;
4657  kmp_int32 lastpriv = 0;
4658 
4659  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4660  (last_chunk < 0 ? last_chunk : extras));
4661  KMP_DEBUG_ASSERT(num_tasks > extras);
4662  KMP_DEBUG_ASSERT(num_tasks > 0);
4663  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4664  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4665  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4666  ub_glob, st, task_dup));
4667 
4668  // Launch num_tasks tasks, assign grainsize iterations each task
4669  for (i = 0; i < num_tasks; ++i) {
4670  kmp_uint64 chunk_minus_1;
4671  if (extras == 0) {
4672  chunk_minus_1 = grainsize - 1;
4673  } else {
4674  chunk_minus_1 = grainsize;
4675  --extras; // first extras iterations get bigger chunk (grainsize+1)
4676  }
4677  upper = lower + st * chunk_minus_1;
4678  if (upper > *ub) {
4679  upper = *ub;
4680  }
4681  if (i == num_tasks - 1) {
4682  // schedule the last task, set lastprivate flag if needed
4683  if (st == 1) { // most common case
4684  KMP_DEBUG_ASSERT(upper == *ub);
4685  if (upper == ub_glob)
4686  lastpriv = 1;
4687  } else if (st > 0) { // positive loop stride
4688  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4689  if ((kmp_uint64)st > ub_glob - upper)
4690  lastpriv = 1;
4691  } else { // negative loop stride
4692  KMP_DEBUG_ASSERT(upper + st < *ub);
4693  if (upper - ub_glob < (kmp_uint64)(-st))
4694  lastpriv = 1;
4695  }
4696  }
4697 
4698 #if OMPX_TASKGRAPH
4699  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4700 #else
4701  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4702 #endif
4703 
4704  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4705  kmp_taskloop_bounds_t next_task_bounds =
4706  kmp_taskloop_bounds_t(next_task, task_bounds);
4707 
4708  // adjust task-specific bounds
4709  next_task_bounds.set_lb(lower);
4710  if (next_taskdata->td_flags.native) {
4711  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4712  } else {
4713  next_task_bounds.set_ub(upper);
4714  }
4715  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4716  // etc.
4717  ptask_dup(next_task, task, lastpriv);
4718  KA_TRACE(40,
4719  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4720  "upper %lld stride %lld, (offsets %p %p)\n",
4721  gtid, i, next_task, lower, upper, st,
4722  next_task_bounds.get_lower_offset(),
4723  next_task_bounds.get_upper_offset()));
4724 #if OMPT_SUPPORT
4725  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4726  codeptr_ra); // schedule new task
4727 #if OMPT_OPTIONAL
4728  if (ompt_enabled.ompt_callback_dispatch) {
4729  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4730  lower, upper, st);
4731  }
4732 #endif // OMPT_OPTIONAL
4733 #else
4734  __kmp_omp_task(gtid, next_task, true); // schedule new task
4735 #endif
4736  lower = upper + st; // adjust lower bound for the next iteration
4737  }
4738  // free the pattern task and exit
4739  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4740  // do not execute the pattern task, just do internal bookkeeping
4741  __kmp_task_finish<false>(gtid, task, current_task);
4742 }
4743 
4744 // Structure to keep taskloop parameters for auxiliary task
4745 // kept in the shareds of the task structure.
4746 typedef struct __taskloop_params {
4747  kmp_task_t *task;
4748  kmp_uint64 *lb;
4749  kmp_uint64 *ub;
4750  void *task_dup;
4751  kmp_int64 st;
4752  kmp_uint64 ub_glob;
4753  kmp_uint64 num_tasks;
4754  kmp_uint64 grainsize;
4755  kmp_uint64 extras;
4756  kmp_int64 last_chunk;
4757  kmp_uint64 tc;
4758  kmp_uint64 num_t_min;
4759 #if OMPT_SUPPORT
4760  void *codeptr_ra;
4761 #endif
4762 } __taskloop_params_t;
4763 
4764 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4765  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4766  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4767  kmp_uint64,
4768 #if OMPT_SUPPORT
4769  void *,
4770 #endif
4771  void *);
4772 
4773 // Execute part of the taskloop submitted as a task.
4774 int __kmp_taskloop_task(int gtid, void *ptask) {
4775  __taskloop_params_t *p =
4776  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4777  kmp_task_t *task = p->task;
4778  kmp_uint64 *lb = p->lb;
4779  kmp_uint64 *ub = p->ub;
4780  void *task_dup = p->task_dup;
4781  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4782  kmp_int64 st = p->st;
4783  kmp_uint64 ub_glob = p->ub_glob;
4784  kmp_uint64 num_tasks = p->num_tasks;
4785  kmp_uint64 grainsize = p->grainsize;
4786  kmp_uint64 extras = p->extras;
4787  kmp_int64 last_chunk = p->last_chunk;
4788  kmp_uint64 tc = p->tc;
4789  kmp_uint64 num_t_min = p->num_t_min;
4790 #if OMPT_SUPPORT
4791  void *codeptr_ra = p->codeptr_ra;
4792 #endif
4793 #if KMP_DEBUG
4794  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4795  KMP_DEBUG_ASSERT(task != NULL);
4796  KA_TRACE(20,
4797  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4798  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4799  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4800  st, task_dup));
4801 #endif
4802  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4803  if (num_tasks > num_t_min)
4804  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4805  grainsize, extras, last_chunk, tc, num_t_min,
4806 #if OMPT_SUPPORT
4807  codeptr_ra,
4808 #endif
4809  task_dup);
4810  else
4811  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4812  grainsize, extras, last_chunk, tc,
4813 #if OMPT_SUPPORT
4814  codeptr_ra,
4815 #endif
4816  task_dup);
4817 
4818  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4819  return 0;
4820 }
4821 
4822 // Schedule part of the taskloop as a task,
4823 // execute the rest of the taskloop.
4824 //
4825 // loc Source location information
4826 // gtid Global thread ID
4827 // task Pattern task, exposes the loop iteration range
4828 // lb Pointer to loop lower bound in task structure
4829 // ub Pointer to loop upper bound in task structure
4830 // st Loop stride
4831 // ub_glob Global upper bound (used for lastprivate check)
4832 // num_tasks Number of tasks to execute
4833 // grainsize Number of loop iterations per task
4834 // extras Number of chunks with grainsize+1 iterations
4835 // last_chunk Reduction of grainsize for last task
4836 // tc Iterations count
4837 // num_t_min Threshold to launch tasks recursively
4838 // task_dup Tasks duplication routine
4839 // codeptr_ra Return address for OMPT events
4840 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4841  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4842  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4843  kmp_uint64 grainsize, kmp_uint64 extras,
4844  kmp_int64 last_chunk, kmp_uint64 tc,
4845  kmp_uint64 num_t_min,
4846 #if OMPT_SUPPORT
4847  void *codeptr_ra,
4848 #endif
4849  void *task_dup) {
4850  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4851  KMP_DEBUG_ASSERT(task != NULL);
4852  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4853  KA_TRACE(20,
4854  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4855  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4856  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4857  st, task_dup));
4858  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4859  kmp_uint64 lower = *lb;
4860  kmp_info_t *thread = __kmp_threads[gtid];
4861  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4862  kmp_task_t *next_task;
4863  size_t lower_offset =
4864  (char *)lb - (char *)task; // remember offset of lb in the task structure
4865  size_t upper_offset =
4866  (char *)ub - (char *)task; // remember offset of ub in the task structure
4867 
4868  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4869  (last_chunk < 0 ? last_chunk : extras));
4870  KMP_DEBUG_ASSERT(num_tasks > extras);
4871  KMP_DEBUG_ASSERT(num_tasks > 0);
4872 
4873  // split the loop in two halves
4874  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4875  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4876  kmp_uint64 gr_size0 = grainsize;
4877  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4878  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4879  if (last_chunk < 0) {
4880  ext0 = ext1 = 0;
4881  last_chunk1 = last_chunk;
4882  tc0 = grainsize * n_tsk0;
4883  tc1 = tc - tc0;
4884  } else if (n_tsk0 <= extras) {
4885  gr_size0++; // integrate extras into grainsize
4886  ext0 = 0; // no extra iters in 1st half
4887  ext1 = extras - n_tsk0; // remaining extras
4888  tc0 = gr_size0 * n_tsk0;
4889  tc1 = tc - tc0;
4890  } else { // n_tsk0 > extras
4891  ext1 = 0; // no extra iters in 2nd half
4892  ext0 = extras;
4893  tc1 = grainsize * n_tsk1;
4894  tc0 = tc - tc1;
4895  }
4896  ub0 = lower + st * (tc0 - 1);
4897  lb1 = ub0 + st;
4898 
4899  // create pattern task for 2nd half of the loop
4900 #if OMPX_TASKGRAPH
4901  next_task = __kmp_task_dup_alloc(thread, task,
4902  /* taskloop_recur */ 1);
4903 #else
4904  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4905 #endif
4906  // adjust lower bound (upper bound is not changed) for the 2nd half
4907  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4908  if (ptask_dup != NULL) // construct firstprivates, etc.
4909  ptask_dup(next_task, task, 0);
4910  *ub = ub0; // adjust upper bound for the 1st half
4911 
4912  // create auxiliary task for 2nd half of the loop
4913  // make sure new task has same parent task as the pattern task
4914  kmp_taskdata_t *current_task = thread->th.th_current_task;
4915  thread->th.th_current_task = taskdata->td_parent;
4916  kmp_task_t *new_task =
4917  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4918  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4919  // restore current task
4920  thread->th.th_current_task = current_task;
4921  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4922  p->task = next_task;
4923  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4924  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4925  p->task_dup = task_dup;
4926  p->st = st;
4927  p->ub_glob = ub_glob;
4928  p->num_tasks = n_tsk1;
4929  p->grainsize = grainsize;
4930  p->extras = ext1;
4931  p->last_chunk = last_chunk1;
4932  p->tc = tc1;
4933  p->num_t_min = num_t_min;
4934 #if OMPT_SUPPORT
4935  p->codeptr_ra = codeptr_ra;
4936 #endif
4937 
4938 #if OMPX_TASKGRAPH
4939  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4940  new_task_data->tdg = taskdata->tdg;
4941  new_task_data->is_taskgraph = 0;
4942 #endif
4943 
4944 #if OMPT_SUPPORT
4945  // schedule new task with correct return address for OMPT events
4946  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4947 #else
4948  __kmp_omp_task(gtid, new_task, true); // schedule new task
4949 #endif
4950 
4951  // execute the 1st half of current subrange
4952  if (n_tsk0 > num_t_min)
4953  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4954  ext0, last_chunk0, tc0, num_t_min,
4955 #if OMPT_SUPPORT
4956  codeptr_ra,
4957 #endif
4958  task_dup);
4959  else
4960  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4961  gr_size0, ext0, last_chunk0, tc0,
4962 #if OMPT_SUPPORT
4963  codeptr_ra,
4964 #endif
4965  task_dup);
4966 
4967  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4968 }
4969 
4970 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4971  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4972  int nogroup, int sched, kmp_uint64 grainsize,
4973  int modifier, void *task_dup) {
4974  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4975  KMP_DEBUG_ASSERT(task != NULL);
4976  if (nogroup == 0) {
4977 #if OMPT_SUPPORT && OMPT_OPTIONAL
4978  OMPT_STORE_RETURN_ADDRESS(gtid);
4979 #endif
4980  __kmpc_taskgroup(loc, gtid);
4981  }
4982 
4983 #if OMPX_TASKGRAPH
4984  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4985 #endif
4986  // =========================================================================
4987  // calculate loop parameters
4988  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4989  kmp_uint64 tc;
4990  // compiler provides global bounds here
4991  kmp_uint64 lower = task_bounds.get_lb();
4992  kmp_uint64 upper = task_bounds.get_ub();
4993  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
4994  kmp_uint64 num_tasks = 0, extras = 0;
4995  kmp_int64 last_chunk =
4996  0; // reduce grainsize of last task by last_chunk in strict mode
4997  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
4998  kmp_info_t *thread = __kmp_threads[gtid];
4999  kmp_taskdata_t *current_task = thread->th.th_current_task;
5000 
5001  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5002  "grain %llu(%d, %d), dup %p\n",
5003  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5004  task_dup));
5005 
5006  // compute trip count
5007  if (st == 1) { // most common case
5008  tc = upper - lower + 1;
5009  } else if (st < 0) {
5010  tc = (lower - upper) / (-st) + 1;
5011  } else { // st > 0
5012  tc = (upper - lower) / st + 1;
5013  }
5014  if (tc == 0) {
5015  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5016  // free the pattern task and exit
5017  __kmp_task_start(gtid, task, current_task);
5018  // do not execute anything for zero-trip loop
5019  __kmp_task_finish<false>(gtid, task, current_task);
5020  return;
5021  }
5022 
5023 #if OMPT_SUPPORT && OMPT_OPTIONAL
5024  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5025  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5026  if (ompt_enabled.ompt_callback_work) {
5027  ompt_callbacks.ompt_callback(ompt_callback_work)(
5028  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5029  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5030  }
5031 #endif
5032 
5033  if (num_tasks_min == 0)
5034  // TODO: can we choose better default heuristic?
5035  num_tasks_min =
5036  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5037 
5038  // compute num_tasks/grainsize based on the input provided
5039  switch (sched) {
5040  case 0: // no schedule clause specified, we can choose the default
5041  // let's try to schedule (team_size*10) tasks
5042  grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5043  KMP_FALLTHROUGH();
5044  case 2: // num_tasks provided
5045  if (grainsize > tc) {
5046  num_tasks = tc; // too big num_tasks requested, adjust values
5047  grainsize = 1;
5048  extras = 0;
5049  } else {
5050  num_tasks = grainsize;
5051  grainsize = tc / num_tasks;
5052  extras = tc % num_tasks;
5053  }
5054  break;
5055  case 1: // grainsize provided
5056  if (grainsize > tc) {
5057  num_tasks = 1;
5058  grainsize = tc; // too big grainsize requested, adjust values
5059  extras = 0;
5060  } else {
5061  if (modifier) {
5062  num_tasks = (tc + grainsize - 1) / grainsize;
5063  last_chunk = tc - (num_tasks * grainsize);
5064  extras = 0;
5065  } else {
5066  num_tasks = tc / grainsize;
5067  // adjust grainsize for balanced distribution of iterations
5068  grainsize = tc / num_tasks;
5069  extras = tc % num_tasks;
5070  }
5071  }
5072  break;
5073  default:
5074  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5075  }
5076 
5077  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5078  (last_chunk < 0 ? last_chunk : extras));
5079  KMP_DEBUG_ASSERT(num_tasks > extras);
5080  KMP_DEBUG_ASSERT(num_tasks > 0);
5081  // =========================================================================
5082 
5083  // check if clause value first
5084  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5085  if (if_val == 0) { // if(0) specified, mark task as serial
5086  taskdata->td_flags.task_serial = 1;
5087  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5088  // always start serial tasks linearly
5089  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5090  grainsize, extras, last_chunk, tc,
5091 #if OMPT_SUPPORT
5092  OMPT_GET_RETURN_ADDRESS(0),
5093 #endif
5094  task_dup);
5095  // !taskdata->td_flags.native => currently force linear spawning of tasks
5096  // for GOMP_taskloop
5097  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5098  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5099  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5100  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5101  last_chunk));
5102  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5103  grainsize, extras, last_chunk, tc, num_tasks_min,
5104 #if OMPT_SUPPORT
5105  OMPT_GET_RETURN_ADDRESS(0),
5106 #endif
5107  task_dup);
5108  } else {
5109  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5110  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5111  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5112  last_chunk));
5113  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5114  grainsize, extras, last_chunk, tc,
5115 #if OMPT_SUPPORT
5116  OMPT_GET_RETURN_ADDRESS(0),
5117 #endif
5118  task_dup);
5119  }
5120 
5121 #if OMPT_SUPPORT && OMPT_OPTIONAL
5122  if (ompt_enabled.ompt_callback_work) {
5123  ompt_callbacks.ompt_callback(ompt_callback_work)(
5124  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5125  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5126  }
5127 #endif
5128 
5129  if (nogroup == 0) {
5130 #if OMPT_SUPPORT && OMPT_OPTIONAL
5131  OMPT_STORE_RETURN_ADDRESS(gtid);
5132 #endif
5133  __kmpc_end_taskgroup(loc, gtid);
5134  }
5135  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5136 }
5137 
5154 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5155  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5156  int sched, kmp_uint64 grainsize, void *task_dup) {
5157  __kmp_assert_valid_gtid(gtid);
5158  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5159  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5160  0, task_dup);
5161  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5162 }
5163 
5181 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5182  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5183  int nogroup, int sched, kmp_uint64 grainsize,
5184  int modifier, void *task_dup) {
5185  __kmp_assert_valid_gtid(gtid);
5186  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5187  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5188  modifier, task_dup);
5189  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5190 }
5191 
5201  if (gtid == KMP_GTID_DNE)
5202  return NULL;
5203 
5204  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5205  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5206 
5207  if (!taskdata)
5208  return NULL;
5209 
5210  return &taskdata->td_target_data.async_handle;
5211 }
5212 
5221 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5222  if (gtid == KMP_GTID_DNE)
5223  return FALSE;
5224 
5225  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5226  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5227 
5228  if (!taskdata)
5229  return FALSE;
5230 
5231  return taskdata->td_task_team != NULL;
5232 }
5233 
5234 #if OMPX_TASKGRAPH
5235 // __kmp_find_tdg: identify a TDG through its ID
5236 // tdg_id: ID of the TDG
5237 // returns: If a TDG corresponding to this ID is found and not
5238 // its initial state, return the pointer to it, otherwise nullptr
5239 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5240  kmp_tdg_info_t *res = nullptr;
5241  if (__kmp_max_tdgs == 0)
5242  return res;
5243 
5244  if (__kmp_global_tdgs == NULL)
5245  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5246  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5247 
5248  if ((__kmp_global_tdgs[tdg_id]) &&
5249  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5250  res = __kmp_global_tdgs[tdg_id];
5251  return res;
5252 }
5253 
5254 // __kmp_print_tdg_dot: prints the TDG to a dot file
5255 // tdg: ID of the TDG
5256 // gtid: Global Thread ID
5257 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5258  kmp_int32 tdg_id = tdg->tdg_id;
5259  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5260 
5261  char file_name[20];
5262  sprintf(file_name, "tdg_%d.dot", tdg_id);
5263  kmp_safe_raii_file_t tdg_file(file_name, "w");
5264 
5265  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5266  fprintf(tdg_file,
5267  "digraph TDG {\n"
5268  " compound=true\n"
5269  " subgraph cluster {\n"
5270  " label=TDG_%d\n",
5271  tdg_id);
5272  for (kmp_int32 i = 0; i < num_tasks; i++) {
5273  fprintf(tdg_file, " %d[style=bold]\n", i);
5274  }
5275  fprintf(tdg_file, " }\n");
5276  for (kmp_int32 i = 0; i < num_tasks; i++) {
5277  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5278  kmp_int32 *successors = tdg->record_map[i].successors;
5279  if (nsuccessors > 0) {
5280  for (kmp_int32 j = 0; j < nsuccessors; j++)
5281  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5282  }
5283  }
5284  fprintf(tdg_file, "}");
5285  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5286 }
5287 
5288 // __kmp_exec_tdg: launch the execution of a previous
5289 // recorded TDG
5290 // gtid: Global Thread ID
5291 // tdg: ID of the TDG
5292 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5293  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5294  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5295  tdg->tdg_id, tdg->num_roots));
5296  kmp_node_info_t *this_record_map = tdg->record_map;
5297  kmp_int32 *this_root_tasks = tdg->root_tasks;
5298  kmp_int32 this_num_roots = tdg->num_roots;
5299  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5300 
5301  kmp_info_t *thread = __kmp_threads[gtid];
5302  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5303 
5304  if (tdg->rec_taskred_data) {
5305  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5306  }
5307 
5308  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5309  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5310 
5311  td->td_parent = parent_task;
5312  this_record_map[j].parent_task = parent_task;
5313 
5314  kmp_taskgroup_t *parent_taskgroup =
5315  this_record_map[j].parent_task->td_taskgroup;
5316 
5317  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5318  this_record_map[j].npredecessors);
5319  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5320 
5321  if (parent_taskgroup) {
5322  KMP_ATOMIC_INC(&parent_taskgroup->count);
5323  // The taskgroup is different so we must update it
5324  td->td_taskgroup = parent_taskgroup;
5325  } else if (td->td_taskgroup != nullptr) {
5326  // If the parent doesnt have a taskgroup, remove it from the task
5327  td->td_taskgroup = nullptr;
5328  }
5329  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5330  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5331  }
5332 
5333  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5334  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5335  }
5336  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5337  tdg->tdg_id, tdg->num_roots));
5338 }
5339 
5340 // __kmp_start_record: set up a TDG structure and turn the
5341 // recording flag to true
5342 // gtid: Global Thread ID of the encountering thread
5343 // input_flags: Flags associated with the TDG
5344 // tdg_id: ID of the TDG to record
5345 static inline void __kmp_start_record(kmp_int32 gtid,
5346  kmp_taskgraph_flags_t *flags,
5347  kmp_int32 tdg_id) {
5348  kmp_tdg_info_t *tdg =
5349  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5350  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5351  // Initializing the TDG structure
5352  tdg->tdg_id = tdg_id;
5353  tdg->map_size = INIT_MAPSIZE;
5354  tdg->num_roots = -1;
5355  tdg->root_tasks = nullptr;
5356  tdg->tdg_status = KMP_TDG_RECORDING;
5357  tdg->rec_num_taskred = 0;
5358  tdg->rec_taskred_data = nullptr;
5359  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5360 
5361  // Initializing the list of nodes in this TDG
5362  kmp_node_info_t *this_record_map =
5363  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5364  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5365  kmp_int32 *successorsList =
5366  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5367  this_record_map[i].task = nullptr;
5368  this_record_map[i].successors = successorsList;
5369  this_record_map[i].nsuccessors = 0;
5370  this_record_map[i].npredecessors = 0;
5371  this_record_map[i].successors_size = __kmp_successors_size;
5372  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5373  }
5374 
5375  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5376 }
5377 
5378 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5379 // the beginning of the record process of a task region
5380 // loc_ref: Location of TDG, not used yet
5381 // gtid: Global Thread ID of the encountering thread
5382 // input_flags: Flags associated with the TDG
5383 // tdg_id: ID of the TDG to record, for now, incremental integer
5384 // returns: 1 if we record, otherwise, 0
5385 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5386  kmp_int32 input_flags, kmp_int32 tdg_id) {
5387 
5388  kmp_int32 res;
5389  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5390  KA_TRACE(10,
5391  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5392  gtid, loc_ref, input_flags, tdg_id));
5393 
5394  if (__kmp_max_tdgs == 0) {
5395  KA_TRACE(
5396  10,
5397  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5398  "__kmp_max_tdgs = 0\n",
5399  gtid, loc_ref, input_flags, tdg_id));
5400  return 1;
5401  }
5402 
5403  __kmpc_taskgroup(loc_ref, gtid);
5404  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5405  // TODO: use re_record flag
5406  __kmp_exec_tdg(gtid, tdg);
5407  res = 0;
5408  } else {
5409  __kmp_curr_tdg_idx = tdg_id;
5410  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5411  __kmp_start_record(gtid, flags, tdg_id);
5412  __kmp_num_tdg++;
5413  res = 1;
5414  }
5415  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5416  gtid, tdg_id, res ? "record" : "execute"));
5417  return res;
5418 }
5419 
5420 // __kmp_end_record: set up a TDG after recording it
5421 // gtid: Global thread ID
5422 // tdg: Pointer to the TDG
5423 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5424  // Store roots
5425  kmp_node_info_t *this_record_map = tdg->record_map;
5426  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5427  kmp_int32 *this_root_tasks =
5428  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5429  kmp_int32 this_map_size = tdg->map_size;
5430  kmp_int32 this_num_roots = 0;
5431  kmp_info_t *thread = __kmp_threads[gtid];
5432 
5433  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5434  if (this_record_map[i].npredecessors == 0) {
5435  this_root_tasks[this_num_roots++] = i;
5436  }
5437  }
5438 
5439  // Update with roots info and mapsize
5440  tdg->map_size = this_map_size;
5441  tdg->num_roots = this_num_roots;
5442  tdg->root_tasks = this_root_tasks;
5443  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5444  tdg->tdg_status = KMP_TDG_READY;
5445 
5446  if (thread->th.th_current_task->td_dephash) {
5447  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5448  thread->th.th_current_task->td_dephash = NULL;
5449  }
5450 
5451  // Reset predecessor counter
5452  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5453  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5454  this_record_map[i].npredecessors);
5455  }
5456  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5457 
5458  if (__kmp_tdg_dot)
5459  __kmp_print_tdg_dot(tdg, gtid);
5460 }
5461 
5462 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5463 // the end of recording phase
5464 //
5465 // loc_ref: Source location information
5466 // gtid: Global thread ID
5467 // input_flags: Flags attached to the graph
5468 // tdg_id: ID of the TDG just finished recording
5469 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5470  kmp_int32 input_flags, kmp_int32 tdg_id) {
5471  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5472 
5473  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5474  " tdg=%d with flags=%d\n",
5475  gtid, loc_ref, tdg_id, input_flags));
5476  if (__kmp_max_tdgs) {
5477  // TODO: use input_flags->nowait
5478  __kmpc_end_taskgroup(loc_ref, gtid);
5479  if (__kmp_tdg_is_recording(tdg->tdg_status))
5480  __kmp_end_record(gtid, tdg);
5481  }
5482  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5483  " tdg=%d, its status is now READY\n",
5484  gtid, loc_ref, tdg_id));
5485 }
5486 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:230
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags