LLVM OpenMP* Runtime Library
kmp_tasking.cpp
1 /*
2  * kmp_tasking.cpp -- OpenMP 3.0 tasking support.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_i18n.h"
15 #include "kmp_itt.h"
16 #include "kmp_stats.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_taskdeps.h"
19 
20 #if OMPT_SUPPORT
21 #include "ompt-specific.h"
22 #endif
23 
24 #if ENABLE_LIBOMPTARGET
25 static void (*tgt_target_nowait_query)(void **);
26 
27 void __kmp_init_target_task() {
28  *(void **)(&tgt_target_nowait_query) = KMP_DLSYM("__tgt_target_nowait_query");
29 }
30 #endif
31 
32 /* forward declaration */
33 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
34  kmp_info_t *this_thr);
35 static void __kmp_alloc_task_deque(kmp_info_t *thread,
36  kmp_thread_data_t *thread_data);
37 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
38  kmp_task_team_t *task_team);
39 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
40 #if OMPX_TASKGRAPH
41 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
42 int __kmp_taskloop_task(int gtid, void *ptask);
43 #endif
44 
45 // returns 1 if new task is allowed to execute, 0 otherwise
46 // checks Task Scheduling constraint (if requested) and
47 // mutexinoutset dependencies if any
48 static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
49  const kmp_taskdata_t *tasknew,
50  const kmp_taskdata_t *taskcurr) {
51  if (is_constrained && (tasknew->td_flags.tiedness == TASK_TIED)) {
52  // Check if the candidate obeys the Task Scheduling Constraints (TSC)
53  // only descendant of all deferred tied tasks can be scheduled, checking
54  // the last one is enough, as it in turn is the descendant of all others
55  kmp_taskdata_t *current = taskcurr->td_last_tied;
56  KMP_DEBUG_ASSERT(current != NULL);
57  // check if the task is not suspended on barrier
58  if (current->td_flags.tasktype == TASK_EXPLICIT ||
59  current->td_taskwait_thread > 0) { // <= 0 on barrier
60  kmp_int32 level = current->td_level;
61  kmp_taskdata_t *parent = tasknew->td_parent;
62  while (parent != current && parent->td_level > level) {
63  // check generation up to the level of the current task
64  parent = parent->td_parent;
65  KMP_DEBUG_ASSERT(parent != NULL);
66  }
67  if (parent != current)
68  return false;
69  }
70  }
71  // Check mutexinoutset dependencies, acquire locks
72  kmp_depnode_t *node = tasknew->td_depnode;
73 #if OMPX_TASKGRAPH
74  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
75 #else
76  if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
77 #endif
78  for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
79  KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
80  if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
81  continue;
82  // could not get the lock, release previous locks
83  for (int j = i - 1; j >= 0; --j)
84  __kmp_release_lock(node->dn.mtx_locks[j], gtid);
85  return false;
86  }
87  // negative num_locks means all locks acquired successfully
88  node->dn.mtx_num_locks = -node->dn.mtx_num_locks;
89  }
90  return true;
91 }
92 
93 // __kmp_realloc_task_deque:
94 // Re-allocates a task deque for a particular thread, copies the content from
95 // the old deque and adjusts the necessary data structures relating to the
96 // deque. This operation must be done with the deque_lock being held
97 static void __kmp_realloc_task_deque(kmp_info_t *thread,
98  kmp_thread_data_t *thread_data) {
99  kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td);
100  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == size);
101  kmp_int32 new_size = 2 * size;
102 
103  KE_TRACE(10, ("__kmp_realloc_task_deque: T#%d reallocating deque[from %d to "
104  "%d] for thread_data %p\n",
105  __kmp_gtid_from_thread(thread), size, new_size, thread_data));
106 
107  kmp_taskdata_t **new_deque =
108  (kmp_taskdata_t **)__kmp_allocate(new_size * sizeof(kmp_taskdata_t *));
109 
110  int i, j;
111  for (i = thread_data->td.td_deque_head, j = 0; j < size;
112  i = (i + 1) & TASK_DEQUE_MASK(thread_data->td), j++)
113  new_deque[j] = thread_data->td.td_deque[i];
114 
115  __kmp_free(thread_data->td.td_deque);
116 
117  thread_data->td.td_deque_head = 0;
118  thread_data->td.td_deque_tail = size;
119  thread_data->td.td_deque = new_deque;
120  thread_data->td.td_deque_size = new_size;
121 }
122 
123 static kmp_task_pri_t *__kmp_alloc_task_pri_list() {
124  kmp_task_pri_t *l = (kmp_task_pri_t *)__kmp_allocate(sizeof(kmp_task_pri_t));
125  kmp_thread_data_t *thread_data = &l->td;
126  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
127  thread_data->td.td_deque_last_stolen = -1;
128  KE_TRACE(20, ("__kmp_alloc_task_pri_list: T#%d allocating deque[%d] "
129  "for thread_data %p\n",
130  __kmp_get_gtid(), INITIAL_TASK_DEQUE_SIZE, thread_data));
131  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
132  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
133  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
134  return l;
135 }
136 
137 // The function finds the deque of priority tasks with given priority, or
138 // allocates a new deque and put it into sorted (high -> low) list of deques.
139 // Deques of non-default priority tasks are shared between all threads in team,
140 // as opposed to per-thread deques of tasks with default priority.
141 // The function is called under the lock task_team->tt.tt_task_pri_lock.
142 static kmp_thread_data_t *
143 __kmp_get_priority_deque_data(kmp_task_team_t *task_team, kmp_int32 pri) {
144  kmp_thread_data_t *thread_data;
145  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
146  if (lst->priority == pri) {
147  // Found queue of tasks with given priority.
148  thread_data = &lst->td;
149  } else if (lst->priority < pri) {
150  // All current priority queues contain tasks with lower priority.
151  // Allocate new one for given priority tasks.
152  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
153  thread_data = &list->td;
154  list->priority = pri;
155  list->next = lst;
156  task_team->tt.tt_task_pri_list = list;
157  } else { // task_team->tt.tt_task_pri_list->priority > pri
158  kmp_task_pri_t *next_queue = lst->next;
159  while (next_queue && next_queue->priority > pri) {
160  lst = next_queue;
161  next_queue = lst->next;
162  }
163  // lst->priority > pri && (next == NULL || pri >= next->priority)
164  if (next_queue == NULL) {
165  // No queue with pri priority, need to allocate new one.
166  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
167  thread_data = &list->td;
168  list->priority = pri;
169  list->next = NULL;
170  lst->next = list;
171  } else if (next_queue->priority == pri) {
172  // Found queue of tasks with given priority.
173  thread_data = &next_queue->td;
174  } else { // lst->priority > pri > next->priority
175  // insert newly allocated between existed queues
176  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
177  thread_data = &list->td;
178  list->priority = pri;
179  list->next = next_queue;
180  lst->next = list;
181  }
182  }
183  return thread_data;
184 }
185 
186 // __kmp_push_priority_task: Add a task to the team's priority task deque
187 static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
188  kmp_taskdata_t *taskdata,
189  kmp_task_team_t *task_team,
190  kmp_int32 pri) {
191  kmp_thread_data_t *thread_data = NULL;
192  KA_TRACE(20,
193  ("__kmp_push_priority_task: T#%d trying to push task %p, pri %d.\n",
194  gtid, taskdata, pri));
195 
196  // Find task queue specific to priority value
197  kmp_task_pri_t *lst = task_team->tt.tt_task_pri_list;
198  if (UNLIKELY(lst == NULL)) {
199  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
200  if (task_team->tt.tt_task_pri_list == NULL) {
201  // List of queues is still empty, allocate one.
202  kmp_task_pri_t *list = __kmp_alloc_task_pri_list();
203  thread_data = &list->td;
204  list->priority = pri;
205  list->next = NULL;
206  task_team->tt.tt_task_pri_list = list;
207  } else {
208  // Other thread initialized a queue. Check if it fits and get thread_data.
209  thread_data = __kmp_get_priority_deque_data(task_team, pri);
210  }
211  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
212  } else {
213  if (lst->priority == pri) {
214  // Found queue of tasks with given priority.
215  thread_data = &lst->td;
216  } else {
217  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
218  thread_data = __kmp_get_priority_deque_data(task_team, pri);
219  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
220  }
221  }
222  KMP_DEBUG_ASSERT(thread_data);
223 
224  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
225  // Check if deque is full
226  if (TCR_4(thread_data->td.td_deque_ntasks) >=
227  TASK_DEQUE_SIZE(thread_data->td)) {
228  if (__kmp_enable_task_throttling &&
229  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
230  thread->th.th_current_task)) {
231  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
232  KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
233  "TASK_NOT_PUSHED for task %p\n",
234  gtid, taskdata));
235  return TASK_NOT_PUSHED;
236  } else {
237  // expand deque to push the task which is not allowed to execute
238  __kmp_realloc_task_deque(thread, thread_data);
239  }
240  }
241  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
242  TASK_DEQUE_SIZE(thread_data->td));
243  // Push taskdata.
244  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
245  // Wrap index.
246  thread_data->td.td_deque_tail =
247  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
248  TCW_4(thread_data->td.td_deque_ntasks,
249  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
250  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
251  KMP_FSYNC_RELEASING(taskdata); // releasing child
252  KA_TRACE(20, ("__kmp_push_priority_task: T#%d returning "
253  "TASK_SUCCESSFULLY_PUSHED: task=%p ntasks=%d head=%u tail=%u\n",
254  gtid, taskdata, thread_data->td.td_deque_ntasks,
255  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
256  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
257  task_team->tt.tt_num_task_pri++; // atomic inc
258  return TASK_SUCCESSFULLY_PUSHED;
259 }
260 
261 // __kmp_push_task: Add a task to the thread's deque
262 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
263  kmp_info_t *thread = __kmp_threads[gtid];
264  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
265 
266  // If we encounter a hidden helper task, and the current thread is not a
267  // hidden helper thread, we have to give the task to any hidden helper thread
268  // starting from its shadow one.
269  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
270  !KMP_HIDDEN_HELPER_THREAD(gtid))) {
271  kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
272  __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
273  // Signal the hidden helper threads.
274  __kmp_hidden_helper_worker_thread_signal();
275  return TASK_SUCCESSFULLY_PUSHED;
276  }
277 
278  kmp_task_team_t *task_team = thread->th.th_task_team;
279  kmp_int32 tid = __kmp_tid_from_gtid(gtid);
280  kmp_thread_data_t *thread_data;
281 
282  KA_TRACE(20,
283  ("__kmp_push_task: T#%d trying to push task %p.\n", gtid, taskdata));
284 
285  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
286  // untied task needs to increment counter so that the task structure is not
287  // freed prematurely
288  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
289  KMP_DEBUG_USE_VAR(counter);
290  KA_TRACE(
291  20,
292  ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
293  gtid, counter, taskdata));
294  }
295 
296  // The first check avoids building task_team thread data if serialized
297  if (UNLIKELY(taskdata->td_flags.task_serial)) {
298  KA_TRACE(20, ("__kmp_push_task: T#%d team serialized; returning "
299  "TASK_NOT_PUSHED for task %p\n",
300  gtid, taskdata));
301  return TASK_NOT_PUSHED;
302  }
303 
304  // Now that serialized tasks have returned, we can assume that we are not in
305  // immediate exec mode
306  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
307  if (UNLIKELY(!KMP_TASKING_ENABLED(task_team))) {
308  __kmp_enable_tasking(task_team, thread);
309  }
310  KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE);
311  KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL);
312 
313  if (taskdata->td_flags.priority_specified && task->data2.priority > 0 &&
314  __kmp_max_task_priority > 0) {
315  int pri = KMP_MIN(task->data2.priority, __kmp_max_task_priority);
316  return __kmp_push_priority_task(gtid, thread, taskdata, task_team, pri);
317  }
318 
319  // Find tasking deque specific to encountering thread
320  thread_data = &task_team->tt.tt_threads_data[tid];
321 
322  // No lock needed since only owner can allocate. If the task is hidden_helper,
323  // we don't need it either because we have initialized the dequeue for hidden
324  // helper thread data.
325  if (UNLIKELY(thread_data->td.td_deque == NULL)) {
326  __kmp_alloc_task_deque(thread, thread_data);
327  }
328 
329  int locked = 0;
330  // Check if deque is full
331  if (TCR_4(thread_data->td.td_deque_ntasks) >=
332  TASK_DEQUE_SIZE(thread_data->td)) {
333  if (__kmp_enable_task_throttling &&
334  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
335  thread->th.th_current_task)) {
336  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
337  "TASK_NOT_PUSHED for task %p\n",
338  gtid, taskdata));
339  return TASK_NOT_PUSHED;
340  } else {
341  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
342  locked = 1;
343  if (TCR_4(thread_data->td.td_deque_ntasks) >=
344  TASK_DEQUE_SIZE(thread_data->td)) {
345  // expand deque to push the task which is not allowed to execute
346  __kmp_realloc_task_deque(thread, thread_data);
347  }
348  }
349  }
350  // Lock the deque for the task push operation
351  if (!locked) {
352  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
353  // Need to recheck as we can get a proxy task from thread outside of OpenMP
354  if (TCR_4(thread_data->td.td_deque_ntasks) >=
355  TASK_DEQUE_SIZE(thread_data->td)) {
356  if (__kmp_enable_task_throttling &&
357  __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
358  thread->th.th_current_task)) {
359  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
360  KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
361  "returning TASK_NOT_PUSHED for task %p\n",
362  gtid, taskdata));
363  return TASK_NOT_PUSHED;
364  } else {
365  // expand deque to push the task which is not allowed to execute
366  __kmp_realloc_task_deque(thread, thread_data);
367  }
368  }
369  }
370  // Must have room since no thread can add tasks but calling thread
371  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
372  TASK_DEQUE_SIZE(thread_data->td));
373 
374  thread_data->td.td_deque[thread_data->td.td_deque_tail] =
375  taskdata; // Push taskdata
376  // Wrap index.
377  thread_data->td.td_deque_tail =
378  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
379  TCW_4(thread_data->td.td_deque_ntasks,
380  TCR_4(thread_data->td.td_deque_ntasks) + 1); // Adjust task count
381  KMP_FSYNC_RELEASING(thread->th.th_current_task); // releasing self
382  KMP_FSYNC_RELEASING(taskdata); // releasing child
383  KA_TRACE(20, ("__kmp_push_task: T#%d returning TASK_SUCCESSFULLY_PUSHED: "
384  "task=%p ntasks=%d head=%u tail=%u\n",
385  gtid, taskdata, thread_data->td.td_deque_ntasks,
386  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
387 
388  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
389 
390  return TASK_SUCCESSFULLY_PUSHED;
391 }
392 
393 // __kmp_pop_current_task_from_thread: set up current task from called thread
394 // when team ends
395 //
396 // this_thr: thread structure to set current_task in.
397 void __kmp_pop_current_task_from_thread(kmp_info_t *this_thr) {
398  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(enter): T#%d "
399  "this_thread=%p, curtask=%p, "
400  "curtask_parent=%p\n",
401  0, this_thr, this_thr->th.th_current_task,
402  this_thr->th.th_current_task->td_parent));
403 
404  this_thr->th.th_current_task = this_thr->th.th_current_task->td_parent;
405 
406  KF_TRACE(10, ("__kmp_pop_current_task_from_thread(exit): T#%d "
407  "this_thread=%p, curtask=%p, "
408  "curtask_parent=%p\n",
409  0, this_thr, this_thr->th.th_current_task,
410  this_thr->th.th_current_task->td_parent));
411 }
412 
413 // __kmp_push_current_task_to_thread: set up current task in called thread for a
414 // new team
415 //
416 // this_thr: thread structure to set up
417 // team: team for implicit task data
418 // tid: thread within team to set up
419 void __kmp_push_current_task_to_thread(kmp_info_t *this_thr, kmp_team_t *team,
420  int tid) {
421  // current task of the thread is a parent of the new just created implicit
422  // tasks of new team
423  KF_TRACE(10, ("__kmp_push_current_task_to_thread(enter): T#%d this_thread=%p "
424  "curtask=%p "
425  "parent_task=%p\n",
426  tid, this_thr, this_thr->th.th_current_task,
427  team->t.t_implicit_task_taskdata[tid].td_parent));
428 
429  KMP_DEBUG_ASSERT(this_thr != NULL);
430 
431  if (tid == 0) {
432  if (this_thr->th.th_current_task != &team->t.t_implicit_task_taskdata[0]) {
433  team->t.t_implicit_task_taskdata[0].td_parent =
434  this_thr->th.th_current_task;
435  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[0];
436  }
437  } else {
438  team->t.t_implicit_task_taskdata[tid].td_parent =
439  team->t.t_implicit_task_taskdata[0].td_parent;
440  this_thr->th.th_current_task = &team->t.t_implicit_task_taskdata[tid];
441  }
442 
443  KF_TRACE(10, ("__kmp_push_current_task_to_thread(exit): T#%d this_thread=%p "
444  "curtask=%p "
445  "parent_task=%p\n",
446  tid, this_thr, this_thr->th.th_current_task,
447  team->t.t_implicit_task_taskdata[tid].td_parent));
448 }
449 
450 // __kmp_task_start: bookkeeping for a task starting execution
451 //
452 // GTID: global thread id of calling thread
453 // task: task starting execution
454 // current_task: task suspending
455 static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task,
456  kmp_taskdata_t *current_task) {
457  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
458  kmp_info_t *thread = __kmp_threads[gtid];
459 
460  KA_TRACE(10,
461  ("__kmp_task_start(enter): T#%d starting task %p: current_task=%p\n",
462  gtid, taskdata, current_task));
463 
464  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
465 
466  // mark currently executing task as suspended
467  // TODO: GEH - make sure root team implicit task is initialized properly.
468  // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 );
469  current_task->td_flags.executing = 0;
470 
471  // mark starting task as executing and as current task
472  thread->th.th_current_task = taskdata;
473 
474  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 0 ||
475  taskdata->td_flags.tiedness == TASK_UNTIED);
476  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0 ||
477  taskdata->td_flags.tiedness == TASK_UNTIED);
478  taskdata->td_flags.started = 1;
479  taskdata->td_flags.executing = 1;
480  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
481  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
482 
483  // GEH TODO: shouldn't we pass some sort of location identifier here?
484  // APT: yes, we will pass location here.
485  // need to store current thread state (in a thread or taskdata structure)
486  // before setting work_state, otherwise wrong state is set after end of task
487 
488  KA_TRACE(10, ("__kmp_task_start(exit): T#%d task=%p\n", gtid, taskdata));
489 
490  return;
491 }
492 
493 #if OMPT_SUPPORT
494 //------------------------------------------------------------------------------
495 
496 // __ompt_task_start:
497 // Build and trigger task-begin event
498 static inline void __ompt_task_start(kmp_task_t *task,
499  kmp_taskdata_t *current_task,
500  kmp_int32 gtid) {
501  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
502  ompt_task_status_t status = ompt_task_switch;
503  if (__kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded) {
504  status = ompt_task_yield;
505  __kmp_threads[gtid]->th.ompt_thread_info.ompt_task_yielded = 0;
506  }
507  /* let OMPT know that we're about to run this task */
508  if (ompt_enabled.ompt_callback_task_schedule) {
509  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
510  &(current_task->ompt_task_info.task_data), status,
511  &(taskdata->ompt_task_info.task_data));
512  }
513  taskdata->ompt_task_info.scheduling_parent = current_task;
514 }
515 
516 // __ompt_task_finish:
517 // Build and trigger final task-schedule event
518 static inline void __ompt_task_finish(kmp_task_t *task,
519  kmp_taskdata_t *resumed_task,
520  ompt_task_status_t status) {
521  if (ompt_enabled.ompt_callback_task_schedule) {
522  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
523  if (__kmp_omp_cancellation && taskdata->td_taskgroup &&
524  taskdata->td_taskgroup->cancel_request == cancel_taskgroup) {
525  status = ompt_task_cancel;
526  }
527 
528  /* let OMPT know that we're returning to the callee task */
529  ompt_callbacks.ompt_callback(ompt_callback_task_schedule)(
530  &(taskdata->ompt_task_info.task_data), status,
531  (resumed_task ? &(resumed_task->ompt_task_info.task_data) : NULL));
532  }
533 }
534 #endif
535 
536 template <bool ompt>
537 static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
538  kmp_task_t *task,
539  void *frame_address,
540  void *return_address) {
541  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
542  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
543 
544  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(enter): T#%d loc=%p task=%p "
545  "current_task=%p\n",
546  gtid, loc_ref, taskdata, current_task));
547 
548  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
549  // untied task needs to increment counter so that the task structure is not
550  // freed prematurely
551  kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
552  KMP_DEBUG_USE_VAR(counter);
553  KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
554  "incremented for task %p\n",
555  gtid, counter, taskdata));
556  }
557 
558  taskdata->td_flags.task_serial =
559  1; // Execute this task immediately, not deferred.
560  __kmp_task_start(gtid, task, current_task);
561 
562 #if OMPT_SUPPORT
563  if (ompt) {
564  if (current_task->ompt_task_info.frame.enter_frame.ptr == NULL) {
565  current_task->ompt_task_info.frame.enter_frame.ptr =
566  taskdata->ompt_task_info.frame.exit_frame.ptr = frame_address;
567  current_task->ompt_task_info.frame.enter_frame_flags =
568  taskdata->ompt_task_info.frame.exit_frame_flags =
569  OMPT_FRAME_FLAGS_APP;
570  }
571  if (ompt_enabled.ompt_callback_task_create) {
572  ompt_task_info_t *parent_info = &(current_task->ompt_task_info);
573  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
574  &(parent_info->task_data), &(parent_info->frame),
575  &(taskdata->ompt_task_info.task_data),
576  TASK_TYPE_DETAILS_FORMAT(taskdata), 0, return_address);
577  }
578  __ompt_task_start(task, current_task, gtid);
579  }
580 #endif // OMPT_SUPPORT
581 
582  KA_TRACE(10, ("__kmpc_omp_task_begin_if0(exit): T#%d loc=%p task=%p,\n", gtid,
583  loc_ref, taskdata));
584 }
585 
586 #if OMPT_SUPPORT
587 OMPT_NOINLINE
588 static void __kmpc_omp_task_begin_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
589  kmp_task_t *task,
590  void *frame_address,
591  void *return_address) {
592  __kmpc_omp_task_begin_if0_template<true>(loc_ref, gtid, task, frame_address,
593  return_address);
594 }
595 #endif // OMPT_SUPPORT
596 
597 // __kmpc_omp_task_begin_if0: report that a given serialized task has started
598 // execution
599 //
600 // loc_ref: source location information; points to beginning of task block.
601 // gtid: global thread number.
602 // task: task thunk for the started task.
603 #ifdef __s390x__
604 // This is required for OMPT_GET_FRAME_ADDRESS(1) to compile on s390x.
605 // In order for it to work correctly, the caller also needs to be compiled with
606 // backchain. If a caller is compiled without backchain,
607 // OMPT_GET_FRAME_ADDRESS(1) will produce an incorrect value, but will not
608 // crash.
609 __attribute__((target("backchain")))
610 #endif
611 void __kmpc_omp_task_begin_if0(ident_t *loc_ref, kmp_int32 gtid,
612  kmp_task_t *task) {
613 #if OMPT_SUPPORT
614  if (UNLIKELY(ompt_enabled.enabled)) {
615  OMPT_STORE_RETURN_ADDRESS(gtid);
616  __kmpc_omp_task_begin_if0_ompt(loc_ref, gtid, task,
617  OMPT_GET_FRAME_ADDRESS(1),
618  OMPT_LOAD_RETURN_ADDRESS(gtid));
619  return;
620  }
621 #endif
622  __kmpc_omp_task_begin_if0_template<false>(loc_ref, gtid, task, NULL, NULL);
623 }
624 
625 #ifdef TASK_UNUSED
626 // __kmpc_omp_task_begin: report that a given task has started execution
627 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
628 void __kmpc_omp_task_begin(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *task) {
629  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
630 
631  KA_TRACE(
632  10,
633  ("__kmpc_omp_task_begin(enter): T#%d loc=%p task=%p current_task=%p\n",
634  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task), current_task));
635 
636  __kmp_task_start(gtid, task, current_task);
637 
638  KA_TRACE(10, ("__kmpc_omp_task_begin(exit): T#%d loc=%p task=%p,\n", gtid,
639  loc_ref, KMP_TASK_TO_TASKDATA(task)));
640  return;
641 }
642 #endif // TASK_UNUSED
643 
644 // __kmp_free_task: free the current task space and the space for shareds
645 //
646 // gtid: Global thread ID of calling thread
647 // taskdata: task to free
648 // thread: thread data structure of caller
649 static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
650  kmp_info_t *thread) {
651  KA_TRACE(30, ("__kmp_free_task: T#%d freeing data from task %p\n", gtid,
652  taskdata));
653 
654  // Check to make sure all flags and counters have the correct values
655  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
656  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
657  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
658  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
659  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
660  taskdata->td_flags.task_serial == 1);
661  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
662  kmp_task_t *task = KMP_TASKDATA_TO_TASK(taskdata);
663  // Clear data to not be re-used later by mistake.
664  task->data1.destructors = NULL;
665  task->data2.priority = 0;
666 
667  taskdata->td_flags.freed = 1;
668 #if OMPX_TASKGRAPH
669  // do not free tasks in taskgraph
670  if (!taskdata->is_taskgraph) {
671 #endif
672 // deallocate the taskdata and shared variable blocks associated with this task
673 #if USE_FAST_MEMORY
674  __kmp_fast_free(thread, taskdata);
675 #else /* ! USE_FAST_MEMORY */
676  __kmp_thread_free(thread, taskdata);
677 #endif
678 #if OMPX_TASKGRAPH
679  } else {
680  taskdata->td_flags.complete = 0;
681  taskdata->td_flags.started = 0;
682  taskdata->td_flags.freed = 0;
683  taskdata->td_flags.executing = 0;
684  taskdata->td_flags.task_serial =
685  (taskdata->td_parent->td_flags.final ||
686  taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
687 
688  // taskdata->td_allow_completion_event.pending_events_count = 1;
689  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
690  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
691  // start at one because counts current task and children
692  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
693  }
694 #endif
695 
696  KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
697 }
698 
699 // __kmp_free_task_and_ancestors: free the current task and ancestors without
700 // children
701 //
702 // gtid: Global thread ID of calling thread
703 // taskdata: task to free
704 // thread: thread data structure of caller
705 static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
706  kmp_taskdata_t *taskdata,
707  kmp_info_t *thread) {
708  // Proxy tasks must always be allowed to free their parents
709  // because they can be run in background even in serial mode.
710  kmp_int32 team_serial =
711  (taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) &&
712  !taskdata->td_flags.proxy;
713  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
714 
715  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
716  KMP_DEBUG_ASSERT(children >= 0);
717 
718  // Now, go up the ancestor tree to see if any ancestors can now be freed.
719  while (children == 0) {
720  kmp_taskdata_t *parent_taskdata = taskdata->td_parent;
721 
722  KA_TRACE(20, ("__kmp_free_task_and_ancestors(enter): T#%d task %p complete "
723  "and freeing itself\n",
724  gtid, taskdata));
725 
726  // --- Deallocate my ancestor task ---
727  __kmp_free_task(gtid, taskdata, thread);
728 
729  taskdata = parent_taskdata;
730 
731  if (team_serial)
732  return;
733  // Stop checking ancestors at implicit task instead of walking up ancestor
734  // tree to avoid premature deallocation of ancestors.
735  if (taskdata->td_flags.tasktype == TASK_IMPLICIT) {
736  if (taskdata->td_dephash) { // do we need to cleanup dephash?
737  int children = KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks);
738  kmp_tasking_flags_t flags_old = taskdata->td_flags;
739  if (children == 0 && flags_old.complete == 1) {
740  kmp_tasking_flags_t flags_new = flags_old;
741  flags_new.complete = 0;
742  if (KMP_COMPARE_AND_STORE_ACQ32(
743  RCAST(kmp_int32 *, &taskdata->td_flags),
744  *RCAST(kmp_int32 *, &flags_old),
745  *RCAST(kmp_int32 *, &flags_new))) {
746  KA_TRACE(100, ("__kmp_free_task_and_ancestors: T#%d cleans "
747  "dephash of implicit task %p\n",
748  gtid, taskdata));
749  // cleanup dephash of finished implicit task
750  __kmp_dephash_free_entries(thread, taskdata->td_dephash);
751  }
752  }
753  }
754  return;
755  }
756  // Predecrement simulated by "- 1" calculation
757  children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
758  KMP_DEBUG_ASSERT(children >= 0);
759  }
760 
761  KA_TRACE(
762  20, ("__kmp_free_task_and_ancestors(exit): T#%d task %p has %d children; "
763  "not freeing it yet\n",
764  gtid, taskdata, children));
765 }
766 
767 // Only need to keep track of child task counts if any of the following:
768 // 1. team parallel and tasking not serialized;
769 // 2. it is a proxy or detachable or hidden helper task
770 // 3. the children counter of its parent task is greater than 0.
771 // The reason for the 3rd one is for serialized team that found detached task,
772 // hidden helper task, T. In this case, the execution of T is still deferred,
773 // and it is also possible that a regular task depends on T. In this case, if we
774 // don't track the children, task synchronization will be broken.
775 static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
776  kmp_tasking_flags_t flags = taskdata->td_flags;
777  bool ret = !(flags.team_serial || flags.tasking_ser);
778  ret = ret || flags.proxy == TASK_PROXY ||
779  flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
780  ret = ret ||
781  KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
782 #if OMPX_TASKGRAPH
783  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
784  ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
785 #endif
786  return ret;
787 }
788 
789 // __kmp_task_finish: bookkeeping to do when a task finishes execution
790 //
791 // gtid: global thread ID for calling thread
792 // task: task to be finished
793 // resumed_task: task to be resumed. (may be NULL if task is serialized)
794 //
795 // template<ompt>: effectively ompt_enabled.enabled!=0
796 // the version with ompt=false is inlined, allowing to optimize away all ompt
797 // code in this case
798 template <bool ompt>
799 static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
800  kmp_taskdata_t *resumed_task) {
801  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
802  kmp_info_t *thread = __kmp_threads[gtid];
803  kmp_task_team_t *task_team =
804  thread->th.th_task_team; // might be NULL for serial teams...
805 #if OMPX_TASKGRAPH
806  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
807  bool is_taskgraph;
808 #endif
809 #if KMP_DEBUG
810  kmp_int32 children = 0;
811 #endif
812  KA_TRACE(10, ("__kmp_task_finish(enter): T#%d finishing task %p and resuming "
813  "task %p\n",
814  gtid, taskdata, resumed_task));
815 
816  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
817 
818 #if OMPX_TASKGRAPH
819  is_taskgraph = taskdata->is_taskgraph;
820 #endif
821 
822  if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
823  // untied task needs to check the counter so that the task structure is not
824  // freed prematurely
825  kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
826  KA_TRACE(
827  20,
828  ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
829  gtid, counter, taskdata));
830  if (counter > 0) {
831  // untied task is not done, to be continued possibly by other thread, do
832  // not free it now
833  if (resumed_task == NULL) {
834  KMP_DEBUG_ASSERT(taskdata->td_flags.task_serial);
835  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
836  // task is the parent
837  }
838  thread->th.th_current_task = resumed_task; // restore current_task
839  resumed_task->td_flags.executing = 1; // resume previous task
840  KA_TRACE(10, ("__kmp_task_finish(exit): T#%d partially done task %p, "
841  "resuming task %p\n",
842  gtid, taskdata, resumed_task));
843  return;
844  }
845  }
846 
847  // bookkeeping for resuming task:
848  // GEH - note tasking_ser => task_serial
849  KMP_DEBUG_ASSERT(
850  (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
851  taskdata->td_flags.task_serial);
852  if (taskdata->td_flags.task_serial) {
853  if (resumed_task == NULL) {
854  resumed_task = taskdata->td_parent; // In a serialized task, the resumed
855  // task is the parent
856  }
857  } else {
858  KMP_DEBUG_ASSERT(resumed_task !=
859  NULL); // verify that resumed task is passed as argument
860  }
861 
862  /* If the tasks' destructor thunk flag has been set, we need to invoke the
863  destructor thunk that has been generated by the compiler. The code is
864  placed here, since at this point other tasks might have been released
865  hence overlapping the destructor invocations with some other work in the
866  released tasks. The OpenMP spec is not specific on when the destructors
867  are invoked, so we should be free to choose. */
868  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
869  kmp_routine_entry_t destr_thunk = task->data1.destructors;
870  KMP_ASSERT(destr_thunk);
871  destr_thunk(gtid, task);
872  }
873 
874  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
875  KMP_DEBUG_ASSERT(taskdata->td_flags.started == 1);
876  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
877 
878  bool completed = true;
879  if (UNLIKELY(taskdata->td_flags.detachable == TASK_DETACHABLE)) {
880  if (taskdata->td_allow_completion_event.type ==
881  KMP_EVENT_ALLOW_COMPLETION) {
882  // event hasn't been fulfilled yet. Try to detach task.
883  __kmp_acquire_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
884  if (taskdata->td_allow_completion_event.type ==
885  KMP_EVENT_ALLOW_COMPLETION) {
886  // task finished execution
887  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
888  taskdata->td_flags.executing = 0; // suspend the finishing task
889 
890 #if OMPT_SUPPORT
891  // For a detached task, which is not completed, we switch back
892  // the omp_fulfill_event signals completion
893  // locking is necessary to avoid a race with ompt_task_late_fulfill
894  if (ompt)
895  __ompt_task_finish(task, resumed_task, ompt_task_detach);
896 #endif
897 
898  // no access to taskdata after this point!
899  // __kmp_fulfill_event might free taskdata at any time from now
900 
901  taskdata->td_flags.proxy = TASK_PROXY; // proxify!
902  completed = false;
903  }
904  __kmp_release_tas_lock(&taskdata->td_allow_completion_event.lock, gtid);
905  }
906  }
907 
908  // Tasks with valid target async handles must be re-enqueued.
909  if (taskdata->td_target_data.async_handle != NULL) {
910  // Note: no need to translate gtid to its shadow. If the current thread is a
911  // hidden helper one, then the gtid is already correct. Otherwise, hidden
912  // helper threads are disabled, and gtid refers to a OpenMP thread.
913 #if OMPT_SUPPORT
914  if (ompt) {
915  __ompt_task_finish(task, resumed_task, ompt_task_switch);
916  }
917 #endif
918  __kmpc_give_task(task, __kmp_tid_from_gtid(gtid));
919  if (KMP_HIDDEN_HELPER_THREAD(gtid))
920  __kmp_hidden_helper_worker_thread_signal();
921  completed = false;
922  }
923 
924  if (completed) {
925  taskdata->td_flags.complete = 1; // mark the task as completed
926 #if OMPX_TASKGRAPH
927  taskdata->td_flags.onced = 1; // mark the task as ran once already
928 #endif
929 
930 #if OMPT_SUPPORT
931  // This is not a detached task, we are done here
932  if (ompt)
933  __ompt_task_finish(task, resumed_task, ompt_task_complete);
934 #endif
935  // TODO: What would be the balance between the conditions in the function
936  // and an atomic operation?
937  if (__kmp_track_children_task(taskdata)) {
938  __kmp_release_deps(gtid, taskdata);
939  // Predecrement simulated by "- 1" calculation
940 #if KMP_DEBUG
941  children = -1 +
942 #endif
943  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
944  KMP_DEBUG_ASSERT(children >= 0);
945 #if OMPX_TASKGRAPH
946  if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
947 #else
948  if (taskdata->td_taskgroup)
949 #endif
950  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
951  } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
952  task_team->tt.tt_hidden_helper_task_encountered)) {
953  // if we found proxy or hidden helper tasks there could exist a dependency
954  // chain with the proxy task as origin
955  __kmp_release_deps(gtid, taskdata);
956  }
957  // td_flags.executing must be marked as 0 after __kmp_release_deps has been
958  // called. Othertwise, if a task is executed immediately from the
959  // release_deps code, the flag will be reset to 1 again by this same
960  // function
961  KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 1);
962  taskdata->td_flags.executing = 0; // suspend the finishing task
963 
964  // Decrement the counter of hidden helper tasks to be executed.
965  if (taskdata->td_flags.hidden_helper) {
966  // Hidden helper tasks can only be executed by hidden helper threads.
967  KMP_ASSERT(KMP_HIDDEN_HELPER_THREAD(gtid));
968  KMP_ATOMIC_DEC(&__kmp_unexecuted_hidden_helper_tasks);
969  }
970  }
971 
972  KA_TRACE(
973  20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
974  gtid, taskdata, children));
975 
976  // Free this task and then ancestor tasks if they have no children.
977  // Restore th_current_task first as suggested by John:
978  // johnmc: if an asynchronous inquiry peers into the runtime system
979  // it doesn't see the freed task as the current task.
980  thread->th.th_current_task = resumed_task;
981  if (completed)
982  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
983 
984  // TODO: GEH - make sure root team implicit task is initialized properly.
985  // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
986  resumed_task->td_flags.executing = 1; // resume previous task
987 
988 #if OMPX_TASKGRAPH
989  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
990  taskdata->td_taskgroup) {
991  // TDG: we only release taskgroup barrier here because
992  // free_task_and_ancestors will call
993  // __kmp_free_task, which resets all task parameters such as
994  // taskdata->started, etc. If we release the barrier earlier, these
995  // parameters could be read before being reset. This is not an issue for
996  // non-TDG implementation because we never reuse a task(data) structure
997  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
998  }
999 #endif
1000 
1001  KA_TRACE(
1002  10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
1003  gtid, taskdata, resumed_task));
1004 
1005  return;
1006 }
1007 
1008 template <bool ompt>
1009 static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
1010  kmp_int32 gtid,
1011  kmp_task_t *task) {
1012  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(enter): T#%d loc=%p task=%p\n",
1013  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1014  KMP_DEBUG_ASSERT(gtid >= 0);
1015  // this routine will provide task to resume
1016  __kmp_task_finish<ompt>(gtid, task, NULL);
1017 
1018  KA_TRACE(10, ("__kmpc_omp_task_complete_if0(exit): T#%d loc=%p task=%p\n",
1019  gtid, loc_ref, KMP_TASK_TO_TASKDATA(task)));
1020 
1021 #if OMPT_SUPPORT
1022  if (ompt) {
1023  ompt_frame_t *ompt_frame;
1024  __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
1025  ompt_frame->enter_frame = ompt_data_none;
1026  ompt_frame->enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1027  }
1028 #endif
1029 
1030  return;
1031 }
1032 
1033 #if OMPT_SUPPORT
1034 OMPT_NOINLINE
1035 void __kmpc_omp_task_complete_if0_ompt(ident_t *loc_ref, kmp_int32 gtid,
1036  kmp_task_t *task) {
1037  __kmpc_omp_task_complete_if0_template<true>(loc_ref, gtid, task);
1038 }
1039 #endif // OMPT_SUPPORT
1040 
1041 // __kmpc_omp_task_complete_if0: report that a task has completed execution
1042 //
1043 // loc_ref: source location information; points to end of task block.
1044 // gtid: global thread number.
1045 // task: task thunk for the completed task.
1046 void __kmpc_omp_task_complete_if0(ident_t *loc_ref, kmp_int32 gtid,
1047  kmp_task_t *task) {
1048 #if OMPT_SUPPORT
1049  if (UNLIKELY(ompt_enabled.enabled)) {
1050  __kmpc_omp_task_complete_if0_ompt(loc_ref, gtid, task);
1051  return;
1052  }
1053 #endif
1054  __kmpc_omp_task_complete_if0_template<false>(loc_ref, gtid, task);
1055 }
1056 
1057 #ifdef TASK_UNUSED
1058 // __kmpc_omp_task_complete: report that a task has completed execution
1059 // NEVER GENERATED BY COMPILER, DEPRECATED!!!
1060 void __kmpc_omp_task_complete(ident_t *loc_ref, kmp_int32 gtid,
1061  kmp_task_t *task) {
1062  KA_TRACE(10, ("__kmpc_omp_task_complete(enter): T#%d loc=%p task=%p\n", gtid,
1063  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1064 
1065  __kmp_task_finish<false>(gtid, task,
1066  NULL); // Not sure how to find task to resume
1067 
1068  KA_TRACE(10, ("__kmpc_omp_task_complete(exit): T#%d loc=%p task=%p\n", gtid,
1069  loc_ref, KMP_TASK_TO_TASKDATA(task)));
1070  return;
1071 }
1072 #endif // TASK_UNUSED
1073 
1074 // __kmp_init_implicit_task: Initialize the appropriate fields in the implicit
1075 // task for a given thread
1076 //
1077 // loc_ref: reference to source location of parallel region
1078 // this_thr: thread data structure corresponding to implicit task
1079 // team: team for this_thr
1080 // tid: thread id of given thread within team
1081 // set_curr_task: TRUE if need to push current task to thread
1082 // NOTE: Routine does not set up the implicit task ICVS. This is assumed to
1083 // have already been done elsewhere.
1084 // TODO: Get better loc_ref. Value passed in may be NULL
1085 void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
1086  kmp_team_t *team, int tid, int set_curr_task) {
1087  kmp_taskdata_t *task = &team->t.t_implicit_task_taskdata[tid];
1088 
1089  KF_TRACE(
1090  10,
1091  ("__kmp_init_implicit_task(enter): T#:%d team=%p task=%p, reinit=%s\n",
1092  tid, team, task, set_curr_task ? "TRUE" : "FALSE"));
1093 
1094  task->td_task_id = KMP_GEN_TASK_ID();
1095  task->td_team = team;
1096  // task->td_parent = NULL; // fix for CQ230101 (broken parent task info
1097  // in debugger)
1098  task->td_ident = loc_ref;
1099  task->td_taskwait_ident = NULL;
1100  task->td_taskwait_counter = 0;
1101  task->td_taskwait_thread = 0;
1102 
1103  task->td_flags.tiedness = TASK_TIED;
1104  task->td_flags.tasktype = TASK_IMPLICIT;
1105  task->td_flags.proxy = TASK_FULL;
1106 
1107  // All implicit tasks are executed immediately, not deferred
1108  task->td_flags.task_serial = 1;
1109  task->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1110  task->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1111 
1112  task->td_flags.started = 1;
1113  task->td_flags.executing = 1;
1114  task->td_flags.complete = 0;
1115  task->td_flags.freed = 0;
1116 #if OMPX_TASKGRAPH
1117  task->td_flags.onced = 0;
1118 #endif
1119 
1120  task->td_depnode = NULL;
1121  task->td_last_tied = task;
1122  task->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1123 
1124  if (set_curr_task) { // only do this init first time thread is created
1125  KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
1126  // Not used: don't need to deallocate implicit task
1127  KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
1128  task->td_taskgroup = NULL; // An implicit task does not have taskgroup
1129  task->td_dephash = NULL;
1130  __kmp_push_current_task_to_thread(this_thr, team, tid);
1131  } else {
1132  KMP_DEBUG_ASSERT(task->td_incomplete_child_tasks == 0);
1133  KMP_DEBUG_ASSERT(task->td_allocated_child_tasks == 0);
1134  }
1135 
1136 #if OMPT_SUPPORT
1137  if (UNLIKELY(ompt_enabled.enabled))
1138  __ompt_task_init(task, tid);
1139 #endif
1140 
1141  KF_TRACE(10, ("__kmp_init_implicit_task(exit): T#:%d team=%p task=%p\n", tid,
1142  team, task));
1143 }
1144 
1145 // __kmp_finish_implicit_task: Release resources associated to implicit tasks
1146 // at the end of parallel regions. Some resources are kept for reuse in the next
1147 // parallel region.
1148 //
1149 // thread: thread data structure corresponding to implicit task
1150 void __kmp_finish_implicit_task(kmp_info_t *thread) {
1151  kmp_taskdata_t *task = thread->th.th_current_task;
1152 #if ENABLE_LIBOMPTARGET
1153  // Give an opportunity to the offload runtime to synchronize any unfinished
1154  // target async regions before finishing the implicit task
1155  if (UNLIKELY(kmp_target_sync_cb != NULL))
1156  (*kmp_target_sync_cb)(NULL, thread->th.th_info.ds.ds_gtid,
1157  KMP_TASKDATA_TO_TASK(task), NULL);
1158 #endif // ENABLE_LIBOMPTARGET
1159  if (task->td_dephash) {
1160  int children;
1161  task->td_flags.complete = 1;
1162 #if OMPX_TASKGRAPH
1163  task->td_flags.onced = 1;
1164 #endif
1165  children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
1166  kmp_tasking_flags_t flags_old = task->td_flags;
1167  if (children == 0 && flags_old.complete == 1) {
1168  kmp_tasking_flags_t flags_new = flags_old;
1169  flags_new.complete = 0;
1170  if (KMP_COMPARE_AND_STORE_ACQ32(RCAST(kmp_int32 *, &task->td_flags),
1171  *RCAST(kmp_int32 *, &flags_old),
1172  *RCAST(kmp_int32 *, &flags_new))) {
1173  KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
1174  "dephash of implicit task %p\n",
1175  thread->th.th_info.ds.ds_gtid, task));
1176  __kmp_dephash_free_entries(thread, task->td_dephash);
1177  }
1178  }
1179  }
1180 }
1181 
1182 // __kmp_free_implicit_task: Release resources associated to implicit tasks
1183 // when these are destroyed regions
1184 //
1185 // thread: thread data structure corresponding to implicit task
1186 void __kmp_free_implicit_task(kmp_info_t *thread) {
1187  kmp_taskdata_t *task = thread->th.th_current_task;
1188  if (task && task->td_dephash) {
1189  __kmp_dephash_free(thread, task->td_dephash);
1190  task->td_dephash = NULL;
1191  }
1192 }
1193 
1194 // Round up a size to a power of two specified by val: Used to insert padding
1195 // between structures co-allocated using a single malloc() call
1196 static size_t __kmp_round_up_to_val(size_t size, size_t val) {
1197  if (size & (val - 1)) {
1198  size &= ~(val - 1);
1199  if (size <= KMP_SIZE_T_MAX - val) {
1200  size += val; // Round up if there is no overflow.
1201  }
1202  }
1203  return size;
1204 } // __kmp_round_up_to_va
1205 
1206 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
1207 //
1208 // loc_ref: source location information
1209 // gtid: global thread number.
1210 // flags: include tiedness & task type (explicit vs. implicit) of the ''new''
1211 // task encountered. Converted from kmp_int32 to kmp_tasking_flags_t in routine.
1212 // sizeof_kmp_task_t: Size in bytes of kmp_task_t data structure including
1213 // private vars accessed in task.
1214 // sizeof_shareds: Size in bytes of array of pointers to shared vars accessed
1215 // in task.
1216 // task_entry: Pointer to task code entry point generated by compiler.
1217 // returns: a pointer to the allocated kmp_task_t structure (task).
1218 kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1219  kmp_tasking_flags_t *flags,
1220  size_t sizeof_kmp_task_t, size_t sizeof_shareds,
1221  kmp_routine_entry_t task_entry) {
1222  kmp_task_t *task;
1223  kmp_taskdata_t *taskdata;
1224  kmp_info_t *thread = __kmp_threads[gtid];
1225  kmp_team_t *team = thread->th.th_team;
1226  kmp_taskdata_t *parent_task = thread->th.th_current_task;
1227  size_t shareds_offset;
1228 
1229  if (UNLIKELY(!TCR_4(__kmp_init_middle)))
1230  __kmp_middle_initialize();
1231 
1232  if (flags->hidden_helper) {
1233  if (__kmp_enable_hidden_helper) {
1234  if (!TCR_4(__kmp_init_hidden_helper))
1235  __kmp_hidden_helper_initialize();
1236  } else {
1237  // If the hidden helper task is not enabled, reset the flag to FALSE.
1238  flags->hidden_helper = FALSE;
1239  }
1240  }
1241 
1242  KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
1243  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1244  gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
1245  sizeof_shareds, task_entry));
1246 
1247  KMP_DEBUG_ASSERT(parent_task);
1248  if (parent_task->td_flags.final) {
1249  if (flags->merged_if0) {
1250  }
1251  flags->final = 1;
1252  }
1253 
1254  if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
1255  // Untied task encountered causes the TSC algorithm to check entire deque of
1256  // the victim thread. If no untied task encountered, then checking the head
1257  // of the deque should be enough.
1258  KMP_CHECK_UPDATE(thread->th.th_task_team->tt.tt_untied_task_encountered, 1);
1259  }
1260 
1261  // Detachable tasks are not proxy tasks yet but could be in the future. Doing
1262  // the tasking setup
1263  // when that happens is too late.
1264  if (UNLIKELY(flags->proxy == TASK_PROXY ||
1265  flags->detachable == TASK_DETACHABLE || flags->hidden_helper)) {
1266  if (flags->proxy == TASK_PROXY) {
1267  flags->tiedness = TASK_UNTIED;
1268  flags->merged_if0 = 1;
1269  }
1270  /* are we running in a sequential parallel or tskm_immediate_exec... we need
1271  tasking support enabled */
1272  if ((thread->th.th_task_team) == NULL) {
1273  /* This should only happen if the team is serialized
1274  setup a task team and propagate it to the thread */
1275  KMP_DEBUG_ASSERT(team->t.t_serialized);
1276  KA_TRACE(30,
1277  ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
1278  gtid));
1279  __kmp_task_team_setup(thread, team);
1280  thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
1281  }
1282  kmp_task_team_t *task_team = thread->th.th_task_team;
1283 
1284  /* tasking must be enabled now as the task might not be pushed */
1285  if (!KMP_TASKING_ENABLED(task_team)) {
1286  KA_TRACE(
1287  30,
1288  ("T#%d enabling tasking in __kmp_task_alloc for proxy task\n", gtid));
1289  __kmp_enable_tasking(task_team, thread);
1290  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
1291  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
1292  // No lock needed since only owner can allocate
1293  if (thread_data->td.td_deque == NULL) {
1294  __kmp_alloc_task_deque(thread, thread_data);
1295  }
1296  }
1297 
1298  if ((flags->proxy == TASK_PROXY || flags->detachable == TASK_DETACHABLE) &&
1299  task_team->tt.tt_found_proxy_tasks == FALSE)
1300  TCW_4(task_team->tt.tt_found_proxy_tasks, TRUE);
1301  if (flags->hidden_helper &&
1302  task_team->tt.tt_hidden_helper_task_encountered == FALSE)
1303  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, TRUE);
1304  }
1305 
1306  // Calculate shared structure offset including padding after kmp_task_t struct
1307  // to align pointers in shared struct
1308  shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
1309  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
1310 
1311  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
1312  KA_TRACE(30, ("__kmp_task_alloc: T#%d First malloc size: %ld\n", gtid,
1313  shareds_offset));
1314  KA_TRACE(30, ("__kmp_task_alloc: T#%d Second malloc size: %ld\n", gtid,
1315  sizeof_shareds));
1316 
1317  // Avoid double allocation here by combining shareds with taskdata
1318 #if USE_FAST_MEMORY
1319  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
1320  sizeof_shareds);
1321 #else /* ! USE_FAST_MEMORY */
1322  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
1323  sizeof_shareds);
1324 #endif /* USE_FAST_MEMORY */
1325 
1326  task = KMP_TASKDATA_TO_TASK(taskdata);
1327 
1328 // Make sure task & taskdata are aligned appropriately
1329 #if KMP_ARCH_X86 || KMP_ARCH_PPC64 || KMP_ARCH_S390X || !KMP_HAVE_QUAD
1330  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(double) - 1)) == 0);
1331  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(double) - 1)) == 0);
1332 #else
1333  KMP_DEBUG_ASSERT((((kmp_uintptr_t)taskdata) & (sizeof(_Quad) - 1)) == 0);
1334  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task) & (sizeof(_Quad) - 1)) == 0);
1335 #endif
1336  if (sizeof_shareds > 0) {
1337  // Avoid double allocation here by combining shareds with taskdata
1338  task->shareds = &((char *)taskdata)[shareds_offset];
1339  // Make sure shareds struct is aligned to pointer size
1340  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
1341  0);
1342  } else {
1343  task->shareds = NULL;
1344  }
1345  task->routine = task_entry;
1346  task->part_id = 0; // AC: Always start with 0 part id
1347 
1348  taskdata->td_task_id = KMP_GEN_TASK_ID();
1349  taskdata->td_team = thread->th.th_team;
1350  taskdata->td_alloc_thread = thread;
1351  taskdata->td_parent = parent_task;
1352  taskdata->td_level = parent_task->td_level + 1; // increment nesting level
1353  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
1354  taskdata->td_ident = loc_ref;
1355  taskdata->td_taskwait_ident = NULL;
1356  taskdata->td_taskwait_counter = 0;
1357  taskdata->td_taskwait_thread = 0;
1358  KMP_DEBUG_ASSERT(taskdata->td_parent != NULL);
1359  // avoid copying icvs for proxy tasks
1360  if (flags->proxy == TASK_FULL)
1361  copy_icvs(&taskdata->td_icvs, &taskdata->td_parent->td_icvs);
1362 
1363  taskdata->td_flags = *flags;
1364  taskdata->td_task_team = thread->th.th_task_team;
1365  taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
1366  taskdata->td_flags.tasktype = TASK_EXPLICIT;
1367  // If it is hidden helper task, we need to set the team and task team
1368  // correspondingly.
1369  if (flags->hidden_helper) {
1370  kmp_info_t *shadow_thread = __kmp_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
1371  taskdata->td_team = shadow_thread->th.th_team;
1372  taskdata->td_task_team = shadow_thread->th.th_task_team;
1373  }
1374 
1375  // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
1376  taskdata->td_flags.tasking_ser = (__kmp_tasking_mode == tskm_immediate_exec);
1377 
1378  // GEH - TODO: fix this to copy parent task's value of team_serial flag
1379  taskdata->td_flags.team_serial = (team->t.t_serialized) ? 1 : 0;
1380 
1381  // GEH - Note we serialize the task if the team is serialized to make sure
1382  // implicit parallel region tasks are not left until program termination to
1383  // execute. Also, it helps locality to execute immediately.
1384 
1385  taskdata->td_flags.task_serial =
1386  (parent_task->td_flags.final || taskdata->td_flags.team_serial ||
1387  taskdata->td_flags.tasking_ser || flags->merged_if0);
1388 
1389  taskdata->td_flags.started = 0;
1390  taskdata->td_flags.executing = 0;
1391  taskdata->td_flags.complete = 0;
1392  taskdata->td_flags.freed = 0;
1393 #if OMPX_TASKGRAPH
1394  taskdata->td_flags.onced = 0;
1395  taskdata->is_taskgraph = 0;
1396  taskdata->tdg = nullptr;
1397 #endif
1398  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
1399  // start at one because counts current task and children
1400  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
1401  taskdata->td_taskgroup =
1402  parent_task->td_taskgroup; // task inherits taskgroup from the parent task
1403  taskdata->td_dephash = NULL;
1404  taskdata->td_depnode = NULL;
1405  taskdata->td_target_data.async_handle = NULL;
1406  if (flags->tiedness == TASK_UNTIED)
1407  taskdata->td_last_tied = NULL; // will be set when the task is scheduled
1408  else
1409  taskdata->td_last_tied = taskdata;
1410  taskdata->td_allow_completion_event.type = KMP_EVENT_UNINITIALIZED;
1411 #if OMPT_SUPPORT
1412  if (UNLIKELY(ompt_enabled.enabled))
1413  __ompt_task_init(taskdata, gtid);
1414 #endif
1415  // TODO: What would be the balance between the conditions in the function and
1416  // an atomic operation?
1417  if (__kmp_track_children_task(taskdata)) {
1418  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
1419  if (parent_task->td_taskgroup)
1420  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
1421  // Only need to keep track of allocated child tasks for explicit tasks since
1422  // implicit not deallocated
1423  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
1424  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
1425  }
1426  if (flags->hidden_helper) {
1427  taskdata->td_flags.task_serial = FALSE;
1428  // Increment the number of hidden helper tasks to be executed
1429  KMP_ATOMIC_INC(&__kmp_unexecuted_hidden_helper_tasks);
1430  }
1431  }
1432 
1433 #if OMPX_TASKGRAPH
1434  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
1435  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
1436  (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
1437  taskdata->is_taskgraph = 1;
1438  taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
1439  taskdata->td_task_id = KMP_GEN_TASK_ID();
1440  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
1441  }
1442 #endif
1443  KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
1444  gtid, taskdata, taskdata->td_parent));
1445 
1446  return task;
1447 }
1448 
1449 kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1450  kmp_int32 flags, size_t sizeof_kmp_task_t,
1451  size_t sizeof_shareds,
1452  kmp_routine_entry_t task_entry) {
1453  kmp_task_t *retval;
1454  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
1455  __kmp_assert_valid_gtid(gtid);
1456  input_flags->native = FALSE;
1457  // __kmp_task_alloc() sets up all other runtime flags
1458  KA_TRACE(10, ("__kmpc_omp_task_alloc(enter): T#%d loc=%p, flags=(%s %s %s) "
1459  "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
1460  gtid, loc_ref, input_flags->tiedness ? "tied " : "untied",
1461  input_flags->proxy ? "proxy" : "",
1462  input_flags->detachable ? "detachable" : "", sizeof_kmp_task_t,
1463  sizeof_shareds, task_entry));
1464 
1465  retval = __kmp_task_alloc(loc_ref, gtid, input_flags, sizeof_kmp_task_t,
1466  sizeof_shareds, task_entry);
1467 
1468  KA_TRACE(20, ("__kmpc_omp_task_alloc(exit): T#%d retval %p\n", gtid, retval));
1469 
1470  return retval;
1471 }
1472 
1473 kmp_task_t *__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
1474  kmp_int32 flags,
1475  size_t sizeof_kmp_task_t,
1476  size_t sizeof_shareds,
1477  kmp_routine_entry_t task_entry,
1478  kmp_int64 device_id) {
1479  auto &input_flags = reinterpret_cast<kmp_tasking_flags_t &>(flags);
1480  // target task is untied defined in the specification
1481  input_flags.tiedness = TASK_UNTIED;
1482  input_flags.target = 1;
1483 
1484  if (__kmp_enable_hidden_helper)
1485  input_flags.hidden_helper = TRUE;
1486 
1487  return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
1488  sizeof_shareds, task_entry);
1489 }
1490 
1504 kmp_int32
1506  kmp_task_t *new_task, kmp_int32 naffins,
1507  kmp_task_affinity_info_t *affin_list) {
1508  if (naffins > 0)
1509  KMP_DEBUG_ASSERT(affin_list != NULL);
1510 
1511  for (kmp_int32 i = 0; i < naffins; ++i) {
1512  KA_TRACE(30, ("__kmpc_omp_reg_task_with_affinity: T#%d aff[%d] "
1513  "base_addr=0x%llx len=%zu flags={%d,%d,%d}\n",
1514  gtid, i, (unsigned long long)affin_list[i].base_addr,
1515  affin_list[i].len, (int)affin_list[i].flags.flag1,
1516  (int)affin_list[i].flags.flag2,
1517  (int)affin_list[i].flags.reserved));
1518  }
1519 
1520  return 0;
1521 }
1522 
1523 // __kmp_invoke_task: invoke the specified task
1524 //
1525 // gtid: global thread ID of caller
1526 // task: the task to invoke
1527 // current_task: the task to resume after task invocation
1528 #ifdef __s390x__
1529 __attribute__((target("backchain")))
1530 #endif
1531 static void
1532 __kmp_invoke_task(kmp_int32 gtid, kmp_task_t *task,
1533  kmp_taskdata_t *current_task) {
1534  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
1535  kmp_info_t *thread;
1536  int discard = 0 /* false */;
1537  KA_TRACE(
1538  30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
1539  gtid, taskdata, current_task));
1540  KMP_DEBUG_ASSERT(task);
1541  if (UNLIKELY(taskdata->td_flags.proxy == TASK_PROXY &&
1542  taskdata->td_flags.complete == 1)) {
1543  // This is a proxy task that was already completed but it needs to run
1544  // its bottom-half finish
1545  KA_TRACE(
1546  30,
1547  ("__kmp_invoke_task: T#%d running bottom finish for proxy task %p\n",
1548  gtid, taskdata));
1549 
1550  __kmp_bottom_half_finish_proxy(gtid, task);
1551 
1552  KA_TRACE(30, ("__kmp_invoke_task(exit): T#%d completed bottom finish for "
1553  "proxy task %p, resuming task %p\n",
1554  gtid, taskdata, current_task));
1555 
1556  return;
1557  }
1558 
1559 #if OMPT_SUPPORT
1560  // For untied tasks, the first task executed only calls __kmpc_omp_task and
1561  // does not execute code.
1562  ompt_thread_info_t oldInfo;
1563  if (UNLIKELY(ompt_enabled.enabled)) {
1564  // Store the threads states and restore them after the task
1565  thread = __kmp_threads[gtid];
1566  oldInfo = thread->th.ompt_thread_info;
1567  thread->th.ompt_thread_info.wait_id = 0;
1568  thread->th.ompt_thread_info.state = (thread->th.th_team_serialized)
1569  ? ompt_state_work_serial
1570  : ompt_state_work_parallel;
1571  taskdata->ompt_task_info.frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1572  }
1573 #endif
1574 
1575  // Proxy tasks are not handled by the runtime
1576  if (taskdata->td_flags.proxy != TASK_PROXY) {
1577  __kmp_task_start(gtid, task, current_task); // OMPT only if not discarded
1578  }
1579 
1580  // TODO: cancel tasks if the parallel region has also been cancelled
1581  // TODO: check if this sequence can be hoisted above __kmp_task_start
1582  // if cancellation has been enabled for this run ...
1583  if (UNLIKELY(__kmp_omp_cancellation)) {
1584  thread = __kmp_threads[gtid];
1585  kmp_team_t *this_team = thread->th.th_team;
1586  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
1587  if ((taskgroup && taskgroup->cancel_request) ||
1588  (this_team->t.t_cancel_request == cancel_parallel)) {
1589 #if OMPT_SUPPORT && OMPT_OPTIONAL
1590  ompt_data_t *task_data;
1591  if (UNLIKELY(ompt_enabled.ompt_callback_cancel)) {
1592  __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL);
1593  ompt_callbacks.ompt_callback(ompt_callback_cancel)(
1594  task_data,
1595  ((taskgroup && taskgroup->cancel_request) ? ompt_cancel_taskgroup
1596  : ompt_cancel_parallel) |
1597  ompt_cancel_discarded_task,
1598  NULL);
1599  }
1600 #endif
1601  KMP_COUNT_BLOCK(TASK_cancelled);
1602  // this task belongs to a task group and we need to cancel it
1603  discard = 1 /* true */;
1604  }
1605  }
1606 
1607  // Invoke the task routine and pass in relevant data.
1608  // Thunks generated by gcc take a different argument list.
1609  if (!discard) {
1610  if (taskdata->td_flags.tiedness == TASK_UNTIED) {
1611  taskdata->td_last_tied = current_task->td_last_tied;
1612  KMP_DEBUG_ASSERT(taskdata->td_last_tied);
1613  }
1614 #if KMP_STATS_ENABLED
1615  KMP_COUNT_BLOCK(TASK_executed);
1616  switch (KMP_GET_THREAD_STATE()) {
1617  case FORK_JOIN_BARRIER:
1618  KMP_PUSH_PARTITIONED_TIMER(OMP_task_join_bar);
1619  break;
1620  case PLAIN_BARRIER:
1621  KMP_PUSH_PARTITIONED_TIMER(OMP_task_plain_bar);
1622  break;
1623  case TASKYIELD:
1624  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskyield);
1625  break;
1626  case TASKWAIT:
1627  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskwait);
1628  break;
1629  case TASKGROUP:
1630  KMP_PUSH_PARTITIONED_TIMER(OMP_task_taskgroup);
1631  break;
1632  default:
1633  KMP_PUSH_PARTITIONED_TIMER(OMP_task_immediate);
1634  break;
1635  }
1636 #endif // KMP_STATS_ENABLED
1637 
1638 // OMPT task begin
1639 #if OMPT_SUPPORT
1640  if (UNLIKELY(ompt_enabled.enabled))
1641  __ompt_task_start(task, current_task, gtid);
1642 #endif
1643 #if OMPT_SUPPORT && OMPT_OPTIONAL
1644  if (UNLIKELY(ompt_enabled.ompt_callback_dispatch &&
1645  taskdata->ompt_task_info.dispatch_chunk.iterations > 0)) {
1646  ompt_data_t instance = ompt_data_none;
1647  instance.ptr = &(taskdata->ompt_task_info.dispatch_chunk);
1648  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1649  ompt_callbacks.ompt_callback(ompt_callback_dispatch)(
1650  &(team_info->parallel_data), &(taskdata->ompt_task_info.task_data),
1651  ompt_dispatch_taskloop_chunk, instance);
1652  taskdata->ompt_task_info.dispatch_chunk = {0, 0};
1653  }
1654 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
1655 
1656 #if OMPD_SUPPORT
1657  if (ompd_state & OMPD_ENABLE_BP)
1658  ompd_bp_task_begin();
1659 #endif
1660 
1661 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1662  kmp_uint64 cur_time;
1663  kmp_int32 kmp_itt_count_task =
1664  __kmp_forkjoin_frames_mode == 3 && !taskdata->td_flags.task_serial &&
1665  current_task->td_flags.tasktype == TASK_IMPLICIT;
1666  if (kmp_itt_count_task) {
1667  thread = __kmp_threads[gtid];
1668  // Time outer level explicit task on barrier for adjusting imbalance time
1669  if (thread->th.th_bar_arrive_time)
1670  cur_time = __itt_get_timestamp();
1671  else
1672  kmp_itt_count_task = 0; // thread is not on a barrier - skip timing
1673  }
1674  KMP_FSYNC_ACQUIRED(taskdata); // acquired self (new task)
1675 #endif
1676 
1677 #if ENABLE_LIBOMPTARGET
1678  if (taskdata->td_target_data.async_handle != NULL) {
1679  // If we have a valid target async handle, that means that we have already
1680  // executed the task routine once. We must query for the handle completion
1681  // instead of re-executing the routine.
1682  KMP_ASSERT(tgt_target_nowait_query);
1683  tgt_target_nowait_query(&taskdata->td_target_data.async_handle);
1684  } else
1685 #endif
1686  if (task->routine != NULL) {
1687 #ifdef KMP_GOMP_COMPAT
1688  if (taskdata->td_flags.native) {
1689  ((void (*)(void *))(*(task->routine)))(task->shareds);
1690  } else
1691 #endif /* KMP_GOMP_COMPAT */
1692  {
1693  (*(task->routine))(gtid, task);
1694  }
1695  }
1696  KMP_POP_PARTITIONED_TIMER();
1697 
1698 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1699  if (kmp_itt_count_task) {
1700  // Barrier imbalance - adjust arrive time with the task duration
1701  thread->th.th_bar_arrive_time += (__itt_get_timestamp() - cur_time);
1702  }
1703  KMP_FSYNC_CANCEL(taskdata); // destroy self (just executed)
1704  KMP_FSYNC_RELEASING(taskdata->td_parent); // releasing parent
1705 #endif
1706  }
1707 
1708 #if OMPD_SUPPORT
1709  if (ompd_state & OMPD_ENABLE_BP)
1710  ompd_bp_task_end();
1711 #endif
1712 
1713  // Proxy tasks are not handled by the runtime
1714  if (taskdata->td_flags.proxy != TASK_PROXY) {
1715 #if OMPT_SUPPORT
1716  if (UNLIKELY(ompt_enabled.enabled)) {
1717  thread->th.ompt_thread_info = oldInfo;
1718  if (taskdata->td_flags.tiedness == TASK_TIED) {
1719  taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1720  }
1721  __kmp_task_finish<true>(gtid, task, current_task);
1722  } else
1723 #endif
1724  __kmp_task_finish<false>(gtid, task, current_task);
1725  }
1726 #if OMPT_SUPPORT
1727  else if (UNLIKELY(ompt_enabled.enabled && taskdata->td_flags.target)) {
1728  __ompt_task_finish(task, current_task, ompt_task_switch);
1729  }
1730 #endif
1731 
1732  KA_TRACE(
1733  30,
1734  ("__kmp_invoke_task(exit): T#%d completed task %p, resuming task %p\n",
1735  gtid, taskdata, current_task));
1736  return;
1737 }
1738 
1739 // __kmpc_omp_task_parts: Schedule a thread-switchable task for execution
1740 //
1741 // loc_ref: location of original task pragma (ignored)
1742 // gtid: Global Thread ID of encountering thread
1743 // new_task: task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
1744 // Returns:
1745 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1746 // be resumed later.
1747 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1748 // resumed later.
1749 kmp_int32 __kmpc_omp_task_parts(ident_t *loc_ref, kmp_int32 gtid,
1750  kmp_task_t *new_task) {
1751  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1752 
1753  KA_TRACE(10, ("__kmpc_omp_task_parts(enter): T#%d loc=%p task=%p\n", gtid,
1754  loc_ref, new_taskdata));
1755 
1756 #if OMPT_SUPPORT
1757  kmp_taskdata_t *parent;
1758  if (UNLIKELY(ompt_enabled.enabled)) {
1759  parent = new_taskdata->td_parent;
1760  if (ompt_enabled.ompt_callback_task_create) {
1761  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1762  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1763  &(new_taskdata->ompt_task_info.task_data),
1764  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1765  OMPT_GET_RETURN_ADDRESS(0));
1766  }
1767  }
1768 #endif
1769 
1770  /* Should we execute the new task or queue it? For now, let's just always try
1771  to queue it. If the queue fills up, then we'll execute it. */
1772 
1773  if (__kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1774  { // Execute this task immediately
1775  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1776  new_taskdata->td_flags.task_serial = 1;
1777  __kmp_invoke_task(gtid, new_task, current_task);
1778  }
1779 
1780  KA_TRACE(
1781  10,
1782  ("__kmpc_omp_task_parts(exit): T#%d returning TASK_CURRENT_NOT_QUEUED: "
1783  "loc=%p task=%p, return: TASK_CURRENT_NOT_QUEUED\n",
1784  gtid, loc_ref, new_taskdata));
1785 
1786 #if OMPT_SUPPORT
1787  if (UNLIKELY(ompt_enabled.enabled)) {
1788  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1789  parent->ompt_task_info.frame.enter_frame_flags = OMPT_FRAME_FLAGS_RUNTIME;
1790  }
1791 #endif
1792  return TASK_CURRENT_NOT_QUEUED;
1793 }
1794 
1795 // __kmp_omp_task: Schedule a non-thread-switchable task for execution
1796 //
1797 // gtid: Global Thread ID of encountering thread
1798 // new_task:non-thread-switchable task thunk allocated by __kmp_omp_task_alloc()
1799 // serialize_immediate: if TRUE then if the task is executed immediately its
1800 // execution will be serialized
1801 // Returns:
1802 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1803 // be resumed later.
1804 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1805 // resumed later.
1806 kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
1807  bool serialize_immediate) {
1808  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1809 
1810 #if OMPX_TASKGRAPH
1811  if (new_taskdata->is_taskgraph &&
1812  __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
1813  kmp_tdg_info_t *tdg = new_taskdata->tdg;
1814  // extend the record_map if needed
1815  if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
1816  __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
1817  // map_size could have been updated by another thread if recursive
1818  // taskloop
1819  if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
1820  kmp_uint old_size = tdg->map_size;
1821  kmp_uint new_size = old_size * 2;
1822  kmp_node_info_t *old_record = tdg->record_map;
1823  kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
1824  new_size * sizeof(kmp_node_info_t));
1825 
1826  KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
1827  tdg->record_map = new_record;
1828 
1829  __kmp_free(old_record);
1830 
1831  for (kmp_int i = old_size; i < new_size; i++) {
1832  kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
1833  __kmp_successors_size * sizeof(kmp_int32));
1834  new_record[i].task = nullptr;
1835  new_record[i].successors = successorsList;
1836  new_record[i].nsuccessors = 0;
1837  new_record[i].npredecessors = 0;
1838  new_record[i].successors_size = __kmp_successors_size;
1839  KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
1840  }
1841  // update the size at the end, so that we avoid other
1842  // threads use old_record while map_size is already updated
1843  tdg->map_size = new_size;
1844  }
1845  __kmp_release_bootstrap_lock(&tdg->graph_lock);
1846  }
1847  // record a task
1848  if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
1849  tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
1850  tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
1851  new_taskdata->td_parent;
1852  KMP_ATOMIC_INC(&tdg->num_tasks);
1853  }
1854  }
1855 #endif
1856 
1857  /* Should we execute the new task or queue it? For now, let's just always try
1858  to queue it. If the queue fills up, then we'll execute it. */
1859  if (new_taskdata->td_flags.proxy == TASK_PROXY ||
1860  __kmp_push_task(gtid, new_task) == TASK_NOT_PUSHED) // if cannot defer
1861  { // Execute this task immediately
1862  kmp_taskdata_t *current_task = __kmp_threads[gtid]->th.th_current_task;
1863  if (serialize_immediate)
1864  new_taskdata->td_flags.task_serial = 1;
1865  __kmp_invoke_task(gtid, new_task, current_task);
1866  } else if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1867  __kmp_wpolicy_passive) {
1868  kmp_info_t *this_thr = __kmp_threads[gtid];
1869  kmp_team_t *team = this_thr->th.th_team;
1870  kmp_int32 nthreads = this_thr->th.th_team_nproc;
1871  for (int i = 0; i < nthreads; ++i) {
1872  kmp_info_t *thread = team->t.t_threads[i];
1873  if (thread == this_thr)
1874  continue;
1875  if (thread->th.th_sleep_loc != NULL) {
1876  __kmp_null_resume_wrapper(thread);
1877  break; // awake one thread at a time
1878  }
1879  }
1880  }
1881  return TASK_CURRENT_NOT_QUEUED;
1882 }
1883 
1884 // __kmpc_omp_task: Wrapper around __kmp_omp_task to schedule a
1885 // non-thread-switchable task from the parent thread only!
1886 //
1887 // loc_ref: location of original task pragma (ignored)
1888 // gtid: Global Thread ID of encountering thread
1889 // new_task: non-thread-switchable task thunk allocated by
1890 // __kmp_omp_task_alloc()
1891 // Returns:
1892 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1893 // be resumed later.
1894 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1895 // resumed later.
1896 kmp_int32 __kmpc_omp_task(ident_t *loc_ref, kmp_int32 gtid,
1897  kmp_task_t *new_task) {
1898  kmp_int32 res;
1899  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1900 
1901 #if KMP_DEBUG || OMPT_SUPPORT
1902  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1903 #endif
1904  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1905  new_taskdata));
1906  __kmp_assert_valid_gtid(gtid);
1907 
1908 #if OMPT_SUPPORT
1909  kmp_taskdata_t *parent = NULL;
1910  if (UNLIKELY(ompt_enabled.enabled)) {
1911  if (!new_taskdata->td_flags.started) {
1912  OMPT_STORE_RETURN_ADDRESS(gtid);
1913  parent = new_taskdata->td_parent;
1914  if (!parent->ompt_task_info.frame.enter_frame.ptr) {
1915  parent->ompt_task_info.frame.enter_frame.ptr =
1916  OMPT_GET_FRAME_ADDRESS(0);
1917  }
1918  if (ompt_enabled.ompt_callback_task_create) {
1919  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1920  &(parent->ompt_task_info.task_data),
1921  &(parent->ompt_task_info.frame),
1922  &(new_taskdata->ompt_task_info.task_data),
1923  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0,
1924  OMPT_LOAD_RETURN_ADDRESS(gtid));
1925  }
1926  } else {
1927  // We are scheduling the continuation of an UNTIED task.
1928  // Scheduling back to the parent task.
1929  __ompt_task_finish(new_task,
1930  new_taskdata->ompt_task_info.scheduling_parent,
1931  ompt_task_switch);
1932  new_taskdata->ompt_task_info.frame.exit_frame = ompt_data_none;
1933  }
1934  }
1935 #endif
1936 
1937  res = __kmp_omp_task(gtid, new_task, true);
1938 
1939  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1940  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1941  gtid, loc_ref, new_taskdata));
1942 #if OMPT_SUPPORT
1943  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1944  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1945  }
1946 #endif
1947  return res;
1948 }
1949 
1950 // __kmp_omp_taskloop_task: Wrapper around __kmp_omp_task to schedule
1951 // a taskloop task with the correct OMPT return address
1952 //
1953 // loc_ref: location of original task pragma (ignored)
1954 // gtid: Global Thread ID of encountering thread
1955 // new_task: non-thread-switchable task thunk allocated by
1956 // __kmp_omp_task_alloc()
1957 // codeptr_ra: return address for OMPT callback
1958 // Returns:
1959 // TASK_CURRENT_NOT_QUEUED (0) if did not suspend and queue current task to
1960 // be resumed later.
1961 // TASK_CURRENT_QUEUED (1) if suspended and queued the current task to be
1962 // resumed later.
1963 kmp_int32 __kmp_omp_taskloop_task(ident_t *loc_ref, kmp_int32 gtid,
1964  kmp_task_t *new_task, void *codeptr_ra) {
1965  kmp_int32 res;
1966  KMP_SET_THREAD_STATE_BLOCK(EXPLICIT_TASK);
1967 
1968 #if KMP_DEBUG || OMPT_SUPPORT
1969  kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
1970 #endif
1971  KA_TRACE(10, ("__kmpc_omp_task(enter): T#%d loc=%p task=%p\n", gtid, loc_ref,
1972  new_taskdata));
1973 
1974 #if OMPT_SUPPORT
1975  kmp_taskdata_t *parent = NULL;
1976  if (UNLIKELY(ompt_enabled.enabled && !new_taskdata->td_flags.started)) {
1977  parent = new_taskdata->td_parent;
1978  if (!parent->ompt_task_info.frame.enter_frame.ptr)
1979  parent->ompt_task_info.frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1980  if (ompt_enabled.ompt_callback_task_create) {
1981  ompt_callbacks.ompt_callback(ompt_callback_task_create)(
1982  &(parent->ompt_task_info.task_data), &(parent->ompt_task_info.frame),
1983  &(new_taskdata->ompt_task_info.task_data),
1984  TASK_TYPE_DETAILS_FORMAT(new_taskdata), 0, codeptr_ra);
1985  }
1986  }
1987 #endif
1988 
1989  res = __kmp_omp_task(gtid, new_task, true);
1990 
1991  KA_TRACE(10, ("__kmpc_omp_task(exit): T#%d returning "
1992  "TASK_CURRENT_NOT_QUEUED: loc=%p task=%p\n",
1993  gtid, loc_ref, new_taskdata));
1994 #if OMPT_SUPPORT
1995  if (UNLIKELY(ompt_enabled.enabled && parent != NULL)) {
1996  parent->ompt_task_info.frame.enter_frame = ompt_data_none;
1997  }
1998 #endif
1999  return res;
2000 }
2001 
2002 template <bool ompt>
2003 static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
2004  void *frame_address,
2005  void *return_address) {
2006  kmp_taskdata_t *taskdata = nullptr;
2007  kmp_info_t *thread;
2008  int thread_finished = FALSE;
2009  KMP_SET_THREAD_STATE_BLOCK(TASKWAIT);
2010 
2011  KA_TRACE(10, ("__kmpc_omp_taskwait(enter): T#%d loc=%p\n", gtid, loc_ref));
2012  KMP_DEBUG_ASSERT(gtid >= 0);
2013 
2014  if (__kmp_tasking_mode != tskm_immediate_exec) {
2015  thread = __kmp_threads[gtid];
2016  taskdata = thread->th.th_current_task;
2017 
2018 #if OMPT_SUPPORT && OMPT_OPTIONAL
2019  ompt_data_t *my_task_data;
2020  ompt_data_t *my_parallel_data;
2021 
2022  if (ompt) {
2023  my_task_data = &(taskdata->ompt_task_info.task_data);
2024  my_parallel_data = OMPT_CUR_TEAM_DATA(thread);
2025 
2026  taskdata->ompt_task_info.frame.enter_frame.ptr = frame_address;
2027 
2028  if (ompt_enabled.ompt_callback_sync_region) {
2029  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2030  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2031  my_task_data, return_address);
2032  }
2033 
2034  if (ompt_enabled.ompt_callback_sync_region_wait) {
2035  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2036  ompt_sync_region_taskwait, ompt_scope_begin, my_parallel_data,
2037  my_task_data, return_address);
2038  }
2039  }
2040 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2041 
2042 #if ENABLE_LIBOMPTARGET
2043  // Give an opportunity to the offload runtime to make progress and create
2044  // any necessary proxy tasks
2045  if (UNLIKELY(kmp_target_sync_cb))
2046  (*kmp_target_sync_cb)(loc_ref, gtid, KMP_TASKDATA_TO_TASK(taskdata),
2047  NULL);
2048 #endif // ENABLE_LIBOMPTARGET
2049 
2050 // Debugger: The taskwait is active. Store location and thread encountered the
2051 // taskwait.
2052 #if USE_ITT_BUILD
2053 // Note: These values are used by ITT events as well.
2054 #endif /* USE_ITT_BUILD */
2055  taskdata->td_taskwait_counter += 1;
2056  taskdata->td_taskwait_ident = loc_ref;
2057  taskdata->td_taskwait_thread = gtid + 1;
2058 
2059 #if USE_ITT_BUILD
2060  void *itt_sync_obj = NULL;
2061 #if USE_ITT_NOTIFY
2062  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2063 #endif /* USE_ITT_NOTIFY */
2064 #endif /* USE_ITT_BUILD */
2065 
2066  bool must_wait =
2067  !taskdata->td_flags.team_serial && !taskdata->td_flags.final;
2068 
2069  must_wait = must_wait || (thread->th.th_task_team != NULL &&
2070  thread->th.th_task_team->tt.tt_found_proxy_tasks);
2071  // If hidden helper thread is encountered, we must enable wait here.
2072  must_wait =
2073  must_wait ||
2074  (__kmp_enable_hidden_helper && thread->th.th_task_team != NULL &&
2075  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered);
2076 
2077  if (must_wait) {
2078  kmp_flag_32<false, false> flag(
2079  RCAST(std::atomic<kmp_uint32> *,
2080  &(taskdata->td_incomplete_child_tasks)),
2081  0U);
2082  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
2083  flag.execute_tasks(thread, gtid, FALSE,
2084  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2085  __kmp_task_stealing_constraint);
2086  }
2087  }
2088 #if USE_ITT_BUILD
2089  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2090  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with children
2091 #endif /* USE_ITT_BUILD */
2092 
2093  // Debugger: The taskwait is completed. Location remains, but thread is
2094  // negated.
2095  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2096 
2097 #if OMPT_SUPPORT && OMPT_OPTIONAL
2098  if (ompt) {
2099  if (ompt_enabled.ompt_callback_sync_region_wait) {
2100  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2101  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2102  my_task_data, return_address);
2103  }
2104  if (ompt_enabled.ompt_callback_sync_region) {
2105  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2106  ompt_sync_region_taskwait, ompt_scope_end, my_parallel_data,
2107  my_task_data, return_address);
2108  }
2109  taskdata->ompt_task_info.frame.enter_frame = ompt_data_none;
2110  }
2111 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2112  }
2113 
2114  KA_TRACE(10, ("__kmpc_omp_taskwait(exit): T#%d task %p finished waiting, "
2115  "returning TASK_CURRENT_NOT_QUEUED\n",
2116  gtid, taskdata));
2117 
2118  return TASK_CURRENT_NOT_QUEUED;
2119 }
2120 
2121 #if OMPT_SUPPORT && OMPT_OPTIONAL
2122 OMPT_NOINLINE
2123 static kmp_int32 __kmpc_omp_taskwait_ompt(ident_t *loc_ref, kmp_int32 gtid,
2124  void *frame_address,
2125  void *return_address) {
2126  return __kmpc_omp_taskwait_template<true>(loc_ref, gtid, frame_address,
2127  return_address);
2128 }
2129 #endif // OMPT_SUPPORT && OMPT_OPTIONAL
2130 
2131 // __kmpc_omp_taskwait: Wait until all tasks generated by the current task are
2132 // complete
2133 kmp_int32 __kmpc_omp_taskwait(ident_t *loc_ref, kmp_int32 gtid) {
2134 #if OMPT_SUPPORT && OMPT_OPTIONAL
2135  if (UNLIKELY(ompt_enabled.enabled)) {
2136  OMPT_STORE_RETURN_ADDRESS(gtid);
2137  return __kmpc_omp_taskwait_ompt(loc_ref, gtid, OMPT_GET_FRAME_ADDRESS(0),
2138  OMPT_LOAD_RETURN_ADDRESS(gtid));
2139  }
2140 #endif
2141  return __kmpc_omp_taskwait_template<false>(loc_ref, gtid, NULL, NULL);
2142 }
2143 
2144 // __kmpc_omp_taskyield: switch to a different task
2145 kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
2146  kmp_taskdata_t *taskdata = NULL;
2147  kmp_info_t *thread;
2148  int thread_finished = FALSE;
2149 
2150  KMP_COUNT_BLOCK(OMP_TASKYIELD);
2151  KMP_SET_THREAD_STATE_BLOCK(TASKYIELD);
2152 
2153  KA_TRACE(10, ("__kmpc_omp_taskyield(enter): T#%d loc=%p end_part = %d\n",
2154  gtid, loc_ref, end_part));
2155  __kmp_assert_valid_gtid(gtid);
2156 
2157  if (__kmp_tasking_mode != tskm_immediate_exec && __kmp_init_parallel) {
2158  thread = __kmp_threads[gtid];
2159  taskdata = thread->th.th_current_task;
2160 // Should we model this as a task wait or not?
2161 // Debugger: The taskwait is active. Store location and thread encountered the
2162 // taskwait.
2163 #if USE_ITT_BUILD
2164 // Note: These values are used by ITT events as well.
2165 #endif /* USE_ITT_BUILD */
2166  taskdata->td_taskwait_counter += 1;
2167  taskdata->td_taskwait_ident = loc_ref;
2168  taskdata->td_taskwait_thread = gtid + 1;
2169 
2170 #if USE_ITT_BUILD
2171  void *itt_sync_obj = NULL;
2172 #if USE_ITT_NOTIFY
2173  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2174 #endif /* USE_ITT_NOTIFY */
2175 #endif /* USE_ITT_BUILD */
2176  if (!taskdata->td_flags.team_serial) {
2177  kmp_task_team_t *task_team = thread->th.th_task_team;
2178  if (task_team != NULL) {
2179  if (KMP_TASKING_ENABLED(task_team)) {
2180 #if OMPT_SUPPORT
2181  if (UNLIKELY(ompt_enabled.enabled))
2182  thread->th.ompt_thread_info.ompt_task_yielded = 1;
2183 #endif
2184  __kmp_execute_tasks_32(
2185  thread, gtid, (kmp_flag_32<> *)NULL, FALSE,
2186  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2187  __kmp_task_stealing_constraint);
2188 #if OMPT_SUPPORT
2189  if (UNLIKELY(ompt_enabled.enabled))
2190  thread->th.ompt_thread_info.ompt_task_yielded = 0;
2191 #endif
2192  }
2193  }
2194  }
2195 #if USE_ITT_BUILD
2196  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2197 #endif /* USE_ITT_BUILD */
2198 
2199  // Debugger: The taskwait is completed. Location remains, but thread is
2200  // negated.
2201  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread;
2202  }
2203 
2204  KA_TRACE(10, ("__kmpc_omp_taskyield(exit): T#%d task %p resuming, "
2205  "returning TASK_CURRENT_NOT_QUEUED\n",
2206  gtid, taskdata));
2207 
2208  return TASK_CURRENT_NOT_QUEUED;
2209 }
2210 
2211 // Task Reduction implementation
2212 //
2213 // Note: initial implementation didn't take into account the possibility
2214 // to specify omp_orig for initializer of the UDR (user defined reduction).
2215 // Corrected implementation takes into account the omp_orig object.
2216 // Compiler is free to use old implementation if omp_orig is not specified.
2217 
2226 typedef struct kmp_taskred_flags {
2228  unsigned lazy_priv : 1;
2229  unsigned reserved31 : 31;
2231 
2235 typedef struct kmp_task_red_input {
2236  void *reduce_shar;
2237  size_t reduce_size;
2238  // three compiler-generated routines (init, fini are optional):
2239  void *reduce_init;
2240  void *reduce_fini;
2241  void *reduce_comb;
2244 
2248 typedef struct kmp_taskred_data {
2249  void *reduce_shar;
2250  size_t reduce_size;
2252  void *reduce_priv;
2253  void *reduce_pend;
2254  // three compiler-generated routines (init, fini are optional):
2255  void *reduce_comb;
2256  void *reduce_init;
2257  void *reduce_fini;
2258  void *reduce_orig;
2260 
2266 typedef struct kmp_taskred_input {
2267  void *reduce_shar;
2268  void *reduce_orig;
2269  size_t reduce_size;
2270  // three compiler-generated routines (init, fini are optional):
2271  void *reduce_init;
2272  void *reduce_fini;
2273  void *reduce_comb;
2280 template <typename T> void __kmp_assign_orig(kmp_taskred_data_t &item, T &src);
2281 template <>
2282 void __kmp_assign_orig<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2283  kmp_task_red_input_t &src) {
2284  item.reduce_orig = NULL;
2285 }
2286 template <>
2287 void __kmp_assign_orig<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2288  kmp_taskred_input_t &src) {
2289  if (src.reduce_orig != NULL) {
2290  item.reduce_orig = src.reduce_orig;
2291  } else {
2292  item.reduce_orig = src.reduce_shar;
2293  } // non-NULL reduce_orig means new interface used
2294 }
2295 
2296 template <typename T> void __kmp_call_init(kmp_taskred_data_t &item, size_t j);
2297 template <>
2298 void __kmp_call_init<kmp_task_red_input_t>(kmp_taskred_data_t &item,
2299  size_t offset) {
2300  ((void (*)(void *))item.reduce_init)((char *)(item.reduce_priv) + offset);
2301 }
2302 template <>
2303 void __kmp_call_init<kmp_taskred_input_t>(kmp_taskred_data_t &item,
2304  size_t offset) {
2305  ((void (*)(void *, void *))item.reduce_init)(
2306  (char *)(item.reduce_priv) + offset, item.reduce_orig);
2307 }
2308 
2309 template <typename T>
2310 void *__kmp_task_reduction_init(int gtid, int num, T *data) {
2311  __kmp_assert_valid_gtid(gtid);
2312  kmp_info_t *thread = __kmp_threads[gtid];
2313  kmp_taskgroup_t *tg = thread->th.th_current_task->td_taskgroup;
2314  kmp_uint32 nth = thread->th.th_team_nproc;
2315  kmp_taskred_data_t *arr;
2316 
2317  // check input data just in case
2318  KMP_ASSERT(tg != NULL);
2319  KMP_ASSERT(data != NULL);
2320  KMP_ASSERT(num > 0);
2321  if (nth == 1 && !__kmp_enable_hidden_helper) {
2322  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, tg %p, exiting nth=1\n",
2323  gtid, tg));
2324  return (void *)tg;
2325  }
2326  KA_TRACE(10, ("__kmpc_task_reduction_init: T#%d, taskgroup %p, #items %d\n",
2327  gtid, tg, num));
2328  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2329  thread, num * sizeof(kmp_taskred_data_t));
2330  for (int i = 0; i < num; ++i) {
2331  size_t size = data[i].reduce_size - 1;
2332  // round the size up to cache line per thread-specific item
2333  size += CACHE_LINE - size % CACHE_LINE;
2334  KMP_ASSERT(data[i].reduce_comb != NULL); // combiner is mandatory
2335  arr[i].reduce_shar = data[i].reduce_shar;
2336  arr[i].reduce_size = size;
2337  arr[i].flags = data[i].flags;
2338  arr[i].reduce_comb = data[i].reduce_comb;
2339  arr[i].reduce_init = data[i].reduce_init;
2340  arr[i].reduce_fini = data[i].reduce_fini;
2341  __kmp_assign_orig<T>(arr[i], data[i]);
2342  if (!arr[i].flags.lazy_priv) {
2343  // allocate cache-line aligned block and fill it with zeros
2344  arr[i].reduce_priv = __kmp_allocate(nth * size);
2345  arr[i].reduce_pend = (char *)(arr[i].reduce_priv) + nth * size;
2346  if (arr[i].reduce_init != NULL) {
2347  // initialize all thread-specific items
2348  for (size_t j = 0; j < nth; ++j) {
2349  __kmp_call_init<T>(arr[i], j * size);
2350  }
2351  }
2352  } else {
2353  // only allocate space for pointers now,
2354  // objects will be lazily allocated/initialized if/when requested
2355  // note that __kmp_allocate zeroes the allocated memory
2356  arr[i].reduce_priv = __kmp_allocate(nth * sizeof(void *));
2357  }
2358  }
2359  tg->reduce_data = (void *)arr;
2360  tg->reduce_num_data = num;
2361  return (void *)tg;
2362 }
2363 
2378 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
2379 #if OMPX_TASKGRAPH
2380  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2381  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2382  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2383  this_tdg->rec_taskred_data =
2384  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2385  this_tdg->rec_num_taskred = num;
2386  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2387  sizeof(kmp_task_red_input_t) * num);
2388  }
2389 #endif
2390  return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
2391 }
2392 
2405 void *__kmpc_taskred_init(int gtid, int num, void *data) {
2406 #if OMPX_TASKGRAPH
2407  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
2408  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
2409  kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
2410  this_tdg->rec_taskred_data =
2411  __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
2412  this_tdg->rec_num_taskred = num;
2413  KMP_MEMCPY(this_tdg->rec_taskred_data, data,
2414  sizeof(kmp_task_red_input_t) * num);
2415  }
2416 #endif
2417  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
2418 }
2419 
2420 // Copy task reduction data (except for shared pointers).
2421 template <typename T>
2422 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
2423  kmp_taskgroup_t *tg, void *reduce_data) {
2424  kmp_taskred_data_t *arr;
2425  KA_TRACE(20, ("__kmp_task_reduction_init_copy: Th %p, init taskgroup %p,"
2426  " from data %p\n",
2427  thr, tg, reduce_data));
2428  arr = (kmp_taskred_data_t *)__kmp_thread_malloc(
2429  thr, num * sizeof(kmp_taskred_data_t));
2430  // threads will share private copies, thunk routines, sizes, flags, etc.:
2431  KMP_MEMCPY(arr, reduce_data, num * sizeof(kmp_taskred_data_t));
2432  for (int i = 0; i < num; ++i) {
2433  arr[i].reduce_shar = data[i].reduce_shar; // init unique shared pointers
2434  }
2435  tg->reduce_data = (void *)arr;
2436  tg->reduce_num_data = num;
2437 }
2438 
2448 void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
2449  __kmp_assert_valid_gtid(gtid);
2450  kmp_info_t *thread = __kmp_threads[gtid];
2451  kmp_int32 nth = thread->th.th_team_nproc;
2452  if (nth == 1)
2453  return data; // nothing to do
2454 
2455  kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
2456  if (tg == NULL)
2457  tg = thread->th.th_current_task->td_taskgroup;
2458  KMP_ASSERT(tg != NULL);
2459  kmp_taskred_data_t *arr;
2460  kmp_int32 num;
2461  kmp_int32 tid = thread->th.th_info.ds.ds_tid;
2462 
2463 #if OMPX_TASKGRAPH
2464  if ((thread->th.th_current_task->is_taskgraph) &&
2465  (!__kmp_tdg_is_recording(
2466  __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
2467  tg = thread->th.th_current_task->td_taskgroup;
2468  KMP_ASSERT(tg != NULL);
2469  KMP_ASSERT(tg->reduce_data != NULL);
2470  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2471  num = tg->reduce_num_data;
2472  }
2473 #endif
2474 
2475  KMP_ASSERT(data != NULL);
2476  while (tg != NULL) {
2477  arr = (kmp_taskred_data_t *)(tg->reduce_data);
2478  num = tg->reduce_num_data;
2479  for (int i = 0; i < num; ++i) {
2480  if (!arr[i].flags.lazy_priv) {
2481  if (data == arr[i].reduce_shar ||
2482  (data >= arr[i].reduce_priv && data < arr[i].reduce_pend))
2483  return (char *)(arr[i].reduce_priv) + tid * arr[i].reduce_size;
2484  } else {
2485  // check shared location first
2486  void **p_priv = (void **)(arr[i].reduce_priv);
2487  if (data == arr[i].reduce_shar)
2488  goto found;
2489  // check if we get some thread specific location as parameter
2490  for (int j = 0; j < nth; ++j)
2491  if (data == p_priv[j])
2492  goto found;
2493  continue; // not found, continue search
2494  found:
2495  if (p_priv[tid] == NULL) {
2496  // allocate thread specific object lazily
2497  p_priv[tid] = __kmp_allocate(arr[i].reduce_size);
2498  if (arr[i].reduce_init != NULL) {
2499  if (arr[i].reduce_orig != NULL) { // new interface
2500  ((void (*)(void *, void *))arr[i].reduce_init)(
2501  p_priv[tid], arr[i].reduce_orig);
2502  } else { // old interface (single parameter)
2503  ((void (*)(void *))arr[i].reduce_init)(p_priv[tid]);
2504  }
2505  }
2506  }
2507  return p_priv[tid];
2508  }
2509  }
2510  KMP_ASSERT(tg->parent);
2511  tg = tg->parent;
2512  }
2513  KMP_ASSERT2(0, "Unknown task reduction item");
2514  return NULL; // ERROR, this line never executed
2515 }
2516 
2517 // Finalize task reduction.
2518 // Called from __kmpc_end_taskgroup()
2519 static void __kmp_task_reduction_fini(kmp_info_t *th, kmp_taskgroup_t *tg) {
2520  kmp_int32 nth = th->th.th_team_nproc;
2521  KMP_DEBUG_ASSERT(
2522  nth > 1 ||
2523  __kmp_enable_hidden_helper); // should not be called if nth == 1 unless we
2524  // are using hidden helper threads
2525  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)tg->reduce_data;
2526  kmp_int32 num = tg->reduce_num_data;
2527  for (int i = 0; i < num; ++i) {
2528  void *sh_data = arr[i].reduce_shar;
2529  void (*f_fini)(void *) = (void (*)(void *))(arr[i].reduce_fini);
2530  void (*f_comb)(void *, void *) =
2531  (void (*)(void *, void *))(arr[i].reduce_comb);
2532  if (!arr[i].flags.lazy_priv) {
2533  void *pr_data = arr[i].reduce_priv;
2534  size_t size = arr[i].reduce_size;
2535  for (int j = 0; j < nth; ++j) {
2536  void *priv_data = (char *)pr_data + j * size;
2537  f_comb(sh_data, priv_data); // combine results
2538  if (f_fini)
2539  f_fini(priv_data); // finalize if needed
2540  }
2541  } else {
2542  void **pr_data = (void **)(arr[i].reduce_priv);
2543  for (int j = 0; j < nth; ++j) {
2544  if (pr_data[j] != NULL) {
2545  f_comb(sh_data, pr_data[j]); // combine results
2546  if (f_fini)
2547  f_fini(pr_data[j]); // finalize if needed
2548  __kmp_free(pr_data[j]);
2549  }
2550  }
2551  }
2552  __kmp_free(arr[i].reduce_priv);
2553  }
2554  __kmp_thread_free(th, arr);
2555  tg->reduce_data = NULL;
2556  tg->reduce_num_data = 0;
2557 }
2558 
2559 // Cleanup task reduction data for parallel or worksharing,
2560 // do not touch task private data other threads still working with.
2561 // Called from __kmpc_end_taskgroup()
2562 static void __kmp_task_reduction_clean(kmp_info_t *th, kmp_taskgroup_t *tg) {
2563  __kmp_thread_free(th, tg->reduce_data);
2564  tg->reduce_data = NULL;
2565  tg->reduce_num_data = 0;
2566 }
2567 
2568 template <typename T>
2569 void *__kmp_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2570  int num, T *data) {
2571  __kmp_assert_valid_gtid(gtid);
2572  kmp_info_t *thr = __kmp_threads[gtid];
2573  kmp_int32 nth = thr->th.th_team_nproc;
2574  __kmpc_taskgroup(loc, gtid); // form new taskgroup first
2575  if (nth == 1) {
2576  KA_TRACE(10,
2577  ("__kmpc_reduction_modifier_init: T#%d, tg %p, exiting nth=1\n",
2578  gtid, thr->th.th_current_task->td_taskgroup));
2579  return (void *)thr->th.th_current_task->td_taskgroup;
2580  }
2581  kmp_team_t *team = thr->th.th_team;
2582  void *reduce_data;
2583  kmp_taskgroup_t *tg;
2584  reduce_data = KMP_ATOMIC_LD_RLX(&team->t.t_tg_reduce_data[is_ws]);
2585  if (reduce_data == NULL &&
2586  __kmp_atomic_compare_store(&team->t.t_tg_reduce_data[is_ws], reduce_data,
2587  (void *)1)) {
2588  // single thread enters this block to initialize common reduction data
2589  KMP_DEBUG_ASSERT(reduce_data == NULL);
2590  // first initialize own data, then make a copy other threads can use
2591  tg = (kmp_taskgroup_t *)__kmp_task_reduction_init<T>(gtid, num, data);
2592  reduce_data = __kmp_thread_malloc(thr, num * sizeof(kmp_taskred_data_t));
2593  KMP_MEMCPY(reduce_data, tg->reduce_data, num * sizeof(kmp_taskred_data_t));
2594  // fini counters should be 0 at this point
2595  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[0]) == 0);
2596  KMP_DEBUG_ASSERT(KMP_ATOMIC_LD_RLX(&team->t.t_tg_fini_counter[1]) == 0);
2597  KMP_ATOMIC_ST_REL(&team->t.t_tg_reduce_data[is_ws], reduce_data);
2598  } else {
2599  while (
2600  (reduce_data = KMP_ATOMIC_LD_ACQ(&team->t.t_tg_reduce_data[is_ws])) ==
2601  (void *)1) { // wait for task reduction initialization
2602  KMP_CPU_PAUSE();
2603  }
2604  KMP_DEBUG_ASSERT(reduce_data > (void *)1); // should be valid pointer here
2605  tg = thr->th.th_current_task->td_taskgroup;
2606  __kmp_task_reduction_init_copy<T>(thr, num, data, tg, reduce_data);
2607  }
2608  return tg;
2609 }
2610 
2627 void *__kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws,
2628  int num, void *data) {
2629  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2630  (kmp_task_red_input_t *)data);
2631 }
2632 
2647 void *__kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num,
2648  void *data) {
2649  return __kmp_task_reduction_modifier_init(loc, gtid, is_ws, num,
2650  (kmp_taskred_input_t *)data);
2651 }
2652 
2661 void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws) {
2662  __kmpc_end_taskgroup(loc, gtid);
2663 }
2664 
2665 // __kmpc_taskgroup: Start a new taskgroup
2666 void __kmpc_taskgroup(ident_t *loc, int gtid) {
2667  __kmp_assert_valid_gtid(gtid);
2668  kmp_info_t *thread = __kmp_threads[gtid];
2669  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2670  kmp_taskgroup_t *tg_new =
2671  (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
2672  KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
2673  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
2674  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
2675  tg_new->parent = taskdata->td_taskgroup;
2676  tg_new->reduce_data = NULL;
2677  tg_new->reduce_num_data = 0;
2678  tg_new->gomp_data = NULL;
2679  taskdata->td_taskgroup = tg_new;
2680 
2681 #if OMPT_SUPPORT && OMPT_OPTIONAL
2682  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2683  void *codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2684  if (!codeptr)
2685  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2686  kmp_team_t *team = thread->th.th_team;
2687  ompt_data_t my_task_data = taskdata->ompt_task_info.task_data;
2688  // FIXME: I think this is wrong for lwt!
2689  ompt_data_t my_parallel_data = team->t.ompt_team_info.parallel_data;
2690 
2691  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2692  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2693  &(my_task_data), codeptr);
2694  }
2695 #endif
2696 }
2697 
2698 // __kmpc_end_taskgroup: Wait until all tasks generated by the current task
2699 // and its descendants are complete
2700 void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
2701  __kmp_assert_valid_gtid(gtid);
2702  kmp_info_t *thread = __kmp_threads[gtid];
2703  kmp_taskdata_t *taskdata = thread->th.th_current_task;
2704  kmp_taskgroup_t *taskgroup = taskdata->td_taskgroup;
2705  int thread_finished = FALSE;
2706 
2707 #if OMPT_SUPPORT && OMPT_OPTIONAL
2708  kmp_team_t *team;
2709  ompt_data_t my_task_data;
2710  ompt_data_t my_parallel_data;
2711  void *codeptr = nullptr;
2712  if (UNLIKELY(ompt_enabled.enabled)) {
2713  team = thread->th.th_team;
2714  my_task_data = taskdata->ompt_task_info.task_data;
2715  // FIXME: I think this is wrong for lwt!
2716  my_parallel_data = team->t.ompt_team_info.parallel_data;
2717  codeptr = OMPT_LOAD_RETURN_ADDRESS(gtid);
2718  if (!codeptr)
2719  codeptr = OMPT_GET_RETURN_ADDRESS(0);
2720  }
2721 #endif
2722 
2723  KA_TRACE(10, ("__kmpc_end_taskgroup(enter): T#%d loc=%p\n", gtid, loc));
2724  KMP_DEBUG_ASSERT(taskgroup != NULL);
2725  KMP_SET_THREAD_STATE_BLOCK(TASKGROUP);
2726 
2727  if (__kmp_tasking_mode != tskm_immediate_exec) {
2728  // mark task as waiting not on a barrier
2729  taskdata->td_taskwait_counter += 1;
2730  taskdata->td_taskwait_ident = loc;
2731  taskdata->td_taskwait_thread = gtid + 1;
2732 #if USE_ITT_BUILD
2733  // For ITT the taskgroup wait is similar to taskwait until we need to
2734  // distinguish them
2735  void *itt_sync_obj = NULL;
2736 #if USE_ITT_NOTIFY
2737  KMP_ITT_TASKWAIT_STARTING(itt_sync_obj);
2738 #endif /* USE_ITT_NOTIFY */
2739 #endif /* USE_ITT_BUILD */
2740 
2741 #if OMPT_SUPPORT && OMPT_OPTIONAL
2742  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2743  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2744  ompt_sync_region_taskgroup, ompt_scope_begin, &(my_parallel_data),
2745  &(my_task_data), codeptr);
2746  }
2747 #endif
2748 
2749 #if ENABLE_LIBOMPTARGET
2750  // Give an opportunity to the offload runtime to make progress and create
2751  // any necessary proxy tasks
2752  if (UNLIKELY(kmp_target_sync_cb))
2753  (*kmp_target_sync_cb)(loc, gtid, KMP_TASKDATA_TO_TASK(taskdata), NULL);
2754 #endif // ENABLE_LIBOMPTARGET
2755 
2756  if (!taskdata->td_flags.team_serial ||
2757  (thread->th.th_task_team != NULL &&
2758  (thread->th.th_task_team->tt.tt_found_proxy_tasks ||
2759  thread->th.th_task_team->tt.tt_hidden_helper_task_encountered))) {
2760  kmp_flag_32<false, false> flag(
2761  RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)), 0U);
2762  while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
2763  flag.execute_tasks(thread, gtid, FALSE,
2764  &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
2765  __kmp_task_stealing_constraint);
2766  }
2767  }
2768  taskdata->td_taskwait_thread = -taskdata->td_taskwait_thread; // end waiting
2769 
2770 #if OMPT_SUPPORT && OMPT_OPTIONAL
2771  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region_wait)) {
2772  ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2773  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2774  &(my_task_data), codeptr);
2775  }
2776 #endif
2777 
2778 #if USE_ITT_BUILD
2779  KMP_ITT_TASKWAIT_FINISHED(itt_sync_obj);
2780  KMP_FSYNC_ACQUIRED(taskdata); // acquire self - sync with descendants
2781 #endif /* USE_ITT_BUILD */
2782  }
2783  KMP_DEBUG_ASSERT(taskgroup->count == 0);
2784 
2785  if (taskgroup->reduce_data != NULL &&
2786  !taskgroup->gomp_data) { // need to reduce?
2787  int cnt;
2788  void *reduce_data;
2789  kmp_team_t *t = thread->th.th_team;
2790  kmp_taskred_data_t *arr = (kmp_taskred_data_t *)taskgroup->reduce_data;
2791  // check if <priv> data of the first reduction variable shared for the team
2792  void *priv0 = arr[0].reduce_priv;
2793  if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[0])) != NULL &&
2794  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2795  // finishing task reduction on parallel
2796  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[0]);
2797  if (cnt == thread->th.th_team_nproc - 1) {
2798  // we are the last thread passing __kmpc_reduction_modifier_fini()
2799  // finalize task reduction:
2800  __kmp_task_reduction_fini(thread, taskgroup);
2801  // cleanup fields in the team structure:
2802  // TODO: is relaxed store enough here (whole barrier should follow)?
2803  __kmp_thread_free(thread, reduce_data);
2804  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[0], NULL);
2805  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[0], 0);
2806  } else {
2807  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2808  // so do not finalize reduction, just clean own copy of the data
2809  __kmp_task_reduction_clean(thread, taskgroup);
2810  }
2811  } else if ((reduce_data = KMP_ATOMIC_LD_ACQ(&t->t.t_tg_reduce_data[1])) !=
2812  NULL &&
2813  ((kmp_taskred_data_t *)reduce_data)[0].reduce_priv == priv0) {
2814  // finishing task reduction on worksharing
2815  cnt = KMP_ATOMIC_INC(&t->t.t_tg_fini_counter[1]);
2816  if (cnt == thread->th.th_team_nproc - 1) {
2817  // we are the last thread passing __kmpc_reduction_modifier_fini()
2818  __kmp_task_reduction_fini(thread, taskgroup);
2819  // cleanup fields in team structure:
2820  // TODO: is relaxed store enough here (whole barrier should follow)?
2821  __kmp_thread_free(thread, reduce_data);
2822  KMP_ATOMIC_ST_REL(&t->t.t_tg_reduce_data[1], NULL);
2823  KMP_ATOMIC_ST_REL(&t->t.t_tg_fini_counter[1], 0);
2824  } else {
2825  // we are not the last thread passing __kmpc_reduction_modifier_fini(),
2826  // so do not finalize reduction, just clean own copy of the data
2827  __kmp_task_reduction_clean(thread, taskgroup);
2828  }
2829  } else {
2830  // finishing task reduction on taskgroup
2831  __kmp_task_reduction_fini(thread, taskgroup);
2832  }
2833  }
2834  // Restore parent taskgroup for the current task
2835  taskdata->td_taskgroup = taskgroup->parent;
2836  __kmp_thread_free(thread, taskgroup);
2837 
2838  KA_TRACE(10, ("__kmpc_end_taskgroup(exit): T#%d task %p finished waiting\n",
2839  gtid, taskdata));
2840 
2841 #if OMPT_SUPPORT && OMPT_OPTIONAL
2842  if (UNLIKELY(ompt_enabled.ompt_callback_sync_region)) {
2843  ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2844  ompt_sync_region_taskgroup, ompt_scope_end, &(my_parallel_data),
2845  &(my_task_data), codeptr);
2846  }
2847 #endif
2848 }
2849 
2850 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
2851  kmp_task_team_t *task_team,
2852  kmp_int32 is_constrained) {
2853  kmp_task_t *task = NULL;
2854  kmp_taskdata_t *taskdata;
2855  kmp_taskdata_t *current;
2856  kmp_thread_data_t *thread_data;
2857  int ntasks = task_team->tt.tt_num_task_pri;
2858  if (ntasks == 0) {
2859  KA_TRACE(
2860  20, ("__kmp_get_priority_task(exit #1): T#%d No tasks to get\n", gtid));
2861  return NULL;
2862  }
2863  do {
2864  // decrement num_tasks to "reserve" one task to get for execution
2865  if (__kmp_atomic_compare_store(&task_team->tt.tt_num_task_pri, ntasks,
2866  ntasks - 1))
2867  break;
2868  ntasks = task_team->tt.tt_num_task_pri;
2869  } while (ntasks > 0);
2870  if (ntasks == 0) {
2871  KA_TRACE(20, ("__kmp_get_priority_task(exit #2): T#%d No tasks to get\n",
2872  __kmp_get_gtid()));
2873  return NULL;
2874  }
2875  // We got a "ticket" to get a "reserved" priority task
2876  int deque_ntasks;
2877  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
2878  do {
2879  KMP_ASSERT(list != NULL);
2880  thread_data = &list->td;
2881  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2882  deque_ntasks = thread_data->td.td_deque_ntasks;
2883  if (deque_ntasks == 0) {
2884  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2885  KA_TRACE(20, ("__kmp_get_priority_task: T#%d No tasks to get from %p\n",
2886  __kmp_get_gtid(), thread_data));
2887  list = list->next;
2888  }
2889  } while (deque_ntasks == 0);
2890  KMP_DEBUG_ASSERT(deque_ntasks);
2891  int target = thread_data->td.td_deque_head;
2892  current = __kmp_threads[gtid]->th.th_current_task;
2893  taskdata = thread_data->td.td_deque[target];
2894  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2895  // Bump head pointer and Wrap.
2896  thread_data->td.td_deque_head =
2897  (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2898  } else {
2899  if (!task_team->tt.tt_untied_task_encountered) {
2900  // The TSC does not allow to steal victim task
2901  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2902  KA_TRACE(20, ("__kmp_get_priority_task(exit #3): T#%d could not get task "
2903  "from %p: task_team=%p ntasks=%d head=%u tail=%u\n",
2904  gtid, thread_data, task_team, deque_ntasks, target,
2905  thread_data->td.td_deque_tail));
2906  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2907  return NULL;
2908  }
2909  int i;
2910  // walk through the deque trying to steal any task
2911  taskdata = NULL;
2912  for (i = 1; i < deque_ntasks; ++i) {
2913  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2914  taskdata = thread_data->td.td_deque[target];
2915  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
2916  break; // found task to execute
2917  } else {
2918  taskdata = NULL;
2919  }
2920  }
2921  if (taskdata == NULL) {
2922  // No appropriate candidate found to execute
2923  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2924  KA_TRACE(
2925  10, ("__kmp_get_priority_task(exit #4): T#%d could not get task from "
2926  "%p: task_team=%p ntasks=%d head=%u tail=%u\n",
2927  gtid, thread_data, task_team, deque_ntasks,
2928  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2929  task_team->tt.tt_num_task_pri++; // atomic inc, restore value
2930  return NULL;
2931  }
2932  int prev = target;
2933  for (i = i + 1; i < deque_ntasks; ++i) {
2934  // shift remaining tasks in the deque left by 1
2935  target = (target + 1) & TASK_DEQUE_MASK(thread_data->td);
2936  thread_data->td.td_deque[prev] = thread_data->td.td_deque[target];
2937  prev = target;
2938  }
2939  KMP_DEBUG_ASSERT(
2940  thread_data->td.td_deque_tail ==
2941  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(thread_data->td)));
2942  thread_data->td.td_deque_tail = target; // tail -= 1 (wrapped))
2943  }
2944  thread_data->td.td_deque_ntasks = deque_ntasks - 1;
2945  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2946  task = KMP_TASKDATA_TO_TASK(taskdata);
2947  return task;
2948 }
2949 
2950 // __kmp_remove_my_task: remove a task from my own deque
2951 static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
2952  kmp_task_team_t *task_team,
2953  kmp_int32 is_constrained) {
2954  kmp_task_t *task;
2955  kmp_taskdata_t *taskdata;
2956  kmp_thread_data_t *thread_data;
2957  kmp_uint32 tail;
2958 
2959  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
2960  KMP_DEBUG_ASSERT(task_team->tt.tt_threads_data !=
2961  NULL); // Caller should check this condition
2962 
2963  thread_data = &task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)];
2964 
2965  KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
2966  gtid, thread_data->td.td_deque_ntasks,
2967  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2968 
2969  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2970  KA_TRACE(10,
2971  ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: "
2972  "ntasks=%d head=%u tail=%u\n",
2973  gtid, thread_data->td.td_deque_ntasks,
2974  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2975  return NULL;
2976  }
2977 
2978  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
2979 
2980  if (TCR_4(thread_data->td.td_deque_ntasks) == 0) {
2981  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2982  KA_TRACE(10,
2983  ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: "
2984  "ntasks=%d head=%u tail=%u\n",
2985  gtid, thread_data->td.td_deque_ntasks,
2986  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
2987  return NULL;
2988  }
2989 
2990  tail = (thread_data->td.td_deque_tail - 1) &
2991  TASK_DEQUE_MASK(thread_data->td); // Wrap index.
2992  taskdata = thread_data->td.td_deque[tail];
2993 
2994  if (!__kmp_task_is_allowed(gtid, is_constrained, taskdata,
2995  thread->th.th_current_task)) {
2996  // The TSC does not allow to steal victim task
2997  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
2998  KA_TRACE(10,
2999  ("__kmp_remove_my_task(exit #3): T#%d TSC blocks tail task: "
3000  "ntasks=%d head=%u tail=%u\n",
3001  gtid, thread_data->td.td_deque_ntasks,
3002  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3003  return NULL;
3004  }
3005 
3006  thread_data->td.td_deque_tail = tail;
3007  TCW_4(thread_data->td.td_deque_ntasks, thread_data->td.td_deque_ntasks - 1);
3008 
3009  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3010 
3011  KA_TRACE(10, ("__kmp_remove_my_task(exit #4): T#%d task %p removed: "
3012  "ntasks=%d head=%u tail=%u\n",
3013  gtid, taskdata, thread_data->td.td_deque_ntasks,
3014  thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
3015 
3016  task = KMP_TASKDATA_TO_TASK(taskdata);
3017  return task;
3018 }
3019 
3020 // __kmp_steal_task: remove a task from another thread's deque
3021 // Assume that calling thread has already checked existence of
3022 // task_team thread_data before calling this routine.
3023 static kmp_task_t *__kmp_steal_task(kmp_int32 victim_tid, kmp_int32 gtid,
3024  kmp_task_team_t *task_team,
3025  std::atomic<kmp_int32> *unfinished_threads,
3026  int *thread_finished,
3027  kmp_int32 is_constrained) {
3028  kmp_task_t *task;
3029  kmp_taskdata_t *taskdata;
3030  kmp_taskdata_t *current;
3031  kmp_thread_data_t *victim_td, *threads_data;
3032  kmp_int32 target;
3033  kmp_info_t *victim_thr;
3034 
3035  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3036 
3037  threads_data = task_team->tt.tt_threads_data;
3038  KMP_DEBUG_ASSERT(threads_data != NULL); // Caller should check this condition
3039  KMP_DEBUG_ASSERT(victim_tid >= 0);
3040  KMP_DEBUG_ASSERT(victim_tid < task_team->tt.tt_max_threads);
3041 
3042  victim_td = &threads_data[victim_tid];
3043  victim_thr = victim_td->td.td_thr;
3044  (void)victim_thr; // Use in TRACE messages which aren't always enabled.
3045 
3046  KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: "
3047  "task_team=%p ntasks=%d head=%u tail=%u\n",
3048  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3049  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3050  victim_td->td.td_deque_tail));
3051 
3052  if (TCR_4(victim_td->td.td_deque_ntasks) == 0) {
3053  KA_TRACE(10, ("__kmp_steal_task(exit #1): T#%d could not steal from T#%d: "
3054  "task_team=%p ntasks=%d head=%u tail=%u\n",
3055  gtid, __kmp_gtid_from_thread(victim_thr), task_team,
3056  victim_td->td.td_deque_ntasks, victim_td->td.td_deque_head,
3057  victim_td->td.td_deque_tail));
3058  return NULL;
3059  }
3060 
3061  __kmp_acquire_bootstrap_lock(&victim_td->td.td_deque_lock);
3062 
3063  int ntasks = TCR_4(victim_td->td.td_deque_ntasks);
3064  // Check again after we acquire the lock
3065  if (ntasks == 0) {
3066  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3067  KA_TRACE(10, ("__kmp_steal_task(exit #2): T#%d could not steal from T#%d: "
3068  "task_team=%p ntasks=%d head=%u tail=%u\n",
3069  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3070  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3071  return NULL;
3072  }
3073 
3074  KMP_DEBUG_ASSERT(victim_td->td.td_deque != NULL);
3075  current = __kmp_threads[gtid]->th.th_current_task;
3076  taskdata = victim_td->td.td_deque[victim_td->td.td_deque_head];
3077  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3078  // Bump head pointer and Wrap.
3079  victim_td->td.td_deque_head =
3080  (victim_td->td.td_deque_head + 1) & TASK_DEQUE_MASK(victim_td->td);
3081  } else {
3082  if (!task_team->tt.tt_untied_task_encountered) {
3083  // The TSC does not allow to steal victim task
3084  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3085  KA_TRACE(10, ("__kmp_steal_task(exit #3): T#%d could not steal from "
3086  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3087  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3088  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3089  return NULL;
3090  }
3091  int i;
3092  // walk through victim's deque trying to steal any task
3093  target = victim_td->td.td_deque_head;
3094  taskdata = NULL;
3095  for (i = 1; i < ntasks; ++i) {
3096  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3097  taskdata = victim_td->td.td_deque[target];
3098  if (__kmp_task_is_allowed(gtid, is_constrained, taskdata, current)) {
3099  break; // found victim task
3100  } else {
3101  taskdata = NULL;
3102  }
3103  }
3104  if (taskdata == NULL) {
3105  // No appropriate candidate to steal found
3106  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3107  KA_TRACE(10, ("__kmp_steal_task(exit #4): T#%d could not steal from "
3108  "T#%d: task_team=%p ntasks=%d head=%u tail=%u\n",
3109  gtid, __kmp_gtid_from_thread(victim_thr), task_team, ntasks,
3110  victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3111  return NULL;
3112  }
3113  int prev = target;
3114  for (i = i + 1; i < ntasks; ++i) {
3115  // shift remaining tasks in the deque left by 1
3116  target = (target + 1) & TASK_DEQUE_MASK(victim_td->td);
3117  victim_td->td.td_deque[prev] = victim_td->td.td_deque[target];
3118  prev = target;
3119  }
3120  KMP_DEBUG_ASSERT(
3121  victim_td->td.td_deque_tail ==
3122  (kmp_uint32)((target + 1) & TASK_DEQUE_MASK(victim_td->td)));
3123  victim_td->td.td_deque_tail = target; // tail -= 1 (wrapped))
3124  }
3125  if (*thread_finished) {
3126  // We need to un-mark this victim as a finished victim. This must be done
3127  // before releasing the lock, or else other threads (starting with the
3128  // primary thread victim) might be prematurely released from the barrier!!!
3129 #if KMP_DEBUG
3130  kmp_int32 count =
3131 #endif
3132  KMP_ATOMIC_INC(unfinished_threads);
3133  KA_TRACE(
3134  20,
3135  ("__kmp_steal_task: T#%d inc unfinished_threads to %d: task_team=%p\n",
3136  gtid, count + 1, task_team));
3137  *thread_finished = FALSE;
3138  }
3139  TCW_4(victim_td->td.td_deque_ntasks, ntasks - 1);
3140 
3141  __kmp_release_bootstrap_lock(&victim_td->td.td_deque_lock);
3142 
3143  KMP_COUNT_BLOCK(TASK_stolen);
3144  KA_TRACE(10,
3145  ("__kmp_steal_task(exit #5): T#%d stole task %p from T#%d: "
3146  "task_team=%p ntasks=%d head=%u tail=%u\n",
3147  gtid, taskdata, __kmp_gtid_from_thread(victim_thr), task_team,
3148  ntasks, victim_td->td.td_deque_head, victim_td->td.td_deque_tail));
3149 
3150  task = KMP_TASKDATA_TO_TASK(taskdata);
3151  return task;
3152 }
3153 
3154 // __kmp_execute_tasks_template: Choose and execute tasks until either the
3155 // condition is statisfied (return true) or there are none left (return false).
3156 //
3157 // final_spin is TRUE if this is the spin at the release barrier.
3158 // thread_finished indicates whether the thread is finished executing all
3159 // the tasks it has on its deque, and is at the release barrier.
3160 // spinner is the location on which to spin.
3161 // spinner == NULL means only execute a single task and return.
3162 // checker is the value to check to terminate the spin.
3163 template <class C>
3164 static inline int __kmp_execute_tasks_template(
3165  kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
3166  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3167  kmp_int32 is_constrained) {
3168  kmp_task_team_t *task_team = thread->th.th_task_team;
3169  kmp_thread_data_t *threads_data;
3170  kmp_task_t *task;
3171  kmp_info_t *other_thread;
3172  kmp_taskdata_t *current_task = thread->th.th_current_task;
3173  std::atomic<kmp_int32> *unfinished_threads;
3174  kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
3175  tid = thread->th.th_info.ds.ds_tid;
3176 
3177  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3178  KMP_DEBUG_ASSERT(thread == __kmp_threads[gtid]);
3179 
3180  if (task_team == NULL || current_task == NULL)
3181  return FALSE;
3182 
3183  KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d "
3184  "*thread_finished=%d\n",
3185  gtid, final_spin, *thread_finished));
3186 
3187  thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
3188  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3189 
3190  KMP_DEBUG_ASSERT(threads_data != NULL);
3191 
3192  nthreads = task_team->tt.tt_nproc;
3193  unfinished_threads = &(task_team->tt.tt_unfinished_threads);
3194  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
3195 
3196  while (1) { // Outer loop keeps trying to find tasks in case of single thread
3197  // getting tasks from target constructs
3198  while (1) { // Inner loop to find a task and execute it
3199 #if ENABLE_LIBOMPTARGET
3200  // Give an opportunity to the offload runtime to make progress
3201  if (UNLIKELY(kmp_target_sync_cb))
3202  (*kmp_target_sync_cb)(NULL, gtid, KMP_TASKDATA_TO_TASK(current_task),
3203  NULL);
3204 #endif // ENABLE_LIBOMPTARGET
3205 
3206  task = NULL;
3207  if (task_team->tt.tt_num_task_pri) { // get priority task first
3208  task = __kmp_get_priority_task(gtid, task_team, is_constrained);
3209  }
3210  if (task == NULL && use_own_tasks) { // check own queue next
3211  task = __kmp_remove_my_task(thread, gtid, task_team, is_constrained);
3212  }
3213  if ((task == NULL) && (nthreads > 1)) { // Steal a task finally
3214  int asleep = 1;
3215  use_own_tasks = 0;
3216  // Try to steal from the last place I stole from successfully.
3217  if (victim_tid == -2) { // haven't stolen anything yet
3218  victim_tid = threads_data[tid].td.td_deque_last_stolen;
3219  if (victim_tid !=
3220  -1) // if we have a last stolen from victim, get the thread
3221  other_thread = threads_data[victim_tid].td.td_thr;
3222  }
3223  if (victim_tid != -1) { // found last victim
3224  asleep = 0;
3225  } else if (!new_victim) { // no recent steals and we haven't already
3226  // used a new victim; select a random thread
3227  do { // Find a different thread to steal work from.
3228  // Pick a random thread. Initial plan was to cycle through all the
3229  // threads, and only return if we tried to steal from every thread,
3230  // and failed. Arch says that's not such a great idea.
3231  victim_tid = __kmp_get_random(thread) % (nthreads - 1);
3232  if (victim_tid >= tid) {
3233  ++victim_tid; // Adjusts random distribution to exclude self
3234  }
3235  // Found a potential victim
3236  other_thread = threads_data[victim_tid].td.td_thr;
3237  // There is a slight chance that __kmp_enable_tasking() did not wake
3238  // up all threads waiting at the barrier. If victim is sleeping,
3239  // then wake it up. Since we were going to pay the cache miss
3240  // penalty for referencing another thread's kmp_info_t struct
3241  // anyway,
3242  // the check shouldn't cost too much performance at this point. In
3243  // extra barrier mode, tasks do not sleep at the separate tasking
3244  // barrier, so this isn't a problem.
3245  asleep = 0;
3246  if ((__kmp_tasking_mode == tskm_task_teams) &&
3247  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) &&
3248  (TCR_PTR(CCAST(void *, other_thread->th.th_sleep_loc)) !=
3249  NULL)) {
3250  asleep = 1;
3251  __kmp_null_resume_wrapper(other_thread);
3252  // A sleeping thread should not have any tasks on it's queue.
3253  // There is a slight possibility that it resumes, steals a task
3254  // from another thread, which spawns more tasks, all in the time
3255  // that it takes this thread to check => don't write an assertion
3256  // that the victim's queue is empty. Try stealing from a
3257  // different thread.
3258  }
3259  } while (asleep);
3260  }
3261 
3262  if (!asleep) {
3263  // We have a victim to try to steal from
3264  task =
3265  __kmp_steal_task(victim_tid, gtid, task_team, unfinished_threads,
3266  thread_finished, is_constrained);
3267  }
3268  if (task != NULL) { // set last stolen to victim
3269  if (threads_data[tid].td.td_deque_last_stolen != victim_tid) {
3270  threads_data[tid].td.td_deque_last_stolen = victim_tid;
3271  // The pre-refactored code did not try more than 1 successful new
3272  // vicitm, unless the last one generated more local tasks;
3273  // new_victim keeps track of this
3274  new_victim = 1;
3275  }
3276  } else { // No tasks found; unset last_stolen
3277  KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1);
3278  victim_tid = -2; // no successful victim found
3279  }
3280  }
3281 
3282  if (task == NULL)
3283  break; // break out of tasking loop
3284 
3285 // Found a task; execute it
3286 #if USE_ITT_BUILD && USE_ITT_NOTIFY
3287  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
3288  if (itt_sync_obj == NULL) { // we are at fork barrier where we could not
3289  // get the object reliably
3290  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
3291  }
3292  __kmp_itt_task_starting(itt_sync_obj);
3293  }
3294 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
3295  __kmp_invoke_task(gtid, task, current_task);
3296 #if USE_ITT_BUILD
3297  if (itt_sync_obj != NULL)
3298  __kmp_itt_task_finished(itt_sync_obj);
3299 #endif /* USE_ITT_BUILD */
3300  // If this thread is only partway through the barrier and the condition is
3301  // met, then return now, so that the barrier gather/release pattern can
3302  // proceed. If this thread is in the last spin loop in the barrier,
3303  // waiting to be released, we know that the termination condition will not
3304  // be satisfied, so don't waste any cycles checking it.
3305  if (flag == NULL || (!final_spin && flag->done_check())) {
3306  KA_TRACE(
3307  15,
3308  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3309  gtid));
3310  return TRUE;
3311  }
3312  if (thread->th.th_task_team == NULL) {
3313  break;
3314  }
3315  KMP_YIELD(__kmp_library == library_throughput); // Yield before next task
3316  // If execution of a stolen task results in more tasks being placed on our
3317  // run queue, reset use_own_tasks
3318  if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) {
3319  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned "
3320  "other tasks, restart\n",
3321  gtid));
3322  use_own_tasks = 1;
3323  new_victim = 0;
3324  }
3325  }
3326 
3327  // The task source has been exhausted. If in final spin loop of barrier,
3328  // check if termination condition is satisfied. The work queue may be empty
3329  // but there might be proxy tasks still executing.
3330  if (final_spin &&
3331  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0) {
3332  // First, decrement the #unfinished threads, if that has not already been
3333  // done. This decrement might be to the spin location, and result in the
3334  // termination condition being satisfied.
3335  if (!*thread_finished) {
3336 #if KMP_DEBUG
3337  kmp_int32 count = -1 +
3338 #endif
3339  KMP_ATOMIC_DEC(unfinished_threads);
3340  KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
3341  "unfinished_threads to %d task_team=%p\n",
3342  gtid, count, task_team));
3343  *thread_finished = TRUE;
3344  }
3345 
3346  // It is now unsafe to reference thread->th.th_team !!!
3347  // Decrementing task_team->tt.tt_unfinished_threads can allow the primary
3348  // thread to pass through the barrier, where it might reset each thread's
3349  // th.th_team field for the next parallel region. If we can steal more
3350  // work, we know that this has not happened yet.
3351  if (flag != NULL && flag->done_check()) {
3352  KA_TRACE(
3353  15,
3354  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3355  gtid));
3356  return TRUE;
3357  }
3358  }
3359 
3360  // If this thread's task team is NULL, primary thread has recognized that
3361  // there are no more tasks; bail out
3362  if (thread->th.th_task_team == NULL) {
3363  KA_TRACE(15,
3364  ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid));
3365  return FALSE;
3366  }
3367 
3368  // Check the flag again to see if it has already done in case to be trapped
3369  // into infinite loop when a if0 task depends on a hidden helper task
3370  // outside any parallel region. Detached tasks are not impacted in this case
3371  // because the only thread executing this function has to execute the proxy
3372  // task so it is in another code path that has the same check.
3373  if (flag == NULL || (!final_spin && flag->done_check())) {
3374  KA_TRACE(15,
3375  ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n",
3376  gtid));
3377  return TRUE;
3378  }
3379 
3380  // We could be getting tasks from target constructs; if this is the only
3381  // thread, keep trying to execute tasks from own queue
3382  if (nthreads == 1 &&
3383  KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks))
3384  use_own_tasks = 1;
3385  else {
3386  KA_TRACE(15,
3387  ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid));
3388  return FALSE;
3389  }
3390  }
3391 }
3392 
3393 template <bool C, bool S>
3394 int __kmp_execute_tasks_32(
3395  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32<C, S> *flag, int final_spin,
3396  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3397  kmp_int32 is_constrained) {
3398  return __kmp_execute_tasks_template(
3399  thread, gtid, flag, final_spin,
3400  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3401 }
3402 
3403 template <bool C, bool S>
3404 int __kmp_execute_tasks_64(
3405  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_64<C, S> *flag, int final_spin,
3406  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3407  kmp_int32 is_constrained) {
3408  return __kmp_execute_tasks_template(
3409  thread, gtid, flag, final_spin,
3410  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3411 }
3412 
3413 template <bool C, bool S>
3414 int __kmp_atomic_execute_tasks_64(
3415  kmp_info_t *thread, kmp_int32 gtid, kmp_atomic_flag_64<C, S> *flag,
3416  int final_spin, int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3417  kmp_int32 is_constrained) {
3418  return __kmp_execute_tasks_template(
3419  thread, gtid, flag, final_spin,
3420  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3421 }
3422 
3423 int __kmp_execute_tasks_oncore(
3424  kmp_info_t *thread, kmp_int32 gtid, kmp_flag_oncore *flag, int final_spin,
3425  int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
3426  kmp_int32 is_constrained) {
3427  return __kmp_execute_tasks_template(
3428  thread, gtid, flag, final_spin,
3429  thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
3430 }
3431 
3432 template int
3433 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
3434  kmp_flag_32<false, false> *, int,
3435  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3436 
3437 template int __kmp_execute_tasks_64<false, true>(kmp_info_t *, kmp_int32,
3438  kmp_flag_64<false, true> *,
3439  int,
3440  int *USE_ITT_BUILD_ARG(void *),
3441  kmp_int32);
3442 
3443 template int __kmp_execute_tasks_64<true, false>(kmp_info_t *, kmp_int32,
3444  kmp_flag_64<true, false> *,
3445  int,
3446  int *USE_ITT_BUILD_ARG(void *),
3447  kmp_int32);
3448 
3449 template int __kmp_atomic_execute_tasks_64<false, true>(
3450  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<false, true> *, int,
3451  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3452 
3453 template int __kmp_atomic_execute_tasks_64<true, false>(
3454  kmp_info_t *, kmp_int32, kmp_atomic_flag_64<true, false> *, int,
3455  int *USE_ITT_BUILD_ARG(void *), kmp_int32);
3456 
3457 // __kmp_enable_tasking: Allocate task team and resume threads sleeping at the
3458 // next barrier so they can assist in executing enqueued tasks.
3459 // First thread in allocates the task team atomically.
3460 static void __kmp_enable_tasking(kmp_task_team_t *task_team,
3461  kmp_info_t *this_thr) {
3462  kmp_thread_data_t *threads_data;
3463  int nthreads, i, is_init_thread;
3464 
3465  KA_TRACE(10, ("__kmp_enable_tasking(enter): T#%d\n",
3466  __kmp_gtid_from_thread(this_thr)));
3467 
3468  KMP_DEBUG_ASSERT(task_team != NULL);
3469  KMP_DEBUG_ASSERT(this_thr->th.th_team != NULL);
3470 
3471  nthreads = task_team->tt.tt_nproc;
3472  KMP_DEBUG_ASSERT(nthreads > 0);
3473  KMP_DEBUG_ASSERT(nthreads == this_thr->th.th_team->t.t_nproc);
3474 
3475  // Allocate or increase the size of threads_data if necessary
3476  is_init_thread = __kmp_realloc_task_threads_data(this_thr, task_team);
3477 
3478  if (!is_init_thread) {
3479  // Some other thread already set up the array.
3480  KA_TRACE(
3481  20,
3482  ("__kmp_enable_tasking(exit): T#%d: threads array already set up.\n",
3483  __kmp_gtid_from_thread(this_thr)));
3484  return;
3485  }
3486  threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
3487  KMP_DEBUG_ASSERT(threads_data != NULL);
3488 
3489  if (__kmp_tasking_mode == tskm_task_teams &&
3490  (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME)) {
3491  // Release any threads sleeping at the barrier, so that they can steal
3492  // tasks and execute them. In extra barrier mode, tasks do not sleep
3493  // at the separate tasking barrier, so this isn't a problem.
3494  for (i = 0; i < nthreads; i++) {
3495  void *sleep_loc;
3496  kmp_info_t *thread = threads_data[i].td.td_thr;
3497 
3498  if (i == this_thr->th.th_info.ds.ds_tid) {
3499  continue;
3500  }
3501  // Since we haven't locked the thread's suspend mutex lock at this
3502  // point, there is a small window where a thread might be putting
3503  // itself to sleep, but hasn't set the th_sleep_loc field yet.
3504  // To work around this, __kmp_execute_tasks_template() periodically checks
3505  // see if other threads are sleeping (using the same random mechanism that
3506  // is used for task stealing) and awakens them if they are.
3507  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3508  NULL) {
3509  KF_TRACE(50, ("__kmp_enable_tasking: T#%d waking up thread T#%d\n",
3510  __kmp_gtid_from_thread(this_thr),
3511  __kmp_gtid_from_thread(thread)));
3512  __kmp_null_resume_wrapper(thread);
3513  } else {
3514  KF_TRACE(50, ("__kmp_enable_tasking: T#%d don't wake up thread T#%d\n",
3515  __kmp_gtid_from_thread(this_thr),
3516  __kmp_gtid_from_thread(thread)));
3517  }
3518  }
3519  }
3520 
3521  KA_TRACE(10, ("__kmp_enable_tasking(exit): T#%d\n",
3522  __kmp_gtid_from_thread(this_thr)));
3523 }
3524 
3525 /* // TODO: Check the comment consistency
3526  * Utility routines for "task teams". A task team (kmp_task_t) is kind of
3527  * like a shadow of the kmp_team_t data struct, with a different lifetime.
3528  * After a child * thread checks into a barrier and calls __kmp_release() from
3529  * the particular variant of __kmp_<barrier_kind>_barrier_gather(), it can no
3530  * longer assume that the kmp_team_t structure is intact (at any moment, the
3531  * primary thread may exit the barrier code and free the team data structure,
3532  * and return the threads to the thread pool).
3533  *
3534  * This does not work with the tasking code, as the thread is still
3535  * expected to participate in the execution of any tasks that may have been
3536  * spawned my a member of the team, and the thread still needs access to all
3537  * to each thread in the team, so that it can steal work from it.
3538  *
3539  * Enter the existence of the kmp_task_team_t struct. It employs a reference
3540  * counting mechanism, and is allocated by the primary thread before calling
3541  * __kmp_<barrier_kind>_release, and then is release by the last thread to
3542  * exit __kmp_<barrier_kind>_release at the next barrier. I.e. the lifetimes
3543  * of the kmp_task_team_t structs for consecutive barriers can overlap
3544  * (and will, unless the primary thread is the last thread to exit the barrier
3545  * release phase, which is not typical). The existence of such a struct is
3546  * useful outside the context of tasking.
3547  *
3548  * We currently use the existence of the threads array as an indicator that
3549  * tasks were spawned since the last barrier. If the structure is to be
3550  * useful outside the context of tasking, then this will have to change, but
3551  * not setting the field minimizes the performance impact of tasking on
3552  * barriers, when no explicit tasks were spawned (pushed, actually).
3553  */
3554 
3555 static kmp_task_team_t *__kmp_free_task_teams =
3556  NULL; // Free list for task_team data structures
3557 // Lock for task team data structures
3558 kmp_bootstrap_lock_t __kmp_task_team_lock =
3559  KMP_BOOTSTRAP_LOCK_INITIALIZER(__kmp_task_team_lock);
3560 
3561 // __kmp_alloc_task_deque:
3562 // Allocates a task deque for a particular thread, and initialize the necessary
3563 // data structures relating to the deque. This only happens once per thread
3564 // per task team since task teams are recycled. No lock is needed during
3565 // allocation since each thread allocates its own deque.
3566 static void __kmp_alloc_task_deque(kmp_info_t *thread,
3567  kmp_thread_data_t *thread_data) {
3568  __kmp_init_bootstrap_lock(&thread_data->td.td_deque_lock);
3569  KMP_DEBUG_ASSERT(thread_data->td.td_deque == NULL);
3570 
3571  // Initialize last stolen task field to "none"
3572  thread_data->td.td_deque_last_stolen = -1;
3573 
3574  KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) == 0);
3575  KMP_DEBUG_ASSERT(thread_data->td.td_deque_head == 0);
3576  KMP_DEBUG_ASSERT(thread_data->td.td_deque_tail == 0);
3577 
3578  KE_TRACE(
3579  10,
3580  ("__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n",
3581  __kmp_gtid_from_thread(thread), INITIAL_TASK_DEQUE_SIZE, thread_data));
3582  // Allocate space for task deque, and zero the deque
3583  // Cannot use __kmp_thread_calloc() because threads not around for
3584  // kmp_reap_task_team( ).
3585  thread_data->td.td_deque = (kmp_taskdata_t **)__kmp_allocate(
3586  INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *));
3587  thread_data->td.td_deque_size = INITIAL_TASK_DEQUE_SIZE;
3588 }
3589 
3590 // __kmp_free_task_deque:
3591 // Deallocates a task deque for a particular thread. Happens at library
3592 // deallocation so don't need to reset all thread data fields.
3593 static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) {
3594  if (thread_data->td.td_deque != NULL) {
3595  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
3596  TCW_4(thread_data->td.td_deque_ntasks, 0);
3597  __kmp_free(thread_data->td.td_deque);
3598  thread_data->td.td_deque = NULL;
3599  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
3600  }
3601 }
3602 
3603 // __kmp_realloc_task_threads_data:
3604 // Allocates a threads_data array for a task team, either by allocating an
3605 // initial array or enlarging an existing array. Only the first thread to get
3606 // the lock allocs or enlarges the array and re-initializes the array elements.
3607 // That thread returns "TRUE", the rest return "FALSE".
3608 // Assumes that the new array size is given by task_team -> tt.tt_nproc.
3609 // The current size is given by task_team -> tt.tt_max_threads.
3610 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
3611  kmp_task_team_t *task_team) {
3612  kmp_thread_data_t **threads_data_p;
3613  kmp_int32 nthreads, maxthreads;
3614  int is_init_thread = FALSE;
3615 
3616  if (TCR_4(task_team->tt.tt_found_tasks)) {
3617  // Already reallocated and initialized.
3618  return FALSE;
3619  }
3620 
3621  threads_data_p = &task_team->tt.tt_threads_data;
3622  nthreads = task_team->tt.tt_nproc;
3623  maxthreads = task_team->tt.tt_max_threads;
3624 
3625  // All threads must lock when they encounter the first task of the implicit
3626  // task region to make sure threads_data fields are (re)initialized before
3627  // used.
3628  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3629 
3630  if (!TCR_4(task_team->tt.tt_found_tasks)) {
3631  // first thread to enable tasking
3632  kmp_team_t *team = thread->th.th_team;
3633  int i;
3634 
3635  is_init_thread = TRUE;
3636  if (maxthreads < nthreads) {
3637 
3638  if (*threads_data_p != NULL) {
3639  kmp_thread_data_t *old_data = *threads_data_p;
3640  kmp_thread_data_t *new_data = NULL;
3641 
3642  KE_TRACE(
3643  10,
3644  ("__kmp_realloc_task_threads_data: T#%d reallocating "
3645  "threads data for task_team %p, new_size = %d, old_size = %d\n",
3646  __kmp_gtid_from_thread(thread), task_team, nthreads, maxthreads));
3647  // Reallocate threads_data to have more elements than current array
3648  // Cannot use __kmp_thread_realloc() because threads not around for
3649  // kmp_reap_task_team( ). Note all new array entries are initialized
3650  // to zero by __kmp_allocate().
3651  new_data = (kmp_thread_data_t *)__kmp_allocate(
3652  nthreads * sizeof(kmp_thread_data_t));
3653  // copy old data to new data
3654  KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t),
3655  (void *)old_data, maxthreads * sizeof(kmp_thread_data_t));
3656 
3657  // Install the new data and free the old data
3658  (*threads_data_p) = new_data;
3659  __kmp_free(old_data);
3660  } else {
3661  KE_TRACE(10, ("__kmp_realloc_task_threads_data: T#%d allocating "
3662  "threads data for task_team %p, size = %d\n",
3663  __kmp_gtid_from_thread(thread), task_team, nthreads));
3664  // Make the initial allocate for threads_data array, and zero entries
3665  // Cannot use __kmp_thread_calloc() because threads not around for
3666  // kmp_reap_task_team( ).
3667  *threads_data_p = (kmp_thread_data_t *)__kmp_allocate(
3668  nthreads * sizeof(kmp_thread_data_t));
3669  }
3670  task_team->tt.tt_max_threads = nthreads;
3671  } else {
3672  // If array has (more than) enough elements, go ahead and use it
3673  KMP_DEBUG_ASSERT(*threads_data_p != NULL);
3674  }
3675 
3676  // initialize threads_data pointers back to thread_info structures
3677  for (i = 0; i < nthreads; i++) {
3678  kmp_thread_data_t *thread_data = &(*threads_data_p)[i];
3679  thread_data->td.td_thr = team->t.t_threads[i];
3680 
3681  if (thread_data->td.td_deque_last_stolen >= nthreads) {
3682  // The last stolen field survives across teams / barrier, and the number
3683  // of threads may have changed. It's possible (likely?) that a new
3684  // parallel region will exhibit the same behavior as previous region.
3685  thread_data->td.td_deque_last_stolen = -1;
3686  }
3687  }
3688 
3689  KMP_MB();
3690  TCW_SYNC_4(task_team->tt.tt_found_tasks, TRUE);
3691  }
3692 
3693  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3694  return is_init_thread;
3695 }
3696 
3697 // __kmp_free_task_threads_data:
3698 // Deallocates a threads_data array for a task team, including any attached
3699 // tasking deques. Only occurs at library shutdown.
3700 static void __kmp_free_task_threads_data(kmp_task_team_t *task_team) {
3701  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_threads_lock);
3702  if (task_team->tt.tt_threads_data != NULL) {
3703  int i;
3704  for (i = 0; i < task_team->tt.tt_max_threads; i++) {
3705  __kmp_free_task_deque(&task_team->tt.tt_threads_data[i]);
3706  }
3707  __kmp_free(task_team->tt.tt_threads_data);
3708  task_team->tt.tt_threads_data = NULL;
3709  }
3710  __kmp_release_bootstrap_lock(&task_team->tt.tt_threads_lock);
3711 }
3712 
3713 // __kmp_free_task_pri_list:
3714 // Deallocates tasking deques used for priority tasks.
3715 // Only occurs at library shutdown.
3716 static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
3717  __kmp_acquire_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3718  if (task_team->tt.tt_task_pri_list != NULL) {
3719  kmp_task_pri_t *list = task_team->tt.tt_task_pri_list;
3720  while (list != NULL) {
3721  kmp_task_pri_t *next = list->next;
3722  __kmp_free_task_deque(&list->td);
3723  __kmp_free(list);
3724  list = next;
3725  }
3726  task_team->tt.tt_task_pri_list = NULL;
3727  }
3728  __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3729 }
3730 
3731 static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
3732  kmp_team_t *team) {
3733  int team_nth = team->t.t_nproc;
3734  // Only need to init if task team is isn't active or team size changed
3735  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
3736  TCW_4(task_team->tt.tt_found_tasks, FALSE);
3737  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
3738  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
3739  TCW_4(task_team->tt.tt_nproc, team_nth);
3740  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
3741  TCW_4(task_team->tt.tt_active, TRUE);
3742  }
3743 }
3744 
3745 // __kmp_allocate_task_team:
3746 // Allocates a task team associated with a specific team, taking it from
3747 // the global task team free list if possible. Also initializes data
3748 // structures.
3749 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
3750  kmp_team_t *team) {
3751  kmp_task_team_t *task_team = NULL;
3752 
3753  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
3754  (thread ? __kmp_gtid_from_thread(thread) : -1), team));
3755 
3756  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3757  // Take a task team from the task team pool
3758  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3759  if (__kmp_free_task_teams != NULL) {
3760  task_team = __kmp_free_task_teams;
3761  TCW_PTR(__kmp_free_task_teams, task_team->tt.tt_next);
3762  task_team->tt.tt_next = NULL;
3763  }
3764  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3765  }
3766 
3767  if (task_team == NULL) {
3768  KE_TRACE(10, ("__kmp_allocate_task_team: T#%d allocating "
3769  "task team for team %p\n",
3770  __kmp_gtid_from_thread(thread), team));
3771  // Allocate a new task team if one is not available. Cannot use
3772  // __kmp_thread_malloc because threads not around for kmp_reap_task_team.
3773  task_team = (kmp_task_team_t *)__kmp_allocate(sizeof(kmp_task_team_t));
3774  __kmp_init_bootstrap_lock(&task_team->tt.tt_threads_lock);
3775  __kmp_init_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
3776 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
3777  // suppress race conditions detection on synchronization flags in debug mode
3778  // this helps to analyze library internals eliminating false positives
3779  __itt_suppress_mark_range(
3780  __itt_suppress_range, __itt_suppress_threading_errors,
3781  &task_team->tt.tt_found_tasks, sizeof(task_team->tt.tt_found_tasks));
3782  __itt_suppress_mark_range(__itt_suppress_range,
3783  __itt_suppress_threading_errors,
3784  CCAST(kmp_uint32 *, &task_team->tt.tt_active),
3785  sizeof(task_team->tt.tt_active));
3786 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
3787  // Note: __kmp_allocate zeroes returned memory, othewise we would need:
3788  // task_team->tt.tt_threads_data = NULL;
3789  // task_team->tt.tt_max_threads = 0;
3790  // task_team->tt.tt_next = NULL;
3791  }
3792 
3793  __kmp_task_team_init(task_team, team);
3794 
3795  KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
3796  "unfinished_threads init'd to %d\n",
3797  (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
3798  KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
3799  return task_team;
3800 }
3801 
3802 // __kmp_free_task_team:
3803 // Frees the task team associated with a specific thread, and adds it
3804 // to the global task team free list.
3805 void __kmp_free_task_team(kmp_info_t *thread, kmp_task_team_t *task_team) {
3806  KA_TRACE(20, ("__kmp_free_task_team: T#%d task_team = %p\n",
3807  thread ? __kmp_gtid_from_thread(thread) : -1, task_team));
3808 
3809  // Put task team back on free list
3810  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3811 
3812  KMP_DEBUG_ASSERT(task_team->tt.tt_next == NULL);
3813  task_team->tt.tt_next = __kmp_free_task_teams;
3814  TCW_PTR(__kmp_free_task_teams, task_team);
3815 
3816  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3817 }
3818 
3819 // __kmp_reap_task_teams:
3820 // Free all the task teams on the task team free list.
3821 // Should only be done during library shutdown.
3822 // Cannot do anything that needs a thread structure or gtid since they are
3823 // already gone.
3824 void __kmp_reap_task_teams(void) {
3825  kmp_task_team_t *task_team;
3826 
3827  if (TCR_PTR(__kmp_free_task_teams) != NULL) {
3828  // Free all task_teams on the free list
3829  __kmp_acquire_bootstrap_lock(&__kmp_task_team_lock);
3830  while ((task_team = __kmp_free_task_teams) != NULL) {
3831  __kmp_free_task_teams = task_team->tt.tt_next;
3832  task_team->tt.tt_next = NULL;
3833 
3834  // Free threads_data if necessary
3835  if (task_team->tt.tt_threads_data != NULL) {
3836  __kmp_free_task_threads_data(task_team);
3837  }
3838  if (task_team->tt.tt_task_pri_list != NULL) {
3839  __kmp_free_task_pri_list(task_team);
3840  }
3841  __kmp_free(task_team);
3842  }
3843  __kmp_release_bootstrap_lock(&__kmp_task_team_lock);
3844  }
3845 }
3846 
3847 // View the array of two task team pointers as a pair of pointers:
3848 // 1) a single task_team pointer
3849 // 2) next pointer for stack
3850 // Serial teams can create a stack of task teams for nested serial teams.
3851 void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3852  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3853  kmp_task_team_list_t *current =
3854  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3855  kmp_task_team_list_t *node =
3856  (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
3857  node->task_team = current->task_team;
3858  node->next = current->next;
3859  thread->th.th_task_team = current->task_team = NULL;
3860  current->next = node;
3861 }
3862 
3863 // Serial team pops a task team off the stack
3864 void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
3865  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3866  kmp_task_team_list_t *current =
3867  (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
3868  if (current->task_team) {
3869  __kmp_free_task_team(thread, current->task_team);
3870  }
3871  kmp_task_team_list_t *next = current->next;
3872  if (next) {
3873  current->task_team = next->task_team;
3874  current->next = next->next;
3875  KMP_DEBUG_ASSERT(next != current);
3876  __kmp_free(next);
3877  thread->th.th_task_team = current->task_team;
3878  }
3879 }
3880 
3881 // __kmp_wait_to_unref_task_teams:
3882 // Some threads could still be in the fork barrier release code, possibly
3883 // trying to steal tasks. Wait for each thread to unreference its task team.
3884 void __kmp_wait_to_unref_task_teams(void) {
3885  kmp_info_t *thread;
3886  kmp_uint32 spins;
3887  kmp_uint64 time;
3888  int done;
3889 
3890  KMP_INIT_YIELD(spins);
3891  KMP_INIT_BACKOFF(time);
3892 
3893  for (;;) {
3894  done = TRUE;
3895 
3896  // TODO: GEH - this may be is wrong because some sync would be necessary
3897  // in case threads are added to the pool during the traversal. Need to
3898  // verify that lock for thread pool is held when calling this routine.
3899  for (thread = CCAST(kmp_info_t *, __kmp_thread_pool); thread != NULL;
3900  thread = thread->th.th_next_pool) {
3901 #if KMP_OS_WINDOWS
3902  DWORD exit_val;
3903 #endif
3904  if (TCR_PTR(thread->th.th_task_team) == NULL) {
3905  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: T#%d task_team == NULL\n",
3906  __kmp_gtid_from_thread(thread)));
3907  continue;
3908  }
3909 #if KMP_OS_WINDOWS
3910  // TODO: GEH - add this check for Linux* OS / OS X* as well?
3911  if (!__kmp_is_thread_alive(thread, &exit_val)) {
3912  thread->th.th_task_team = NULL;
3913  continue;
3914  }
3915 #endif
3916 
3917  done = FALSE; // Because th_task_team pointer is not NULL for this thread
3918 
3919  KA_TRACE(10, ("__kmp_wait_to_unref_task_team: Waiting for T#%d to "
3920  "unreference task_team\n",
3921  __kmp_gtid_from_thread(thread)));
3922 
3923  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
3924  void *sleep_loc;
3925  // If the thread is sleeping, awaken it.
3926  if ((sleep_loc = TCR_PTR(CCAST(void *, thread->th.th_sleep_loc))) !=
3927  NULL) {
3928  KA_TRACE(
3929  10,
3930  ("__kmp_wait_to_unref_task_team: T#%d waking up thread T#%d\n",
3931  __kmp_gtid_from_thread(thread), __kmp_gtid_from_thread(thread)));
3932  __kmp_null_resume_wrapper(thread);
3933  }
3934  }
3935  }
3936  if (done) {
3937  break;
3938  }
3939 
3940  // If oversubscribed or have waited a bit, yield.
3941  KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
3942  }
3943 }
3944 
3945 // __kmp_task_team_setup: Create a task_team for the current team, but use
3946 // an already created, unused one if it already exists.
3947 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
3948  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
3949 
3950  // For the serial and root teams, setup the first task team pointer to point
3951  // to task team. The other pointer is a stack of task teams from previous
3952  // serial levels.
3953  if (team == this_thr->th.th_serial_team ||
3954  team == this_thr->th.th_root->r.r_root_team) {
3955  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
3956  if (team->t.t_task_team[0] == NULL) {
3957  team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
3958  KA_TRACE(
3959  20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3960  " for serial/root team %p\n",
3961  __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
3962 
3963  } else
3964  __kmp_task_team_init(team->t.t_task_team[0], team);
3965  return;
3966  }
3967 
3968  // If this task_team hasn't been created yet, allocate it. It will be used in
3969  // the region after the next.
3970  // If it exists, it is the current task team and shouldn't be touched yet as
3971  // it may still be in use.
3972  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
3973  team->t.t_task_team[this_thr->th.th_task_state] =
3974  __kmp_allocate_task_team(this_thr, team);
3975  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
3976  " for team %d at parity=%d\n",
3977  __kmp_gtid_from_thread(this_thr),
3978  team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
3979  this_thr->th.th_task_state));
3980  }
3981 
3982  // After threads exit the release, they will call sync, and then point to this
3983  // other task_team; make sure it is allocated and properly initialized. As
3984  // threads spin in the barrier release phase, they will continue to use the
3985  // previous task_team struct(above), until they receive the signal to stop
3986  // checking for tasks (they can't safely reference the kmp_team_t struct,
3987  // which could be reallocated by the primary thread).
3988  int other_team = 1 - this_thr->th.th_task_state;
3989  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
3990  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
3991  team->t.t_task_team[other_team] = __kmp_allocate_task_team(this_thr, team);
3992  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
3993  "task_team %p for team %d at parity=%d\n",
3994  __kmp_gtid_from_thread(this_thr),
3995  team->t.t_task_team[other_team], team->t.t_id, other_team));
3996  } else { // Leave the old task team struct in place for the upcoming region;
3997  // adjust as needed
3998  kmp_task_team_t *task_team = team->t.t_task_team[other_team];
3999  __kmp_task_team_init(task_team, team);
4000  // if team size has changed, the first thread to enable tasking will
4001  // realloc threads_data if necessary
4002  KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
4003  "%p for team %d at parity=%d\n",
4004  __kmp_gtid_from_thread(this_thr),
4005  team->t.t_task_team[other_team], team->t.t_id, other_team));
4006  }
4007 
4008  // For regular thread, task enabling should be called when the task is going
4009  // to be pushed to a dequeue. However, for the hidden helper thread, we need
4010  // it ahead of time so that some operations can be performed without race
4011  // condition.
4012  if (this_thr == __kmp_hidden_helper_main_thread) {
4013  for (int i = 0; i < 2; ++i) {
4014  kmp_task_team_t *task_team = team->t.t_task_team[i];
4015  if (KMP_TASKING_ENABLED(task_team)) {
4016  continue;
4017  }
4018  __kmp_enable_tasking(task_team, this_thr);
4019  for (int j = 0; j < task_team->tt.tt_nproc; ++j) {
4020  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[j];
4021  if (thread_data->td.td_deque == NULL) {
4022  __kmp_alloc_task_deque(__kmp_hidden_helper_threads[j], thread_data);
4023  }
4024  }
4025  }
4026  }
4027 }
4028 
4029 // __kmp_task_team_sync: Propagation of task team data from team to threads
4030 // which happens just after the release phase of a team barrier. This may be
4031 // called by any thread. This is not called for serial or root teams.
4032 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
4033  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4034  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
4035  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
4036 
4037  // Toggle the th_task_state field, to switch which task_team this thread
4038  // refers to
4039  this_thr->th.th_task_state = (kmp_uint8)(1 - this_thr->th.th_task_state);
4040 
4041  // It is now safe to propagate the task team pointer from the team struct to
4042  // the current thread.
4043  TCW_PTR(this_thr->th.th_task_team,
4044  team->t.t_task_team[this_thr->th.th_task_state]);
4045  KA_TRACE(20,
4046  ("__kmp_task_team_sync: Thread T#%d task team switched to task_team "
4047  "%p from Team #%d (parity=%d)\n",
4048  __kmp_gtid_from_thread(this_thr), this_thr->th.th_task_team,
4049  team->t.t_id, this_thr->th.th_task_state));
4050 }
4051 
4052 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
4053 // barrier gather phase. Only called by the primary thread.
4054 //
4055 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
4056 // by passing in 0 optionally as the last argument. When wait is zero, primary
4057 // thread does not wait for unfinished_threads to reach 0.
4058 void __kmp_task_team_wait(
4059  kmp_info_t *this_thr,
4060  kmp_team_t *team USE_ITT_BUILD_ARG(void *itt_sync_obj), int wait) {
4061  kmp_task_team_t *task_team = team->t.t_task_team[this_thr->th.th_task_state];
4062 
4063  KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
4064  KMP_DEBUG_ASSERT(task_team == this_thr->th.th_task_team);
4065 
4066  if ((task_team != NULL) && KMP_TASKING_ENABLED(task_team)) {
4067  if (wait) {
4068  KA_TRACE(20, ("__kmp_task_team_wait: Primary T#%d waiting for all tasks "
4069  "(for unfinished_threads to reach 0) on task_team = %p\n",
4070  __kmp_gtid_from_thread(this_thr), task_team));
4071  // Worker threads may have dropped through to release phase, but could
4072  // still be executing tasks. Wait here for tasks to complete. To avoid
4073  // memory contention, only primary thread checks termination condition.
4074  kmp_flag_32<false, false> flag(
4075  RCAST(std::atomic<kmp_uint32> *,
4076  &task_team->tt.tt_unfinished_threads),
4077  0U);
4078  flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
4079  }
4080  // Deactivate the old task team, so that the worker threads will stop
4081  // referencing it while spinning.
4082  KA_TRACE(
4083  20,
4084  ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
4085  "setting active to false, setting local and team's pointer to NULL\n",
4086  __kmp_gtid_from_thread(this_thr), task_team));
4087  TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
4088  TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
4089  KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
4090  TCW_SYNC_4(task_team->tt.tt_active, FALSE);
4091  KMP_MB();
4092 
4093  TCW_PTR(this_thr->th.th_task_team, NULL);
4094  }
4095 }
4096 
4097 // __kmp_tasking_barrier:
4098 // This routine is called only when __kmp_tasking_mode == tskm_extra_barrier.
4099 // Internal function to execute all tasks prior to a regular barrier or a join
4100 // barrier. It is a full barrier itself, which unfortunately turns regular
4101 // barriers into double barriers and join barriers into 1 1/2 barriers.
4102 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
4103  std::atomic<kmp_uint32> *spin = RCAST(
4104  std::atomic<kmp_uint32> *,
4105  &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
4106  int flag = FALSE;
4107  KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
4108 
4109 #if USE_ITT_BUILD
4110  KMP_FSYNC_SPIN_INIT(spin, NULL);
4111 #endif /* USE_ITT_BUILD */
4112  kmp_flag_32<false, false> spin_flag(spin, 0U);
4113  while (!spin_flag.execute_tasks(thread, gtid, TRUE,
4114  &flag USE_ITT_BUILD_ARG(NULL), 0)) {
4115 #if USE_ITT_BUILD
4116  // TODO: What about itt_sync_obj??
4117  KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
4118 #endif /* USE_ITT_BUILD */
4119 
4120  if (TCR_4(__kmp_global.g.g_done)) {
4121  if (__kmp_global.g.g_abort)
4122  __kmp_abort_thread();
4123  break;
4124  }
4125  KMP_YIELD(TRUE);
4126  }
4127 #if USE_ITT_BUILD
4128  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
4129 #endif /* USE_ITT_BUILD */
4130 }
4131 
4132 // __kmp_give_task puts a task into a given thread queue if:
4133 // - the queue for that thread was created
4134 // - there's space in that queue
4135 // Because of this, __kmp_push_task needs to check if there's space after
4136 // getting the lock
4137 static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task,
4138  kmp_int32 pass) {
4139  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4140  kmp_task_team_t *task_team = taskdata->td_task_team;
4141 
4142  KA_TRACE(20, ("__kmp_give_task: trying to give task %p to thread %d.\n",
4143  taskdata, tid));
4144 
4145  // If task_team is NULL something went really bad...
4146  KMP_DEBUG_ASSERT(task_team != NULL);
4147 
4148  bool result = false;
4149  kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[tid];
4150 
4151  if (thread_data->td.td_deque == NULL) {
4152  // There's no queue in this thread, go find another one
4153  // We're guaranteed that at least one thread has a queue
4154  KA_TRACE(30,
4155  ("__kmp_give_task: thread %d has no queue while giving task %p.\n",
4156  tid, taskdata));
4157  return result;
4158  }
4159 
4160  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4161  TASK_DEQUE_SIZE(thread_data->td)) {
4162  KA_TRACE(
4163  30,
4164  ("__kmp_give_task: queue is full while giving task %p to thread %d.\n",
4165  taskdata, tid));
4166 
4167  // if this deque is bigger than the pass ratio give a chance to another
4168  // thread
4169  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4170  return result;
4171 
4172  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4173  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4174  TASK_DEQUE_SIZE(thread_data->td)) {
4175  // expand deque to push the task which is not allowed to execute
4176  __kmp_realloc_task_deque(thread, thread_data);
4177  }
4178 
4179  } else {
4180 
4181  __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
4182 
4183  if (TCR_4(thread_data->td.td_deque_ntasks) >=
4184  TASK_DEQUE_SIZE(thread_data->td)) {
4185  KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to "
4186  "thread %d.\n",
4187  taskdata, tid));
4188 
4189  // if this deque is bigger than the pass ratio give a chance to another
4190  // thread
4191  if (TASK_DEQUE_SIZE(thread_data->td) / INITIAL_TASK_DEQUE_SIZE >= pass)
4192  goto release_and_exit;
4193 
4194  __kmp_realloc_task_deque(thread, thread_data);
4195  }
4196  }
4197 
4198  // lock is held here, and there is space in the deque
4199 
4200  thread_data->td.td_deque[thread_data->td.td_deque_tail] = taskdata;
4201  // Wrap index.
4202  thread_data->td.td_deque_tail =
4203  (thread_data->td.td_deque_tail + 1) & TASK_DEQUE_MASK(thread_data->td);
4204  TCW_4(thread_data->td.td_deque_ntasks,
4205  TCR_4(thread_data->td.td_deque_ntasks) + 1);
4206 
4207  result = true;
4208  KA_TRACE(30, ("__kmp_give_task: successfully gave task %p to thread %d.\n",
4209  taskdata, tid));
4210 
4211 release_and_exit:
4212  __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
4213 
4214  return result;
4215 }
4216 
4217 #define PROXY_TASK_FLAG 0x40000000
4218 /* The finish of the proxy tasks is divided in two pieces:
4219  - the top half is the one that can be done from a thread outside the team
4220  - the bottom half must be run from a thread within the team
4221 
4222  In order to run the bottom half the task gets queued back into one of the
4223  threads of the team. Once the td_incomplete_child_task counter of the parent
4224  is decremented the threads can leave the barriers. So, the bottom half needs
4225  to be queued before the counter is decremented. The top half is therefore
4226  divided in two parts:
4227  - things that can be run before queuing the bottom half
4228  - things that must be run after queuing the bottom half
4229 
4230  This creates a second race as the bottom half can free the task before the
4231  second top half is executed. To avoid this we use the
4232  td_incomplete_child_task of the proxy task to synchronize the top and bottom
4233  half. */
4234 static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4235  KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
4236  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4237  KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 0);
4238  KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
4239 
4240  taskdata->td_flags.complete = 1; // mark the task as completed
4241 #if OMPX_TASKGRAPH
4242  taskdata->td_flags.onced = 1;
4243 #endif
4244 
4245  if (taskdata->td_taskgroup)
4246  KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
4247 
4248  // Create an imaginary children for this task so the bottom half cannot
4249  // release the task before we have completed the second top half
4250  KMP_ATOMIC_OR(&taskdata->td_incomplete_child_tasks, PROXY_TASK_FLAG);
4251 }
4252 
4253 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
4254 #if KMP_DEBUG
4255  kmp_int32 children = 0;
4256  // Predecrement simulated by "- 1" calculation
4257  children = -1 +
4258 #endif
4259  KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
4260  KMP_DEBUG_ASSERT(children >= 0);
4261 
4262  // Remove the imaginary children
4263  KMP_ATOMIC_AND(&taskdata->td_incomplete_child_tasks, ~PROXY_TASK_FLAG);
4264 }
4265 
4266 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
4267  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4268  kmp_info_t *thread = __kmp_threads[gtid];
4269 
4270  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4271  KMP_DEBUG_ASSERT(taskdata->td_flags.complete ==
4272  1); // top half must run before bottom half
4273 
4274  // We need to wait to make sure the top half is finished
4275  // Spinning here should be ok as this should happen quickly
4276  while ((KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) &
4277  PROXY_TASK_FLAG) > 0)
4278  ;
4279 
4280  __kmp_release_deps(gtid, taskdata);
4281  __kmp_free_task_and_ancestors(gtid, taskdata, thread);
4282 }
4283 
4292 void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask) {
4293  KMP_DEBUG_ASSERT(ptask != NULL);
4294  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4295  KA_TRACE(
4296  10, ("__kmp_proxy_task_completed(enter): T#%d proxy task %p completing\n",
4297  gtid, taskdata));
4298  __kmp_assert_valid_gtid(gtid);
4299  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4300 
4301  __kmp_first_top_half_finish_proxy(taskdata);
4302  __kmp_second_top_half_finish_proxy(taskdata);
4303  __kmp_bottom_half_finish_proxy(gtid, ptask);
4304 
4305  KA_TRACE(10,
4306  ("__kmp_proxy_task_completed(exit): T#%d proxy task %p completing\n",
4307  gtid, taskdata));
4308 }
4309 
4310 void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start = 0) {
4311  KMP_DEBUG_ASSERT(ptask != NULL);
4312  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4313 
4314  // Enqueue task to complete bottom half completion from a thread within the
4315  // corresponding team
4316  kmp_team_t *team = taskdata->td_team;
4317  kmp_int32 nthreads = team->t.t_nproc;
4318  kmp_info_t *thread;
4319 
4320  // This should be similar to start_k = __kmp_get_random( thread ) % nthreads
4321  // but we cannot use __kmp_get_random here
4322  kmp_int32 start_k = start % nthreads;
4323  kmp_int32 pass = 1;
4324  kmp_int32 k = start_k;
4325 
4326  do {
4327  // For now we're just linearly trying to find a thread
4328  thread = team->t.t_threads[k];
4329  k = (k + 1) % nthreads;
4330 
4331  // we did a full pass through all the threads
4332  if (k == start_k)
4333  pass = pass << 1;
4334 
4335  } while (!__kmp_give_task(thread, k, ptask, pass));
4336 
4337  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && __kmp_wpolicy_passive) {
4338  // awake at least one thread to execute given task
4339  for (int i = 0; i < nthreads; ++i) {
4340  thread = team->t.t_threads[i];
4341  if (thread->th.th_sleep_loc != NULL) {
4342  __kmp_null_resume_wrapper(thread);
4343  break;
4344  }
4345  }
4346  }
4347 }
4348 
4356 void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask) {
4357  KMP_DEBUG_ASSERT(ptask != NULL);
4358  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4359 
4360  KA_TRACE(
4361  10,
4362  ("__kmp_proxy_task_completed_ooo(enter): proxy task completing ooo %p\n",
4363  taskdata));
4364 
4365  KMP_DEBUG_ASSERT(taskdata->td_flags.proxy == TASK_PROXY);
4366 
4367  __kmp_first_top_half_finish_proxy(taskdata);
4368 
4369  __kmpc_give_task(ptask);
4370 
4371  __kmp_second_top_half_finish_proxy(taskdata);
4372 
4373  KA_TRACE(
4374  10,
4375  ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n",
4376  taskdata));
4377 }
4378 
4379 kmp_event_t *__kmpc_task_allow_completion_event(ident_t *loc_ref, int gtid,
4380  kmp_task_t *task) {
4381  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
4382  if (td->td_allow_completion_event.type == KMP_EVENT_UNINITIALIZED) {
4383  td->td_allow_completion_event.type = KMP_EVENT_ALLOW_COMPLETION;
4384  td->td_allow_completion_event.ed.task = task;
4385  __kmp_init_tas_lock(&td->td_allow_completion_event.lock);
4386  }
4387  return &td->td_allow_completion_event;
4388 }
4389 
4390 void __kmp_fulfill_event(kmp_event_t *event) {
4391  if (event->type == KMP_EVENT_ALLOW_COMPLETION) {
4392  kmp_task_t *ptask = event->ed.task;
4393  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(ptask);
4394  bool detached = false;
4395  int gtid = __kmp_get_gtid();
4396 
4397  // The associated task might have completed or could be completing at this
4398  // point.
4399  // We need to take the lock to avoid races
4400  __kmp_acquire_tas_lock(&event->lock, gtid);
4401  if (taskdata->td_flags.proxy == TASK_PROXY) {
4402  detached = true;
4403  } else {
4404 #if OMPT_SUPPORT
4405  // The OMPT event must occur under mutual exclusion,
4406  // otherwise the tool might access ptask after free
4407  if (UNLIKELY(ompt_enabled.enabled))
4408  __ompt_task_finish(ptask, NULL, ompt_task_early_fulfill);
4409 #endif
4410  }
4411  event->type = KMP_EVENT_UNINITIALIZED;
4412  __kmp_release_tas_lock(&event->lock, gtid);
4413 
4414  if (detached) {
4415 #if OMPT_SUPPORT
4416  // We free ptask afterwards and know the task is finished,
4417  // so locking is not necessary
4418  if (UNLIKELY(ompt_enabled.enabled))
4419  __ompt_task_finish(ptask, NULL, ompt_task_late_fulfill);
4420 #endif
4421  // If the task detached complete the proxy task
4422  if (gtid >= 0) {
4423  kmp_team_t *team = taskdata->td_team;
4424  kmp_info_t *thread = __kmp_get_thread();
4425  if (thread->th.th_team == team) {
4426  __kmpc_proxy_task_completed(gtid, ptask);
4427  return;
4428  }
4429  }
4430 
4431  // fallback
4433  }
4434  }
4435 }
4436 
4437 // __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task
4438 // for taskloop
4439 //
4440 // thread: allocating thread
4441 // task_src: pointer to source task to be duplicated
4442 // taskloop_recur: used only when dealing with taskgraph,
4443 // indicating whether we need to update task->td_task_id
4444 // returns: a pointer to the allocated kmp_task_t structure (task).
4445 kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
4446 #if OMPX_TASKGRAPH
4447  , int taskloop_recur
4448 #endif
4449 ) {
4450  kmp_task_t *task;
4451  kmp_taskdata_t *taskdata;
4452  kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
4453  kmp_taskdata_t *parent_task = taskdata_src->td_parent; // same parent task
4454  size_t shareds_offset;
4455  size_t task_size;
4456 
4457  KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread,
4458  task_src));
4459  KMP_DEBUG_ASSERT(taskdata_src->td_flags.proxy ==
4460  TASK_FULL); // it should not be proxy task
4461  KMP_DEBUG_ASSERT(taskdata_src->td_flags.tasktype == TASK_EXPLICIT);
4462  task_size = taskdata_src->td_size_alloc;
4463 
4464  // Allocate a kmp_taskdata_t block and a kmp_task_t block.
4465  KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread,
4466  task_size));
4467 #if USE_FAST_MEMORY
4468  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, task_size);
4469 #else
4470  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, task_size);
4471 #endif /* USE_FAST_MEMORY */
4472  KMP_MEMCPY(taskdata, taskdata_src, task_size);
4473 
4474  task = KMP_TASKDATA_TO_TASK(taskdata);
4475 
4476  // Initialize new task (only specific fields not affected by memcpy)
4477 #if OMPX_TASKGRAPH
4478  if (taskdata->is_taskgraph && !taskloop_recur &&
4479  __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
4480  taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
4481 #endif
4482  taskdata->td_task_id = KMP_GEN_TASK_ID();
4483  if (task->shareds != NULL) { // need setup shareds pointer
4484  shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
4485  task->shareds = &((char *)taskdata)[shareds_offset];
4486  KMP_DEBUG_ASSERT((((kmp_uintptr_t)task->shareds) & (sizeof(void *) - 1)) ==
4487  0);
4488  }
4489  taskdata->td_alloc_thread = thread;
4490  taskdata->td_parent = parent_task;
4491  // task inherits the taskgroup from the parent task
4492  taskdata->td_taskgroup = parent_task->td_taskgroup;
4493  // tied task needs to initialize the td_last_tied at creation,
4494  // untied one does this when it is scheduled for execution
4495  if (taskdata->td_flags.tiedness == TASK_TIED)
4496  taskdata->td_last_tied = taskdata;
4497 
4498  // Only need to keep track of child task counts if team parallel and tasking
4499  // not serialized
4500  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
4501  KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
4502  if (parent_task->td_taskgroup)
4503  KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
4504  // Only need to keep track of allocated child tasks for explicit tasks since
4505  // implicit not deallocated
4506  if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
4507  KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
4508  }
4509 
4510  KA_TRACE(20,
4511  ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
4512  thread, taskdata, taskdata->td_parent));
4513 #if OMPT_SUPPORT
4514  if (UNLIKELY(ompt_enabled.enabled))
4515  __ompt_task_init(taskdata, thread->th.th_info.ds.ds_gtid);
4516 #endif
4517  return task;
4518 }
4519 
4520 // Routine optionally generated by the compiler for setting the lastprivate flag
4521 // and calling needed constructors for private/firstprivate objects
4522 // (used to form taskloop tasks from pattern task)
4523 // Parameters: dest task, src task, lastprivate flag.
4524 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
4525 
4526 KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
4527 
4528 // class to encapsulate manipulating loop bounds in a taskloop task.
4529 // this abstracts away the Intel vs GOMP taskloop interface for setting/getting
4530 // the loop bound variables.
4531 class kmp_taskloop_bounds_t {
4532  kmp_task_t *task;
4533  const kmp_taskdata_t *taskdata;
4534  size_t lower_offset;
4535  size_t upper_offset;
4536 
4537 public:
4538  kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub)
4539  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)),
4540  lower_offset((char *)lb - (char *)task),
4541  upper_offset((char *)ub - (char *)task) {
4542  KMP_DEBUG_ASSERT((char *)lb > (char *)_task);
4543  KMP_DEBUG_ASSERT((char *)ub > (char *)_task);
4544  }
4545  kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds)
4546  : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)),
4547  lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {}
4548  size_t get_lower_offset() const { return lower_offset; }
4549  size_t get_upper_offset() const { return upper_offset; }
4550  kmp_uint64 get_lb() const {
4551  kmp_int64 retval;
4552 #if defined(KMP_GOMP_COMPAT)
4553  // Intel task just returns the lower bound normally
4554  if (!taskdata->td_flags.native) {
4555  retval = *(kmp_int64 *)((char *)task + lower_offset);
4556  } else {
4557  // GOMP task has to take into account the sizeof(long)
4558  if (taskdata->td_size_loop_bounds == 4) {
4559  kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds);
4560  retval = (kmp_int64)*lb;
4561  } else {
4562  kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds);
4563  retval = (kmp_int64)*lb;
4564  }
4565  }
4566 #else
4567  (void)taskdata;
4568  retval = *(kmp_int64 *)((char *)task + lower_offset);
4569 #endif // defined(KMP_GOMP_COMPAT)
4570  return retval;
4571  }
4572  kmp_uint64 get_ub() const {
4573  kmp_int64 retval;
4574 #if defined(KMP_GOMP_COMPAT)
4575  // Intel task just returns the upper bound normally
4576  if (!taskdata->td_flags.native) {
4577  retval = *(kmp_int64 *)((char *)task + upper_offset);
4578  } else {
4579  // GOMP task has to take into account the sizeof(long)
4580  if (taskdata->td_size_loop_bounds == 4) {
4581  kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1;
4582  retval = (kmp_int64)*ub;
4583  } else {
4584  kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1;
4585  retval = (kmp_int64)*ub;
4586  }
4587  }
4588 #else
4589  retval = *(kmp_int64 *)((char *)task + upper_offset);
4590 #endif // defined(KMP_GOMP_COMPAT)
4591  return retval;
4592  }
4593  void set_lb(kmp_uint64 lb) {
4594 #if defined(KMP_GOMP_COMPAT)
4595  // Intel task just sets the lower bound normally
4596  if (!taskdata->td_flags.native) {
4597  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4598  } else {
4599  // GOMP task has to take into account the sizeof(long)
4600  if (taskdata->td_size_loop_bounds == 4) {
4601  kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds);
4602  *lower = (kmp_uint32)lb;
4603  } else {
4604  kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds);
4605  *lower = (kmp_uint64)lb;
4606  }
4607  }
4608 #else
4609  *(kmp_uint64 *)((char *)task + lower_offset) = lb;
4610 #endif // defined(KMP_GOMP_COMPAT)
4611  }
4612  void set_ub(kmp_uint64 ub) {
4613 #if defined(KMP_GOMP_COMPAT)
4614  // Intel task just sets the upper bound normally
4615  if (!taskdata->td_flags.native) {
4616  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4617  } else {
4618  // GOMP task has to take into account the sizeof(long)
4619  if (taskdata->td_size_loop_bounds == 4) {
4620  kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1;
4621  *upper = (kmp_uint32)ub;
4622  } else {
4623  kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1;
4624  *upper = (kmp_uint64)ub;
4625  }
4626  }
4627 #else
4628  *(kmp_uint64 *)((char *)task + upper_offset) = ub;
4629 #endif // defined(KMP_GOMP_COMPAT)
4630  }
4631 };
4632 
4633 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
4634 //
4635 // loc Source location information
4636 // gtid Global thread ID
4637 // task Pattern task, exposes the loop iteration range
4638 // lb Pointer to loop lower bound in task structure
4639 // ub Pointer to loop upper bound in task structure
4640 // st Loop stride
4641 // ub_glob Global upper bound (used for lastprivate check)
4642 // num_tasks Number of tasks to execute
4643 // grainsize Number of loop iterations per task
4644 // extras Number of chunks with grainsize+1 iterations
4645 // last_chunk Reduction of grainsize for last task
4646 // tc Iterations count
4647 // task_dup Tasks duplication routine
4648 // codeptr_ra Return address for OMPT events
4649 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
4650  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4651  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4652  kmp_uint64 grainsize, kmp_uint64 extras,
4653  kmp_int64 last_chunk, kmp_uint64 tc,
4654 #if OMPT_SUPPORT
4655  void *codeptr_ra,
4656 #endif
4657  void *task_dup) {
4658  KMP_COUNT_BLOCK(OMP_TASKLOOP);
4659  KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
4660  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4661  // compiler provides global bounds here
4662  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
4663  kmp_uint64 lower = task_bounds.get_lb();
4664  kmp_uint64 upper = task_bounds.get_ub();
4665  kmp_uint64 i;
4666  kmp_info_t *thread = __kmp_threads[gtid];
4667  kmp_taskdata_t *current_task = thread->th.th_current_task;
4668  kmp_task_t *next_task;
4669  kmp_int32 lastpriv = 0;
4670 
4671  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4672  (last_chunk < 0 ? last_chunk : extras));
4673  KMP_DEBUG_ASSERT(num_tasks > extras);
4674  KMP_DEBUG_ASSERT(num_tasks > 0);
4675  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
4676  "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n",
4677  gtid, num_tasks, grainsize, extras, last_chunk, lower, upper,
4678  ub_glob, st, task_dup));
4679 
4680  // Launch num_tasks tasks, assign grainsize iterations each task
4681  for (i = 0; i < num_tasks; ++i) {
4682  kmp_uint64 chunk_minus_1;
4683  if (extras == 0) {
4684  chunk_minus_1 = grainsize - 1;
4685  } else {
4686  chunk_minus_1 = grainsize;
4687  --extras; // first extras iterations get bigger chunk (grainsize+1)
4688  }
4689  upper = lower + st * chunk_minus_1;
4690  if (upper > *ub) {
4691  upper = *ub;
4692  }
4693  if (i == num_tasks - 1) {
4694  // schedule the last task, set lastprivate flag if needed
4695  if (st == 1) { // most common case
4696  KMP_DEBUG_ASSERT(upper == *ub);
4697  if (upper == ub_glob)
4698  lastpriv = 1;
4699  } else if (st > 0) { // positive loop stride
4700  KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
4701  if ((kmp_uint64)st > ub_glob - upper)
4702  lastpriv = 1;
4703  } else { // negative loop stride
4704  KMP_DEBUG_ASSERT(upper + st < *ub);
4705  if (upper - ub_glob < (kmp_uint64)(-st))
4706  lastpriv = 1;
4707  }
4708  }
4709 
4710 #if OMPX_TASKGRAPH
4711  next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
4712 #else
4713  next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
4714 #endif
4715 
4716  kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
4717  kmp_taskloop_bounds_t next_task_bounds =
4718  kmp_taskloop_bounds_t(next_task, task_bounds);
4719 
4720  // adjust task-specific bounds
4721  next_task_bounds.set_lb(lower);
4722  if (next_taskdata->td_flags.native) {
4723  next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1));
4724  } else {
4725  next_task_bounds.set_ub(upper);
4726  }
4727  if (ptask_dup != NULL) // set lastprivate flag, construct firstprivates,
4728  // etc.
4729  ptask_dup(next_task, task, lastpriv);
4730  KA_TRACE(40,
4731  ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, "
4732  "upper %lld stride %lld, (offsets %p %p)\n",
4733  gtid, i, next_task, lower, upper, st,
4734  next_task_bounds.get_lower_offset(),
4735  next_task_bounds.get_upper_offset()));
4736 #if OMPT_SUPPORT
4737  __kmp_omp_taskloop_task(NULL, gtid, next_task,
4738  codeptr_ra); // schedule new task
4739 #if OMPT_OPTIONAL
4740  if (ompt_enabled.ompt_callback_dispatch) {
4741  OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
4742  lower, upper, st);
4743  }
4744 #endif // OMPT_OPTIONAL
4745 #else
4746  __kmp_omp_task(gtid, next_task, true); // schedule new task
4747 #endif
4748  lower = upper + st; // adjust lower bound for the next iteration
4749  }
4750  // free the pattern task and exit
4751  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
4752  // do not execute the pattern task, just do internal bookkeeping
4753  __kmp_task_finish<false>(gtid, task, current_task);
4754 }
4755 
4756 // Structure to keep taskloop parameters for auxiliary task
4757 // kept in the shareds of the task structure.
4758 typedef struct __taskloop_params {
4759  kmp_task_t *task;
4760  kmp_uint64 *lb;
4761  kmp_uint64 *ub;
4762  void *task_dup;
4763  kmp_int64 st;
4764  kmp_uint64 ub_glob;
4765  kmp_uint64 num_tasks;
4766  kmp_uint64 grainsize;
4767  kmp_uint64 extras;
4768  kmp_int64 last_chunk;
4769  kmp_uint64 tc;
4770  kmp_uint64 num_t_min;
4771 #if OMPT_SUPPORT
4772  void *codeptr_ra;
4773 #endif
4774 } __taskloop_params_t;
4775 
4776 void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
4777  kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
4778  kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64,
4779  kmp_uint64,
4780 #if OMPT_SUPPORT
4781  void *,
4782 #endif
4783  void *);
4784 
4785 // Execute part of the taskloop submitted as a task.
4786 int __kmp_taskloop_task(int gtid, void *ptask) {
4787  __taskloop_params_t *p =
4788  (__taskloop_params_t *)((kmp_task_t *)ptask)->shareds;
4789  kmp_task_t *task = p->task;
4790  kmp_uint64 *lb = p->lb;
4791  kmp_uint64 *ub = p->ub;
4792  void *task_dup = p->task_dup;
4793  // p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4794  kmp_int64 st = p->st;
4795  kmp_uint64 ub_glob = p->ub_glob;
4796  kmp_uint64 num_tasks = p->num_tasks;
4797  kmp_uint64 grainsize = p->grainsize;
4798  kmp_uint64 extras = p->extras;
4799  kmp_int64 last_chunk = p->last_chunk;
4800  kmp_uint64 tc = p->tc;
4801  kmp_uint64 num_t_min = p->num_t_min;
4802 #if OMPT_SUPPORT
4803  void *codeptr_ra = p->codeptr_ra;
4804 #endif
4805 #if KMP_DEBUG
4806  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4807  KMP_DEBUG_ASSERT(task != NULL);
4808  KA_TRACE(20,
4809  ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
4810  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4811  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4812  st, task_dup));
4813 #endif
4814  KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min);
4815  if (num_tasks > num_t_min)
4816  __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4817  grainsize, extras, last_chunk, tc, num_t_min,
4818 #if OMPT_SUPPORT
4819  codeptr_ra,
4820 #endif
4821  task_dup);
4822  else
4823  __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
4824  grainsize, extras, last_chunk, tc,
4825 #if OMPT_SUPPORT
4826  codeptr_ra,
4827 #endif
4828  task_dup);
4829 
4830  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
4831  return 0;
4832 }
4833 
4834 // Schedule part of the taskloop as a task,
4835 // execute the rest of the taskloop.
4836 //
4837 // loc Source location information
4838 // gtid Global thread ID
4839 // task Pattern task, exposes the loop iteration range
4840 // lb Pointer to loop lower bound in task structure
4841 // ub Pointer to loop upper bound in task structure
4842 // st Loop stride
4843 // ub_glob Global upper bound (used for lastprivate check)
4844 // num_tasks Number of tasks to execute
4845 // grainsize Number of loop iterations per task
4846 // extras Number of chunks with grainsize+1 iterations
4847 // last_chunk Reduction of grainsize for last task
4848 // tc Iterations count
4849 // num_t_min Threshold to launch tasks recursively
4850 // task_dup Tasks duplication routine
4851 // codeptr_ra Return address for OMPT events
4852 void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
4853  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4854  kmp_uint64 ub_glob, kmp_uint64 num_tasks,
4855  kmp_uint64 grainsize, kmp_uint64 extras,
4856  kmp_int64 last_chunk, kmp_uint64 tc,
4857  kmp_uint64 num_t_min,
4858 #if OMPT_SUPPORT
4859  void *codeptr_ra,
4860 #endif
4861  void *task_dup) {
4862  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4863  KMP_DEBUG_ASSERT(task != NULL);
4864  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
4865  KA_TRACE(20,
4866  ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
4867  " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n",
4868  gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub,
4869  st, task_dup));
4870  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
4871  kmp_uint64 lower = *lb;
4872  kmp_info_t *thread = __kmp_threads[gtid];
4873  // kmp_taskdata_t *current_task = thread->th.th_current_task;
4874  kmp_task_t *next_task;
4875  size_t lower_offset =
4876  (char *)lb - (char *)task; // remember offset of lb in the task structure
4877  size_t upper_offset =
4878  (char *)ub - (char *)task; // remember offset of ub in the task structure
4879 
4880  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
4881  (last_chunk < 0 ? last_chunk : extras));
4882  KMP_DEBUG_ASSERT(num_tasks > extras);
4883  KMP_DEBUG_ASSERT(num_tasks > 0);
4884 
4885  // split the loop in two halves
4886  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
4887  kmp_int64 last_chunk0 = 0, last_chunk1 = 0;
4888  kmp_uint64 gr_size0 = grainsize;
4889  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
4890  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
4891  if (last_chunk < 0) {
4892  ext0 = ext1 = 0;
4893  last_chunk1 = last_chunk;
4894  tc0 = grainsize * n_tsk0;
4895  tc1 = tc - tc0;
4896  } else if (n_tsk0 <= extras) {
4897  gr_size0++; // integrate extras into grainsize
4898  ext0 = 0; // no extra iters in 1st half
4899  ext1 = extras - n_tsk0; // remaining extras
4900  tc0 = gr_size0 * n_tsk0;
4901  tc1 = tc - tc0;
4902  } else { // n_tsk0 > extras
4903  ext1 = 0; // no extra iters in 2nd half
4904  ext0 = extras;
4905  tc1 = grainsize * n_tsk1;
4906  tc0 = tc - tc1;
4907  }
4908  ub0 = lower + st * (tc0 - 1);
4909  lb1 = ub0 + st;
4910 
4911  // create pattern task for 2nd half of the loop
4912 #if OMPX_TASKGRAPH
4913  next_task = __kmp_task_dup_alloc(thread, task,
4914  /* taskloop_recur */ 1);
4915 #else
4916  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
4917 #endif
4918  // adjust lower bound (upper bound is not changed) for the 2nd half
4919  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
4920  if (ptask_dup != NULL) // construct firstprivates, etc.
4921  ptask_dup(next_task, task, 0);
4922  *ub = ub0; // adjust upper bound for the 1st half
4923 
4924  // create auxiliary task for 2nd half of the loop
4925  // make sure new task has same parent task as the pattern task
4926  kmp_taskdata_t *current_task = thread->th.th_current_task;
4927  thread->th.th_current_task = taskdata->td_parent;
4928  kmp_task_t *new_task =
4929  __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void *),
4930  sizeof(__taskloop_params_t), &__kmp_taskloop_task);
4931  // restore current task
4932  thread->th.th_current_task = current_task;
4933  __taskloop_params_t *p = (__taskloop_params_t *)new_task->shareds;
4934  p->task = next_task;
4935  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
4936  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
4937  p->task_dup = task_dup;
4938  p->st = st;
4939  p->ub_glob = ub_glob;
4940  p->num_tasks = n_tsk1;
4941  p->grainsize = grainsize;
4942  p->extras = ext1;
4943  p->last_chunk = last_chunk1;
4944  p->tc = tc1;
4945  p->num_t_min = num_t_min;
4946 #if OMPT_SUPPORT
4947  p->codeptr_ra = codeptr_ra;
4948 #endif
4949 
4950 #if OMPX_TASKGRAPH
4951  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
4952  new_task_data->tdg = taskdata->tdg;
4953  new_task_data->is_taskgraph = 0;
4954 #endif
4955 
4956 #if OMPT_SUPPORT
4957  // schedule new task with correct return address for OMPT events
4958  __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
4959 #else
4960  __kmp_omp_task(gtid, new_task, true); // schedule new task
4961 #endif
4962 
4963  // execute the 1st half of current subrange
4964  if (n_tsk0 > num_t_min)
4965  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0,
4966  ext0, last_chunk0, tc0, num_t_min,
4967 #if OMPT_SUPPORT
4968  codeptr_ra,
4969 #endif
4970  task_dup);
4971  else
4972  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
4973  gr_size0, ext0, last_chunk0, tc0,
4974 #if OMPT_SUPPORT
4975  codeptr_ra,
4976 #endif
4977  task_dup);
4978 
4979  KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid));
4980 }
4981 
4982 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
4983  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
4984  int nogroup, int sched, kmp_uint64 grainsize,
4985  int modifier, void *task_dup) {
4986  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
4987  KMP_DEBUG_ASSERT(task != NULL);
4988  if (nogroup == 0) {
4989 #if OMPT_SUPPORT && OMPT_OPTIONAL
4990  OMPT_STORE_RETURN_ADDRESS(gtid);
4991 #endif
4992  __kmpc_taskgroup(loc, gtid);
4993  }
4994 
4995 #if OMPX_TASKGRAPH
4996  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
4997 #endif
4998  // =========================================================================
4999  // calculate loop parameters
5000  kmp_taskloop_bounds_t task_bounds(task, lb, ub);
5001  kmp_uint64 tc;
5002  // compiler provides global bounds here
5003  kmp_uint64 lower = task_bounds.get_lb();
5004  kmp_uint64 upper = task_bounds.get_ub();
5005  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
5006  kmp_uint64 num_tasks = 0, extras = 0;
5007  kmp_int64 last_chunk =
5008  0; // reduce grainsize of last task by last_chunk in strict mode
5009  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
5010  kmp_info_t *thread = __kmp_threads[gtid];
5011  kmp_taskdata_t *current_task = thread->th.th_current_task;
5012 
5013  KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
5014  "grain %llu(%d, %d), dup %p\n",
5015  gtid, taskdata, lower, upper, st, grainsize, sched, modifier,
5016  task_dup));
5017 
5018  // compute trip count
5019  if (st == 1) { // most common case
5020  tc = upper - lower + 1;
5021  } else if (st < 0) {
5022  tc = (lower - upper) / (-st) + 1;
5023  } else { // st > 0
5024  tc = (upper - lower) / st + 1;
5025  }
5026  if (tc == 0) {
5027  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
5028  // free the pattern task and exit
5029  __kmp_task_start(gtid, task, current_task);
5030  // do not execute anything for zero-trip loop
5031  __kmp_task_finish<false>(gtid, task, current_task);
5032  return;
5033  }
5034 
5035 #if OMPT_SUPPORT && OMPT_OPTIONAL
5036  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
5037  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
5038  if (ompt_enabled.ompt_callback_work) {
5039  ompt_callbacks.ompt_callback(ompt_callback_work)(
5040  ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
5041  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5042  }
5043 #endif
5044 
5045  if (num_tasks_min == 0)
5046  // TODO: can we choose better default heuristic?
5047  num_tasks_min =
5048  KMP_MIN(thread->th.th_team_nproc * 10, INITIAL_TASK_DEQUE_SIZE);
5049 
5050  // compute num_tasks/grainsize based on the input provided
5051  switch (sched) {
5052  case 0: // no schedule clause specified, we can choose the default
5053  // let's try to schedule (team_size*10) tasks
5054  grainsize = thread->th.th_team_nproc * static_cast<kmp_uint64>(10);
5055  KMP_FALLTHROUGH();
5056  case 2: // num_tasks provided
5057  if (grainsize > tc) {
5058  num_tasks = tc; // too big num_tasks requested, adjust values
5059  grainsize = 1;
5060  extras = 0;
5061  } else {
5062  num_tasks = grainsize;
5063  grainsize = tc / num_tasks;
5064  extras = tc % num_tasks;
5065  }
5066  break;
5067  case 1: // grainsize provided
5068  if (grainsize > tc) {
5069  num_tasks = 1;
5070  grainsize = tc; // too big grainsize requested, adjust values
5071  extras = 0;
5072  } else {
5073  if (modifier) {
5074  num_tasks = (tc + grainsize - 1) / grainsize;
5075  last_chunk = tc - (num_tasks * grainsize);
5076  extras = 0;
5077  } else {
5078  num_tasks = tc / grainsize;
5079  // adjust grainsize for balanced distribution of iterations
5080  grainsize = tc / num_tasks;
5081  extras = tc % num_tasks;
5082  }
5083  }
5084  break;
5085  default:
5086  KMP_ASSERT2(0, "unknown scheduling of taskloop");
5087  }
5088 
5089  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
5090  (last_chunk < 0 ? last_chunk : extras));
5091  KMP_DEBUG_ASSERT(num_tasks > extras);
5092  KMP_DEBUG_ASSERT(num_tasks > 0);
5093  // =========================================================================
5094 
5095  // check if clause value first
5096  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
5097  if (if_val == 0) { // if(0) specified, mark task as serial
5098  taskdata->td_flags.task_serial = 1;
5099  taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
5100  // always start serial tasks linearly
5101  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5102  grainsize, extras, last_chunk, tc,
5103 #if OMPT_SUPPORT
5104  OMPT_GET_RETURN_ADDRESS(0),
5105 #endif
5106  task_dup);
5107  // !taskdata->td_flags.native => currently force linear spawning of tasks
5108  // for GOMP_taskloop
5109  } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) {
5110  KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
5111  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5112  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5113  last_chunk));
5114  __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5115  grainsize, extras, last_chunk, tc, num_tasks_min,
5116 #if OMPT_SUPPORT
5117  OMPT_GET_RETURN_ADDRESS(0),
5118 #endif
5119  task_dup);
5120  } else {
5121  KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
5122  "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
5123  gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
5124  last_chunk));
5125  __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
5126  grainsize, extras, last_chunk, tc,
5127 #if OMPT_SUPPORT
5128  OMPT_GET_RETURN_ADDRESS(0),
5129 #endif
5130  task_dup);
5131  }
5132 
5133 #if OMPT_SUPPORT && OMPT_OPTIONAL
5134  if (ompt_enabled.ompt_callback_work) {
5135  ompt_callbacks.ompt_callback(ompt_callback_work)(
5136  ompt_work_taskloop, ompt_scope_end, &(team_info->parallel_data),
5137  &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
5138  }
5139 #endif
5140 
5141  if (nogroup == 0) {
5142 #if OMPT_SUPPORT && OMPT_OPTIONAL
5143  OMPT_STORE_RETURN_ADDRESS(gtid);
5144 #endif
5145  __kmpc_end_taskgroup(loc, gtid);
5146  }
5147  KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
5148 }
5149 
5166 void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5167  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
5168  int sched, kmp_uint64 grainsize, void *task_dup) {
5169  __kmp_assert_valid_gtid(gtid);
5170  KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid));
5171  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5172  0, task_dup);
5173  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
5174 }
5175 
5193 void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
5194  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
5195  int nogroup, int sched, kmp_uint64 grainsize,
5196  int modifier, void *task_dup) {
5197  __kmp_assert_valid_gtid(gtid);
5198  KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid));
5199  __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize,
5200  modifier, task_dup);
5201  KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid));
5202 }
5203 
5213  if (gtid == KMP_GTID_DNE)
5214  return NULL;
5215 
5216  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5217  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5218 
5219  if (!taskdata)
5220  return NULL;
5221 
5222  return &taskdata->td_target_data.async_handle;
5223 }
5224 
5233 bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
5234  if (gtid == KMP_GTID_DNE)
5235  return FALSE;
5236 
5237  kmp_info_t *thread = __kmp_thread_from_gtid(gtid);
5238  kmp_taskdata_t *taskdata = thread->th.th_current_task;
5239 
5240  if (!taskdata)
5241  return FALSE;
5242 
5243  return taskdata->td_task_team != NULL;
5244 }
5245 
5246 #if OMPX_TASKGRAPH
5247 // __kmp_find_tdg: identify a TDG through its ID
5248 // tdg_id: ID of the TDG
5249 // returns: If a TDG corresponding to this ID is found and not
5250 // its initial state, return the pointer to it, otherwise nullptr
5251 static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
5252  kmp_tdg_info_t *res = nullptr;
5253  if (__kmp_max_tdgs == 0)
5254  return res;
5255 
5256  if (__kmp_global_tdgs == NULL)
5257  __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
5258  sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
5259 
5260  if ((__kmp_global_tdgs[tdg_id]) &&
5261  (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
5262  res = __kmp_global_tdgs[tdg_id];
5263  return res;
5264 }
5265 
5266 // __kmp_print_tdg_dot: prints the TDG to a dot file
5267 // tdg: ID of the TDG
5268 // gtid: Global Thread ID
5269 void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
5270  kmp_int32 tdg_id = tdg->tdg_id;
5271  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
5272 
5273  char file_name[20];
5274  sprintf(file_name, "tdg_%d.dot", tdg_id);
5275  kmp_safe_raii_file_t tdg_file(file_name, "w");
5276 
5277  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5278  fprintf(tdg_file,
5279  "digraph TDG {\n"
5280  " compound=true\n"
5281  " subgraph cluster {\n"
5282  " label=TDG_%d\n",
5283  tdg_id);
5284  for (kmp_int32 i = 0; i < num_tasks; i++) {
5285  fprintf(tdg_file, " %d[style=bold]\n", i);
5286  }
5287  fprintf(tdg_file, " }\n");
5288  for (kmp_int32 i = 0; i < num_tasks; i++) {
5289  kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
5290  kmp_int32 *successors = tdg->record_map[i].successors;
5291  if (nsuccessors > 0) {
5292  for (kmp_int32 j = 0; j < nsuccessors; j++)
5293  fprintf(tdg_file, " %d -> %d \n", i, successors[j]);
5294  }
5295  }
5296  fprintf(tdg_file, "}");
5297  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
5298 }
5299 
5300 // __kmp_exec_tdg: launch the execution of a previous
5301 // recorded TDG
5302 // gtid: Global Thread ID
5303 // tdg: ID of the TDG
5304 void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5305  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
5306  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
5307  tdg->tdg_id, tdg->num_roots));
5308  kmp_node_info_t *this_record_map = tdg->record_map;
5309  kmp_int32 *this_root_tasks = tdg->root_tasks;
5310  kmp_int32 this_num_roots = tdg->num_roots;
5311  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5312 
5313  kmp_info_t *thread = __kmp_threads[gtid];
5314  kmp_taskdata_t *parent_task = thread->th.th_current_task;
5315 
5316  if (tdg->rec_taskred_data) {
5317  __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
5318  }
5319 
5320  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
5321  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
5322 
5323  td->td_parent = parent_task;
5324  this_record_map[j].parent_task = parent_task;
5325 
5326  kmp_taskgroup_t *parent_taskgroup =
5327  this_record_map[j].parent_task->td_taskgroup;
5328 
5329  KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
5330  this_record_map[j].npredecessors);
5331  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
5332 
5333  if (parent_taskgroup) {
5334  KMP_ATOMIC_INC(&parent_taskgroup->count);
5335  // The taskgroup is different so we must update it
5336  td->td_taskgroup = parent_taskgroup;
5337  } else if (td->td_taskgroup != nullptr) {
5338  // If the parent doesnt have a taskgroup, remove it from the task
5339  td->td_taskgroup = nullptr;
5340  }
5341  if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
5342  KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
5343  }
5344 
5345  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
5346  __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
5347  }
5348  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
5349  tdg->tdg_id, tdg->num_roots));
5350 }
5351 
5352 // __kmp_start_record: set up a TDG structure and turn the
5353 // recording flag to true
5354 // gtid: Global Thread ID of the encountering thread
5355 // input_flags: Flags associated with the TDG
5356 // tdg_id: ID of the TDG to record
5357 static inline void __kmp_start_record(kmp_int32 gtid,
5358  kmp_taskgraph_flags_t *flags,
5359  kmp_int32 tdg_id) {
5360  kmp_tdg_info_t *tdg =
5361  (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
5362  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
5363  // Initializing the TDG structure
5364  tdg->tdg_id = tdg_id;
5365  tdg->map_size = INIT_MAPSIZE;
5366  tdg->num_roots = -1;
5367  tdg->root_tasks = nullptr;
5368  tdg->tdg_status = KMP_TDG_RECORDING;
5369  tdg->rec_num_taskred = 0;
5370  tdg->rec_taskred_data = nullptr;
5371  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
5372 
5373  // Initializing the list of nodes in this TDG
5374  kmp_node_info_t *this_record_map =
5375  (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
5376  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
5377  kmp_int32 *successorsList =
5378  (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
5379  this_record_map[i].task = nullptr;
5380  this_record_map[i].successors = successorsList;
5381  this_record_map[i].nsuccessors = 0;
5382  this_record_map[i].npredecessors = 0;
5383  this_record_map[i].successors_size = __kmp_successors_size;
5384  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
5385  }
5386 
5387  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
5388 }
5389 
5390 // __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
5391 // the beginning of the record process of a task region
5392 // loc_ref: Location of TDG, not used yet
5393 // gtid: Global Thread ID of the encountering thread
5394 // input_flags: Flags associated with the TDG
5395 // tdg_id: ID of the TDG to record, for now, incremental integer
5396 // returns: 1 if we record, otherwise, 0
5397 kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
5398  kmp_int32 input_flags, kmp_int32 tdg_id) {
5399 
5400  kmp_int32 res;
5401  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
5402  KA_TRACE(10,
5403  ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
5404  gtid, loc_ref, input_flags, tdg_id));
5405 
5406  if (__kmp_max_tdgs == 0) {
5407  KA_TRACE(
5408  10,
5409  ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
5410  "__kmp_max_tdgs = 0\n",
5411  gtid, loc_ref, input_flags, tdg_id));
5412  return 1;
5413  }
5414 
5415  __kmpc_taskgroup(loc_ref, gtid);
5416  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
5417  // TODO: use re_record flag
5418  __kmp_exec_tdg(gtid, tdg);
5419  res = 0;
5420  } else {
5421  __kmp_curr_tdg_idx = tdg_id;
5422  KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
5423  __kmp_start_record(gtid, flags, tdg_id);
5424  __kmp_num_tdg++;
5425  res = 1;
5426  }
5427  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
5428  gtid, tdg_id, res ? "record" : "execute"));
5429  return res;
5430 }
5431 
5432 // __kmp_end_record: set up a TDG after recording it
5433 // gtid: Global thread ID
5434 // tdg: Pointer to the TDG
5435 void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
5436  // Store roots
5437  kmp_node_info_t *this_record_map = tdg->record_map;
5438  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
5439  kmp_int32 *this_root_tasks =
5440  (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
5441  kmp_int32 this_map_size = tdg->map_size;
5442  kmp_int32 this_num_roots = 0;
5443  kmp_info_t *thread = __kmp_threads[gtid];
5444 
5445  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5446  if (this_record_map[i].npredecessors == 0) {
5447  this_root_tasks[this_num_roots++] = i;
5448  }
5449  }
5450 
5451  // Update with roots info and mapsize
5452  tdg->map_size = this_map_size;
5453  tdg->num_roots = this_num_roots;
5454  tdg->root_tasks = this_root_tasks;
5455  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
5456  tdg->tdg_status = KMP_TDG_READY;
5457 
5458  if (thread->th.th_current_task->td_dephash) {
5459  __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
5460  thread->th.th_current_task->td_dephash = NULL;
5461  }
5462 
5463  // Reset predecessor counter
5464  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
5465  KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
5466  this_record_map[i].npredecessors);
5467  }
5468  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
5469 
5470  if (__kmp_tdg_dot)
5471  __kmp_print_tdg_dot(tdg, gtid);
5472 }
5473 
5474 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
5475 // the end of recording phase
5476 //
5477 // loc_ref: Source location information
5478 // gtid: Global thread ID
5479 // input_flags: Flags attached to the graph
5480 // tdg_id: ID of the TDG just finished recording
5481 void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
5482  kmp_int32 input_flags, kmp_int32 tdg_id) {
5483  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
5484 
5485  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
5486  " tdg=%d with flags=%d\n",
5487  gtid, loc_ref, tdg_id, input_flags));
5488  if (__kmp_max_tdgs) {
5489  // TODO: use input_flags->nowait
5490  __kmpc_end_taskgroup(loc_ref, gtid);
5491  if (__kmp_tdg_is_recording(tdg->tdg_status))
5492  __kmp_end_record(gtid, tdg);
5493  }
5494  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
5495  " tdg=%d, its status is now READY\n",
5496  gtid, loc_ref, tdg_id));
5497 }
5498 #endif
struct kmp_taskred_data kmp_taskred_data_t
struct kmp_task_red_input kmp_task_red_input_t
struct kmp_taskred_flags kmp_taskred_flags_t
struct kmp_taskred_input kmp_taskred_input_t
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:911
void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void *task_dup)
void * __kmpc_taskred_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void * __kmpc_taskred_init(int gtid, int num, void *data)
void * __kmpc_task_reduction_init(int gtid, int num, void *data)
bool __kmpc_omp_has_task_team(kmp_int32 gtid)
void __kmpc_proxy_task_completed_ooo(kmp_task_t *ptask)
void __kmpc_task_reduction_modifier_fini(ident_t *loc, int gtid, int is_ws)
kmp_int32 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 naffins, kmp_task_affinity_info_t *affin_list)
void * __kmpc_task_reduction_modifier_init(ident_t *loc, int gtid, int is_ws, int num, void *data)
void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, int modifier, void *task_dup)
void __kmpc_proxy_task_completed(kmp_int32 gtid, kmp_task_t *ptask)
void * __kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data)
void ** __kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
Definition: kmp.h:230
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags
kmp_taskred_flags_t flags