LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_taskq.cpp
1 /*
2  * kmp_taskq.cpp -- TASKQ support for OpenMP.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 
18 #define MAX_MESSAGE 512
19 
20 /* Taskq routines and global variables */
21 
22 #define KMP_DEBUG_REF_CTS(x) KF_TRACE(1, x);
23 
24 #define THREAD_ALLOC_FOR_TASKQ
25 
26 static int in_parallel_context(kmp_team_t *team) {
27  return !team->t.t_serialized;
28 }
29 
30 static void __kmp_taskq_eo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
31  int gtid = *gtid_ref;
32  int tid = __kmp_tid_from_gtid(gtid);
33  kmp_uint32 my_token;
34  kmpc_task_queue_t *taskq;
35  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
36 
37  if (__kmp_env_consistency_check)
38 #if KMP_USE_DYNAMIC_LOCK
39  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL, 0);
40 #else
41  __kmp_push_sync(gtid, ct_ordered_in_taskq, loc_ref, NULL);
42 #endif
43 
44  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
45  KMP_MB(); /* Flush all pending memory write invalidates. */
46 
47  /* GEH - need check here under stats to make sure */
48  /* inside task (curr_thunk[*tid_ref] != NULL) */
49 
50  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
51 
52  taskq = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
53 
54  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_EQ, NULL);
55  KMP_MB();
56  }
57 }
58 
59 static void __kmp_taskq_xo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
60  int gtid = *gtid_ref;
61  int tid = __kmp_tid_from_gtid(gtid);
62  kmp_uint32 my_token;
63  kmp_taskq_t *tq = &__kmp_threads[gtid]->th.th_team->t.t_taskq;
64 
65  if (__kmp_env_consistency_check)
66  __kmp_pop_sync(gtid, ct_ordered_in_taskq, loc_ref);
67 
68  if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) {
69  KMP_MB(); /* Flush all pending memory write invalidates. */
70 
71  /* GEH - need check here under stats to make sure */
72  /* inside task (curr_thunk[tid] != NULL) */
73 
74  my_token = tq->tq_curr_thunk[tid]->th_tasknum;
75 
76  KMP_MB(); /* Flush all pending memory write invalidates. */
77 
78  tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue->tq_tasknum_serving =
79  my_token + 1;
80 
81  KMP_MB(); /* Flush all pending memory write invalidates. */
82  }
83 }
84 
85 static void __kmp_taskq_check_ordered(kmp_int32 gtid, kmpc_thunk_t *thunk) {
86  kmp_uint32 my_token;
87  kmpc_task_queue_t *taskq;
88 
89  /* assume we are always called from an active parallel context */
90 
91  KMP_MB(); /* Flush all pending memory write invalidates. */
92 
93  my_token = thunk->th_tasknum;
94 
95  taskq = thunk->th.th_shareds->sv_queue;
96 
97  if (taskq->tq_tasknum_serving <= my_token) {
98  KMP_WAIT_YIELD(&taskq->tq_tasknum_serving, my_token, KMP_GE, NULL);
99  KMP_MB();
100  taskq->tq_tasknum_serving = my_token + 1;
101  KMP_MB();
102  }
103 }
104 
105 #ifdef KMP_DEBUG
106 
107 static void __kmp_dump_TQF(kmp_int32 flags) {
108  if (flags & TQF_IS_ORDERED)
109  __kmp_printf("ORDERED ");
110  if (flags & TQF_IS_LASTPRIVATE)
111  __kmp_printf("LAST_PRIV ");
112  if (flags & TQF_IS_NOWAIT)
113  __kmp_printf("NOWAIT ");
114  if (flags & TQF_HEURISTICS)
115  __kmp_printf("HEURIST ");
116  if (flags & TQF_INTERFACE_RESERVED1)
117  __kmp_printf("RESERV1 ");
118  if (flags & TQF_INTERFACE_RESERVED2)
119  __kmp_printf("RESERV2 ");
120  if (flags & TQF_INTERFACE_RESERVED3)
121  __kmp_printf("RESERV3 ");
122  if (flags & TQF_INTERFACE_RESERVED4)
123  __kmp_printf("RESERV4 ");
124  if (flags & TQF_IS_LAST_TASK)
125  __kmp_printf("LAST_TASK ");
126  if (flags & TQF_TASKQ_TASK)
127  __kmp_printf("TASKQ_TASK ");
128  if (flags & TQF_RELEASE_WORKERS)
129  __kmp_printf("RELEASE ");
130  if (flags & TQF_ALL_TASKS_QUEUED)
131  __kmp_printf("ALL_QUEUED ");
132  if (flags & TQF_PARALLEL_CONTEXT)
133  __kmp_printf("PARALLEL ");
134  if (flags & TQF_DEALLOCATED)
135  __kmp_printf("DEALLOC ");
136  if (!(flags & (TQF_INTERNAL_FLAGS | TQF_INTERFACE_FLAGS)))
137  __kmp_printf("(NONE)");
138 }
139 
140 static void __kmp_dump_thunk(kmp_taskq_t *tq, kmpc_thunk_t *thunk,
141  kmp_int32 global_tid) {
142  int i;
143  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
144 
145  __kmp_printf("\tThunk at %p on (%d): ", thunk, global_tid);
146 
147  if (thunk != NULL) {
148  for (i = 0; i < nproc; i++) {
149  if (tq->tq_curr_thunk[i] == thunk) {
150  __kmp_printf("[%i] ", i);
151  }
152  }
153  __kmp_printf("th_shareds=%p, ", thunk->th.th_shareds);
154  __kmp_printf("th_task=%p, ", thunk->th_task);
155  __kmp_printf("th_encl_thunk=%p, ", thunk->th_encl_thunk);
156  __kmp_printf("th_status=%d, ", thunk->th_status);
157  __kmp_printf("th_tasknum=%u, ", thunk->th_tasknum);
158  __kmp_printf("th_flags=");
159  __kmp_dump_TQF(thunk->th_flags);
160  }
161 
162  __kmp_printf("\n");
163 }
164 
165 static void __kmp_dump_thunk_stack(kmpc_thunk_t *thunk, kmp_int32 thread_num) {
166  kmpc_thunk_t *th;
167 
168  __kmp_printf(" Thunk stack for T#%d: ", thread_num);
169 
170  for (th = thunk; th != NULL; th = th->th_encl_thunk)
171  __kmp_printf("%p ", th);
172 
173  __kmp_printf("\n");
174 }
175 
176 static void __kmp_dump_task_queue(kmp_taskq_t *tq, kmpc_task_queue_t *queue,
177  kmp_int32 global_tid) {
178  int qs, count, i;
179  kmpc_thunk_t *thunk;
180  kmpc_task_queue_t *taskq;
181 
182  __kmp_printf("Task Queue at %p on (%d):\n", queue, global_tid);
183 
184  if (queue != NULL) {
185  int in_parallel = queue->tq_flags & TQF_PARALLEL_CONTEXT;
186 
187  if (__kmp_env_consistency_check) {
188  __kmp_printf(" tq_loc : ");
189  }
190  if (in_parallel) {
191 
192  // if (queue->tq.tq_parent != 0)
193  //__kmp_acquire_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
194 
195  //__kmp_acquire_lock(& queue->tq_link_lck, global_tid);
196 
197  // Make sure data structures are in consistent state before querying them
198  // Seems to work without this for digital/alpha, needed for IBM/RS6000
199  KMP_MB();
200 
201  __kmp_printf(" tq_parent : %p\n", queue->tq.tq_parent);
202  __kmp_printf(" tq_first_child : %p\n", queue->tq_first_child);
203  __kmp_printf(" tq_next_child : %p\n", queue->tq_next_child);
204  __kmp_printf(" tq_prev_child : %p\n", queue->tq_prev_child);
205  __kmp_printf(" tq_ref_count : %d\n", queue->tq_ref_count);
206 
207  //__kmp_release_lock(& queue->tq_link_lck, global_tid);
208 
209  // if (queue->tq.tq_parent != 0)
210  //__kmp_release_lock(& queue->tq.tq_parent->tq_link_lck, global_tid);
211 
212  //__kmp_acquire_lock(& queue->tq_free_thunks_lck, global_tid);
213  //__kmp_acquire_lock(& queue->tq_queue_lck, global_tid);
214 
215  // Make sure data structures are in consistent state before querying them
216  // Seems to work without this for digital/alpha, needed for IBM/RS6000
217  KMP_MB();
218  }
219 
220  __kmp_printf(" tq_shareds : ");
221  for (i = 0; i < ((queue == tq->tq_root) ? queue->tq_nproc : 1); i++)
222  __kmp_printf("%p ", queue->tq_shareds[i].ai_data);
223  __kmp_printf("\n");
224 
225  if (in_parallel) {
226  __kmp_printf(" tq_tasknum_queuing : %u\n", queue->tq_tasknum_queuing);
227  __kmp_printf(" tq_tasknum_serving : %u\n", queue->tq_tasknum_serving);
228  }
229 
230  __kmp_printf(" tq_queue : %p\n", queue->tq_queue);
231  __kmp_printf(" tq_thunk_space : %p\n", queue->tq_thunk_space);
232  __kmp_printf(" tq_taskq_slot : %p\n", queue->tq_taskq_slot);
233 
234  __kmp_printf(" tq_free_thunks : ");
235  for (thunk = queue->tq_free_thunks; thunk != NULL;
236  thunk = thunk->th.th_next_free)
237  __kmp_printf("%p ", thunk);
238  __kmp_printf("\n");
239 
240  __kmp_printf(" tq_nslots : %d\n", queue->tq_nslots);
241  __kmp_printf(" tq_head : %d\n", queue->tq_head);
242  __kmp_printf(" tq_tail : %d\n", queue->tq_tail);
243  __kmp_printf(" tq_nfull : %d\n", queue->tq_nfull);
244  __kmp_printf(" tq_hiwat : %d\n", queue->tq_hiwat);
245  __kmp_printf(" tq_flags : ");
246  __kmp_dump_TQF(queue->tq_flags);
247  __kmp_printf("\n");
248 
249  if (in_parallel) {
250  __kmp_printf(" tq_th_thunks : ");
251  for (i = 0; i < queue->tq_nproc; i++) {
252  __kmp_printf("%d ", queue->tq_th_thunks[i].ai_data);
253  }
254  __kmp_printf("\n");
255  }
256 
257  __kmp_printf("\n");
258  __kmp_printf(" Queue slots:\n");
259 
260  qs = queue->tq_tail;
261  for (count = 0; count < queue->tq_nfull; ++count) {
262  __kmp_printf("(%d)", qs);
263  __kmp_dump_thunk(tq, queue->tq_queue[qs].qs_thunk, global_tid);
264  qs = (qs + 1) % queue->tq_nslots;
265  }
266 
267  __kmp_printf("\n");
268 
269  if (in_parallel) {
270  if (queue->tq_taskq_slot != NULL) {
271  __kmp_printf(" TaskQ slot:\n");
272  __kmp_dump_thunk(tq, CCAST(kmpc_thunk_t *, queue->tq_taskq_slot),
273  global_tid);
274  __kmp_printf("\n");
275  }
276  //__kmp_release_lock(& queue->tq_queue_lck, global_tid);
277  //__kmp_release_lock(& queue->tq_free_thunks_lck, global_tid);
278  }
279  }
280 
281  __kmp_printf(" Taskq freelist: ");
282 
283  //__kmp_acquire_lock( & tq->tq_freelist_lck, global_tid );
284 
285  // Make sure data structures are in consistent state before querying them
286  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
287  KMP_MB();
288 
289  for (taskq = tq->tq_freelist; taskq != NULL; taskq = taskq->tq.tq_next_free)
290  __kmp_printf("%p ", taskq);
291 
292  //__kmp_release_lock( & tq->tq_freelist_lck, global_tid );
293 
294  __kmp_printf("\n\n");
295 }
296 
297 static void __kmp_aux_dump_task_queue_tree(kmp_taskq_t *tq,
298  kmpc_task_queue_t *curr_queue,
299  kmp_int32 level,
300  kmp_int32 global_tid) {
301  int i, count, qs;
302  int nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
303  kmpc_task_queue_t *queue = curr_queue;
304 
305  if (curr_queue == NULL)
306  return;
307 
308  __kmp_printf(" ");
309 
310  for (i = 0; i < level; i++)
311  __kmp_printf(" ");
312 
313  __kmp_printf("%p", curr_queue);
314 
315  for (i = 0; i < nproc; i++) {
316  if (tq->tq_curr_thunk[i] &&
317  tq->tq_curr_thunk[i]->th.th_shareds->sv_queue == curr_queue) {
318  __kmp_printf(" [%i]", i);
319  }
320  }
321 
322  __kmp_printf(":");
323 
324  //__kmp_acquire_lock(& curr_queue->tq_queue_lck, global_tid);
325 
326  // Make sure data structures are in consistent state before querying them
327  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
328  KMP_MB();
329 
330  qs = curr_queue->tq_tail;
331 
332  for (count = 0; count < curr_queue->tq_nfull; ++count) {
333  __kmp_printf("%p ", curr_queue->tq_queue[qs].qs_thunk);
334  qs = (qs + 1) % curr_queue->tq_nslots;
335  }
336 
337  //__kmp_release_lock(& curr_queue->tq_queue_lck, global_tid);
338 
339  __kmp_printf("\n");
340 
341  if (curr_queue->tq_first_child) {
342  //__kmp_acquire_lock(& curr_queue->tq_link_lck, global_tid);
343 
344  // Make sure data structures are in consistent state before querying them
345  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
346  KMP_MB();
347 
348  if (curr_queue->tq_first_child) {
349  for (queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
350  queue != NULL; queue = queue->tq_next_child) {
351  __kmp_aux_dump_task_queue_tree(tq, queue, level + 1, global_tid);
352  }
353  }
354 
355  //__kmp_release_lock(& curr_queue->tq_link_lck, global_tid);
356  }
357 }
358 
359 static void __kmp_dump_task_queue_tree(kmp_taskq_t *tq,
360  kmpc_task_queue_t *tqroot,
361  kmp_int32 global_tid) {
362  __kmp_printf("TaskQ Tree at root %p on (%d):\n", tqroot, global_tid);
363 
364  __kmp_aux_dump_task_queue_tree(tq, tqroot, 0, global_tid);
365 
366  __kmp_printf("\n");
367 }
368 #endif
369 
370 /* New taskq storage routines that try to minimize overhead of mallocs but
371  still provide cache line alignment. */
372 static void *__kmp_taskq_allocate(size_t size, kmp_int32 global_tid) {
373  void *addr, *orig_addr;
374  size_t bytes;
375 
376  KB_TRACE(5, ("__kmp_taskq_allocate: called size=%d, gtid=%d\n", (int)size,
377  global_tid));
378 
379  bytes = sizeof(void *) + CACHE_LINE + size;
380 
381 #ifdef THREAD_ALLOC_FOR_TASKQ
382  orig_addr =
383  (void *)__kmp_thread_malloc(__kmp_thread_from_gtid(global_tid), bytes);
384 #else
385  KE_TRACE(10, ("%%%%%% MALLOC( %d )\n", bytes));
386  orig_addr = (void *)KMP_INTERNAL_MALLOC(bytes);
387 #endif /* THREAD_ALLOC_FOR_TASKQ */
388 
389  if (orig_addr == 0)
390  KMP_FATAL(OutOfHeapMemory);
391 
392  addr = orig_addr;
393 
394  if (((kmp_uintptr_t)addr & (CACHE_LINE - 1)) != 0) {
395  KB_TRACE(50, ("__kmp_taskq_allocate: adjust for cache alignment\n"));
396  addr = (void *)(((kmp_uintptr_t)addr + CACHE_LINE) & ~(CACHE_LINE - 1));
397  }
398 
399  (*(void **)addr) = orig_addr;
400 
401  KB_TRACE(10,
402  ("__kmp_taskq_allocate: allocate: %p, use: %p - %p, size: %d, "
403  "gtid: %d\n",
404  orig_addr, ((void **)addr) + 1,
405  ((char *)(((void **)addr) + 1)) + size - 1, (int)size, global_tid));
406 
407  return (((void **)addr) + 1);
408 }
409 
410 static void __kmpc_taskq_free(void *p, kmp_int32 global_tid) {
411  KB_TRACE(5, ("__kmpc_taskq_free: called addr=%p, gtid=%d\n", p, global_tid));
412 
413  KB_TRACE(10, ("__kmpc_taskq_free: freeing: %p, gtid: %d\n",
414  (*(((void **)p) - 1)), global_tid));
415 
416 #ifdef THREAD_ALLOC_FOR_TASKQ
417  __kmp_thread_free(__kmp_thread_from_gtid(global_tid), *(((void **)p) - 1));
418 #else
419  KMP_INTERNAL_FREE(*(((void **)p) - 1));
420 #endif /* THREAD_ALLOC_FOR_TASKQ */
421 }
422 
423 /* Keep freed kmpc_task_queue_t on an internal freelist and recycle since
424  they're of constant size. */
425 
426 static kmpc_task_queue_t *
427 __kmp_alloc_taskq(kmp_taskq_t *tq, int in_parallel, kmp_int32 nslots,
428  kmp_int32 nthunks, kmp_int32 nshareds, kmp_int32 nproc,
429  size_t sizeof_thunk, size_t sizeof_shareds,
430  kmpc_thunk_t **new_taskq_thunk, kmp_int32 global_tid) {
431  kmp_int32 i;
432  size_t bytes;
433  kmpc_task_queue_t *new_queue;
434  kmpc_aligned_shared_vars_t *shared_var_array;
435  char *shared_var_storage;
436  char *pt; /* for doing byte-adjusted address computations */
437 
438  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
439 
440  // Make sure data structures are in consistent state before querying them
441  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
442  KMP_MB();
443 
444  if (tq->tq_freelist) {
445  new_queue = tq->tq_freelist;
446  tq->tq_freelist = tq->tq_freelist->tq.tq_next_free;
447 
448  KMP_DEBUG_ASSERT(new_queue->tq_flags & TQF_DEALLOCATED);
449 
450  new_queue->tq_flags = 0;
451 
452  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
453  } else {
454  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
455 
456  new_queue = (kmpc_task_queue_t *)__kmp_taskq_allocate(
457  sizeof(kmpc_task_queue_t), global_tid);
458  new_queue->tq_flags = 0;
459  }
460 
461  /* space in the task queue for queue slots (allocate as one big chunk */
462  /* of storage including new_taskq_task space) */
463 
464  sizeof_thunk +=
465  (CACHE_LINE - (sizeof_thunk % CACHE_LINE)); /* pad to cache line size */
466  pt = (char *)__kmp_taskq_allocate(nthunks * sizeof_thunk, global_tid);
467  new_queue->tq_thunk_space = (kmpc_thunk_t *)pt;
468  *new_taskq_thunk = (kmpc_thunk_t *)(pt + (nthunks - 1) * sizeof_thunk);
469 
470  /* chain the allocated thunks into a freelist for this queue */
471 
472  new_queue->tq_free_thunks = (kmpc_thunk_t *)pt;
473 
474  for (i = 0; i < (nthunks - 2); i++) {
475  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th.th_next_free =
476  (kmpc_thunk_t *)(pt + (i + 1) * sizeof_thunk);
477 #ifdef KMP_DEBUG
478  ((kmpc_thunk_t *)(pt + i * sizeof_thunk))->th_flags = TQF_DEALLOCATED;
479 #endif
480  }
481 
482  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th.th_next_free = NULL;
483 #ifdef KMP_DEBUG
484  ((kmpc_thunk_t *)(pt + (nthunks - 2) * sizeof_thunk))->th_flags =
485  TQF_DEALLOCATED;
486 #endif
487 
488  /* initialize the locks */
489 
490  if (in_parallel) {
491  __kmp_init_lock(&new_queue->tq_link_lck);
492  __kmp_init_lock(&new_queue->tq_free_thunks_lck);
493  __kmp_init_lock(&new_queue->tq_queue_lck);
494  }
495 
496  /* now allocate the slots */
497 
498  bytes = nslots * sizeof(kmpc_aligned_queue_slot_t);
499  new_queue->tq_queue =
500  (kmpc_aligned_queue_slot_t *)__kmp_taskq_allocate(bytes, global_tid);
501 
502  /* space for array of pointers to shared variable structures */
503  sizeof_shareds += sizeof(kmpc_task_queue_t *);
504  sizeof_shareds +=
505  (CACHE_LINE - (sizeof_shareds % CACHE_LINE)); /* pad to cache line size */
506 
507  bytes = nshareds * sizeof(kmpc_aligned_shared_vars_t);
508  shared_var_array =
509  (kmpc_aligned_shared_vars_t *)__kmp_taskq_allocate(bytes, global_tid);
510 
511  bytes = nshareds * sizeof_shareds;
512  shared_var_storage = (char *)__kmp_taskq_allocate(bytes, global_tid);
513 
514  for (i = 0; i < nshareds; i++) {
515  shared_var_array[i].ai_data =
516  (kmpc_shared_vars_t *)(shared_var_storage + i * sizeof_shareds);
517  shared_var_array[i].ai_data->sv_queue = new_queue;
518  }
519  new_queue->tq_shareds = shared_var_array;
520 
521  /* array for number of outstanding thunks per thread */
522 
523  if (in_parallel) {
524  bytes = nproc * sizeof(kmpc_aligned_int32_t);
525  new_queue->tq_th_thunks =
526  (kmpc_aligned_int32_t *)__kmp_taskq_allocate(bytes, global_tid);
527  new_queue->tq_nproc = nproc;
528 
529  for (i = 0; i < nproc; i++)
530  new_queue->tq_th_thunks[i].ai_data = 0;
531  }
532 
533  return new_queue;
534 }
535 
536 static void __kmp_free_taskq(kmp_taskq_t *tq, kmpc_task_queue_t *p,
537  int in_parallel, kmp_int32 global_tid) {
538  __kmpc_taskq_free(p->tq_thunk_space, global_tid);
539  __kmpc_taskq_free(p->tq_queue, global_tid);
540 
541  /* free shared var structure storage */
542  __kmpc_taskq_free(CCAST(kmpc_shared_vars_t *, p->tq_shareds[0].ai_data),
543  global_tid);
544  /* free array of pointers to shared vars storage */
545  __kmpc_taskq_free(p->tq_shareds, global_tid);
546 
547 #ifdef KMP_DEBUG
548  p->tq_first_child = NULL;
549  p->tq_next_child = NULL;
550  p->tq_prev_child = NULL;
551  p->tq_ref_count = -10;
552  p->tq_shareds = NULL;
553  p->tq_tasknum_queuing = 0;
554  p->tq_tasknum_serving = 0;
555  p->tq_queue = NULL;
556  p->tq_thunk_space = NULL;
557  p->tq_taskq_slot = NULL;
558  p->tq_free_thunks = NULL;
559  p->tq_nslots = 0;
560  p->tq_head = 0;
561  p->tq_tail = 0;
562  p->tq_nfull = 0;
563  p->tq_hiwat = 0;
564 
565  if (in_parallel) {
566  int i;
567 
568  for (i = 0; i < p->tq_nproc; i++)
569  p->tq_th_thunks[i].ai_data = 0;
570  }
571  if (__kmp_env_consistency_check)
572  p->tq_loc = NULL;
573  KMP_DEBUG_ASSERT(p->tq_flags & TQF_DEALLOCATED);
574  p->tq_flags = TQF_DEALLOCATED;
575 #endif /* KMP_DEBUG */
576 
577  if (in_parallel) {
578  __kmpc_taskq_free(p->tq_th_thunks, global_tid);
579  __kmp_destroy_lock(&p->tq_link_lck);
580  __kmp_destroy_lock(&p->tq_queue_lck);
581  __kmp_destroy_lock(&p->tq_free_thunks_lck);
582  }
583 #ifdef KMP_DEBUG
584  p->tq_th_thunks = NULL;
585 #endif /* KMP_DEBUG */
586 
587  // Make sure data structures are in consistent state before querying them
588  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
589  KMP_MB();
590 
591  __kmp_acquire_lock(&tq->tq_freelist_lck, global_tid);
592  p->tq.tq_next_free = tq->tq_freelist;
593 
594  tq->tq_freelist = p;
595  __kmp_release_lock(&tq->tq_freelist_lck, global_tid);
596 }
597 
598 /* Once a group of thunks has been allocated for use in a particular queue,
599  these are managed via a per-queue freelist.
600  We force a check that there's always a thunk free if we need one. */
601 
602 static kmpc_thunk_t *__kmp_alloc_thunk(kmpc_task_queue_t *queue,
603  int in_parallel, kmp_int32 global_tid) {
604  kmpc_thunk_t *fl;
605 
606  if (in_parallel) {
607  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
608  // Make sure data structures are in consistent state before querying them
609  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
610  KMP_MB();
611  }
612 
613  fl = queue->tq_free_thunks;
614 
615  KMP_DEBUG_ASSERT(fl != NULL);
616 
617  queue->tq_free_thunks = fl->th.th_next_free;
618  fl->th_flags = 0;
619 
620  if (in_parallel)
621  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
622 
623  return fl;
624 }
625 
626 static void __kmp_free_thunk(kmpc_task_queue_t *queue, kmpc_thunk_t *p,
627  int in_parallel, kmp_int32 global_tid) {
628 #ifdef KMP_DEBUG
629  p->th_task = 0;
630  p->th_encl_thunk = 0;
631  p->th_status = 0;
632  p->th_tasknum = 0;
633 /* Also could zero pointers to private vars */
634 #endif
635 
636  if (in_parallel) {
637  __kmp_acquire_lock(&queue->tq_free_thunks_lck, global_tid);
638  // Make sure data structures are in consistent state before querying them
639  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
640  KMP_MB();
641  }
642 
643  p->th.th_next_free = queue->tq_free_thunks;
644  queue->tq_free_thunks = p;
645 
646 #ifdef KMP_DEBUG
647  p->th_flags = TQF_DEALLOCATED;
648 #endif
649 
650  if (in_parallel)
651  __kmp_release_lock(&queue->tq_free_thunks_lck, global_tid);
652 }
653 
654 /* returns nonzero if the queue just became full after the enqueue */
655 static kmp_int32 __kmp_enqueue_task(kmp_taskq_t *tq, kmp_int32 global_tid,
656  kmpc_task_queue_t *queue,
657  kmpc_thunk_t *thunk, int in_parallel) {
658  kmp_int32 ret;
659 
660  /* dkp: can we get around the lock in the TQF_RELEASE_WORKERS case (only the
661  * master is executing then) */
662  if (in_parallel) {
663  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
664  // Make sure data structures are in consistent state before querying them
665  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
666  KMP_MB();
667  }
668 
669  KMP_DEBUG_ASSERT(queue->tq_nfull < queue->tq_nslots); // check queue not full
670 
671  queue->tq_queue[(queue->tq_head)++].qs_thunk = thunk;
672 
673  if (queue->tq_head >= queue->tq_nslots)
674  queue->tq_head = 0;
675 
676  (queue->tq_nfull)++;
677 
678  KMP_MB(); /* to assure that nfull is seen to increase before
679  TQF_ALL_TASKS_QUEUED is set */
680 
681  ret = (in_parallel) ? (queue->tq_nfull == queue->tq_nslots) : FALSE;
682 
683  if (in_parallel) {
684  /* don't need to wait until workers are released before unlocking */
685  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
686 
687  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
688  // If just creating the root queue, the worker threads are waiting at a
689  // join barrier until now, when there's something in the queue for them to
690  // do; release them now to do work. This should only be done when this is
691  // the first task enqueued, so reset the flag here also.
692  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS; /* no lock needed, workers
693  are still in spin mode */
694  // avoid releasing barrier twice if taskq_task switches threads
695  KMP_MB();
696 
697  __kmpc_end_barrier_master(NULL, global_tid);
698  }
699  }
700 
701  return ret;
702 }
703 
704 static kmpc_thunk_t *__kmp_dequeue_task(kmp_int32 global_tid,
705  kmpc_task_queue_t *queue,
706  int in_parallel) {
707  kmpc_thunk_t *pt;
708  int tid = __kmp_tid_from_gtid(global_tid);
709 
710  KMP_DEBUG_ASSERT(queue->tq_nfull > 0); /* check queue not empty */
711 
712  if (queue->tq.tq_parent != NULL && in_parallel) {
713  int ct;
714  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
715  ct = ++(queue->tq_ref_count);
716  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
717  KMP_DEBUG_REF_CTS(
718  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
719  }
720 
721  pt = queue->tq_queue[(queue->tq_tail)++].qs_thunk;
722 
723  if (queue->tq_tail >= queue->tq_nslots)
724  queue->tq_tail = 0;
725 
726  if (in_parallel) {
727  queue->tq_th_thunks[tid].ai_data++;
728 
729  KMP_MB(); /* necessary so ai_data increment is propagated to other threads
730  immediately (digital) */
731 
732  KF_TRACE(200, ("__kmp_dequeue_task: T#%d(:%d) now has %d outstanding "
733  "thunks from queue %p\n",
734  global_tid, tid, queue->tq_th_thunks[tid].ai_data, queue));
735  }
736 
737  (queue->tq_nfull)--;
738 
739 #ifdef KMP_DEBUG
740  KMP_MB();
741 
742  /* necessary so (queue->tq_nfull > 0) above succeeds after tq_nfull is
743  * decremented */
744 
745  KMP_DEBUG_ASSERT(queue->tq_nfull >= 0);
746 
747  if (in_parallel) {
748  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data <=
749  __KMP_TASKQ_THUNKS_PER_TH);
750  }
751 #endif
752 
753  return pt;
754 }
755 
756 /* Find the next (non-null) task to dequeue and return it.
757  * This is never called unless in_parallel=TRUE
758  *
759  * Here are the rules for deciding which queue to take the task from:
760  * 1. Walk up the task queue tree from the current queue's parent and look
761  * on the way up (for loop, below).
762  * 2. Do a depth-first search back down the tree from the root and
763  * look (find_task_in_descendant_queue()).
764  *
765  * Here are the rules for deciding which task to take from a queue
766  * (__kmp_find_task_in_queue ()):
767  * 1. Never take the last task from a queue if TQF_IS_LASTPRIVATE; this task
768  * must be staged to make sure we execute the last one with
769  * TQF_IS_LAST_TASK at the end of task queue execution.
770  * 2. If the queue length is below some high water mark and the taskq task
771  * is enqueued, prefer running the taskq task.
772  * 3. Otherwise, take a (normal) task from the queue.
773  *
774  * If we do all this and return pt == NULL at the bottom of this routine,
775  * this means there are no more tasks to execute (except possibly for
776  * TQF_IS_LASTPRIVATE).
777  */
778 
779 static kmpc_thunk_t *__kmp_find_task_in_queue(kmp_int32 global_tid,
780  kmpc_task_queue_t *queue) {
781  kmpc_thunk_t *pt = NULL;
782  int tid = __kmp_tid_from_gtid(global_tid);
783 
784  /* To prevent deadlock from tq_queue_lck if queue already deallocated */
785  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
786 
787  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
788 
789  /* Check again to avoid race in __kmpc_end_taskq() */
790  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
791  // Make sure data structures are in consistent state before querying them
792  // Seems to work without this for digital/alpha, needed for IBM/RS6000
793  KMP_MB();
794 
795  if ((queue->tq_taskq_slot != NULL) &&
796  (queue->tq_nfull <= queue->tq_hiwat)) {
797  /* if there's enough room in the queue and the dispatcher */
798  /* (taskq task) is available, schedule more tasks */
799  pt = CCAST(kmpc_thunk_t *, queue->tq_taskq_slot);
800  queue->tq_taskq_slot = NULL;
801  } else if (queue->tq_nfull == 0 ||
802  queue->tq_th_thunks[tid].ai_data >=
803  __KMP_TASKQ_THUNKS_PER_TH) {
804  /* do nothing if no thunks available or this thread can't */
805  /* run any because it already is executing too many */
806  pt = NULL;
807  } else if (queue->tq_nfull > 1) {
808  /* always safe to schedule a task even if TQF_IS_LASTPRIVATE */
809 
810  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
811  } else if (!(queue->tq_flags & TQF_IS_LASTPRIVATE)) {
812  // one thing in queue, always safe to schedule if !TQF_IS_LASTPRIVATE
813  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
814  } else if (queue->tq_flags & TQF_IS_LAST_TASK) {
815  /* TQF_IS_LASTPRIVATE, one thing in queue, kmpc_end_taskq_task() */
816  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
817  /* instrumentation does copy-out. */
818  pt = __kmp_dequeue_task(global_tid, queue, TRUE);
819  pt->th_flags |=
820  TQF_IS_LAST_TASK; /* don't need test_then_or since already locked */
821  }
822  }
823 
824  /* GEH - What happens here if is lastprivate, but not last task? */
825  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
826  }
827 
828  return pt;
829 }
830 
831 /* Walk a tree of queues starting at queue's first child and return a non-NULL
832  thunk if one can be scheduled. Must only be called when in_parallel=TRUE */
833 
834 static kmpc_thunk_t *
835 __kmp_find_task_in_descendant_queue(kmp_int32 global_tid,
836  kmpc_task_queue_t *curr_queue) {
837  kmpc_thunk_t *pt = NULL;
838  kmpc_task_queue_t *queue = curr_queue;
839 
840  if (curr_queue->tq_first_child != NULL) {
841  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
842  // Make sure data structures are in consistent state before querying them
843  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
844  KMP_MB();
845 
846  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
847  if (queue == NULL) {
848  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
849  return NULL;
850  }
851 
852  while (queue != NULL) {
853  int ct;
854  kmpc_task_queue_t *next;
855 
856  ct = ++(queue->tq_ref_count);
857  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
858  KMP_DEBUG_REF_CTS(
859  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
860 
861  pt = __kmp_find_task_in_queue(global_tid, queue);
862 
863  if (pt != NULL) {
864  int ct;
865 
866  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
867  // Make sure data structures in consistent state before querying them
868  // Seems to work without this for digital/alpha, needed for IBM/RS6000
869  KMP_MB();
870 
871  ct = --(queue->tq_ref_count);
872  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
873  global_tid, queue, ct));
874  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
875 
876  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
877 
878  return pt;
879  }
880 
881  /* although reference count stays active during descendant walk, shouldn't
882  matter since if children still exist, reference counts aren't being
883  monitored anyway */
884 
885  pt = __kmp_find_task_in_descendant_queue(global_tid, queue);
886 
887  if (pt != NULL) {
888  int ct;
889 
890  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
891  // Make sure data structures in consistent state before querying them
892  // Seems to work without this for digital/alpha, needed for IBM/RS6000
893  KMP_MB();
894 
895  ct = --(queue->tq_ref_count);
896  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
897  global_tid, queue, ct));
898  KMP_DEBUG_ASSERT(ct >= 0);
899 
900  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
901 
902  return pt;
903  }
904 
905  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
906  // Make sure data structures in consistent state before querying them
907  // Seems to work without this for digital/alpha, needed for IBM/RS6000
908  KMP_MB();
909 
910  next = queue->tq_next_child;
911 
912  ct = --(queue->tq_ref_count);
913  KMP_DEBUG_REF_CTS(
914  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
915  KMP_DEBUG_ASSERT(ct >= 0);
916 
917  queue = next;
918  }
919 
920  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
921  }
922 
923  return pt;
924 }
925 
926 /* Walk up the taskq tree looking for a task to execute. If we get to the root,
927  search the tree for a descendent queue task. Must only be called when
928  in_parallel=TRUE */
929 static kmpc_thunk_t *
930 __kmp_find_task_in_ancestor_queue(kmp_taskq_t *tq, kmp_int32 global_tid,
931  kmpc_task_queue_t *curr_queue) {
932  kmpc_task_queue_t *queue;
933  kmpc_thunk_t *pt;
934 
935  pt = NULL;
936 
937  if (curr_queue->tq.tq_parent != NULL) {
938  queue = curr_queue->tq.tq_parent;
939 
940  while (queue != NULL) {
941  if (queue->tq.tq_parent != NULL) {
942  int ct;
943  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
944  // Make sure data structures in consistent state before querying them
945  // Seems to work without this for digital/alpha, needed for IBM/RS6000
946  KMP_MB();
947 
948  ct = ++(queue->tq_ref_count);
949  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
950  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p inc %d\n", __LINE__,
951  global_tid, queue, ct));
952  }
953 
954  pt = __kmp_find_task_in_queue(global_tid, queue);
955  if (pt != NULL) {
956  if (queue->tq.tq_parent != NULL) {
957  int ct;
958  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
959  // Make sure data structures in consistent state before querying them
960  // Seems to work without this for digital/alpha, needed for IBM/RS6000
961  KMP_MB();
962 
963  ct = --(queue->tq_ref_count);
964  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
965  global_tid, queue, ct));
966  KMP_DEBUG_ASSERT(ct >= 0);
967 
968  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
969  }
970 
971  return pt;
972  }
973 
974  if (queue->tq.tq_parent != NULL) {
975  int ct;
976  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
977  // Make sure data structures in consistent state before querying them
978  // Seems to work without this for digital/alpha, needed for IBM/RS6000
979  KMP_MB();
980 
981  ct = --(queue->tq_ref_count);
982  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p dec %d\n", __LINE__,
983  global_tid, queue, ct));
984  KMP_DEBUG_ASSERT(ct >= 0);
985  }
986  queue = queue->tq.tq_parent;
987 
988  if (queue != NULL)
989  __kmp_release_lock(&queue->tq_link_lck, global_tid);
990  }
991  }
992 
993  pt = __kmp_find_task_in_descendant_queue(global_tid, tq->tq_root);
994 
995  return pt;
996 }
997 
998 static int __kmp_taskq_tasks_finished(kmpc_task_queue_t *queue) {
999  int i;
1000 
1001  /* KMP_MB(); */ /* is this really necessary? */
1002 
1003  for (i = 0; i < queue->tq_nproc; i++) {
1004  if (queue->tq_th_thunks[i].ai_data != 0)
1005  return FALSE;
1006  }
1007 
1008  return TRUE;
1009 }
1010 
1011 static int __kmp_taskq_has_any_children(kmpc_task_queue_t *queue) {
1012  return (queue->tq_first_child != NULL);
1013 }
1014 
1015 static void __kmp_remove_queue_from_tree(kmp_taskq_t *tq, kmp_int32 global_tid,
1016  kmpc_task_queue_t *queue,
1017  int in_parallel) {
1018 #ifdef KMP_DEBUG
1019  kmp_int32 i;
1020  kmpc_thunk_t *thunk;
1021 #endif
1022 
1023  KF_TRACE(50,
1024  ("Before Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1025  KF_DUMP(50, __kmp_dump_task_queue(tq, queue, global_tid));
1026 
1027  /* sub-queue in a recursion, not the root task queue */
1028  KMP_DEBUG_ASSERT(queue->tq.tq_parent != NULL);
1029 
1030  if (in_parallel) {
1031  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1032  // Make sure data structures are in consistent state before querying them
1033  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1034  KMP_MB();
1035  }
1036 
1037  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1038 
1039  /* unlink queue from its siblings if any at this level */
1040  if (queue->tq_prev_child != NULL)
1041  queue->tq_prev_child->tq_next_child = queue->tq_next_child;
1042  if (queue->tq_next_child != NULL)
1043  queue->tq_next_child->tq_prev_child = queue->tq_prev_child;
1044  if (queue->tq.tq_parent->tq_first_child == queue)
1045  queue->tq.tq_parent->tq_first_child = queue->tq_next_child;
1046 
1047  queue->tq_prev_child = NULL;
1048  queue->tq_next_child = NULL;
1049 
1050  if (in_parallel) {
1051  KMP_DEBUG_REF_CTS(
1052  ("line %d gtid %d: Q %p waiting for ref_count of %d to reach 1\n",
1053  __LINE__, global_tid, queue, queue->tq_ref_count));
1054 
1055  /* wait until all other threads have stopped accessing this queue */
1056  while (queue->tq_ref_count > 1) {
1057  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1058 
1059  KMP_WAIT_YIELD((volatile kmp_uint32 *)&queue->tq_ref_count, 1, KMP_LE,
1060  NULL);
1061 
1062  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1063  // Make sure data structures are in consistent state before querying them
1064  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1065  KMP_MB();
1066  }
1067 
1068  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1069  }
1070 
1071  KMP_DEBUG_REF_CTS(
1072  ("line %d gtid %d: Q %p freeing queue\n", __LINE__, global_tid, queue));
1073 
1074 #ifdef KMP_DEBUG
1075  KMP_DEBUG_ASSERT(queue->tq_flags & TQF_ALL_TASKS_QUEUED);
1076  KMP_DEBUG_ASSERT(queue->tq_nfull == 0);
1077 
1078  for (i = 0; i < queue->tq_nproc; i++) {
1079  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1080  }
1081 
1082  i = 0;
1083  for (thunk = queue->tq_free_thunks; thunk != NULL;
1084  thunk = thunk->th.th_next_free)
1085  ++i;
1086 
1087  KMP_ASSERT(i ==
1088  queue->tq_nslots + (queue->tq_nproc * __KMP_TASKQ_THUNKS_PER_TH));
1089 #endif
1090 
1091  /* release storage for queue entry */
1092  __kmp_free_taskq(tq, queue, TRUE, global_tid);
1093 
1094  KF_TRACE(50, ("After Deletion of TaskQ at %p on (%d):\n", queue, global_tid));
1095  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1096 }
1097 
1098 /* Starting from indicated queue, proceed downward through tree and remove all
1099  taskqs which are finished, but only go down to taskqs which have the "nowait"
1100  clause present. Assume this is only called when in_parallel=TRUE. */
1101 
1102 static void __kmp_find_and_remove_finished_child_taskq(
1103  kmp_taskq_t *tq, kmp_int32 global_tid, kmpc_task_queue_t *curr_queue) {
1104  kmpc_task_queue_t *queue = curr_queue;
1105 
1106  if (curr_queue->tq_first_child != NULL) {
1107  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1108  // Make sure data structures are in consistent state before querying them
1109  // Seems to work without this call for digital/alpha, needed for IBM/RS6000
1110  KMP_MB();
1111 
1112  queue = CCAST(kmpc_task_queue_t *, curr_queue->tq_first_child);
1113  if (queue != NULL) {
1114  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1115  return;
1116  }
1117 
1118  while (queue != NULL) {
1119  kmpc_task_queue_t *next;
1120  int ct = ++(queue->tq_ref_count);
1121  KMP_DEBUG_REF_CTS(
1122  ("line %d gtid %d: Q %p inc %d\n", __LINE__, global_tid, queue, ct));
1123 
1124  /* although reference count stays active during descendant walk, */
1125  /* shouldn't matter since if children still exist, reference */
1126  /* counts aren't being monitored anyway */
1127 
1128  if (queue->tq_flags & TQF_IS_NOWAIT) {
1129  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1130 
1131  if ((queue->tq_flags & TQF_ALL_TASKS_QUEUED) &&
1132  (queue->tq_nfull == 0) && __kmp_taskq_tasks_finished(queue) &&
1133  !__kmp_taskq_has_any_children(queue)) {
1134 
1135  /* Only remove this if we have not already marked it for deallocation.
1136  This should prevent multiple threads from trying to free this. */
1137 
1138  if (__kmp_test_lock(&queue->tq_queue_lck, global_tid)) {
1139  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1140  queue->tq_flags |= TQF_DEALLOCATED;
1141  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1142 
1143  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1144 
1145  /* Can't do any more here since can't be sure where sibling queue
1146  * is so just exit this level */
1147  return;
1148  } else {
1149  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1150  }
1151  }
1152  /* otherwise, just fall through and decrement reference count */
1153  }
1154  }
1155 
1156  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1157  // Make sure data structures are in consistent state before querying them
1158  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1159  KMP_MB();
1160 
1161  next = queue->tq_next_child;
1162 
1163  ct = --(queue->tq_ref_count);
1164  KMP_DEBUG_REF_CTS(
1165  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1166  KMP_DEBUG_ASSERT(ct >= 0);
1167 
1168  queue = next;
1169  }
1170 
1171  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1172  }
1173 }
1174 
1175 /* Starting from indicated queue, proceed downward through tree and remove all
1176  taskq's assuming all are finished and assuming NO other threads are executing
1177  at this point. */
1178 static void __kmp_remove_all_child_taskq(kmp_taskq_t *tq, kmp_int32 global_tid,
1179  kmpc_task_queue_t *queue) {
1180  kmpc_task_queue_t *next_child;
1181 
1182  queue = CCAST(kmpc_task_queue_t *, queue->tq_first_child);
1183 
1184  while (queue != NULL) {
1185  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1186 
1187  next_child = queue->tq_next_child;
1188  queue->tq_flags |= TQF_DEALLOCATED;
1189  __kmp_remove_queue_from_tree(tq, global_tid, queue, FALSE);
1190  queue = next_child;
1191  }
1192 }
1193 
1194 static void __kmp_execute_task_from_queue(kmp_taskq_t *tq, ident_t *loc,
1195  kmp_int32 global_tid,
1196  kmpc_thunk_t *thunk,
1197  int in_parallel) {
1198  kmpc_task_queue_t *queue = thunk->th.th_shareds->sv_queue;
1199  kmp_int32 tid = __kmp_tid_from_gtid(global_tid);
1200 
1201  KF_TRACE(100, ("After dequeueing this Task on (%d):\n", global_tid));
1202  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1203  KF_TRACE(100, ("Task Queue: %p looks like this (%d):\n", queue, global_tid));
1204  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1205 
1206  /* For the taskq task, the curr_thunk pushes and pop pairs are set up as
1207  * follows:
1208  *
1209  * happens exactly once:
1210  * 1) __kmpc_taskq : push (if returning thunk only)
1211  * 4) __kmpc_end_taskq_task : pop
1212  *
1213  * optionally happens *each* time taskq task is dequeued/enqueued:
1214  * 2) __kmpc_taskq_task : pop
1215  * 3) __kmp_execute_task_from_queue : push
1216  *
1217  * execution ordering: 1,(2,3)*,4
1218  */
1219 
1220  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1221  kmp_int32 index = (queue == tq->tq_root) ? tid : 0;
1222  thunk->th.th_shareds =
1223  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[index].ai_data);
1224 
1225  if (__kmp_env_consistency_check) {
1226  __kmp_push_workshare(global_tid,
1227  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1228  : ct_task,
1229  queue->tq_loc);
1230  }
1231  } else {
1232  if (__kmp_env_consistency_check)
1233  __kmp_push_workshare(global_tid, ct_taskq, queue->tq_loc);
1234  }
1235 
1236  if (in_parallel) {
1237  thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1238  tq->tq_curr_thunk[tid] = thunk;
1239 
1240  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1241  }
1242 
1243  KF_TRACE(50, ("Begin Executing Thunk %p from queue %p on (%d)\n", thunk,
1244  queue, global_tid));
1245  thunk->th_task(global_tid, thunk);
1246  KF_TRACE(50, ("End Executing Thunk %p from queue %p on (%d)\n", thunk, queue,
1247  global_tid));
1248 
1249  if (!(thunk->th_flags & TQF_TASKQ_TASK)) {
1250  if (__kmp_env_consistency_check)
1251  __kmp_pop_workshare(global_tid,
1252  (queue->tq_flags & TQF_IS_ORDERED) ? ct_task_ordered
1253  : ct_task,
1254  queue->tq_loc);
1255 
1256  if (in_parallel) {
1257  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1258  thunk->th_encl_thunk = NULL;
1259  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1260  }
1261 
1262  if ((thunk->th_flags & TQF_IS_ORDERED) && in_parallel) {
1263  __kmp_taskq_check_ordered(global_tid, thunk);
1264  }
1265 
1266  __kmp_free_thunk(queue, thunk, in_parallel, global_tid);
1267 
1268  KF_TRACE(100, ("T#%d After freeing thunk: %p, TaskQ looks like this:\n",
1269  global_tid, thunk));
1270  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1271 
1272  if (in_parallel) {
1273  KMP_MB(); /* needed so thunk put on free list before outstanding thunk
1274  count is decremented */
1275 
1276  KMP_DEBUG_ASSERT(queue->tq_th_thunks[tid].ai_data >= 1);
1277 
1278  KF_TRACE(
1279  200,
1280  ("__kmp_execute_task_from_queue: T#%d has %d thunks in queue %p\n",
1281  global_tid, queue->tq_th_thunks[tid].ai_data - 1, queue));
1282 
1283  queue->tq_th_thunks[tid].ai_data--;
1284 
1285  /* KMP_MB(); */ /* is MB really necessary ? */
1286  }
1287 
1288  if (queue->tq.tq_parent != NULL && in_parallel) {
1289  int ct;
1290  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1291  ct = --(queue->tq_ref_count);
1292  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1293  KMP_DEBUG_REF_CTS(
1294  ("line %d gtid %d: Q %p dec %d\n", __LINE__, global_tid, queue, ct));
1295  KMP_DEBUG_ASSERT(ct >= 0);
1296  }
1297  }
1298 }
1299 
1300 /* starts a taskq; creates and returns a thunk for the taskq_task */
1301 /* also, returns pointer to shared vars for this thread in "shareds" arg */
1302 kmpc_thunk_t *__kmpc_taskq(ident_t *loc, kmp_int32 global_tid,
1303  kmpc_task_t taskq_task, size_t sizeof_thunk,
1304  size_t sizeof_shareds, kmp_int32 flags,
1305  kmpc_shared_vars_t **shareds) {
1306  int in_parallel;
1307  kmp_int32 nslots, nthunks, nshareds, nproc;
1308  kmpc_task_queue_t *new_queue, *curr_queue;
1309  kmpc_thunk_t *new_taskq_thunk;
1310  kmp_info_t *th;
1311  kmp_team_t *team;
1312  kmp_taskq_t *tq;
1313  kmp_int32 tid;
1314 
1315  KE_TRACE(10, ("__kmpc_taskq called (%d)\n", global_tid));
1316 
1317  th = __kmp_threads[global_tid];
1318  team = th->th.th_team;
1319  tq = &team->t.t_taskq;
1320  nproc = team->t.t_nproc;
1321  tid = __kmp_tid_from_gtid(global_tid);
1322 
1323  /* find out whether this is a parallel taskq or serialized one. */
1324  in_parallel = in_parallel_context(team);
1325 
1326  if (!tq->tq_root) {
1327  if (in_parallel) {
1328  /* Vector ORDERED SECTION to taskq version */
1329  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1330 
1331  /* Vector ORDERED SECTION to taskq version */
1332  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1333  }
1334 
1335  if (in_parallel) {
1336  // This shouldn't be a barrier region boundary, it will confuse the user.
1337  /* Need the boundary to be at the end taskq instead. */
1338  if (__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1339  /* Creating the active root queue, and we are not the master thread. */
1340  /* The master thread below created the queue and tasks have been */
1341  /* enqueued, and the master thread released this barrier. This */
1342  /* worker thread can now proceed and execute tasks. See also the */
1343  /* TQF_RELEASE_WORKERS which is used to handle this case. */
1344  *shareds =
1345  CCAST(kmpc_shared_vars_t *, tq->tq_root->tq_shareds[tid].ai_data);
1346  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1347 
1348  return NULL;
1349  }
1350  }
1351 
1352  /* master thread only executes this code */
1353  if (tq->tq_curr_thunk_capacity < nproc) {
1354  if (tq->tq_curr_thunk)
1355  __kmp_free(tq->tq_curr_thunk);
1356  else {
1357  /* only need to do this once at outer level, i.e. when tq_curr_thunk is
1358  * still NULL */
1359  __kmp_init_lock(&tq->tq_freelist_lck);
1360  }
1361 
1362  tq->tq_curr_thunk =
1363  (kmpc_thunk_t **)__kmp_allocate(nproc * sizeof(kmpc_thunk_t *));
1364  tq->tq_curr_thunk_capacity = nproc;
1365  }
1366 
1367  if (in_parallel)
1368  tq->tq_global_flags = TQF_RELEASE_WORKERS;
1369  }
1370 
1371  /* dkp: in future, if flags & TQF_HEURISTICS, will choose nslots based */
1372  /* on some heuristics (e.g., depth of queue nesting?). */
1373  nslots = (in_parallel) ? (2 * nproc) : 1;
1374 
1375  /* There must be nproc * __KMP_TASKQ_THUNKS_PER_TH extra slots for pending */
1376  /* jobs being executed by other threads, and one extra for taskq slot */
1377  nthunks = (in_parallel) ? (nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH) + 1)
1378  : nslots + 2;
1379 
1380  /* Only the root taskq gets a per-thread array of shareds. */
1381  /* The rest of the taskq's only get one copy of the shared vars. */
1382  nshareds = (!tq->tq_root && in_parallel) ? nproc : 1;
1383 
1384  /* create overall queue data structure and its components that require
1385  * allocation */
1386  new_queue = __kmp_alloc_taskq(tq, in_parallel, nslots, nthunks, nshareds,
1387  nproc, sizeof_thunk, sizeof_shareds,
1388  &new_taskq_thunk, global_tid);
1389 
1390  /* rest of new_queue initializations */
1391  new_queue->tq_flags = flags & TQF_INTERFACE_FLAGS;
1392 
1393  if (in_parallel) {
1394  new_queue->tq_tasknum_queuing = 0;
1395  new_queue->tq_tasknum_serving = 0;
1396  new_queue->tq_flags |= TQF_PARALLEL_CONTEXT;
1397  }
1398 
1399  new_queue->tq_taskq_slot = NULL;
1400  new_queue->tq_nslots = nslots;
1401  new_queue->tq_hiwat = HIGH_WATER_MARK(nslots);
1402  new_queue->tq_nfull = 0;
1403  new_queue->tq_head = 0;
1404  new_queue->tq_tail = 0;
1405  new_queue->tq_loc = loc;
1406 
1407  if ((new_queue->tq_flags & TQF_IS_ORDERED) && in_parallel) {
1408  /* prepare to serve the first-queued task's ORDERED directive */
1409  new_queue->tq_tasknum_serving = 1;
1410 
1411  /* Vector ORDERED SECTION to taskq version */
1412  th->th.th_dispatch->th_deo_fcn = __kmp_taskq_eo;
1413 
1414  /* Vector ORDERED SECTION to taskq version */
1415  th->th.th_dispatch->th_dxo_fcn = __kmp_taskq_xo;
1416  }
1417 
1418  /* create a new thunk for the taskq_task in the new_queue */
1419  *shareds = CCAST(kmpc_shared_vars_t *, new_queue->tq_shareds[0].ai_data);
1420 
1421  new_taskq_thunk->th.th_shareds = *shareds;
1422  new_taskq_thunk->th_task = taskq_task;
1423  new_taskq_thunk->th_flags = new_queue->tq_flags | TQF_TASKQ_TASK;
1424  new_taskq_thunk->th_status = 0;
1425 
1426  KMP_DEBUG_ASSERT(new_taskq_thunk->th_flags & TQF_TASKQ_TASK);
1427 
1428  // Make sure these inits complete before threads start using this queue
1429  /* KMP_MB(); */ // (necessary?)
1430 
1431  /* insert the new task queue into the tree, but only after all fields
1432  * initialized */
1433 
1434  if (in_parallel) {
1435  if (!tq->tq_root) {
1436  new_queue->tq.tq_parent = NULL;
1437  new_queue->tq_first_child = NULL;
1438  new_queue->tq_next_child = NULL;
1439  new_queue->tq_prev_child = NULL;
1440  new_queue->tq_ref_count = 1;
1441  tq->tq_root = new_queue;
1442  } else {
1443  curr_queue = tq->tq_curr_thunk[tid]->th.th_shareds->sv_queue;
1444  new_queue->tq.tq_parent = curr_queue;
1445  new_queue->tq_first_child = NULL;
1446  new_queue->tq_prev_child = NULL;
1447  new_queue->tq_ref_count =
1448  1; /* for this the thread that built the queue */
1449 
1450  KMP_DEBUG_REF_CTS(("line %d gtid %d: Q %p alloc %d\n", __LINE__,
1451  global_tid, new_queue, new_queue->tq_ref_count));
1452 
1453  __kmp_acquire_lock(&curr_queue->tq_link_lck, global_tid);
1454 
1455  // Make sure data structures are in consistent state before querying them
1456  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1457  KMP_MB();
1458 
1459  new_queue->tq_next_child =
1460  CCAST(struct kmpc_task_queue_t *, curr_queue->tq_first_child);
1461 
1462  if (curr_queue->tq_first_child != NULL)
1463  curr_queue->tq_first_child->tq_prev_child = new_queue;
1464 
1465  curr_queue->tq_first_child = new_queue;
1466 
1467  __kmp_release_lock(&curr_queue->tq_link_lck, global_tid);
1468  }
1469 
1470  /* set up thunk stack only after code that determines curr_queue above */
1471  new_taskq_thunk->th_encl_thunk = tq->tq_curr_thunk[tid];
1472  tq->tq_curr_thunk[tid] = new_taskq_thunk;
1473 
1474  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1475  } else {
1476  new_taskq_thunk->th_encl_thunk = 0;
1477  new_queue->tq.tq_parent = NULL;
1478  new_queue->tq_first_child = NULL;
1479  new_queue->tq_next_child = NULL;
1480  new_queue->tq_prev_child = NULL;
1481  new_queue->tq_ref_count = 1;
1482  }
1483 
1484 #ifdef KMP_DEBUG
1485  KF_TRACE(150, ("Creating TaskQ Task on (%d):\n", global_tid));
1486  KF_DUMP(150, __kmp_dump_thunk(tq, new_taskq_thunk, global_tid));
1487 
1488  if (in_parallel) {
1489  KF_TRACE(25,
1490  ("After TaskQ at %p Creation on (%d):\n", new_queue, global_tid));
1491  } else {
1492  KF_TRACE(25, ("After Serial TaskQ at %p Creation on (%d):\n", new_queue,
1493  global_tid));
1494  }
1495 
1496  KF_DUMP(25, __kmp_dump_task_queue(tq, new_queue, global_tid));
1497 
1498  if (in_parallel) {
1499  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1500  }
1501 #endif /* KMP_DEBUG */
1502 
1503  if (__kmp_env_consistency_check)
1504  __kmp_push_workshare(global_tid, ct_taskq, new_queue->tq_loc);
1505 
1506  KE_TRACE(10, ("__kmpc_taskq return (%d)\n", global_tid));
1507 
1508  return new_taskq_thunk;
1509 }
1510 
1511 /* ends a taskq; last thread out destroys the queue */
1512 
1513 void __kmpc_end_taskq(ident_t *loc, kmp_int32 global_tid,
1514  kmpc_thunk_t *taskq_thunk) {
1515 #ifdef KMP_DEBUG
1516  kmp_int32 i;
1517 #endif
1518  kmp_taskq_t *tq;
1519  int in_parallel;
1520  kmp_info_t *th;
1521  kmp_int32 is_outermost;
1522  kmpc_task_queue_t *queue;
1523  kmpc_thunk_t *thunk;
1524  int nproc;
1525 
1526  KE_TRACE(10, ("__kmpc_end_taskq called (%d)\n", global_tid));
1527 
1528  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1529  nproc = __kmp_threads[global_tid]->th.th_team->t.t_nproc;
1530 
1531  /* For the outermost taskq only, all but one thread will have taskq_thunk ==
1532  * NULL */
1533  queue = (taskq_thunk == NULL) ? tq->tq_root
1534  : taskq_thunk->th.th_shareds->sv_queue;
1535 
1536  KE_TRACE(50, ("__kmpc_end_taskq queue=%p (%d) \n", queue, global_tid));
1537  is_outermost = (queue == tq->tq_root);
1538  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1539 
1540  if (in_parallel) {
1541  kmp_uint32 spins;
1542 
1543  /* this is just a safeguard to release the waiting threads if */
1544  /* the outermost taskq never queues a task */
1545 
1546  if (is_outermost && (KMP_MASTER_GTID(global_tid))) {
1547  if (tq->tq_global_flags & TQF_RELEASE_WORKERS) {
1548  /* no lock needed, workers are still in spin mode */
1549  tq->tq_global_flags &= ~TQF_RELEASE_WORKERS;
1550 
1551  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1552  }
1553  }
1554 
1555  /* keep dequeueing work until all tasks are queued and dequeued */
1556 
1557  do {
1558  /* wait until something is available to dequeue */
1559  KMP_INIT_YIELD(spins);
1560 
1561  while ((queue->tq_nfull == 0) && (queue->tq_taskq_slot == NULL) &&
1562  (!__kmp_taskq_has_any_children(queue)) &&
1563  (!(queue->tq_flags & TQF_ALL_TASKS_QUEUED))) {
1564  KMP_YIELD_WHEN(TRUE, spins);
1565  }
1566 
1567  /* check to see if we can execute tasks in the queue */
1568  while (((queue->tq_nfull != 0) || (queue->tq_taskq_slot != NULL)) &&
1569  (thunk = __kmp_find_task_in_queue(global_tid, queue)) != NULL) {
1570  KF_TRACE(50, ("Found thunk: %p in primary queue %p (%d)\n", thunk,
1571  queue, global_tid));
1572  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1573  }
1574 
1575  /* see if work found can be found in a descendant queue */
1576  if ((__kmp_taskq_has_any_children(queue)) &&
1577  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1578  NULL) {
1579 
1580  KF_TRACE(50,
1581  ("Stole thunk: %p in descendant queue: %p while waiting in "
1582  "queue: %p (%d)\n",
1583  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1584 
1585  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1586  }
1587 
1588  } while ((!(queue->tq_flags & TQF_ALL_TASKS_QUEUED)) ||
1589  (queue->tq_nfull != 0));
1590 
1591  KF_TRACE(50, ("All tasks queued and dequeued in queue: %p (%d)\n", queue,
1592  global_tid));
1593 
1594  /* wait while all tasks are not finished and more work found
1595  in descendant queues */
1596 
1597  while ((!__kmp_taskq_tasks_finished(queue)) &&
1598  (thunk = __kmp_find_task_in_descendant_queue(global_tid, queue)) !=
1599  NULL) {
1600 
1601  KF_TRACE(50, ("Stole thunk: %p in descendant queue: %p while waiting in "
1602  "queue: %p (%d)\n",
1603  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1604 
1605  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1606  }
1607 
1608  KF_TRACE(50, ("No work found in descendent queues or all work finished in "
1609  "queue: %p (%d)\n",
1610  queue, global_tid));
1611 
1612  if (!is_outermost) {
1613  /* need to return if NOWAIT present and not outermost taskq */
1614 
1615  if (queue->tq_flags & TQF_IS_NOWAIT) {
1616  __kmp_acquire_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1617  queue->tq_ref_count--;
1618  KMP_DEBUG_ASSERT(queue->tq_ref_count >= 0);
1619  __kmp_release_lock(&queue->tq.tq_parent->tq_link_lck, global_tid);
1620 
1621  KE_TRACE(
1622  10, ("__kmpc_end_taskq return for nowait case (%d)\n", global_tid));
1623 
1624  return;
1625  }
1626 
1627  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1628 
1629  /* WAIT until all tasks are finished and no child queues exist before
1630  * proceeding */
1631  KMP_INIT_YIELD(spins);
1632 
1633  while (!__kmp_taskq_tasks_finished(queue) ||
1634  __kmp_taskq_has_any_children(queue)) {
1635  thunk = __kmp_find_task_in_ancestor_queue(tq, global_tid, queue);
1636 
1637  if (thunk != NULL) {
1638  KF_TRACE(50,
1639  ("Stole thunk: %p in ancestor queue: %p while waiting in "
1640  "queue: %p (%d)\n",
1641  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1642  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk,
1643  in_parallel);
1644  }
1645 
1646  KMP_YIELD_WHEN(thunk == NULL, spins);
1647 
1648  __kmp_find_and_remove_finished_child_taskq(tq, global_tid, queue);
1649  }
1650 
1651  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1652  if (!(queue->tq_flags & TQF_DEALLOCATED)) {
1653  queue->tq_flags |= TQF_DEALLOCATED;
1654  }
1655  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1656 
1657  /* only the allocating thread can deallocate the queue */
1658  if (taskq_thunk != NULL) {
1659  __kmp_remove_queue_from_tree(tq, global_tid, queue, TRUE);
1660  }
1661 
1662  KE_TRACE(
1663  10,
1664  ("__kmpc_end_taskq return for non_outermost queue, wait case (%d)\n",
1665  global_tid));
1666 
1667  return;
1668  }
1669 
1670  // Outermost Queue: steal work from descendants until all tasks are finished
1671 
1672  KMP_INIT_YIELD(spins);
1673 
1674  while (!__kmp_taskq_tasks_finished(queue)) {
1675  thunk = __kmp_find_task_in_descendant_queue(global_tid, queue);
1676 
1677  if (thunk != NULL) {
1678  KF_TRACE(50,
1679  ("Stole thunk: %p in descendant queue: %p while waiting in "
1680  "queue: %p (%d)\n",
1681  thunk, thunk->th.th_shareds->sv_queue, queue, global_tid));
1682 
1683  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1684  }
1685 
1686  KMP_YIELD_WHEN(thunk == NULL, spins);
1687  }
1688 
1689  /* Need this barrier to prevent destruction of queue before threads have all
1690  * executed above code */
1691  /* This may need to be done earlier when NOWAIT is implemented for the
1692  * outermost level */
1693 
1694  if (!__kmp_barrier(bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL)) {
1695  /* the queue->tq_flags & TQF_IS_NOWAIT case is not yet handled here; */
1696  /* for right now, everybody waits, and the master thread destroys the */
1697  /* remaining queues. */
1698 
1699  __kmp_remove_all_child_taskq(tq, global_tid, queue);
1700 
1701  /* Now destroy the root queue */
1702  KF_TRACE(100, ("T#%d Before Deletion of top-level TaskQ at %p:\n",
1703  global_tid, queue));
1704  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1705 
1706 #ifdef KMP_DEBUG
1707  /* the root queue entry */
1708  KMP_DEBUG_ASSERT((queue->tq.tq_parent == NULL) &&
1709  (queue->tq_next_child == NULL));
1710 
1711  /* children must all be gone by now because of barrier above */
1712  KMP_DEBUG_ASSERT(queue->tq_first_child == NULL);
1713 
1714  for (i = 0; i < nproc; i++) {
1715  KMP_DEBUG_ASSERT(queue->tq_th_thunks[i].ai_data == 0);
1716  }
1717 
1718  for (i = 0, thunk = queue->tq_free_thunks; thunk != NULL;
1719  i++, thunk = thunk->th.th_next_free)
1720  ;
1721 
1722  KMP_DEBUG_ASSERT(i ==
1723  queue->tq_nslots + (nproc * __KMP_TASKQ_THUNKS_PER_TH));
1724 
1725  for (i = 0; i < nproc; i++) {
1726  KMP_DEBUG_ASSERT(!tq->tq_curr_thunk[i]);
1727  }
1728 #endif
1729  /* unlink the root queue entry */
1730  tq->tq_root = NULL;
1731 
1732  /* release storage for root queue entry */
1733  KF_TRACE(50, ("After Deletion of top-level TaskQ at %p on (%d):\n", queue,
1734  global_tid));
1735 
1736  queue->tq_flags |= TQF_DEALLOCATED;
1737  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1738 
1739  KF_DUMP(50, __kmp_dump_task_queue_tree(tq, tq->tq_root, global_tid));
1740 
1741  /* release the workers now that the data structures are up to date */
1742  __kmp_end_split_barrier(bs_plain_barrier, global_tid);
1743  }
1744 
1745  th = __kmp_threads[global_tid];
1746 
1747  /* Reset ORDERED SECTION to parallel version */
1748  th->th.th_dispatch->th_deo_fcn = 0;
1749 
1750  /* Reset ORDERED SECTION to parallel version */
1751  th->th.th_dispatch->th_dxo_fcn = 0;
1752  } else {
1753  /* in serial execution context, dequeue the last task */
1754  /* and execute it, if there were any tasks encountered */
1755 
1756  if (queue->tq_nfull > 0) {
1757  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1758 
1759  thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1760 
1761  if (queue->tq_flags & TQF_IS_LAST_TASK) {
1762  /* TQF_IS_LASTPRIVATE, one thing in queue, __kmpc_end_taskq_task() */
1763  /* has been run so this is last task, run with TQF_IS_LAST_TASK so */
1764  /* instrumentation does copy-out. */
1765 
1766  /* no need for test_then_or call since already locked */
1767  thunk->th_flags |= TQF_IS_LAST_TASK;
1768  }
1769 
1770  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1771  thunk, queue));
1772 
1773  __kmp_execute_task_from_queue(tq, loc, global_tid, thunk, in_parallel);
1774  }
1775 
1776  // destroy the unattached serial queue now that there is no more work to do
1777  KF_TRACE(100, ("Before Deletion of Serialized TaskQ at %p on (%d):\n",
1778  queue, global_tid));
1779  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1780 
1781 #ifdef KMP_DEBUG
1782  i = 0;
1783  for (thunk = queue->tq_free_thunks; thunk != NULL;
1784  thunk = thunk->th.th_next_free)
1785  ++i;
1786  KMP_DEBUG_ASSERT(i == queue->tq_nslots + 1);
1787 #endif
1788  /* release storage for unattached serial queue */
1789  KF_TRACE(50,
1790  ("Serialized TaskQ at %p deleted on (%d).\n", queue, global_tid));
1791 
1792  queue->tq_flags |= TQF_DEALLOCATED;
1793  __kmp_free_taskq(tq, queue, in_parallel, global_tid);
1794  }
1795 
1796  KE_TRACE(10, ("__kmpc_end_taskq return (%d)\n", global_tid));
1797 }
1798 
1799 /* Enqueues a task for thunk previously created by __kmpc_task_buffer. */
1800 /* Returns nonzero if just filled up queue */
1801 
1802 kmp_int32 __kmpc_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk) {
1803  kmp_int32 ret;
1804  kmpc_task_queue_t *queue;
1805  int in_parallel;
1806  kmp_taskq_t *tq;
1807 
1808  KE_TRACE(10, ("__kmpc_task called (%d)\n", global_tid));
1809 
1810  KMP_DEBUG_ASSERT(!(thunk->th_flags &
1811  TQF_TASKQ_TASK)); /* thunk->th_task is a regular task */
1812 
1813  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1814  queue = thunk->th.th_shareds->sv_queue;
1815  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1816 
1817  if (in_parallel && (thunk->th_flags & TQF_IS_ORDERED))
1818  thunk->th_tasknum = ++queue->tq_tasknum_queuing;
1819 
1820  /* For serial execution dequeue the preceding task and execute it, if one
1821  * exists */
1822  /* This cannot be the last task. That one is handled in __kmpc_end_taskq */
1823 
1824  if (!in_parallel && queue->tq_nfull > 0) {
1825  kmpc_thunk_t *prev_thunk;
1826 
1827  KMP_DEBUG_ASSERT(queue->tq_nfull == 1);
1828 
1829  prev_thunk = __kmp_dequeue_task(global_tid, queue, in_parallel);
1830 
1831  KF_TRACE(50, ("T#%d found thunk: %p in serial queue: %p\n", global_tid,
1832  prev_thunk, queue));
1833 
1834  __kmp_execute_task_from_queue(tq, loc, global_tid, prev_thunk, in_parallel);
1835  }
1836 
1837  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
1838  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
1839  task queue is not full and allocates a thunk (which is then passed to
1840  __kmpc_task()). So, the enqueue below should never fail due to a full
1841  queue. */
1842 
1843  KF_TRACE(100, ("After enqueueing this Task on (%d):\n", global_tid));
1844  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1845 
1846  ret = __kmp_enqueue_task(tq, global_tid, queue, thunk, in_parallel);
1847 
1848  KF_TRACE(100, ("Task Queue looks like this on (%d):\n", global_tid));
1849  KF_DUMP(100, __kmp_dump_task_queue(tq, queue, global_tid));
1850 
1851  KE_TRACE(10, ("__kmpc_task return (%d)\n", global_tid));
1852 
1853  return ret;
1854 }
1855 
1856 /* enqueues a taskq_task for thunk previously created by __kmpc_taskq */
1857 /* this should never be called unless in a parallel context */
1858 
1859 void __kmpc_taskq_task(ident_t *loc, kmp_int32 global_tid, kmpc_thunk_t *thunk,
1860  kmp_int32 status) {
1861  kmpc_task_queue_t *queue;
1862  kmp_taskq_t *tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1863  int tid = __kmp_tid_from_gtid(global_tid);
1864 
1865  KE_TRACE(10, ("__kmpc_taskq_task called (%d)\n", global_tid));
1866  KF_TRACE(100, ("TaskQ Task argument thunk on (%d):\n", global_tid));
1867  KF_DUMP(100, __kmp_dump_thunk(tq, thunk, global_tid));
1868 
1869  queue = thunk->th.th_shareds->sv_queue;
1870 
1871  if (__kmp_env_consistency_check)
1872  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1873 
1874  /* thunk->th_task is the taskq_task */
1875  KMP_DEBUG_ASSERT(thunk->th_flags & TQF_TASKQ_TASK);
1876 
1877  /* not supposed to call __kmpc_taskq_task if it's already enqueued */
1878  KMP_DEBUG_ASSERT(queue->tq_taskq_slot == NULL);
1879 
1880  /* dequeue taskq thunk from curr_thunk stack */
1881  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1882  thunk->th_encl_thunk = NULL;
1883 
1884  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1885 
1886  thunk->th_status = status;
1887 
1888  // Flush thunk->th_status before taskq_task enqueued to avoid race condition
1889  KMP_MB();
1890 
1891  /* enqueue taskq_task in thunk into special slot in queue */
1892  /* GEH - probably don't need to lock taskq slot since only one */
1893  /* thread enqueues & already a lock set at dequeue point */
1894 
1895  queue->tq_taskq_slot = thunk;
1896 
1897  KE_TRACE(10, ("__kmpc_taskq_task return (%d)\n", global_tid));
1898 }
1899 
1900 /* ends a taskq_task; done generating tasks */
1901 
1902 void __kmpc_end_taskq_task(ident_t *loc, kmp_int32 global_tid,
1903  kmpc_thunk_t *thunk) {
1904  kmp_taskq_t *tq;
1905  kmpc_task_queue_t *queue;
1906  int in_parallel;
1907  int tid;
1908 
1909  KE_TRACE(10, ("__kmpc_end_taskq_task called (%d)\n", global_tid));
1910 
1911  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
1912  queue = thunk->th.th_shareds->sv_queue;
1913  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
1914  tid = __kmp_tid_from_gtid(global_tid);
1915 
1916  if (__kmp_env_consistency_check)
1917  __kmp_pop_workshare(global_tid, ct_taskq, loc);
1918 
1919  if (in_parallel) {
1920 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1921  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1922  TQF_ALL_TASKS_QUEUED);
1923 #else
1924  {
1925  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1926 
1927  // Make sure data structures are in consistent state before querying them
1928  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1929  KMP_MB();
1930 
1931  queue->tq_flags |= TQF_ALL_TASKS_QUEUED;
1932  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1933  }
1934 #endif
1935  }
1936 
1937  if (thunk->th_flags & TQF_IS_LASTPRIVATE) {
1938  /* Normally, __kmp_find_task_in_queue() refuses to schedule the last task in
1939  the queue if TQF_IS_LASTPRIVATE so we can positively identify that last
1940  task and run it with its TQF_IS_LAST_TASK bit turned on in th_flags.
1941  When __kmpc_end_taskq_task() is called we are done generating all the
1942  tasks, so we know the last one in the queue is the lastprivate task.
1943  Mark the queue as having gotten to this state via tq_flags &
1944  TQF_IS_LAST_TASK; when that task actually executes mark it via th_flags &
1945  TQF_IS_LAST_TASK (this th_flags bit signals the instrumented code to do
1946  copy-outs after execution). */
1947  if (!in_parallel) {
1948  /* No synchronization needed for serial context */
1949  queue->tq_flags |= TQF_IS_LAST_TASK;
1950  } else {
1951 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1952  KMP_TEST_THEN_OR32(RCAST(volatile kmp_uint32 *, &queue->tq_flags),
1953  TQF_IS_LAST_TASK);
1954 #else
1955  {
1956  __kmp_acquire_lock(&queue->tq_queue_lck, global_tid);
1957 
1958  // Make sure data structures in consistent state before querying them
1959  // Seems to work without this for digital/alpha, needed for IBM/RS6000
1960  KMP_MB();
1961 
1962  queue->tq_flags |= TQF_IS_LAST_TASK;
1963  __kmp_release_lock(&queue->tq_queue_lck, global_tid);
1964  }
1965 #endif
1966  /* to prevent race condition where last task is dequeued but */
1967  /* flag isn't visible yet (not sure about this) */
1968  KMP_MB();
1969  }
1970  }
1971 
1972  /* dequeue taskq thunk from curr_thunk stack */
1973  if (in_parallel) {
1974  tq->tq_curr_thunk[tid] = thunk->th_encl_thunk;
1975  thunk->th_encl_thunk = NULL;
1976 
1977  KF_DUMP(200, __kmp_dump_thunk_stack(tq->tq_curr_thunk[tid], global_tid));
1978  }
1979 
1980  KE_TRACE(10, ("__kmpc_end_taskq_task return (%d)\n", global_tid));
1981 }
1982 
1983 /* returns thunk for a regular task based on taskq_thunk */
1984 /* (__kmpc_taskq_task does the analogous thing for a TQF_TASKQ_TASK) */
1985 
1986 kmpc_thunk_t *__kmpc_task_buffer(ident_t *loc, kmp_int32 global_tid,
1987  kmpc_thunk_t *taskq_thunk, kmpc_task_t task) {
1988  kmp_taskq_t *tq;
1989  kmpc_task_queue_t *queue;
1990  kmpc_thunk_t *new_thunk;
1991  int in_parallel;
1992 
1993  KE_TRACE(10, ("__kmpc_task_buffer called (%d)\n", global_tid));
1994 
1995  KMP_DEBUG_ASSERT(
1996  taskq_thunk->th_flags &
1997  TQF_TASKQ_TASK); /* taskq_thunk->th_task is the taskq_task */
1998 
1999  tq = &__kmp_threads[global_tid]->th.th_team->t.t_taskq;
2000  queue = taskq_thunk->th.th_shareds->sv_queue;
2001  in_parallel = (queue->tq_flags & TQF_PARALLEL_CONTEXT);
2002 
2003  /* The instrumentation sequence is: __kmpc_task_buffer(), initialize private
2004  variables, __kmpc_task(). The __kmpc_task_buffer routine checks that the
2005  task queue is not full and allocates a thunk (which is then passed to
2006  __kmpc_task()). So, we can pre-allocate a thunk here assuming it will be
2007  the next to be enqueued in __kmpc_task(). */
2008 
2009  new_thunk = __kmp_alloc_thunk(queue, in_parallel, global_tid);
2010  new_thunk->th.th_shareds =
2011  CCAST(kmpc_shared_vars_t *, queue->tq_shareds[0].ai_data);
2012  new_thunk->th_encl_thunk = NULL;
2013  new_thunk->th_task = task;
2014 
2015  /* GEH - shouldn't need to lock the read of tq_flags here */
2016  new_thunk->th_flags = queue->tq_flags & TQF_INTERFACE_FLAGS;
2017 
2018  new_thunk->th_status = 0;
2019 
2020  KMP_DEBUG_ASSERT(!(new_thunk->th_flags & TQF_TASKQ_TASK));
2021 
2022  KF_TRACE(100, ("Creating Regular Task on (%d):\n", global_tid));
2023  KF_DUMP(100, __kmp_dump_thunk(tq, new_thunk, global_tid));
2024 
2025  KE_TRACE(10, ("__kmpc_task_buffer return (%d)\n", global_tid));
2026 
2027  return new_thunk;
2028 }
Definition: kmp.h:223
KMP_EXPORT void __kmpc_end_barrier_master(ident_t *, kmp_int32 global_tid)