LLVM OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Modules Pages
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  * it may change values between parallel regions. __kmp_max_nth
17  * is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43  kmp_info_t *th;
44 
45  KMP_DEBUG_ASSERT(gtid_ref);
46 
47  if (__kmp_env_consistency_check) {
48  th = __kmp_threads[*gtid_ref];
49  if (th->th.th_root->r.r_active &&
50  (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54  __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56  }
57  }
58 }
59 
60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61  kmp_info_t *th;
62 
63  if (__kmp_env_consistency_check) {
64  th = __kmp_threads[*gtid_ref];
65  if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66  __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67  }
68  }
69 }
70 
71 // Initialize a dispatch_private_info_template<T> buffer for a particular
72 // type of schedule,chunk. The loop description is found in lb (lower bound),
73 // ub (upper bound), and st (stride). nproc is the number of threads relevant
74 // to the scheduling (often the number of threads in a team, but not always if
75 // hierarchical scheduling is used). tid is the id of the thread calling
76 // the function within the group of nproc threads. It will have a value
77 // between 0 and nproc - 1. This is often just the thread id within a team, but
78 // is not necessarily the case when using hierarchical scheduling.
79 // loc is the source file location of the corresponding loop
80 // gtid is the global thread id
81 template <typename T>
82 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
83  dispatch_private_info_template<T> *pr,
84  enum sched_type schedule, T lb, T ub,
85  typename traits_t<T>::signed_t st,
86 #if USE_ITT_BUILD
87  kmp_uint64 *cur_chunk,
88 #endif
89  typename traits_t<T>::signed_t chunk,
90  T nproc, T tid) {
91  typedef typename traits_t<T>::unsigned_t UT;
92  typedef typename traits_t<T>::floating_t DBL;
93 
94  int active;
95  T tc;
96  kmp_info_t *th;
97  kmp_team_t *team;
98 
99 #ifdef KMP_DEBUG
100  typedef typename traits_t<T>::signed_t ST;
101  {
102  char *buff;
103  // create format specifiers before the debug output
104  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
105  "pr:%%p lb:%%%s ub:%%%s st:%%%s "
106  "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
107  traits_t<T>::spec, traits_t<T>::spec,
108  traits_t<ST>::spec, traits_t<ST>::spec,
109  traits_t<T>::spec, traits_t<T>::spec);
110  KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
111  __kmp_str_free(&buff);
112  }
113 #endif
114  /* setup data */
115  th = __kmp_threads[gtid];
116  team = th->th.th_team;
117  active = !team->t.t_serialized;
118 
119 #if USE_ITT_BUILD
120  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
121  __kmp_forkjoin_frames_mode == 3 &&
122  KMP_MASTER_GTID(gtid) &&
123 #if OMP_40_ENABLED
124  th->th.th_teams_microtask == NULL &&
125 #endif
126  team->t.t_active_level == 1;
127 #endif
128 #if (KMP_STATIC_STEAL_ENABLED)
129  if (SCHEDULE_HAS_NONMONOTONIC(schedule))
130  // AC: we now have only one implementation of stealing, so use it
131  schedule = kmp_sch_static_steal;
132  else
133 #endif
134  schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
135 
136  /* Pick up the nomerge/ordered bits from the scheduling type */
137  if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
138  pr->flags.nomerge = TRUE;
139  schedule =
140  (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
141  } else {
142  pr->flags.nomerge = FALSE;
143  }
144  pr->type_size = traits_t<T>::type_size; // remember the size of variables
145  if (kmp_ord_lower & schedule) {
146  pr->flags.ordered = TRUE;
147  schedule =
148  (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
149  } else {
150  pr->flags.ordered = FALSE;
151  }
152 
153  if (schedule == kmp_sch_static) {
154  schedule = __kmp_static;
155  } else {
156  if (schedule == kmp_sch_runtime) {
157  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
158  // not specified)
159  schedule = team->t.t_sched.r_sched_type;
160  // Detail the schedule if needed (global controls are differentiated
161  // appropriately)
162  if (schedule == kmp_sch_guided_chunked) {
163  schedule = __kmp_guided;
164  } else if (schedule == kmp_sch_static) {
165  schedule = __kmp_static;
166  }
167  // Use the chunk size specified by OMP_SCHEDULE (or default if not
168  // specified)
169  chunk = team->t.t_sched.chunk;
170 #if USE_ITT_BUILD
171  if (cur_chunk)
172  *cur_chunk = chunk;
173 #endif
174 #ifdef KMP_DEBUG
175  {
176  char *buff;
177  // create format specifiers before the debug output
178  buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
179  "schedule:%%d chunk:%%%s\n",
180  traits_t<ST>::spec);
181  KD_TRACE(10, (buff, gtid, schedule, chunk));
182  __kmp_str_free(&buff);
183  }
184 #endif
185  } else {
186  if (schedule == kmp_sch_guided_chunked) {
187  schedule = __kmp_guided;
188  }
189  if (chunk <= 0) {
190  chunk = KMP_DEFAULT_CHUNK;
191  }
192  }
193 
194  if (schedule == kmp_sch_auto) {
195  // mapping and differentiation: in the __kmp_do_serial_initialize()
196  schedule = __kmp_auto;
197 #ifdef KMP_DEBUG
198  {
199  char *buff;
200  // create format specifiers before the debug output
201  buff = __kmp_str_format(
202  "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
203  "schedule:%%d chunk:%%%s\n",
204  traits_t<ST>::spec);
205  KD_TRACE(10, (buff, gtid, schedule, chunk));
206  __kmp_str_free(&buff);
207  }
208 #endif
209  }
210 
211  /* guided analytical not safe for too many threads */
212  if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
213  schedule = kmp_sch_guided_iterative_chunked;
214  KMP_WARNING(DispatchManyThreads);
215  }
216 #if OMP_45_ENABLED
217  if (schedule == kmp_sch_runtime_simd) {
218  // compiler provides simd_width in the chunk parameter
219  schedule = team->t.t_sched.r_sched_type;
220  // Detail the schedule if needed (global controls are differentiated
221  // appropriately)
222  if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
223  schedule == __kmp_static) {
224  schedule = kmp_sch_static_balanced_chunked;
225  } else {
226  if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
227  schedule = kmp_sch_guided_simd;
228  }
229  chunk = team->t.t_sched.chunk * chunk;
230  }
231 #if USE_ITT_BUILD
232  if (cur_chunk)
233  *cur_chunk = chunk;
234 #endif
235 #ifdef KMP_DEBUG
236  {
237  char *buff;
238  // create format specifiers before the debug output
239  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
240  " chunk:%%%s\n",
241  traits_t<ST>::spec);
242  KD_TRACE(10, (buff, gtid, schedule, chunk));
243  __kmp_str_free(&buff);
244  }
245 #endif
246  }
247 #endif // OMP_45_ENABLED
248  pr->u.p.parm1 = chunk;
249  }
250  KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
251  "unknown scheduling type");
252 
253  pr->u.p.count = 0;
254 
255  if (__kmp_env_consistency_check) {
256  if (st == 0) {
257  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
258  (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
259  }
260  }
261  // compute trip count
262  if (st == 1) { // most common case
263  if (ub >= lb) {
264  tc = ub - lb + 1;
265  } else { // ub < lb
266  tc = 0; // zero-trip
267  }
268  } else if (st < 0) {
269  if (lb >= ub) {
270  // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
271  // where the division needs to be unsigned regardless of the result type
272  tc = (UT)(lb - ub) / (-st) + 1;
273  } else { // lb < ub
274  tc = 0; // zero-trip
275  }
276  } else { // st > 0
277  if (ub >= lb) {
278  // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
279  // where the division needs to be unsigned regardless of the result type
280  tc = (UT)(ub - lb) / st + 1;
281  } else { // ub < lb
282  tc = 0; // zero-trip
283  }
284  }
285 
286  pr->u.p.lb = lb;
287  pr->u.p.ub = ub;
288  pr->u.p.st = st;
289  pr->u.p.tc = tc;
290 
291 #if KMP_OS_WINDOWS
292  pr->u.p.last_upper = ub + st;
293 #endif /* KMP_OS_WINDOWS */
294 
295  /* NOTE: only the active parallel region(s) has active ordered sections */
296 
297  if (active) {
298  if (pr->flags.ordered) {
299  pr->ordered_bumped = 0;
300  pr->u.p.ordered_lower = 1;
301  pr->u.p.ordered_upper = 0;
302  }
303  }
304 
305  switch (schedule) {
306 #if (KMP_STATIC_STEAL_ENABLED)
307  case kmp_sch_static_steal: {
308  T ntc, init;
309 
310  KD_TRACE(100,
311  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
312  gtid));
313 
314  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
315  if (nproc > 1 && ntc >= nproc) {
316  KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
317  T id = tid;
318  T small_chunk, extras;
319 
320  small_chunk = ntc / nproc;
321  extras = ntc % nproc;
322 
323  init = id * small_chunk + (id < extras ? id : extras);
324  pr->u.p.count = init;
325  pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
326 
327  pr->u.p.parm2 = lb;
328  // pr->pfields.parm3 = 0; // it's not used in static_steal
329  pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
330  pr->u.p.st = st;
331  if (traits_t<T>::type_size > 4) {
332  // AC: TODO: check if 16-byte CAS available and use it to
333  // improve performance (probably wait for explicit request
334  // before spending time on this).
335  // For now use dynamically allocated per-thread lock,
336  // free memory in __kmp_dispatch_next when status==0.
337  KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
338  th->th.th_dispatch->th_steal_lock =
339  (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
340  __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
341  }
342  break;
343  } else {
344  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
345  "kmp_sch_static_balanced\n",
346  gtid));
347  schedule = kmp_sch_static_balanced;
348  /* too few iterations: fall-through to kmp_sch_static_balanced */
349  } // if
350  /* FALL-THROUGH to static balanced */
351  KMP_FALLTHROUGH();
352  } // case
353 #endif
354  case kmp_sch_static_balanced: {
355  T init, limit;
356 
357  KD_TRACE(
358  100,
359  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
360  gtid));
361 
362  if (nproc > 1) {
363  T id = tid;
364 
365  if (tc < nproc) {
366  if (id < tc) {
367  init = id;
368  limit = id;
369  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
370  } else {
371  pr->u.p.count = 1; /* means no more chunks to execute */
372  pr->u.p.parm1 = FALSE;
373  break;
374  }
375  } else {
376  T small_chunk = tc / nproc;
377  T extras = tc % nproc;
378  init = id * small_chunk + (id < extras ? id : extras);
379  limit = init + small_chunk - (id < extras ? 0 : 1);
380  pr->u.p.parm1 = (id == nproc - 1);
381  }
382  } else {
383  if (tc > 0) {
384  init = 0;
385  limit = tc - 1;
386  pr->u.p.parm1 = TRUE;
387  } else {
388  // zero trip count
389  pr->u.p.count = 1; /* means no more chunks to execute */
390  pr->u.p.parm1 = FALSE;
391  break;
392  }
393  }
394 #if USE_ITT_BUILD
395  // Calculate chunk for metadata report
396  if (itt_need_metadata_reporting)
397  if (cur_chunk)
398  *cur_chunk = limit - init + 1;
399 #endif
400  if (st == 1) {
401  pr->u.p.lb = lb + init;
402  pr->u.p.ub = lb + limit;
403  } else {
404  // calculated upper bound, "ub" is user-defined upper bound
405  T ub_tmp = lb + limit * st;
406  pr->u.p.lb = lb + init * st;
407  // adjust upper bound to "ub" if needed, so that MS lastprivate will match
408  // it exactly
409  if (st > 0) {
410  pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
411  } else {
412  pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
413  }
414  }
415  if (pr->flags.ordered) {
416  pr->u.p.ordered_lower = init;
417  pr->u.p.ordered_upper = limit;
418  }
419  break;
420  } // case
421 #if OMP_45_ENABLED
422  case kmp_sch_static_balanced_chunked: {
423  // similar to balanced, but chunk adjusted to multiple of simd width
424  T nth = nproc;
425  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
426  " -> falling-through to static_greedy\n",
427  gtid));
428  schedule = kmp_sch_static_greedy;
429  if (nth > 1)
430  pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
431  else
432  pr->u.p.parm1 = tc;
433  break;
434  } // case
435  case kmp_sch_guided_simd:
436 #endif // OMP_45_ENABLED
437  case kmp_sch_guided_iterative_chunked: {
438  KD_TRACE(
439  100,
440  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
441  " case\n",
442  gtid));
443 
444  if (nproc > 1) {
445  if ((2L * chunk + 1) * nproc >= tc) {
446  /* chunk size too large, switch to dynamic */
447  schedule = kmp_sch_dynamic_chunked;
448  } else {
449  // when remaining iters become less than parm2 - switch to dynamic
450  pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
451  *(double *)&pr->u.p.parm3 =
452  guided_flt_param / nproc; // may occupy parm3 and parm4
453  }
454  } else {
455  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
456  "kmp_sch_static_greedy\n",
457  gtid));
458  schedule = kmp_sch_static_greedy;
459  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
460  KD_TRACE(
461  100,
462  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
463  gtid));
464  pr->u.p.parm1 = tc;
465  } // if
466  } // case
467  break;
468  case kmp_sch_guided_analytical_chunked: {
469  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
470  "kmp_sch_guided_analytical_chunked case\n",
471  gtid));
472 
473  if (nproc > 1) {
474  if ((2L * chunk + 1) * nproc >= tc) {
475  /* chunk size too large, switch to dynamic */
476  schedule = kmp_sch_dynamic_chunked;
477  } else {
478  /* commonly used term: (2 nproc - 1)/(2 nproc) */
479  DBL x;
480 
481 #if KMP_USE_X87CONTROL
482  /* Linux* OS already has 64-bit computation by default for long double,
483  and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
484  Windows* OS on IA-32 architecture, we need to set precision to 64-bit
485  instead of the default 53-bit. Even though long double doesn't work
486  on Windows* OS on Intel(R) 64, the resulting lack of precision is not
487  expected to impact the correctness of the algorithm, but this has not
488  been mathematically proven. */
489  // save original FPCW and set precision to 64-bit, as
490  // Windows* OS on IA-32 architecture defaults to 53-bit
491  unsigned int oldFpcw = _control87(0, 0);
492  _control87(_PC_64, _MCW_PC); // 0,0x30000
493 #endif
494  /* value used for comparison in solver for cross-over point */
495  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
496 
497  /* crossover point--chunk indexes equal to or greater than
498  this point switch to dynamic-style scheduling */
499  UT cross;
500 
501  /* commonly used term: (2 nproc - 1)/(2 nproc) */
502  x = (long double)1.0 - (long double)0.5 / nproc;
503 
504 #ifdef KMP_DEBUG
505  { // test natural alignment
506  struct _test_a {
507  char a;
508  union {
509  char b;
510  DBL d;
511  };
512  } t;
513  ptrdiff_t natural_alignment =
514  (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
515  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
516  // long)natural_alignment );
517  KMP_DEBUG_ASSERT(
518  (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
519  }
520 #endif // KMP_DEBUG
521 
522  /* save the term in thread private dispatch structure */
523  *(DBL *)&pr->u.p.parm3 = x;
524 
525  /* solve for the crossover point to the nearest integer i for which C_i
526  <= chunk */
527  {
528  UT left, right, mid;
529  long double p;
530 
531  /* estimate initial upper and lower bound */
532 
533  /* doesn't matter what value right is as long as it is positive, but
534  it affects performance of the solver */
535  right = 229;
536  p = __kmp_pow<UT>(x, right);
537  if (p > target) {
538  do {
539  p *= p;
540  right <<= 1;
541  } while (p > target && right < (1 << 27));
542  /* lower bound is previous (failed) estimate of upper bound */
543  left = right >> 1;
544  } else {
545  left = 0;
546  }
547 
548  /* bisection root-finding method */
549  while (left + 1 < right) {
550  mid = (left + right) / 2;
551  if (__kmp_pow<UT>(x, mid) > target) {
552  left = mid;
553  } else {
554  right = mid;
555  }
556  } // while
557  cross = right;
558  }
559  /* assert sanity of computed crossover point */
560  KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
561  __kmp_pow<UT>(x, cross) <= target);
562 
563  /* save the crossover point in thread private dispatch structure */
564  pr->u.p.parm2 = cross;
565 
566 // C75803
567 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
568 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
569 #else
570 #define GUIDED_ANALYTICAL_WORKAROUND (x)
571 #endif
572  /* dynamic-style scheduling offset */
573  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
574  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
575  cross * chunk;
576 #if KMP_USE_X87CONTROL
577  // restore FPCW
578  _control87(oldFpcw, _MCW_PC);
579 #endif
580  } // if
581  } else {
582  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
583  "kmp_sch_static_greedy\n",
584  gtid));
585  schedule = kmp_sch_static_greedy;
586  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
587  pr->u.p.parm1 = tc;
588  } // if
589  } // case
590  break;
591  case kmp_sch_static_greedy:
592  KD_TRACE(
593  100,
594  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
595  gtid));
596  pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
597  break;
598  case kmp_sch_static_chunked:
599  case kmp_sch_dynamic_chunked:
600  if (pr->u.p.parm1 <= 0) {
601  pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
602  }
603  KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
604  "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
605  gtid));
606  break;
607  case kmp_sch_trapezoidal: {
608  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
609 
610  T parm1, parm2, parm3, parm4;
611  KD_TRACE(100,
612  ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
613  gtid));
614 
615  parm1 = chunk;
616 
617  /* F : size of the first cycle */
618  parm2 = (tc / (2 * nproc));
619 
620  if (parm2 < 1) {
621  parm2 = 1;
622  }
623 
624  /* L : size of the last cycle. Make sure the last cycle is not larger
625  than the first cycle. */
626  if (parm1 < 1) {
627  parm1 = 1;
628  } else if (parm1 > parm2) {
629  parm1 = parm2;
630  }
631 
632  /* N : number of cycles */
633  parm3 = (parm2 + parm1);
634  parm3 = (2 * tc + parm3 - 1) / parm3;
635 
636  if (parm3 < 2) {
637  parm3 = 2;
638  }
639 
640  /* sigma : decreasing incr of the trapezoid */
641  parm4 = (parm3 - 1);
642  parm4 = (parm2 - parm1) / parm4;
643 
644  // pointless check, because parm4 >= 0 always
645  // if ( parm4 < 0 ) {
646  // parm4 = 0;
647  //}
648 
649  pr->u.p.parm1 = parm1;
650  pr->u.p.parm2 = parm2;
651  pr->u.p.parm3 = parm3;
652  pr->u.p.parm4 = parm4;
653  } // case
654  break;
655 
656  default: {
657  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
658  KMP_HNT(GetNewerLibrary), // Hint
659  __kmp_msg_null // Variadic argument list terminator
660  );
661  } break;
662  } // switch
663  pr->schedule = schedule;
664 }
665 
666 #if KMP_USE_HIER_SCHED
667 template <typename T>
668 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
669  typename traits_t<T>::signed_t st);
670 template <>
671 inline void
672 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
673  kmp_int32 ub, kmp_int32 st) {
674  __kmp_dispatch_init_hierarchy<kmp_int32>(
675  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
676  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
677 }
678 template <>
679 inline void
680 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
681  kmp_uint32 ub, kmp_int32 st) {
682  __kmp_dispatch_init_hierarchy<kmp_uint32>(
683  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
684  __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
685 }
686 template <>
687 inline void
688 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
689  kmp_int64 ub, kmp_int64 st) {
690  __kmp_dispatch_init_hierarchy<kmp_int64>(
691  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
692  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
693 }
694 template <>
695 inline void
696 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
697  kmp_uint64 ub, kmp_int64 st) {
698  __kmp_dispatch_init_hierarchy<kmp_uint64>(
699  loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
700  __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
701 }
702 
703 // free all the hierarchy scheduling memory associated with the team
704 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
705  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
706  for (int i = 0; i < num_disp_buff; ++i) {
707  // type does not matter here so use kmp_int32
708  auto sh =
709  reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
710  &team->t.t_disp_buffer[i]);
711  if (sh->hier) {
712  sh->hier->deallocate();
713  __kmp_free(sh->hier);
714  }
715  }
716 }
717 #endif
718 
719 // UT - unsigned flavor of T, ST - signed flavor of T,
720 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
721 template <typename T>
722 static void
723 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
724  T ub, typename traits_t<T>::signed_t st,
725  typename traits_t<T>::signed_t chunk, int push_ws) {
726  typedef typename traits_t<T>::unsigned_t UT;
727 
728  int active;
729  kmp_info_t *th;
730  kmp_team_t *team;
731  kmp_uint32 my_buffer_index;
732  dispatch_private_info_template<T> *pr;
733  dispatch_shared_info_template<T> volatile *sh;
734 
735  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
736  sizeof(dispatch_private_info));
737  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
738  sizeof(dispatch_shared_info));
739 
740  if (!TCR_4(__kmp_init_parallel))
741  __kmp_parallel_initialize();
742 
743 #if OMP_50_ENABLED
744  __kmp_resume_if_soft_paused();
745 #endif
746 
747 #if INCLUDE_SSC_MARKS
748  SSC_MARK_DISPATCH_INIT();
749 #endif
750 #ifdef KMP_DEBUG
751  typedef typename traits_t<T>::signed_t ST;
752  {
753  char *buff;
754  // create format specifiers before the debug output
755  buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
756  "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
757  traits_t<ST>::spec, traits_t<T>::spec,
758  traits_t<T>::spec, traits_t<ST>::spec);
759  KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
760  __kmp_str_free(&buff);
761  }
762 #endif
763  /* setup data */
764  th = __kmp_threads[gtid];
765  team = th->th.th_team;
766  active = !team->t.t_serialized;
767  th->th.th_ident = loc;
768 
769  // Any half-decent optimizer will remove this test when the blocks are empty
770  // since the macros expand to nothing
771  // when statistics are disabled.
772  if (schedule == __kmp_static) {
773  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
774  } else {
775  KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
776  }
777 
778 #if KMP_USE_HIER_SCHED
779  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
780  // Hierarchical scheduling does not work with ordered, so if ordered is
781  // detected, then revert back to threaded scheduling.
782  bool ordered;
783  enum sched_type my_sched = schedule;
784  my_buffer_index = th->th.th_dispatch->th_disp_index;
785  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
786  &th->th.th_dispatch
787  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
788  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
789  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
790  my_sched =
791  (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
792  ordered = (kmp_ord_lower & my_sched);
793  if (pr->flags.use_hier) {
794  if (ordered) {
795  KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
796  "Disabling hierarchical scheduling.\n",
797  gtid));
798  pr->flags.use_hier = FALSE;
799  }
800  }
801  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
802  // Don't use hierarchical for ordered parallel loops and don't
803  // use the runtime hierarchy if one was specified in the program
804  if (!ordered && !pr->flags.use_hier)
805  __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
806  }
807 #endif // KMP_USE_HIER_SCHED
808 
809 #if USE_ITT_BUILD
810  kmp_uint64 cur_chunk = chunk;
811  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
812  __kmp_forkjoin_frames_mode == 3 &&
813  KMP_MASTER_GTID(gtid) &&
814 #if OMP_40_ENABLED
815  th->th.th_teams_microtask == NULL &&
816 #endif
817  team->t.t_active_level == 1;
818 #endif
819  if (!active) {
820  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
821  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
822  } else {
823  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
824  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
825 
826  my_buffer_index = th->th.th_dispatch->th_disp_index++;
827 
828  /* What happens when number of threads changes, need to resize buffer? */
829  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
830  &th->th.th_dispatch
831  ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
833  &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
834  KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
835  my_buffer_index));
836  }
837 
838  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
839 #if USE_ITT_BUILD
840  &cur_chunk,
841 #endif
842  chunk, (T)th->th.th_team_nproc,
843  (T)th->th.th_info.ds.ds_tid);
844  if (active) {
845  if (pr->flags.ordered == 0) {
846  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
847  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
848  } else {
849  th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
850  th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
851  }
852  }
853 
854  if (active) {
855  /* The name of this buffer should be my_buffer_index when it's free to use
856  * it */
857 
858  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
859  "sh->buffer_index:%d\n",
860  gtid, my_buffer_index, sh->buffer_index));
861  __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
862  __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
863  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
864  // my_buffer_index are *always* 32-bit integers.
865  KMP_MB(); /* is this necessary? */
866  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
867  "sh->buffer_index:%d\n",
868  gtid, my_buffer_index, sh->buffer_index));
869 
870  th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
871  th->th.th_dispatch->th_dispatch_sh_current =
872  CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
873 #if USE_ITT_BUILD
874  if (pr->flags.ordered) {
875  __kmp_itt_ordered_init(gtid);
876  }
877  // Report loop metadata
878  if (itt_need_metadata_reporting) {
879  // Only report metadata by master of active team at level 1
880  kmp_uint64 schedtype = 0;
881  switch (schedule) {
882  case kmp_sch_static_chunked:
883  case kmp_sch_static_balanced: // Chunk is calculated in the switch above
884  break;
885  case kmp_sch_static_greedy:
886  cur_chunk = pr->u.p.parm1;
887  break;
888  case kmp_sch_dynamic_chunked:
889  schedtype = 1;
890  break;
891  case kmp_sch_guided_iterative_chunked:
892  case kmp_sch_guided_analytical_chunked:
893 #if OMP_45_ENABLED
894  case kmp_sch_guided_simd:
895 #endif
896  schedtype = 2;
897  break;
898  default:
899  // Should we put this case under "static"?
900  // case kmp_sch_static_steal:
901  schedtype = 3;
902  break;
903  }
904  __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
905  }
906 #if KMP_USE_HIER_SCHED
907  if (pr->flags.use_hier) {
908  pr->u.p.count = 0;
909  pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
910  }
911 #endif // KMP_USER_HIER_SCHED
912 #endif /* USE_ITT_BUILD */
913  }
914 
915 #ifdef KMP_DEBUG
916  {
917  char *buff;
918  // create format specifiers before the debug output
919  buff = __kmp_str_format(
920  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
921  "lb:%%%s ub:%%%s"
922  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
923  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
924  traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
925  traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
926  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
927  traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
928  KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
929  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
930  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
931  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
932  __kmp_str_free(&buff);
933  }
934 #endif
935 #if (KMP_STATIC_STEAL_ENABLED)
936  // It cannot be guaranteed that after execution of a loop with some other
937  // schedule kind all the parm3 variables will contain the same value. Even if
938  // all parm3 will be the same, it still exists a bad case like using 0 and 1
939  // rather than program life-time increment. So the dedicated variable is
940  // required. The 'static_steal_counter' is used.
941  if (schedule == kmp_sch_static_steal) {
942  // Other threads will inspect this variable when searching for a victim.
943  // This is a flag showing that other threads may steal from this thread
944  // since then.
945  volatile T *p = &pr->u.p.static_steal_counter;
946  *p = *p + 1;
947  }
948 #endif // ( KMP_STATIC_STEAL_ENABLED )
949 
950 #if OMPT_SUPPORT && OMPT_OPTIONAL
951  if (ompt_enabled.ompt_callback_work) {
952  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
953  ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
954  ompt_callbacks.ompt_callback(ompt_callback_work)(
955  ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
956  &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
957  }
958 #endif
959  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
960 }
961 
962 /* For ordered loops, either __kmp_dispatch_finish() should be called after
963  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
964  * every chunk of iterations. If the ordered section(s) were not executed
965  * for this iteration (or every iteration in this chunk), we need to set the
966  * ordered iteration counters so that the next thread can proceed. */
967 template <typename UT>
968 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
969  typedef typename traits_t<UT>::signed_t ST;
970  kmp_info_t *th = __kmp_threads[gtid];
971 
972  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
973  if (!th->th.th_team->t.t_serialized) {
974 
975  dispatch_private_info_template<UT> *pr =
976  reinterpret_cast<dispatch_private_info_template<UT> *>(
977  th->th.th_dispatch->th_dispatch_pr_current);
978  dispatch_shared_info_template<UT> volatile *sh =
979  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
980  th->th.th_dispatch->th_dispatch_sh_current);
981  KMP_DEBUG_ASSERT(pr);
982  KMP_DEBUG_ASSERT(sh);
983  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
984  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
985 
986  if (pr->ordered_bumped) {
987  KD_TRACE(
988  1000,
989  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
990  gtid));
991  pr->ordered_bumped = 0;
992  } else {
993  UT lower = pr->u.p.ordered_lower;
994 
995 #ifdef KMP_DEBUG
996  {
997  char *buff;
998  // create format specifiers before the debug output
999  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1000  "ordered_iteration:%%%s lower:%%%s\n",
1001  traits_t<UT>::spec, traits_t<UT>::spec);
1002  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1003  __kmp_str_free(&buff);
1004  }
1005 #endif
1006 
1007  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1008  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1009  KMP_MB(); /* is this necessary? */
1010 #ifdef KMP_DEBUG
1011  {
1012  char *buff;
1013  // create format specifiers before the debug output
1014  buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1015  "ordered_iteration:%%%s lower:%%%s\n",
1016  traits_t<UT>::spec, traits_t<UT>::spec);
1017  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1018  __kmp_str_free(&buff);
1019  }
1020 #endif
1021 
1022  test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1023  } // if
1024  } // if
1025  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1026 }
1027 
1028 #ifdef KMP_GOMP_COMPAT
1029 
1030 template <typename UT>
1031 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1032  typedef typename traits_t<UT>::signed_t ST;
1033  kmp_info_t *th = __kmp_threads[gtid];
1034 
1035  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1036  if (!th->th.th_team->t.t_serialized) {
1037  // int cid;
1038  dispatch_private_info_template<UT> *pr =
1039  reinterpret_cast<dispatch_private_info_template<UT> *>(
1040  th->th.th_dispatch->th_dispatch_pr_current);
1041  dispatch_shared_info_template<UT> volatile *sh =
1042  reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1043  th->th.th_dispatch->th_dispatch_sh_current);
1044  KMP_DEBUG_ASSERT(pr);
1045  KMP_DEBUG_ASSERT(sh);
1046  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1047  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1048 
1049  // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1050  UT lower = pr->u.p.ordered_lower;
1051  UT upper = pr->u.p.ordered_upper;
1052  UT inc = upper - lower + 1;
1053 
1054  if (pr->ordered_bumped == inc) {
1055  KD_TRACE(
1056  1000,
1057  ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1058  gtid));
1059  pr->ordered_bumped = 0;
1060  } else {
1061  inc -= pr->ordered_bumped;
1062 
1063 #ifdef KMP_DEBUG
1064  {
1065  char *buff;
1066  // create format specifiers before the debug output
1067  buff = __kmp_str_format(
1068  "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1069  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1070  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1071  KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1072  __kmp_str_free(&buff);
1073  }
1074 #endif
1075 
1076  __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1077  __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1078 
1079  KMP_MB(); /* is this necessary? */
1080  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1081  "ordered_bumped to zero\n",
1082  gtid));
1083  pr->ordered_bumped = 0;
1085 #ifdef KMP_DEBUG
1086  {
1087  char *buff;
1088  // create format specifiers before the debug output
1089  buff = __kmp_str_format(
1090  "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1091  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1092  traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1093  traits_t<UT>::spec);
1094  KD_TRACE(1000,
1095  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1096  __kmp_str_free(&buff);
1097  }
1098 #endif
1099 
1100  test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1101  }
1102  // }
1103  }
1104  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1105 }
1106 
1107 #endif /* KMP_GOMP_COMPAT */
1108 
1109 template <typename T>
1110 int __kmp_dispatch_next_algorithm(int gtid,
1111  dispatch_private_info_template<T> *pr,
1112  dispatch_shared_info_template<T> volatile *sh,
1113  kmp_int32 *p_last, T *p_lb, T *p_ub,
1114  typename traits_t<T>::signed_t *p_st, T nproc,
1115  T tid) {
1116  typedef typename traits_t<T>::unsigned_t UT;
1117  typedef typename traits_t<T>::signed_t ST;
1118  typedef typename traits_t<T>::floating_t DBL;
1119  int status = 0;
1120  kmp_int32 last = 0;
1121  T start;
1122  ST incr;
1123  UT limit, trip, init;
1124  kmp_info_t *th = __kmp_threads[gtid];
1125  kmp_team_t *team = th->th.th_team;
1126 
1127  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1128  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1129  KMP_DEBUG_ASSERT(pr);
1130  KMP_DEBUG_ASSERT(sh);
1131  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1132 #ifdef KMP_DEBUG
1133  {
1134  char *buff;
1135  // create format specifiers before the debug output
1136  buff =
1137  __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1138  "sh:%%p nproc:%%%s tid:%%%s\n",
1139  traits_t<T>::spec, traits_t<T>::spec);
1140  KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1141  __kmp_str_free(&buff);
1142  }
1143 #endif
1144 
1145  // zero trip count
1146  if (pr->u.p.tc == 0) {
1147  KD_TRACE(10,
1148  ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1149  "zero status:%d\n",
1150  gtid, status));
1151  return 0;
1152  }
1153 
1154  switch (pr->schedule) {
1155 #if (KMP_STATIC_STEAL_ENABLED)
1156  case kmp_sch_static_steal: {
1157  T chunk = pr->u.p.parm1;
1158 
1159  KD_TRACE(100,
1160  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1161  gtid));
1162 
1163  trip = pr->u.p.tc - 1;
1164 
1165  if (traits_t<T>::type_size > 4) {
1166  // use lock for 8-byte and CAS for 4-byte induction
1167  // variable. TODO (optional): check and use 16-byte CAS
1168  kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1169  KMP_DEBUG_ASSERT(lck != NULL);
1170  if (pr->u.p.count < (UT)pr->u.p.ub) {
1171  __kmp_acquire_lock(lck, gtid);
1172  // try to get own chunk of iterations
1173  init = (pr->u.p.count)++;
1174  status = (init < (UT)pr->u.p.ub);
1175  __kmp_release_lock(lck, gtid);
1176  } else {
1177  status = 0; // no own chunks
1178  }
1179  if (!status) { // try to steal
1180  kmp_info_t **other_threads = team->t.t_threads;
1181  int while_limit = nproc; // nproc attempts to find a victim
1182  int while_index = 0;
1183  // TODO: algorithm of searching for a victim
1184  // should be cleaned up and measured
1185  while ((!status) && (while_limit != ++while_index)) {
1186  T remaining;
1187  T victimIdx = pr->u.p.parm4;
1188  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1189  dispatch_private_info_template<T> *victim =
1190  reinterpret_cast<dispatch_private_info_template<T> *>(
1191  other_threads[victimIdx]
1192  ->th.th_dispatch->th_dispatch_pr_current);
1193  while ((victim == NULL || victim == pr ||
1194  (*(volatile T *)&victim->u.p.static_steal_counter !=
1195  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1196  oldVictimIdx != victimIdx) {
1197  victimIdx = (victimIdx + 1) % nproc;
1198  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1199  other_threads[victimIdx]
1200  ->th.th_dispatch->th_dispatch_pr_current);
1201  }
1202  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1203  *(volatile T *)&pr->u.p.static_steal_counter)) {
1204  continue; // try once more (nproc attempts in total)
1205  // no victim is ready yet to participate in stealing
1206  // because all victims are still in kmp_init_dispatch
1207  }
1208  if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1209  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1210  continue; // not enough chunks to steal, goto next victim
1211  }
1212 
1213  lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1214  KMP_ASSERT(lck != NULL);
1215  __kmp_acquire_lock(lck, gtid);
1216  limit = victim->u.p.ub; // keep initial ub
1217  if (victim->u.p.count >= limit ||
1218  (remaining = limit - victim->u.p.count) < 2) {
1219  __kmp_release_lock(lck, gtid);
1220  pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1221  continue; // not enough chunks to steal
1222  }
1223  // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
1224  // by 1
1225  if (remaining > 3) {
1226  // steal 1/4 of remaining
1227  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1228  init = (victim->u.p.ub -= (remaining >> 2));
1229  } else {
1230  // steal 1 chunk of 2 or 3 remaining
1231  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1232  init = (victim->u.p.ub -= 1);
1233  }
1234  __kmp_release_lock(lck, gtid);
1235 
1236  KMP_DEBUG_ASSERT(init + 1 <= limit);
1237  pr->u.p.parm4 = victimIdx; // remember victim to steal from
1238  status = 1;
1239  while_index = 0;
1240  // now update own count and ub with stolen range but init chunk
1241  __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1242  pr->u.p.count = init + 1;
1243  pr->u.p.ub = limit;
1244  __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1245  } // while (search for victim)
1246  } // if (try to find victim and steal)
1247  } else {
1248  // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1249  typedef union {
1250  struct {
1251  UT count;
1252  T ub;
1253  } p;
1254  kmp_int64 b;
1255  } union_i4;
1256  // All operations on 'count' or 'ub' must be combined atomically
1257  // together.
1258  {
1259  union_i4 vold, vnew;
1260  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1261  vnew = vold;
1262  vnew.p.count++;
1263  while (!KMP_COMPARE_AND_STORE_ACQ64(
1264  (volatile kmp_int64 *)&pr->u.p.count,
1265  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1266  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1267  KMP_CPU_PAUSE();
1268  vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1269  vnew = vold;
1270  vnew.p.count++;
1271  }
1272  vnew = vold;
1273  init = vnew.p.count;
1274  status = (init < (UT)vnew.p.ub);
1275  }
1276 
1277  if (!status) {
1278  kmp_info_t **other_threads = team->t.t_threads;
1279  int while_limit = nproc; // nproc attempts to find a victim
1280  int while_index = 0;
1281 
1282  // TODO: algorithm of searching for a victim
1283  // should be cleaned up and measured
1284  while ((!status) && (while_limit != ++while_index)) {
1285  union_i4 vold, vnew;
1286  kmp_int32 remaining;
1287  T victimIdx = pr->u.p.parm4;
1288  T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1289  dispatch_private_info_template<T> *victim =
1290  reinterpret_cast<dispatch_private_info_template<T> *>(
1291  other_threads[victimIdx]
1292  ->th.th_dispatch->th_dispatch_pr_current);
1293  while ((victim == NULL || victim == pr ||
1294  (*(volatile T *)&victim->u.p.static_steal_counter !=
1295  *(volatile T *)&pr->u.p.static_steal_counter)) &&
1296  oldVictimIdx != victimIdx) {
1297  victimIdx = (victimIdx + 1) % nproc;
1298  victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1299  other_threads[victimIdx]
1300  ->th.th_dispatch->th_dispatch_pr_current);
1301  }
1302  if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
1303  *(volatile T *)&pr->u.p.static_steal_counter)) {
1304  continue; // try once more (nproc attempts in total)
1305  // no victim is ready yet to participate in stealing
1306  // because all victims are still in kmp_init_dispatch
1307  }
1308  pr->u.p.parm4 = victimIdx; // new victim found
1309  while (1) { // CAS loop if victim has enough chunks to steal
1310  vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1311  vnew = vold;
1312 
1313  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1314  if (vnew.p.count >= (UT)vnew.p.ub ||
1315  (remaining = vnew.p.ub - vnew.p.count) < 2) {
1316  pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1317  break; // not enough chunks to steal, goto next victim
1318  }
1319  if (remaining > 3) {
1320  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1321  } else {
1322  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1323  }
1324  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1325  // TODO: Should this be acquire or release?
1326  if (KMP_COMPARE_AND_STORE_ACQ64(
1327  (volatile kmp_int64 *)&victim->u.p.count,
1328  *VOLATILE_CAST(kmp_int64 *) & vold.b,
1329  *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1330  // stealing succedded
1331  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1332  vold.p.ub - vnew.p.ub);
1333  status = 1;
1334  while_index = 0;
1335  // now update own count and ub
1336  init = vnew.p.ub;
1337  vold.p.count = init + 1;
1338 #if KMP_ARCH_X86
1339  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1340 #else
1341  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1342 #endif
1343  break;
1344  } // if (check CAS result)
1345  KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1346  } // while (try to steal from particular victim)
1347  } // while (search for victim)
1348  } // if (try to find victim and steal)
1349  } // if (4-byte induction variable)
1350  if (!status) {
1351  *p_lb = 0;
1352  *p_ub = 0;
1353  if (p_st != NULL)
1354  *p_st = 0;
1355  } else {
1356  start = pr->u.p.parm2;
1357  init *= chunk;
1358  limit = chunk + init - 1;
1359  incr = pr->u.p.st;
1360  KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1361 
1362  KMP_DEBUG_ASSERT(init <= trip);
1363  if ((last = (limit >= trip)) != 0)
1364  limit = trip;
1365  if (p_st != NULL)
1366  *p_st = incr;
1367 
1368  if (incr == 1) {
1369  *p_lb = start + init;
1370  *p_ub = start + limit;
1371  } else {
1372  *p_lb = start + init * incr;
1373  *p_ub = start + limit * incr;
1374  }
1375 
1376  if (pr->flags.ordered) {
1377  pr->u.p.ordered_lower = init;
1378  pr->u.p.ordered_upper = limit;
1379  } // if
1380  } // if
1381  break;
1382  } // case
1383 #endif // ( KMP_STATIC_STEAL_ENABLED )
1384  case kmp_sch_static_balanced: {
1385  KD_TRACE(
1386  10,
1387  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1388  gtid));
1389  /* check if thread has any iteration to do */
1390  if ((status = !pr->u.p.count) != 0) {
1391  pr->u.p.count = 1;
1392  *p_lb = pr->u.p.lb;
1393  *p_ub = pr->u.p.ub;
1394  last = pr->u.p.parm1;
1395  if (p_st != NULL)
1396  *p_st = pr->u.p.st;
1397  } else { /* no iterations to do */
1398  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1399  }
1400  } // case
1401  break;
1402  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1403  merged here */
1404  case kmp_sch_static_chunked: {
1405  T parm1;
1406 
1407  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1408  "kmp_sch_static_[affinity|chunked] case\n",
1409  gtid));
1410  parm1 = pr->u.p.parm1;
1411 
1412  trip = pr->u.p.tc - 1;
1413  init = parm1 * (pr->u.p.count + tid);
1414 
1415  if ((status = (init <= trip)) != 0) {
1416  start = pr->u.p.lb;
1417  incr = pr->u.p.st;
1418  limit = parm1 + init - 1;
1419 
1420  if ((last = (limit >= trip)) != 0)
1421  limit = trip;
1422 
1423  if (p_st != NULL)
1424  *p_st = incr;
1425 
1426  pr->u.p.count += nproc;
1427 
1428  if (incr == 1) {
1429  *p_lb = start + init;
1430  *p_ub = start + limit;
1431  } else {
1432  *p_lb = start + init * incr;
1433  *p_ub = start + limit * incr;
1434  }
1435 
1436  if (pr->flags.ordered) {
1437  pr->u.p.ordered_lower = init;
1438  pr->u.p.ordered_upper = limit;
1439  } // if
1440  } // if
1441  } // case
1442  break;
1443 
1444  case kmp_sch_dynamic_chunked: {
1445  T chunk = pr->u.p.parm1;
1446 
1447  KD_TRACE(
1448  100,
1449  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1450  gtid));
1451 
1452  init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1453  trip = pr->u.p.tc - 1;
1454 
1455  if ((status = (init <= trip)) == 0) {
1456  *p_lb = 0;
1457  *p_ub = 0;
1458  if (p_st != NULL)
1459  *p_st = 0;
1460  } else {
1461  start = pr->u.p.lb;
1462  limit = chunk + init - 1;
1463  incr = pr->u.p.st;
1464 
1465  if ((last = (limit >= trip)) != 0)
1466  limit = trip;
1467 
1468  if (p_st != NULL)
1469  *p_st = incr;
1470 
1471  if (incr == 1) {
1472  *p_lb = start + init;
1473  *p_ub = start + limit;
1474  } else {
1475  *p_lb = start + init * incr;
1476  *p_ub = start + limit * incr;
1477  }
1478 
1479  if (pr->flags.ordered) {
1480  pr->u.p.ordered_lower = init;
1481  pr->u.p.ordered_upper = limit;
1482  } // if
1483  } // if
1484  } // case
1485  break;
1486 
1487  case kmp_sch_guided_iterative_chunked: {
1488  T chunkspec = pr->u.p.parm1;
1489  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1490  "iterative case\n",
1491  gtid));
1492  trip = pr->u.p.tc;
1493  // Start atomic part of calculations
1494  while (1) {
1495  ST remaining; // signed, because can be < 0
1496  init = sh->u.s.iteration; // shared value
1497  remaining = trip - init;
1498  if (remaining <= 0) { // AC: need to compare with 0 first
1499  // nothing to do, don't try atomic op
1500  status = 0;
1501  break;
1502  }
1503  if ((T)remaining <
1504  pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1505  // use dynamic-style shcedule
1506  // atomically inrement iterations, get old value
1507  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1508  (ST)chunkspec);
1509  remaining = trip - init;
1510  if (remaining <= 0) {
1511  status = 0; // all iterations got by other threads
1512  } else {
1513  // got some iterations to work on
1514  status = 1;
1515  if ((T)remaining > chunkspec) {
1516  limit = init + chunkspec - 1;
1517  } else {
1518  last = 1; // the last chunk
1519  limit = init + remaining - 1;
1520  } // if
1521  } // if
1522  break;
1523  } // if
1524  limit = init +
1525  (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1526  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1527  (ST)init, (ST)limit)) {
1528  // CAS was successful, chunk obtained
1529  status = 1;
1530  --limit;
1531  break;
1532  } // if
1533  } // while
1534  if (status != 0) {
1535  start = pr->u.p.lb;
1536  incr = pr->u.p.st;
1537  if (p_st != NULL)
1538  *p_st = incr;
1539  *p_lb = start + init * incr;
1540  *p_ub = start + limit * incr;
1541  if (pr->flags.ordered) {
1542  pr->u.p.ordered_lower = init;
1543  pr->u.p.ordered_upper = limit;
1544  } // if
1545  } else {
1546  *p_lb = 0;
1547  *p_ub = 0;
1548  if (p_st != NULL)
1549  *p_st = 0;
1550  } // if
1551  } // case
1552  break;
1553 
1554 #if OMP_45_ENABLED
1555  case kmp_sch_guided_simd: {
1556  // same as iterative but curr-chunk adjusted to be multiple of given
1557  // chunk
1558  T chunk = pr->u.p.parm1;
1559  KD_TRACE(100,
1560  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1561  gtid));
1562  trip = pr->u.p.tc;
1563  // Start atomic part of calculations
1564  while (1) {
1565  ST remaining; // signed, because can be < 0
1566  init = sh->u.s.iteration; // shared value
1567  remaining = trip - init;
1568  if (remaining <= 0) { // AC: need to compare with 0 first
1569  status = 0; // nothing to do, don't try atomic op
1570  break;
1571  }
1572  KMP_DEBUG_ASSERT(init % chunk == 0);
1573  // compare with K*nproc*(chunk+1), K=2 by default
1574  if ((T)remaining < pr->u.p.parm2) {
1575  // use dynamic-style shcedule
1576  // atomically inrement iterations, get old value
1577  init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1578  (ST)chunk);
1579  remaining = trip - init;
1580  if (remaining <= 0) {
1581  status = 0; // all iterations got by other threads
1582  } else {
1583  // got some iterations to work on
1584  status = 1;
1585  if ((T)remaining > chunk) {
1586  limit = init + chunk - 1;
1587  } else {
1588  last = 1; // the last chunk
1589  limit = init + remaining - 1;
1590  } // if
1591  } // if
1592  break;
1593  } // if
1594  // divide by K*nproc
1595  UT span = remaining * (*(double *)&pr->u.p.parm3);
1596  UT rem = span % chunk;
1597  if (rem) // adjust so that span%chunk == 0
1598  span += chunk - rem;
1599  limit = init + span;
1600  if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1601  (ST)init, (ST)limit)) {
1602  // CAS was successful, chunk obtained
1603  status = 1;
1604  --limit;
1605  break;
1606  } // if
1607  } // while
1608  if (status != 0) {
1609  start = pr->u.p.lb;
1610  incr = pr->u.p.st;
1611  if (p_st != NULL)
1612  *p_st = incr;
1613  *p_lb = start + init * incr;
1614  *p_ub = start + limit * incr;
1615  if (pr->flags.ordered) {
1616  pr->u.p.ordered_lower = init;
1617  pr->u.p.ordered_upper = limit;
1618  } // if
1619  } else {
1620  *p_lb = 0;
1621  *p_ub = 0;
1622  if (p_st != NULL)
1623  *p_st = 0;
1624  } // if
1625  } // case
1626  break;
1627 #endif // OMP_45_ENABLED
1628 
1629  case kmp_sch_guided_analytical_chunked: {
1630  T chunkspec = pr->u.p.parm1;
1631  UT chunkIdx;
1632 #if KMP_USE_X87CONTROL
1633  /* for storing original FPCW value for Windows* OS on
1634  IA-32 architecture 8-byte version */
1635  unsigned int oldFpcw;
1636  unsigned int fpcwSet = 0;
1637 #endif
1638  KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1639  "kmp_sch_guided_analytical_chunked case\n",
1640  gtid));
1641 
1642  trip = pr->u.p.tc;
1643 
1644  KMP_DEBUG_ASSERT(nproc > 1);
1645  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1646 
1647  while (1) { /* this while loop is a safeguard against unexpected zero
1648  chunk sizes */
1649  chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1650  if (chunkIdx >= (UT)pr->u.p.parm2) {
1651  --trip;
1652  /* use dynamic-style scheduling */
1653  init = chunkIdx * chunkspec + pr->u.p.count;
1654  /* need to verify init > 0 in case of overflow in the above
1655  * calculation */
1656  if ((status = (init > 0 && init <= trip)) != 0) {
1657  limit = init + chunkspec - 1;
1658 
1659  if ((last = (limit >= trip)) != 0)
1660  limit = trip;
1661  }
1662  break;
1663  } else {
1664 /* use exponential-style scheduling */
1665 /* The following check is to workaround the lack of long double precision on
1666  Windows* OS.
1667  This check works around the possible effect that init != 0 for chunkIdx == 0.
1668  */
1669 #if KMP_USE_X87CONTROL
1670  /* If we haven't already done so, save original
1671  FPCW and set precision to 64-bit, as Windows* OS
1672  on IA-32 architecture defaults to 53-bit */
1673  if (!fpcwSet) {
1674  oldFpcw = _control87(0, 0);
1675  _control87(_PC_64, _MCW_PC);
1676  fpcwSet = 0x30000;
1677  }
1678 #endif
1679  if (chunkIdx) {
1680  init = __kmp_dispatch_guided_remaining<T>(
1681  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1682  KMP_DEBUG_ASSERT(init);
1683  init = trip - init;
1684  } else
1685  init = 0;
1686  limit = trip - __kmp_dispatch_guided_remaining<T>(
1687  trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1688  KMP_ASSERT(init <= limit);
1689  if (init < limit) {
1690  KMP_DEBUG_ASSERT(limit <= trip);
1691  --limit;
1692  status = 1;
1693  break;
1694  } // if
1695  } // if
1696  } // while (1)
1697 #if KMP_USE_X87CONTROL
1698  /* restore FPCW if necessary
1699  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1700  */
1701  if (fpcwSet && (oldFpcw & fpcwSet))
1702  _control87(oldFpcw, _MCW_PC);
1703 #endif
1704  if (status != 0) {
1705  start = pr->u.p.lb;
1706  incr = pr->u.p.st;
1707  if (p_st != NULL)
1708  *p_st = incr;
1709  *p_lb = start + init * incr;
1710  *p_ub = start + limit * incr;
1711  if (pr->flags.ordered) {
1712  pr->u.p.ordered_lower = init;
1713  pr->u.p.ordered_upper = limit;
1714  }
1715  } else {
1716  *p_lb = 0;
1717  *p_ub = 0;
1718  if (p_st != NULL)
1719  *p_st = 0;
1720  }
1721  } // case
1722  break;
1723 
1724  case kmp_sch_trapezoidal: {
1725  UT index;
1726  T parm2 = pr->u.p.parm2;
1727  T parm3 = pr->u.p.parm3;
1728  T parm4 = pr->u.p.parm4;
1729  KD_TRACE(100,
1730  ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1731  gtid));
1732 
1733  index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1734 
1735  init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1736  trip = pr->u.p.tc - 1;
1737 
1738  if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1739  *p_lb = 0;
1740  *p_ub = 0;
1741  if (p_st != NULL)
1742  *p_st = 0;
1743  } else {
1744  start = pr->u.p.lb;
1745  limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1746  incr = pr->u.p.st;
1747 
1748  if ((last = (limit >= trip)) != 0)
1749  limit = trip;
1750 
1751  if (p_st != NULL)
1752  *p_st = incr;
1753 
1754  if (incr == 1) {
1755  *p_lb = start + init;
1756  *p_ub = start + limit;
1757  } else {
1758  *p_lb = start + init * incr;
1759  *p_ub = start + limit * incr;
1760  }
1761 
1762  if (pr->flags.ordered) {
1763  pr->u.p.ordered_lower = init;
1764  pr->u.p.ordered_upper = limit;
1765  } // if
1766  } // if
1767  } // case
1768  break;
1769  default: {
1770  status = 0; // to avoid complaints on uninitialized variable use
1771  __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1772  KMP_HNT(GetNewerLibrary), // Hint
1773  __kmp_msg_null // Variadic argument list terminator
1774  );
1775  } break;
1776  } // switch
1777  if (p_last)
1778  *p_last = last;
1779 #ifdef KMP_DEBUG
1780  if (pr->flags.ordered) {
1781  char *buff;
1782  // create format specifiers before the debug output
1783  buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1784  "ordered_lower:%%%s ordered_upper:%%%s\n",
1785  traits_t<UT>::spec, traits_t<UT>::spec);
1786  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1787  __kmp_str_free(&buff);
1788  }
1789  {
1790  char *buff;
1791  // create format specifiers before the debug output
1792  buff = __kmp_str_format(
1793  "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1794  "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1795  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1796  KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1797  __kmp_str_free(&buff);
1798  }
1799 #endif
1800  return status;
1801 }
1802 
1803 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1804  work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1805  is not called. */
1806 #if OMPT_SUPPORT && OMPT_OPTIONAL
1807 #define OMPT_LOOP_END \
1808  if (status == 0) { \
1809  if (ompt_enabled.ompt_callback_work) { \
1810  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1811  ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1812  ompt_callbacks.ompt_callback(ompt_callback_work)( \
1813  ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1814  &(task_info->task_data), 0, codeptr); \
1815  } \
1816  }
1817 // TODO: implement count
1818 #else
1819 #define OMPT_LOOP_END // no-op
1820 #endif
1821 
1822 #if KMP_STATS_ENABLED
1823 #define KMP_STATS_LOOP_END \
1824  { \
1825  kmp_int64 u, l, t, i; \
1826  l = (kmp_int64)(*p_lb); \
1827  u = (kmp_int64)(*p_ub); \
1828  i = (kmp_int64)(pr->u.p.st); \
1829  if (status == 0) { \
1830  t = 0; \
1831  KMP_POP_PARTITIONED_TIMER(); \
1832  } else if (i == 1) { \
1833  if (u >= l) \
1834  t = u - l + 1; \
1835  else \
1836  t = 0; \
1837  } else if (i < 0) { \
1838  if (l >= u) \
1839  t = (l - u) / (-i) + 1; \
1840  else \
1841  t = 0; \
1842  } else { \
1843  if (u >= l) \
1844  t = (u - l) / i + 1; \
1845  else \
1846  t = 0; \
1847  } \
1848  KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1849  }
1850 #else
1851 #define KMP_STATS_LOOP_END /* Nothing */
1852 #endif
1853 
1854 template <typename T>
1855 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1856  T *p_lb, T *p_ub,
1857  typename traits_t<T>::signed_t *p_st
1858 #if OMPT_SUPPORT && OMPT_OPTIONAL
1859  ,
1860  void *codeptr
1861 #endif
1862  ) {
1863 
1864  typedef typename traits_t<T>::unsigned_t UT;
1865  typedef typename traits_t<T>::signed_t ST;
1866  // This is potentially slightly misleading, schedule(runtime) will appear here
1867  // even if the actual runtme schedule is static. (Which points out a
1868  // disadavantage of schedule(runtime): even when static scheduling is used it
1869  // costs more than a compile time choice to use static scheduling would.)
1870  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1871 
1872  int status;
1873  dispatch_private_info_template<T> *pr;
1874  kmp_info_t *th = __kmp_threads[gtid];
1875  kmp_team_t *team = th->th.th_team;
1876 
1877  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1878  KD_TRACE(
1879  1000,
1880  ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1881  gtid, p_lb, p_ub, p_st, p_last));
1882 
1883  if (team->t.t_serialized) {
1884  /* NOTE: serialize this dispatch becase we are not at the active level */
1885  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1886  th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1887  KMP_DEBUG_ASSERT(pr);
1888 
1889  if ((status = (pr->u.p.tc != 0)) == 0) {
1890  *p_lb = 0;
1891  *p_ub = 0;
1892  // if ( p_last != NULL )
1893  // *p_last = 0;
1894  if (p_st != NULL)
1895  *p_st = 0;
1896  if (__kmp_env_consistency_check) {
1897  if (pr->pushed_ws != ct_none) {
1898  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1899  }
1900  }
1901  } else if (pr->flags.nomerge) {
1902  kmp_int32 last;
1903  T start;
1904  UT limit, trip, init;
1905  ST incr;
1906  T chunk = pr->u.p.parm1;
1907 
1908  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1909  gtid));
1910 
1911  init = chunk * pr->u.p.count++;
1912  trip = pr->u.p.tc - 1;
1913 
1914  if ((status = (init <= trip)) == 0) {
1915  *p_lb = 0;
1916  *p_ub = 0;
1917  // if ( p_last != NULL )
1918  // *p_last = 0;
1919  if (p_st != NULL)
1920  *p_st = 0;
1921  if (__kmp_env_consistency_check) {
1922  if (pr->pushed_ws != ct_none) {
1923  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1924  }
1925  }
1926  } else {
1927  start = pr->u.p.lb;
1928  limit = chunk + init - 1;
1929  incr = pr->u.p.st;
1930 
1931  if ((last = (limit >= trip)) != 0) {
1932  limit = trip;
1933 #if KMP_OS_WINDOWS
1934  pr->u.p.last_upper = pr->u.p.ub;
1935 #endif /* KMP_OS_WINDOWS */
1936  }
1937  if (p_last != NULL)
1938  *p_last = last;
1939  if (p_st != NULL)
1940  *p_st = incr;
1941  if (incr == 1) {
1942  *p_lb = start + init;
1943  *p_ub = start + limit;
1944  } else {
1945  *p_lb = start + init * incr;
1946  *p_ub = start + limit * incr;
1947  }
1948 
1949  if (pr->flags.ordered) {
1950  pr->u.p.ordered_lower = init;
1951  pr->u.p.ordered_upper = limit;
1952 #ifdef KMP_DEBUG
1953  {
1954  char *buff;
1955  // create format specifiers before the debug output
1956  buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1957  "ordered_lower:%%%s ordered_upper:%%%s\n",
1958  traits_t<UT>::spec, traits_t<UT>::spec);
1959  KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1960  pr->u.p.ordered_upper));
1961  __kmp_str_free(&buff);
1962  }
1963 #endif
1964  } // if
1965  } // if
1966  } else {
1967  pr->u.p.tc = 0;
1968  *p_lb = pr->u.p.lb;
1969  *p_ub = pr->u.p.ub;
1970 #if KMP_OS_WINDOWS
1971  pr->u.p.last_upper = *p_ub;
1972 #endif /* KMP_OS_WINDOWS */
1973  if (p_last != NULL)
1974  *p_last = TRUE;
1975  if (p_st != NULL)
1976  *p_st = pr->u.p.st;
1977  } // if
1978 #ifdef KMP_DEBUG
1979  {
1980  char *buff;
1981  // create format specifiers before the debug output
1982  buff = __kmp_str_format(
1983  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1984  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1985  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1986  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1987  __kmp_str_free(&buff);
1988  }
1989 #endif
1990 #if INCLUDE_SSC_MARKS
1991  SSC_MARK_DISPATCH_NEXT();
1992 #endif
1993  OMPT_LOOP_END;
1994  KMP_STATS_LOOP_END;
1995  return status;
1996  } else {
1997  kmp_int32 last = 0;
1998  dispatch_shared_info_template<T> volatile *sh;
1999 
2000  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2001  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2002 
2003  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2004  th->th.th_dispatch->th_dispatch_pr_current);
2005  KMP_DEBUG_ASSERT(pr);
2006  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2007  th->th.th_dispatch->th_dispatch_sh_current);
2008  KMP_DEBUG_ASSERT(sh);
2009 
2010 #if KMP_USE_HIER_SCHED
2011  if (pr->flags.use_hier)
2012  status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2013  else
2014 #endif // KMP_USE_HIER_SCHED
2015  status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2016  p_st, th->th.th_team_nproc,
2017  th->th.th_info.ds.ds_tid);
2018  // status == 0: no more iterations to execute
2019  if (status == 0) {
2020  UT num_done;
2021 
2022  num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2023 #ifdef KMP_DEBUG
2024  {
2025  char *buff;
2026  // create format specifiers before the debug output
2027  buff = __kmp_str_format(
2028  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2029  traits_t<UT>::spec);
2030  KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2031  __kmp_str_free(&buff);
2032  }
2033 #endif
2034 
2035 #if KMP_USE_HIER_SCHED
2036  pr->flags.use_hier = FALSE;
2037 #endif
2038  if ((ST)num_done == th->th.th_team_nproc - 1) {
2039 #if (KMP_STATIC_STEAL_ENABLED)
2040  if (pr->schedule == kmp_sch_static_steal &&
2041  traits_t<T>::type_size > 4) {
2042  int i;
2043  kmp_info_t **other_threads = team->t.t_threads;
2044  // loop complete, safe to destroy locks used for stealing
2045  for (i = 0; i < th->th.th_team_nproc; ++i) {
2046  kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2047  KMP_ASSERT(lck != NULL);
2048  __kmp_destroy_lock(lck);
2049  __kmp_free(lck);
2050  other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2051  }
2052  }
2053 #endif
2054  /* NOTE: release this buffer to be reused */
2055 
2056  KMP_MB(); /* Flush all pending memory write invalidates. */
2057 
2058  sh->u.s.num_done = 0;
2059  sh->u.s.iteration = 0;
2060 
2061  /* TODO replace with general release procedure? */
2062  if (pr->flags.ordered) {
2063  sh->u.s.ordered_iteration = 0;
2064  }
2065 
2066  KMP_MB(); /* Flush all pending memory write invalidates. */
2067 
2068  sh->buffer_index += __kmp_dispatch_num_buffers;
2069  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2070  gtid, sh->buffer_index));
2071 
2072  KMP_MB(); /* Flush all pending memory write invalidates. */
2073 
2074  } // if
2075  if (__kmp_env_consistency_check) {
2076  if (pr->pushed_ws != ct_none) {
2077  pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2078  }
2079  }
2080 
2081  th->th.th_dispatch->th_deo_fcn = NULL;
2082  th->th.th_dispatch->th_dxo_fcn = NULL;
2083  th->th.th_dispatch->th_dispatch_sh_current = NULL;
2084  th->th.th_dispatch->th_dispatch_pr_current = NULL;
2085  } // if (status == 0)
2086 #if KMP_OS_WINDOWS
2087  else if (last) {
2088  pr->u.p.last_upper = pr->u.p.ub;
2089  }
2090 #endif /* KMP_OS_WINDOWS */
2091  if (p_last != NULL && status != 0)
2092  *p_last = last;
2093  } // if
2094 
2095 #ifdef KMP_DEBUG
2096  {
2097  char *buff;
2098  // create format specifiers before the debug output
2099  buff = __kmp_str_format(
2100  "__kmp_dispatch_next: T#%%d normal case: "
2101  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2102  traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2103  KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2104  (p_last ? *p_last : 0), status));
2105  __kmp_str_free(&buff);
2106  }
2107 #endif
2108 #if INCLUDE_SSC_MARKS
2109  SSC_MARK_DISPATCH_NEXT();
2110 #endif
2111  OMPT_LOOP_END;
2112  KMP_STATS_LOOP_END;
2113  return status;
2114 }
2115 
2116 template <typename T>
2117 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2118  kmp_int32 *plastiter, T *plower, T *pupper,
2119  typename traits_t<T>::signed_t incr) {
2120  typedef typename traits_t<T>::unsigned_t UT;
2121  kmp_uint32 team_id;
2122  kmp_uint32 nteams;
2123  UT trip_count;
2124  kmp_team_t *team;
2125  kmp_info_t *th;
2126 
2127  KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2128  KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2129 #ifdef KMP_DEBUG
2130  typedef typename traits_t<T>::signed_t ST;
2131  {
2132  char *buff;
2133  // create format specifiers before the debug output
2134  buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2135  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2136  traits_t<T>::spec, traits_t<T>::spec,
2137  traits_t<ST>::spec, traits_t<T>::spec);
2138  KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2139  __kmp_str_free(&buff);
2140  }
2141 #endif
2142 
2143  if (__kmp_env_consistency_check) {
2144  if (incr == 0) {
2145  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2146  loc);
2147  }
2148  if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2149  // The loop is illegal.
2150  // Some zero-trip loops maintained by compiler, e.g.:
2151  // for(i=10;i<0;++i) // lower >= upper - run-time check
2152  // for(i=0;i>10;--i) // lower <= upper - run-time check
2153  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2154  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2155  // Compiler does not check the following illegal loops:
2156  // for(i=0;i<10;i+=incr) // where incr<0
2157  // for(i=10;i>0;i-=incr) // where incr<0
2158  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2159  }
2160  }
2161  th = __kmp_threads[gtid];
2162  team = th->th.th_team;
2163 #if OMP_40_ENABLED
2164  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2165  nteams = th->th.th_teams_size.nteams;
2166 #endif
2167  team_id = team->t.t_master_tid;
2168  KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2169 
2170  // compute global trip count
2171  if (incr == 1) {
2172  trip_count = *pupper - *plower + 1;
2173  } else if (incr == -1) {
2174  trip_count = *plower - *pupper + 1;
2175  } else if (incr > 0) {
2176  // upper-lower can exceed the limit of signed type
2177  trip_count = (UT)(*pupper - *plower) / incr + 1;
2178  } else {
2179  trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2180  }
2181 
2182  if (trip_count <= nteams) {
2183  KMP_DEBUG_ASSERT(
2184  __kmp_static == kmp_sch_static_greedy ||
2185  __kmp_static ==
2186  kmp_sch_static_balanced); // Unknown static scheduling type.
2187  // only some teams get single iteration, others get nothing
2188  if (team_id < trip_count) {
2189  *pupper = *plower = *plower + team_id * incr;
2190  } else {
2191  *plower = *pupper + incr; // zero-trip loop
2192  }
2193  if (plastiter != NULL)
2194  *plastiter = (team_id == trip_count - 1);
2195  } else {
2196  if (__kmp_static == kmp_sch_static_balanced) {
2197  UT chunk = trip_count / nteams;
2198  UT extras = trip_count % nteams;
2199  *plower +=
2200  incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2201  *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2202  if (plastiter != NULL)
2203  *plastiter = (team_id == nteams - 1);
2204  } else {
2205  T chunk_inc_count =
2206  (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2207  T upper = *pupper;
2208  KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2209  // Unknown static scheduling type.
2210  *plower += team_id * chunk_inc_count;
2211  *pupper = *plower + chunk_inc_count - incr;
2212  // Check/correct bounds if needed
2213  if (incr > 0) {
2214  if (*pupper < *plower)
2215  *pupper = traits_t<T>::max_value;
2216  if (plastiter != NULL)
2217  *plastiter = *plower <= upper && *pupper > upper - incr;
2218  if (*pupper > upper)
2219  *pupper = upper; // tracker C73258
2220  } else {
2221  if (*pupper > *plower)
2222  *pupper = traits_t<T>::min_value;
2223  if (plastiter != NULL)
2224  *plastiter = *plower >= upper && *pupper < upper - incr;
2225  if (*pupper < upper)
2226  *pupper = upper; // tracker C73258
2227  }
2228  }
2229  }
2230 }
2231 
2232 //-----------------------------------------------------------------------------
2233 // Dispatch routines
2234 // Transfer call to template< type T >
2235 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2236 // T lb, T ub, ST st, ST chunk )
2237 extern "C" {
2238 
2255 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2256  enum sched_type schedule, kmp_int32 lb,
2257  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2258  KMP_DEBUG_ASSERT(__kmp_init_serial);
2259 #if OMPT_SUPPORT && OMPT_OPTIONAL
2260  OMPT_STORE_RETURN_ADDRESS(gtid);
2261 #endif
2262  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2263 }
2267 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2268  enum sched_type schedule, kmp_uint32 lb,
2269  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2270  KMP_DEBUG_ASSERT(__kmp_init_serial);
2271 #if OMPT_SUPPORT && OMPT_OPTIONAL
2272  OMPT_STORE_RETURN_ADDRESS(gtid);
2273 #endif
2274  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2275 }
2276 
2280 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2281  enum sched_type schedule, kmp_int64 lb,
2282  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2283  KMP_DEBUG_ASSERT(__kmp_init_serial);
2284 #if OMPT_SUPPORT && OMPT_OPTIONAL
2285  OMPT_STORE_RETURN_ADDRESS(gtid);
2286 #endif
2287  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2288 }
2289 
2293 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2294  enum sched_type schedule, kmp_uint64 lb,
2295  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2296  KMP_DEBUG_ASSERT(__kmp_init_serial);
2297 #if OMPT_SUPPORT && OMPT_OPTIONAL
2298  OMPT_STORE_RETURN_ADDRESS(gtid);
2299 #endif
2300  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2301 }
2302 
2312 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2313  enum sched_type schedule, kmp_int32 *p_last,
2314  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2315  kmp_int32 chunk) {
2316  KMP_DEBUG_ASSERT(__kmp_init_serial);
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318  OMPT_STORE_RETURN_ADDRESS(gtid);
2319 #endif
2320  __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2321  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2322 }
2323 
2324 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2325  enum sched_type schedule, kmp_int32 *p_last,
2326  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2327  kmp_int32 chunk) {
2328  KMP_DEBUG_ASSERT(__kmp_init_serial);
2329 #if OMPT_SUPPORT && OMPT_OPTIONAL
2330  OMPT_STORE_RETURN_ADDRESS(gtid);
2331 #endif
2332  __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2333  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2334 }
2335 
2336 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2337  enum sched_type schedule, kmp_int32 *p_last,
2338  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2339  kmp_int64 chunk) {
2340  KMP_DEBUG_ASSERT(__kmp_init_serial);
2341 #if OMPT_SUPPORT && OMPT_OPTIONAL
2342  OMPT_STORE_RETURN_ADDRESS(gtid);
2343 #endif
2344  __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2345  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2346 }
2347 
2348 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2349  enum sched_type schedule, kmp_int32 *p_last,
2350  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2351  kmp_int64 chunk) {
2352  KMP_DEBUG_ASSERT(__kmp_init_serial);
2353 #if OMPT_SUPPORT && OMPT_OPTIONAL
2354  OMPT_STORE_RETURN_ADDRESS(gtid);
2355 #endif
2356  __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2357  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2358 }
2359 
2373 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2374  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376  OMPT_STORE_RETURN_ADDRESS(gtid);
2377 #endif
2378  return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2379 #if OMPT_SUPPORT && OMPT_OPTIONAL
2380  ,
2381  OMPT_LOAD_RETURN_ADDRESS(gtid)
2382 #endif
2383  );
2384 }
2385 
2389 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2390  kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2391  kmp_int32 *p_st) {
2392 #if OMPT_SUPPORT && OMPT_OPTIONAL
2393  OMPT_STORE_RETURN_ADDRESS(gtid);
2394 #endif
2395  return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2396 #if OMPT_SUPPORT && OMPT_OPTIONAL
2397  ,
2398  OMPT_LOAD_RETURN_ADDRESS(gtid)
2399 #endif
2400  );
2401 }
2402 
2406 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2407  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2408 #if OMPT_SUPPORT && OMPT_OPTIONAL
2409  OMPT_STORE_RETURN_ADDRESS(gtid);
2410 #endif
2411  return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2412 #if OMPT_SUPPORT && OMPT_OPTIONAL
2413  ,
2414  OMPT_LOAD_RETURN_ADDRESS(gtid)
2415 #endif
2416  );
2417 }
2418 
2422 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2423  kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2424  kmp_int64 *p_st) {
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426  OMPT_STORE_RETURN_ADDRESS(gtid);
2427 #endif
2428  return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2429 #if OMPT_SUPPORT && OMPT_OPTIONAL
2430  ,
2431  OMPT_LOAD_RETURN_ADDRESS(gtid)
2432 #endif
2433  );
2434 }
2435 
2442 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2443  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2444 }
2445 
2449 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2450  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2451 }
2452 
2456 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2457  __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2458 }
2459 
2463 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2464  __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2465 }
2468 //-----------------------------------------------------------------------------
2469 // Non-template routines from kmp_dispatch.cpp used in other sources
2470 
2471 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2472  return value == checker;
2473 }
2474 
2475 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2476  return value != checker;
2477 }
2478 
2479 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2480  return value < checker;
2481 }
2482 
2483 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2484  return value >= checker;
2485 }
2486 
2487 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2488  return value <= checker;
2489 }
2490 
2491 kmp_uint32
2492 __kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2493  kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2494  void *obj // Higher-level synchronization object, or NULL.
2495  ) {
2496  // note: we may not belong to a team at this point
2497  volatile kmp_uint32 *spin = spinner;
2498  kmp_uint32 check = checker;
2499  kmp_uint32 spins;
2500  kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2501  kmp_uint32 r;
2502 
2503  KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2504  KMP_INIT_YIELD(spins);
2505  // main wait spin loop
2506  while (!f(r = TCR_4(*spin), check)) {
2507  KMP_FSYNC_SPIN_PREPARE(obj);
2508  /* GEH - remove this since it was accidentally introduced when kmp_wait was
2509  split. It causes problems with infinite recursion because of exit lock */
2510  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2511  __kmp_abort_thread(); */
2512 
2513  /* if we have waited a bit, or are oversubscribed, yield */
2514  /* pause is in the following code */
2515  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2516  KMP_YIELD_SPIN(spins);
2517  }
2518  KMP_FSYNC_SPIN_ACQUIRED(obj);
2519  return r;
2520 }
2521 
2522 void __kmp_wait_yield_4_ptr(
2523  void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2524  void *obj // Higher-level synchronization object, or NULL.
2525  ) {
2526  // note: we may not belong to a team at this point
2527  void *spin = spinner;
2528  kmp_uint32 check = checker;
2529  kmp_uint32 spins;
2530  kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2531 
2532  KMP_FSYNC_SPIN_INIT(obj, spin);
2533  KMP_INIT_YIELD(spins);
2534  // main wait spin loop
2535  while (!f(spin, check)) {
2536  KMP_FSYNC_SPIN_PREPARE(obj);
2537  /* if we have waited a bit, or are oversubscribed, yield */
2538  /* pause is in the following code */
2539  KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2540  KMP_YIELD_SPIN(spins);
2541  }
2542  KMP_FSYNC_SPIN_ACQUIRED(obj);
2543 }
2544 
2545 } // extern "C"
2546 
2547 #ifdef KMP_GOMP_COMPAT
2548 
2549 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2550  enum sched_type schedule, kmp_int32 lb,
2551  kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2552  int push_ws) {
2553  __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2554  push_ws);
2555 }
2556 
2557 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2558  enum sched_type schedule, kmp_uint32 lb,
2559  kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2560  int push_ws) {
2561  __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2562  push_ws);
2563 }
2564 
2565 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2566  enum sched_type schedule, kmp_int64 lb,
2567  kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2568  int push_ws) {
2569  __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2570  push_ws);
2571 }
2572 
2573 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2574  enum sched_type schedule, kmp_uint64 lb,
2575  kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2576  int push_ws) {
2577  __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2578  push_ws);
2579 }
2580 
2581 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2582  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2583 }
2584 
2585 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2586  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2587 }
2588 
2589 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2590  __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2591 }
2592 
2593 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2594  __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2595 }
2596 
2597 #endif /* KMP_GOMP_COMPAT */
2598 
2599 /* ------------------------------------------------------------------------ */
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:889
sched_type
Definition: kmp.h:336
Definition: kmp.h:223
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)