LLVM OpenMP* Runtime Library
kmp_collapse.cpp
1 /*
2  * kmp_collapse.cpp -- loop collapse feature
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_error.h"
15 #include "kmp_i18n.h"
16 #include "kmp_itt.h"
17 #include "kmp_stats.h"
18 #include "kmp_str.h"
19 #include "kmp_collapse.h"
20 
21 #if OMPT_SUPPORT
22 #include "ompt-specific.h"
23 #endif
24 
25 // OMPTODO: different style of comments (see kmp_sched)
26 // OMPTODO: OMPT/OMPD
27 
28 // avoid inadevertently using a library based abs
29 template <typename T> T __kmp_abs(const T val) {
30  return (val < 0) ? -val : val;
31 }
32 kmp_uint32 __kmp_abs(const kmp_uint32 val) { return val; }
33 kmp_uint64 __kmp_abs(const kmp_uint64 val) { return val; }
34 
35 //----------------------------------------------------------------------------
36 // Common functions for working with rectangular and non-rectangular loops
37 //----------------------------------------------------------------------------
38 
39 template <typename T> int __kmp_sign(T val) {
40  return (T(0) < val) - (val < T(0));
41 }
42 
43 template <typename T> class CollapseAllocator {
44  typedef T *pT;
45 
46 private:
47  static const size_t allocaSize = 32; // size limit for stack allocations
48  // (8 bytes x 4 nested loops)
49  char stackAlloc[allocaSize];
50  static constexpr size_t maxElemCount = allocaSize / sizeof(T);
51  pT pTAlloc;
52 
53 public:
54  CollapseAllocator(size_t n) : pTAlloc(reinterpret_cast<pT>(stackAlloc)) {
55  if (n > maxElemCount) {
56  pTAlloc = reinterpret_cast<pT>(__kmp_allocate(n * sizeof(T)));
57  }
58  }
59  ~CollapseAllocator() {
60  if (pTAlloc != reinterpret_cast<pT>(stackAlloc)) {
61  __kmp_free(pTAlloc);
62  }
63  }
64  T &operator[](int index) { return pTAlloc[index]; }
65  operator const pT() { return pTAlloc; }
66 };
67 
68 //----------Loop canonicalization---------------------------------------------
69 
70 // For loop nest (any shape):
71 // convert != to < or >;
72 // switch from using < or > to <= or >=.
73 // "bounds" array has to be allocated per thread.
74 // All other internal functions will work only with canonicalized loops.
75 template <typename T>
76 void kmp_canonicalize_one_loop_XX(
77  ident_t *loc,
78  /*in/out*/ bounds_infoXX_template<T> *bounds) {
79 
80  if (__kmp_env_consistency_check) {
81  if (bounds->step == 0) {
82  __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
83  loc);
84  }
85  }
86 
87  if (bounds->comparison == comparison_t::comp_not_eq) {
88  // We can convert this to < or >, depends on the sign of the step:
89  if (bounds->step > 0) {
90  bounds->comparison = comparison_t::comp_less;
91  } else {
92  bounds->comparison = comparison_t::comp_greater;
93  }
94  }
95 
96  if (bounds->comparison == comparison_t::comp_less) {
97  // Note: ub0 can be unsigned. Should be Ok to hit overflow here,
98  // because ub0 + ub1*j should be still positive (otherwise loop was not
99  // well formed)
100  bounds->ub0 -= 1;
101  bounds->comparison = comparison_t::comp_less_or_eq;
102  } else if (bounds->comparison == comparison_t::comp_greater) {
103  bounds->ub0 += 1;
104  bounds->comparison = comparison_t::comp_greater_or_eq;
105  }
106 }
107 
108 // Canonicalize loop nest. original_bounds_nest is an array of length n.
109 void kmp_canonicalize_loop_nest(ident_t *loc,
110  /*in/out*/ bounds_info_t *original_bounds_nest,
111  kmp_index_t n) {
112 
113  for (kmp_index_t ind = 0; ind < n; ++ind) {
114  auto bounds = &(original_bounds_nest[ind]);
115 
116  switch (bounds->loop_type) {
117  case loop_type_t::loop_type_int32:
118  kmp_canonicalize_one_loop_XX<kmp_int32>(
119  loc,
120  /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
121  break;
122  case loop_type_t::loop_type_uint32:
123  kmp_canonicalize_one_loop_XX<kmp_uint32>(
124  loc,
125  /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
126  break;
127  case loop_type_t::loop_type_int64:
128  kmp_canonicalize_one_loop_XX<kmp_int64>(
129  loc,
130  /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
131  break;
132  case loop_type_t::loop_type_uint64:
133  kmp_canonicalize_one_loop_XX<kmp_uint64>(
134  loc,
135  /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
136  break;
137  default:
138  KMP_ASSERT(false);
139  }
140  }
141 }
142 
143 //----------Calculating trip count on one level-------------------------------
144 
145 // Calculate trip count on this loop level.
146 // We do this either for a rectangular loop nest,
147 // or after an adjustment bringing the loops to a parallelepiped shape.
148 // This number should not depend on the value of outer IV
149 // even if the formular has lb1 and ub1.
150 // Note: for non-rectangular loops don't use span for this, it's too big.
151 
152 template <typename T>
153 kmp_loop_nest_iv_t kmp_calculate_trip_count_XX(
154  /*in/out*/ bounds_infoXX_template<T> *bounds) {
155 
156  if (bounds->comparison == comparison_t::comp_less_or_eq) {
157  if (bounds->ub0 < bounds->lb0) {
158  // Note: after this we don't need to calculate inner loops,
159  // but that should be an edge case:
160  bounds->trip_count = 0;
161  } else {
162  // ub - lb may exceed signed type range; we need to cast to
163  // kmp_loop_nest_iv_t anyway
164  bounds->trip_count =
165  static_cast<kmp_loop_nest_iv_t>(bounds->ub0 - bounds->lb0) /
166  __kmp_abs(bounds->step) +
167  1;
168  }
169  } else if (bounds->comparison == comparison_t::comp_greater_or_eq) {
170  if (bounds->lb0 < bounds->ub0) {
171  // Note: after this we don't need to calculate inner loops,
172  // but that should be an edge case:
173  bounds->trip_count = 0;
174  } else {
175  // lb - ub may exceed signed type range; we need to cast to
176  // kmp_loop_nest_iv_t anyway
177  bounds->trip_count =
178  static_cast<kmp_loop_nest_iv_t>(bounds->lb0 - bounds->ub0) /
179  __kmp_abs(bounds->step) +
180  1;
181  }
182  } else {
183  KMP_ASSERT(false);
184  }
185  return bounds->trip_count;
186 }
187 
188 // Calculate trip count on this loop level.
189 kmp_loop_nest_iv_t kmp_calculate_trip_count(/*in/out*/ bounds_info_t *bounds) {
190 
191  kmp_loop_nest_iv_t trip_count = 0;
192 
193  switch (bounds->loop_type) {
194  case loop_type_t::loop_type_int32:
195  trip_count = kmp_calculate_trip_count_XX<kmp_int32>(
196  /*in/out*/ (bounds_infoXX_template<kmp_int32> *)(bounds));
197  break;
198  case loop_type_t::loop_type_uint32:
199  trip_count = kmp_calculate_trip_count_XX<kmp_uint32>(
200  /*in/out*/ (bounds_infoXX_template<kmp_uint32> *)(bounds));
201  break;
202  case loop_type_t::loop_type_int64:
203  trip_count = kmp_calculate_trip_count_XX<kmp_int64>(
204  /*in/out*/ (bounds_infoXX_template<kmp_int64> *)(bounds));
205  break;
206  case loop_type_t::loop_type_uint64:
207  trip_count = kmp_calculate_trip_count_XX<kmp_uint64>(
208  /*in/out*/ (bounds_infoXX_template<kmp_uint64> *)(bounds));
209  break;
210  default:
211  KMP_ASSERT(false);
212  }
213 
214  return trip_count;
215 }
216 
217 //----------Trim original iv according to its type----------------------------
218 
219 // Trim original iv according to its type.
220 // Return kmp_uint64 value which can be easily used in all internal calculations
221 // And can be statically cast back to original type in user code.
222 kmp_uint64 kmp_fix_iv(loop_type_t loop_iv_type, kmp_uint64 original_iv) {
223  kmp_uint64 res = 0;
224 
225  switch (loop_iv_type) {
226  case loop_type_t::loop_type_int8:
227  res = static_cast<kmp_uint64>(static_cast<kmp_int8>(original_iv));
228  break;
229  case loop_type_t::loop_type_uint8:
230  res = static_cast<kmp_uint64>(static_cast<kmp_uint8>(original_iv));
231  break;
232  case loop_type_t::loop_type_int16:
233  res = static_cast<kmp_uint64>(static_cast<kmp_int16>(original_iv));
234  break;
235  case loop_type_t::loop_type_uint16:
236  res = static_cast<kmp_uint64>(static_cast<kmp_uint16>(original_iv));
237  break;
238  case loop_type_t::loop_type_int32:
239  res = static_cast<kmp_uint64>(static_cast<kmp_int32>(original_iv));
240  break;
241  case loop_type_t::loop_type_uint32:
242  res = static_cast<kmp_uint64>(static_cast<kmp_uint32>(original_iv));
243  break;
244  case loop_type_t::loop_type_int64:
245  res = static_cast<kmp_uint64>(static_cast<kmp_int64>(original_iv));
246  break;
247  case loop_type_t::loop_type_uint64:
248  res = static_cast<kmp_uint64>(original_iv);
249  break;
250  default:
251  KMP_ASSERT(false);
252  }
253 
254  return res;
255 }
256 
257 //----------Compare two IVs (remember they have a type)-----------------------
258 
259 bool kmp_ivs_eq(loop_type_t loop_iv_type, kmp_uint64 original_iv1,
260  kmp_uint64 original_iv2) {
261  bool res = false;
262 
263  switch (loop_iv_type) {
264  case loop_type_t::loop_type_int8:
265  res = static_cast<kmp_int8>(original_iv1) ==
266  static_cast<kmp_int8>(original_iv2);
267  break;
268  case loop_type_t::loop_type_uint8:
269  res = static_cast<kmp_uint8>(original_iv1) ==
270  static_cast<kmp_uint8>(original_iv2);
271  break;
272  case loop_type_t::loop_type_int16:
273  res = static_cast<kmp_int16>(original_iv1) ==
274  static_cast<kmp_int16>(original_iv2);
275  break;
276  case loop_type_t::loop_type_uint16:
277  res = static_cast<kmp_uint16>(original_iv1) ==
278  static_cast<kmp_uint16>(original_iv2);
279  break;
280  case loop_type_t::loop_type_int32:
281  res = static_cast<kmp_int32>(original_iv1) ==
282  static_cast<kmp_int32>(original_iv2);
283  break;
284  case loop_type_t::loop_type_uint32:
285  res = static_cast<kmp_uint32>(original_iv1) ==
286  static_cast<kmp_uint32>(original_iv2);
287  break;
288  case loop_type_t::loop_type_int64:
289  res = static_cast<kmp_int64>(original_iv1) ==
290  static_cast<kmp_int64>(original_iv2);
291  break;
292  case loop_type_t::loop_type_uint64:
293  res = static_cast<kmp_uint64>(original_iv1) ==
294  static_cast<kmp_uint64>(original_iv2);
295  break;
296  default:
297  KMP_ASSERT(false);
298  }
299 
300  return res;
301 }
302 
303 //----------Calculate original iv on one level--------------------------------
304 
305 // Return true if the point fits into upper bounds on this level,
306 // false otherwise
307 template <typename T>
308 bool kmp_iv_is_in_upper_bound_XX(const bounds_infoXX_template<T> *bounds,
309  const kmp_point_t original_ivs,
310  kmp_index_t ind) {
311 
312  T iv = static_cast<T>(original_ivs[ind]);
313  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
314 
315  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
316  (iv > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
317  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
318  (iv < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
319  // The calculated point is outside of loop upper boundary:
320  return false;
321  }
322 
323  return true;
324 }
325 
326 // Calculate one iv corresponding to iteration on the level ind.
327 // Return true if it fits into lower-upper bounds on this level
328 // (if not, we need to re-calculate)
329 template <typename T>
330 bool kmp_calc_one_iv_XX(const bounds_infoXX_template<T> *bounds,
331  /*in/out*/ kmp_point_t original_ivs,
332  const kmp_iterations_t iterations, kmp_index_t ind,
333  bool start_with_lower_bound, bool checkBounds) {
334 
335  kmp_uint64 temp = 0;
336  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
337 
338  if (start_with_lower_bound) {
339  // we moved to the next iteration on one of outer loops, should start
340  // with the lower bound here:
341  temp = bounds->lb0 + bounds->lb1 * outer_iv;
342  } else {
343  auto iteration = iterations[ind];
344  temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration * bounds->step;
345  }
346 
347  // Now trim original iv according to its type:
348  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
349 
350  if (checkBounds) {
351  return kmp_iv_is_in_upper_bound_XX(bounds, original_ivs, ind);
352  } else {
353  return true;
354  }
355 }
356 
357 bool kmp_calc_one_iv(const bounds_info_t *bounds,
358  /*in/out*/ kmp_point_t original_ivs,
359  const kmp_iterations_t iterations, kmp_index_t ind,
360  bool start_with_lower_bound, bool checkBounds) {
361 
362  switch (bounds->loop_type) {
363  case loop_type_t::loop_type_int32:
364  return kmp_calc_one_iv_XX<kmp_int32>(
366  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
367  checkBounds);
368  break;
369  case loop_type_t::loop_type_uint32:
370  return kmp_calc_one_iv_XX<kmp_uint32>(
372  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
373  checkBounds);
374  break;
375  case loop_type_t::loop_type_int64:
376  return kmp_calc_one_iv_XX<kmp_int64>(
378  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
379  checkBounds);
380  break;
381  case loop_type_t::loop_type_uint64:
382  return kmp_calc_one_iv_XX<kmp_uint64>(
384  /*in/out*/ original_ivs, iterations, ind, start_with_lower_bound,
385  checkBounds);
386  break;
387  default:
388  KMP_ASSERT(false);
389  return false;
390  }
391 }
392 
393 //----------Calculate original iv on one level for rectangular loop nest------
394 
395 // Calculate one iv corresponding to iteration on the level ind.
396 // Return true if it fits into lower-upper bounds on this level
397 // (if not, we need to re-calculate)
398 template <typename T>
399 void kmp_calc_one_iv_rectang_XX(const bounds_infoXX_template<T> *bounds,
400  /*in/out*/ kmp_uint64 *original_ivs,
401  const kmp_iterations_t iterations,
402  kmp_index_t ind) {
403 
404  auto iteration = iterations[ind];
405 
406  kmp_uint64 temp =
407  bounds->lb0 +
408  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) +
409  iteration * bounds->step;
410 
411  // Now trim original iv according to its type:
412  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
413 }
414 
415 void kmp_calc_one_iv_rectang(const bounds_info_t *bounds,
416  /*in/out*/ kmp_uint64 *original_ivs,
417  const kmp_iterations_t iterations,
418  kmp_index_t ind) {
419 
420  switch (bounds->loop_type) {
421  case loop_type_t::loop_type_int32:
422  kmp_calc_one_iv_rectang_XX<kmp_int32>(
424  /*in/out*/ original_ivs, iterations, ind);
425  break;
426  case loop_type_t::loop_type_uint32:
427  kmp_calc_one_iv_rectang_XX<kmp_uint32>(
429  /*in/out*/ original_ivs, iterations, ind);
430  break;
431  case loop_type_t::loop_type_int64:
432  kmp_calc_one_iv_rectang_XX<kmp_int64>(
434  /*in/out*/ original_ivs, iterations, ind);
435  break;
436  case loop_type_t::loop_type_uint64:
437  kmp_calc_one_iv_rectang_XX<kmp_uint64>(
439  /*in/out*/ original_ivs, iterations, ind);
440  break;
441  default:
442  KMP_ASSERT(false);
443  }
444 }
445 
446 //----------------------------------------------------------------------------
447 // Rectangular loop nest
448 //----------------------------------------------------------------------------
449 
450 //----------Canonicalize loop nest and calculate trip count-------------------
451 
452 // Canonicalize loop nest and calculate overall trip count.
453 // "bounds_nest" has to be allocated per thread.
454 // API will modify original bounds_nest array to bring it to a canonical form
455 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
456 // canonical form there will be no changes to bounds in bounds_nest array
457 // (only trip counts will be calculated).
458 // Returns trip count of overall space.
459 extern "C" kmp_loop_nest_iv_t
460 __kmpc_process_loop_nest_rectang(ident_t *loc, kmp_int32 gtid,
461  /*in/out*/ bounds_info_t *original_bounds_nest,
462  kmp_index_t n) {
463 
464  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
465 
466  kmp_loop_nest_iv_t total = 1;
467 
468  for (kmp_index_t ind = 0; ind < n; ++ind) {
469  auto bounds = &(original_bounds_nest[ind]);
470 
471  kmp_loop_nest_iv_t trip_count = kmp_calculate_trip_count(/*in/out*/ bounds);
472  total *= trip_count;
473  }
474 
475  return total;
476 }
477 
478 //----------Calculate old induction variables---------------------------------
479 
480 // Calculate old induction variables corresponding to overall new_iv.
481 // Note: original IV will be returned as if it had kmp_uint64 type,
482 // will have to be converted to original type in user code.
483 // Note: trip counts should be already calculated by
484 // __kmpc_process_loop_nest_rectang.
485 // OMPTODO: special case 2, 3 nested loops: either do different
486 // interface without array or possibly template this over n
487 extern "C" void
488 __kmpc_calc_original_ivs_rectang(ident_t *loc, kmp_loop_nest_iv_t new_iv,
489  const bounds_info_t *original_bounds_nest,
490  /*out*/ kmp_uint64 *original_ivs,
491  kmp_index_t n) {
492 
493  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
494 
495  // First, calc corresponding iteration in every original loop:
496  for (kmp_index_t ind = n; ind > 0;) {
497  --ind;
498  auto bounds = &(original_bounds_nest[ind]);
499 
500  // should be optimized to OPDIVREM:
501  auto temp = new_iv / bounds->trip_count;
502  auto iteration = new_iv % bounds->trip_count;
503  new_iv = temp;
504 
505  iterations[ind] = iteration;
506  }
507  KMP_ASSERT(new_iv == 0);
508 
509  for (kmp_index_t ind = 0; ind < n; ++ind) {
510  auto bounds = &(original_bounds_nest[ind]);
511 
512  kmp_calc_one_iv_rectang(bounds, /*in/out*/ original_ivs, iterations, ind);
513  }
514 }
515 
516 //----------------------------------------------------------------------------
517 // Non-rectangular loop nest
518 //----------------------------------------------------------------------------
519 
520 //----------Calculate maximum possible span of iv values on one level---------
521 
522 // Calculate span for IV on this loop level for "<=" case.
523 // Note: it's for <= on this loop nest level, so lower bound should be smallest
524 // value, upper bound should be the biggest value. If the loop won't execute,
525 // 'smallest' may be bigger than 'biggest', but we'd better not switch them
526 // around.
527 template <typename T>
528 void kmp_calc_span_lessoreq_XX(
529  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
530  /* in/out*/ bounds_info_internal_t *bounds_nest) {
531 
532  typedef typename traits_t<T>::unsigned_t UT;
533  // typedef typename traits_t<T>::signed_t ST;
534 
535  // typedef typename big_span_t span_t;
536  typedef T span_t;
537 
538  auto &bbounds = bounds->b;
539 
540  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
541  // This dimention depends on one of previous ones; can't be the outermost
542  // one.
543  bounds_info_internalXX_template<T> *previous =
544  reinterpret_cast<bounds_info_internalXX_template<T> *>(
545  &(bounds_nest[bbounds.outer_iv]));
546 
547  // OMPTODO: assert that T is compatible with loop variable type on
548  // 'previous' loop
549 
550  {
551  span_t bound_candidate1 =
552  bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
553  span_t bound_candidate2 =
554  bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
555  if (bound_candidate1 < bound_candidate2) {
556  bounds->span_smallest = bound_candidate1;
557  } else {
558  bounds->span_smallest = bound_candidate2;
559  }
560  }
561 
562  {
563  // We can't adjust the upper bound with respect to step, because
564  // lower bound might be off after adjustments
565 
566  span_t bound_candidate1 =
567  bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
568  span_t bound_candidate2 =
569  bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
570  if (bound_candidate1 < bound_candidate2) {
571  bounds->span_biggest = bound_candidate2;
572  } else {
573  bounds->span_biggest = bound_candidate1;
574  }
575  }
576  } else {
577  // Rectangular:
578  bounds->span_smallest = bbounds.lb0;
579  bounds->span_biggest = bbounds.ub0;
580  }
581  if (!bounds->loop_bounds_adjusted) {
582  // Here it's safe to reduce the space to the multiply of step.
583  // OMPTODO: check if the formular is correct.
584  // Also check if it would be safe to do this if we didn't adjust left side.
585  bounds->span_biggest -=
586  (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
587  }
588 }
589 
590 // Calculate span for IV on this loop level for ">=" case.
591 template <typename T>
592 void kmp_calc_span_greateroreq_XX(
593  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
594  /* in/out*/ bounds_info_internal_t *bounds_nest) {
595 
596  typedef typename traits_t<T>::unsigned_t UT;
597  // typedef typename traits_t<T>::signed_t ST;
598 
599  // typedef typename big_span_t span_t;
600  typedef T span_t;
601 
602  auto &bbounds = bounds->b;
603 
604  if ((bbounds.lb1 != 0) || (bbounds.ub1 != 0)) {
605  // This dimention depends on one of previous ones; can't be the outermost
606  // one.
607  bounds_info_internalXX_template<T> *previous =
608  reinterpret_cast<bounds_info_internalXX_template<T> *>(
609  &(bounds_nest[bbounds.outer_iv]));
610 
611  // OMPTODO: assert that T is compatible with loop variable type on
612  // 'previous' loop
613 
614  {
615  span_t bound_candidate1 =
616  bbounds.lb0 + bbounds.lb1 * previous->span_smallest;
617  span_t bound_candidate2 =
618  bbounds.lb0 + bbounds.lb1 * previous->span_biggest;
619  if (bound_candidate1 >= bound_candidate2) {
620  bounds->span_smallest = bound_candidate1;
621  } else {
622  bounds->span_smallest = bound_candidate2;
623  }
624  }
625 
626  {
627  // We can't adjust the upper bound with respect to step, because
628  // lower bound might be off after adjustments
629 
630  span_t bound_candidate1 =
631  bbounds.ub0 + bbounds.ub1 * previous->span_smallest;
632  span_t bound_candidate2 =
633  bbounds.ub0 + bbounds.ub1 * previous->span_biggest;
634  if (bound_candidate1 >= bound_candidate2) {
635  bounds->span_biggest = bound_candidate2;
636  } else {
637  bounds->span_biggest = bound_candidate1;
638  }
639  }
640 
641  } else {
642  // Rectangular:
643  bounds->span_biggest = bbounds.lb0;
644  bounds->span_smallest = bbounds.ub0;
645  }
646  if (!bounds->loop_bounds_adjusted) {
647  // Here it's safe to reduce the space to the multiply of step.
648  // OMPTODO: check if the formular is correct.
649  // Also check if it would be safe to do this if we didn't adjust left side.
650  bounds->span_biggest -=
651  (static_cast<UT>(bbounds.ub0 - bbounds.lb0)) % bbounds.step; // abs?
652  }
653 }
654 
655 // Calculate maximum possible span for IV on this loop level.
656 template <typename T>
657 void kmp_calc_span_XX(
658  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
659  /* in/out*/ bounds_info_internal_t *bounds_nest) {
660 
661  if (bounds->b.comparison == comparison_t::comp_less_or_eq) {
662  kmp_calc_span_lessoreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
663  } else {
664  KMP_ASSERT(bounds->b.comparison == comparison_t::comp_greater_or_eq);
665  kmp_calc_span_greateroreq_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
666  }
667 }
668 
669 //----------All initial processing of the loop nest---------------------------
670 
671 // Calculate new bounds for this loop level.
672 // To be able to work with the nest we need to get it to a parallelepiped shape.
673 // We need to stay in the original range of values, so that there will be no
674 // overflow, for that we'll adjust both upper and lower bounds as needed.
675 template <typename T>
676 void kmp_calc_new_bounds_XX(
677  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
678  /* in/out*/ bounds_info_internal_t *bounds_nest) {
679 
680  auto &bbounds = bounds->b;
681 
682  if (bbounds.lb1 == bbounds.ub1) {
683  // Already parallel, no need to adjust:
684  bounds->loop_bounds_adjusted = false;
685  } else {
686  bounds->loop_bounds_adjusted = true;
687 
688  T old_lb1 = bbounds.lb1;
689  T old_ub1 = bbounds.ub1;
690 
691  if (__kmp_sign(old_lb1) != __kmp_sign(old_ub1)) {
692  // With this shape we can adjust to a rectangle:
693  bbounds.lb1 = 0;
694  bbounds.ub1 = 0;
695  } else {
696  // get upper and lower bounds to be parallel
697  // with values in the old range.
698  // Note: abs didn't work here.
699  if (((old_lb1 < 0) && (old_lb1 < old_ub1)) ||
700  ((old_lb1 > 0) && (old_lb1 > old_ub1))) {
701  bbounds.lb1 = old_ub1;
702  } else {
703  bbounds.ub1 = old_lb1;
704  }
705  }
706 
707  // Now need to adjust lb0, ub0, otherwise in some cases space will shrink.
708  // The idea here that for this IV we are now getting the same span
709  // irrespective of the previous IV value.
710  bounds_info_internalXX_template<T> *previous =
711  reinterpret_cast<bounds_info_internalXX_template<T> *>(
712  &bounds_nest[bbounds.outer_iv]);
713 
714  if (bbounds.comparison == comparison_t::comp_less_or_eq) {
715  if (old_lb1 < bbounds.lb1) {
716  KMP_ASSERT(old_lb1 < 0);
717  // The length is good on outer_iv biggest number,
718  // can use it to find where to move the lower bound:
719 
720  T sub = (bbounds.lb1 - old_lb1) * previous->span_biggest;
721  bbounds.lb0 -= sub; // OMPTODO: what if it'll go out of unsigned space?
722  // e.g. it was 0?? (same below)
723  } else if (old_lb1 > bbounds.lb1) {
724  // still need to move lower bound:
725  T add = (old_lb1 - bbounds.lb1) * previous->span_smallest;
726  bbounds.lb0 += add;
727  }
728 
729  if (old_ub1 > bbounds.ub1) {
730  KMP_ASSERT(old_ub1 > 0);
731  // The length is good on outer_iv biggest number,
732  // can use it to find where to move upper bound:
733 
734  T add = (old_ub1 - bbounds.ub1) * previous->span_biggest;
735  bbounds.ub0 += add;
736  } else if (old_ub1 < bbounds.ub1) {
737  // still need to move upper bound:
738  T sub = (bbounds.ub1 - old_ub1) * previous->span_smallest;
739  bbounds.ub0 -= sub;
740  }
741  } else {
742  KMP_ASSERT(bbounds.comparison == comparison_t::comp_greater_or_eq);
743  if (old_lb1 < bbounds.lb1) {
744  KMP_ASSERT(old_lb1 < 0);
745  T sub = (bbounds.lb1 - old_lb1) * previous->span_smallest;
746  bbounds.lb0 -= sub;
747  } else if (old_lb1 > bbounds.lb1) {
748  T add = (old_lb1 - bbounds.lb1) * previous->span_biggest;
749  bbounds.lb0 += add;
750  }
751 
752  if (old_ub1 > bbounds.ub1) {
753  KMP_ASSERT(old_ub1 > 0);
754  T add = (old_ub1 - bbounds.ub1) * previous->span_smallest;
755  bbounds.ub0 += add;
756  } else if (old_ub1 < bbounds.ub1) {
757  T sub = (bbounds.ub1 - old_ub1) * previous->span_biggest;
758  bbounds.ub0 -= sub;
759  }
760  }
761  }
762 }
763 
764 // Do all processing for one canonicalized loop in the nest
765 // (assuming that outer loops already were processed):
766 template <typename T>
767 kmp_loop_nest_iv_t kmp_process_one_loop_XX(
768  /* in/out*/ bounds_info_internalXX_template<T> *bounds,
769  /*in/out*/ bounds_info_internal_t *bounds_nest) {
770 
771  kmp_calc_new_bounds_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
772  kmp_calc_span_XX(/* in/out*/ bounds, /* in/out*/ bounds_nest);
773  return kmp_calculate_trip_count_XX(/*in/out*/ &(bounds->b));
774 }
775 
776 // Non-rectangular loop nest, canonicalized to use <= or >=.
777 // Process loop nest to have a parallelepiped shape,
778 // calculate biggest spans for IV's on all levels and calculate overall trip
779 // count. "bounds_nest" has to be allocated per thread.
780 // Returns overall trip count (for adjusted space).
781 kmp_loop_nest_iv_t kmp_process_loop_nest(
782  /*in/out*/ bounds_info_internal_t *bounds_nest, kmp_index_t n) {
783 
784  kmp_loop_nest_iv_t total = 1;
785 
786  for (kmp_index_t ind = 0; ind < n; ++ind) {
787  auto bounds = &(bounds_nest[ind]);
788  kmp_loop_nest_iv_t trip_count = 0;
789 
790  switch (bounds->b.loop_type) {
791  case loop_type_t::loop_type_int32:
792  trip_count = kmp_process_one_loop_XX<kmp_int32>(
793  /*in/out*/ (bounds_info_internalXX_template<kmp_int32> *)(bounds),
794  /*in/out*/ bounds_nest);
795  break;
796  case loop_type_t::loop_type_uint32:
797  trip_count = kmp_process_one_loop_XX<kmp_uint32>(
798  /*in/out*/ (bounds_info_internalXX_template<kmp_uint32> *)(bounds),
799  /*in/out*/ bounds_nest);
800  break;
801  case loop_type_t::loop_type_int64:
802  trip_count = kmp_process_one_loop_XX<kmp_int64>(
803  /*in/out*/ (bounds_info_internalXX_template<kmp_int64> *)(bounds),
804  /*in/out*/ bounds_nest);
805  break;
806  case loop_type_t::loop_type_uint64:
807  trip_count = kmp_process_one_loop_XX<kmp_uint64>(
808  /*in/out*/ (bounds_info_internalXX_template<kmp_uint64> *)(bounds),
809  /*in/out*/ bounds_nest);
810  break;
811  default:
812  KMP_ASSERT(false);
813  }
814  total *= trip_count;
815  }
816 
817  return total;
818 }
819 
820 //----------Calculate iterations (in the original or updated space)-----------
821 
822 // Calculate number of iterations in original or updated space resulting in
823 // original_ivs[ind] (only on this level, non-negative)
824 // (not counting initial iteration)
825 template <typename T>
826 kmp_loop_nest_iv_t
827 kmp_calc_number_of_iterations_XX(const bounds_infoXX_template<T> *bounds,
828  const kmp_point_t original_ivs,
829  kmp_index_t ind) {
830 
831  kmp_loop_nest_iv_t iterations = 0;
832 
833  if (bounds->comparison == comparison_t::comp_less_or_eq) {
834  iterations =
835  (static_cast<T>(original_ivs[ind]) - bounds->lb0 -
836  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv])) /
837  __kmp_abs(bounds->step);
838  } else {
839  KMP_DEBUG_ASSERT(bounds->comparison == comparison_t::comp_greater_or_eq);
840  iterations = (bounds->lb0 +
841  bounds->lb1 * static_cast<T>(original_ivs[bounds->outer_iv]) -
842  static_cast<T>(original_ivs[ind])) /
843  __kmp_abs(bounds->step);
844  }
845 
846  return iterations;
847 }
848 
849 // Calculate number of iterations in the original or updated space resulting in
850 // original_ivs[ind] (only on this level, non-negative)
851 kmp_loop_nest_iv_t kmp_calc_number_of_iterations(const bounds_info_t *bounds,
852  const kmp_point_t original_ivs,
853  kmp_index_t ind) {
854 
855  switch (bounds->loop_type) {
856  case loop_type_t::loop_type_int32:
857  return kmp_calc_number_of_iterations_XX<kmp_int32>(
858  (bounds_infoXX_template<kmp_int32> *)(bounds), original_ivs, ind);
859  break;
860  case loop_type_t::loop_type_uint32:
861  return kmp_calc_number_of_iterations_XX<kmp_uint32>(
862  (bounds_infoXX_template<kmp_uint32> *)(bounds), original_ivs, ind);
863  break;
864  case loop_type_t::loop_type_int64:
865  return kmp_calc_number_of_iterations_XX<kmp_int64>(
866  (bounds_infoXX_template<kmp_int64> *)(bounds), original_ivs, ind);
867  break;
868  case loop_type_t::loop_type_uint64:
869  return kmp_calc_number_of_iterations_XX<kmp_uint64>(
870  (bounds_infoXX_template<kmp_uint64> *)(bounds), original_ivs, ind);
871  break;
872  default:
873  KMP_ASSERT(false);
874  return 0;
875  }
876 }
877 
878 //----------Calculate new iv corresponding to original ivs--------------------
879 
880 // We got a point in the original loop nest.
881 // Take updated bounds and calculate what new_iv will correspond to this point.
882 // When we are getting original IVs from new_iv, we have to adjust to fit into
883 // original loops bounds. Getting new_iv for the adjusted original IVs will help
884 // with making more chunks non-empty.
885 kmp_loop_nest_iv_t
886 kmp_calc_new_iv_from_original_ivs(const bounds_info_internal_t *bounds_nest,
887  const kmp_point_t original_ivs,
888  kmp_index_t n) {
889 
890  kmp_loop_nest_iv_t new_iv = 0;
891 
892  for (kmp_index_t ind = 0; ind < n; ++ind) {
893  auto bounds = &(bounds_nest[ind].b);
894 
895  new_iv = new_iv * bounds->trip_count +
896  kmp_calc_number_of_iterations(bounds, original_ivs, ind);
897  }
898 
899  return new_iv;
900 }
901 
902 //----------Calculate original ivs for provided iterations--------------------
903 
904 // Calculate original IVs for provided iterations, assuming iterations are
905 // calculated in the original space.
906 // Loop nest is in canonical form (with <= / >=).
907 bool kmp_calc_original_ivs_from_iterations(
908  const bounds_info_t *original_bounds_nest, kmp_index_t n,
909  /*in/out*/ kmp_point_t original_ivs,
910  /*in/out*/ kmp_iterations_t iterations, kmp_index_t ind) {
911 
912  kmp_index_t lengthened_ind = n;
913 
914  for (; ind < n;) {
915  auto bounds = &(original_bounds_nest[ind]);
916  bool good = kmp_calc_one_iv(bounds, /*in/out*/ original_ivs, iterations,
917  ind, (lengthened_ind < ind), true);
918 
919  if (!good) {
920  // The calculated iv value is too big (or too small for >=):
921  if (ind == 0) {
922  // Space is empty:
923  return false;
924  } else {
925  // Go to next iteration on the outer loop:
926  --ind;
927  ++iterations[ind];
928  lengthened_ind = ind;
929  for (kmp_index_t i = ind + 1; i < n; ++i) {
930  iterations[i] = 0;
931  }
932  continue;
933  }
934  }
935  ++ind;
936  }
937 
938  return true;
939 }
940 
941 //----------Calculate original ivs for the beginning of the loop nest---------
942 
943 // Calculate IVs for the beginning of the loop nest.
944 // Note: lower bounds of all loops may not work -
945 // if on some of the iterations of the outer loops inner loops are empty.
946 // Loop nest is in canonical form (with <= / >=).
947 bool kmp_calc_original_ivs_for_start(const bounds_info_t *original_bounds_nest,
948  kmp_index_t n,
949  /*out*/ kmp_point_t original_ivs) {
950 
951  // Iterations in the original space, multiplied by step:
952  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
953  for (kmp_index_t ind = n; ind > 0;) {
954  --ind;
955  iterations[ind] = 0;
956  }
957 
958  // Now calculate the point:
959  bool b = kmp_calc_original_ivs_from_iterations(original_bounds_nest, n,
960  /*in/out*/ original_ivs,
961  /*in/out*/ iterations, 0);
962  return b;
963 }
964 
965 //----------Calculate next point in the original loop space-------------------
966 
967 // From current set of original IVs calculate next point.
968 // Return false if there is no next point in the loop bounds.
969 bool kmp_calc_next_original_ivs(const bounds_info_t *original_bounds_nest,
970  kmp_index_t n, const kmp_point_t original_ivs,
971  /*out*/ kmp_point_t next_original_ivs) {
972  // Iterations in the original space, multiplied by step (so can be negative):
973  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
974  // First, calc corresponding iteration in every original loop:
975  for (kmp_index_t ind = 0; ind < n; ++ind) {
976  auto bounds = &(original_bounds_nest[ind]);
977  iterations[ind] = kmp_calc_number_of_iterations(bounds, original_ivs, ind);
978  }
979 
980  for (kmp_index_t ind = 0; ind < n; ++ind) {
981  next_original_ivs[ind] = original_ivs[ind];
982  }
983 
984  // Next add one step to the iterations on the inner-most level, and see if we
985  // need to move up the nest:
986  kmp_index_t ind = n - 1;
987  ++iterations[ind];
988 
989  bool b = kmp_calc_original_ivs_from_iterations(
990  original_bounds_nest, n, /*in/out*/ next_original_ivs, iterations, ind);
991 
992  return b;
993 }
994 
995 //----------Calculate chunk end in the original loop space--------------------
996 
997 // For one level calculate old induction variable corresponding to overall
998 // new_iv for the chunk end.
999 // Return true if it fits into upper bound on this level
1000 // (if not, we need to re-calculate)
1001 template <typename T>
1002 bool kmp_calc_one_iv_for_chunk_end_XX(
1003  const bounds_infoXX_template<T> *bounds,
1004  const bounds_infoXX_template<T> *updated_bounds,
1005  /*in/out*/ kmp_point_t original_ivs, const kmp_iterations_t iterations,
1006  kmp_index_t ind, bool start_with_lower_bound, bool compare_with_start,
1007  const kmp_point_t original_ivs_start) {
1008 
1009  // typedef std::conditional<std::is_signed<T>::value, kmp_int64, kmp_uint64>
1010  // big_span_t;
1011 
1012  // OMPTODO: is it good enough, or do we need ST or do we need big_span_t?
1013  T temp = 0;
1014 
1015  T outer_iv = static_cast<T>(original_ivs[bounds->outer_iv]);
1016 
1017  if (start_with_lower_bound) {
1018  // we moved to the next iteration on one of outer loops, may as well use
1019  // the lower bound here:
1020  temp = bounds->lb0 + bounds->lb1 * outer_iv;
1021  } else {
1022  // Start in expanded space, but:
1023  // - we need to hit original space lower bound, so need to account for
1024  // that
1025  // - we have to go into original space, even if that means adding more
1026  // iterations than was planned
1027  // - we have to go past (or equal to) previous point (which is the chunk
1028  // starting point)
1029 
1030  auto iteration = iterations[ind];
1031 
1032  auto step = bounds->step;
1033 
1034  // In case of >= it's negative:
1035  auto accountForStep =
1036  ((bounds->lb0 + bounds->lb1 * outer_iv) -
1037  (updated_bounds->lb0 + updated_bounds->lb1 * outer_iv)) %
1038  step;
1039 
1040  temp = updated_bounds->lb0 + updated_bounds->lb1 * outer_iv +
1041  accountForStep + iteration * step;
1042 
1043  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1044  (temp < (bounds->lb0 + bounds->lb1 * outer_iv))) ||
1045  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1046  (temp > (bounds->lb0 + bounds->lb1 * outer_iv)))) {
1047  // Too small (or too big), didn't reach the original lower bound. Use
1048  // heuristic:
1049  temp = bounds->lb0 + bounds->lb1 * outer_iv + iteration / 2 * step;
1050  }
1051 
1052  if (compare_with_start) {
1053 
1054  T start = static_cast<T>(original_ivs_start[ind]);
1055 
1056  temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1057 
1058  // On all previous levels start of the chunk is same as the end, need to
1059  // be really careful here:
1060  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1061  (temp < start)) ||
1062  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1063  (temp > start))) {
1064  // End of the chunk can't be smaller (for >= bigger) than it's start.
1065  // Use heuristic:
1066  temp = start + iteration / 4 * step;
1067  }
1068  }
1069  }
1070 
1071  original_ivs[ind] = temp = kmp_fix_iv(bounds->loop_iv_type, temp);
1072 
1073  if (((bounds->comparison == comparison_t::comp_less_or_eq) &&
1074  (temp > (bounds->ub0 + bounds->ub1 * outer_iv))) ||
1075  ((bounds->comparison == comparison_t::comp_greater_or_eq) &&
1076  (temp < (bounds->ub0 + bounds->ub1 * outer_iv)))) {
1077  // Too big (or too small for >=).
1078  return false;
1079  }
1080 
1081  return true;
1082 }
1083 
1084 // For one level calculate old induction variable corresponding to overall
1085 // new_iv for the chunk end.
1086 bool kmp_calc_one_iv_for_chunk_end(const bounds_info_t *bounds,
1087  const bounds_info_t *updated_bounds,
1088  /*in/out*/ kmp_point_t original_ivs,
1089  const kmp_iterations_t iterations,
1090  kmp_index_t ind, bool start_with_lower_bound,
1091  bool compare_with_start,
1092  const kmp_point_t original_ivs_start) {
1093 
1094  switch (bounds->loop_type) {
1095  case loop_type_t::loop_type_int32:
1096  return kmp_calc_one_iv_for_chunk_end_XX<kmp_int32>(
1098  (bounds_infoXX_template<kmp_int32> *)(updated_bounds),
1099  /*in/out*/
1100  original_ivs, iterations, ind, start_with_lower_bound,
1101  compare_with_start, original_ivs_start);
1102  break;
1103  case loop_type_t::loop_type_uint32:
1104  return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint32>(
1106  (bounds_infoXX_template<kmp_uint32> *)(updated_bounds),
1107  /*in/out*/
1108  original_ivs, iterations, ind, start_with_lower_bound,
1109  compare_with_start, original_ivs_start);
1110  break;
1111  case loop_type_t::loop_type_int64:
1112  return kmp_calc_one_iv_for_chunk_end_XX<kmp_int64>(
1114  (bounds_infoXX_template<kmp_int64> *)(updated_bounds),
1115  /*in/out*/
1116  original_ivs, iterations, ind, start_with_lower_bound,
1117  compare_with_start, original_ivs_start);
1118  break;
1119  case loop_type_t::loop_type_uint64:
1120  return kmp_calc_one_iv_for_chunk_end_XX<kmp_uint64>(
1122  (bounds_infoXX_template<kmp_uint64> *)(updated_bounds),
1123  /*in/out*/
1124  original_ivs, iterations, ind, start_with_lower_bound,
1125  compare_with_start, original_ivs_start);
1126  break;
1127  default:
1128  KMP_ASSERT(false);
1129  return false;
1130  }
1131 }
1132 
1133 // Calculate old induction variables corresponding to overall new_iv for the
1134 // chunk end. If due to space extension we are getting old IVs outside of the
1135 // boundaries, bring them into the boundaries. Need to do this in the runtime,
1136 // esp. on the lower bounds side. When getting result need to make sure that the
1137 // new chunk starts at next position to old chunk, not overlaps with it (this is
1138 // done elsewhere), and need to make sure end of the chunk is further than the
1139 // beginning of the chunk. We don't need an exact ending point here, just
1140 // something more-or-less close to the desired chunk length, bigger is fine
1141 // (smaller would be fine, but we risk going into infinite loop, so do smaller
1142 // only at the very end of the space). result: false if could not find the
1143 // ending point in the original loop space. In this case the caller can use
1144 // original upper bounds as the end of the chunk. Chunk won't be empty, because
1145 // it'll have at least the starting point, which is by construction in the
1146 // original space.
1147 bool kmp_calc_original_ivs_for_chunk_end(
1148  const bounds_info_t *original_bounds_nest, kmp_index_t n,
1149  const bounds_info_internal_t *updated_bounds_nest,
1150  const kmp_point_t original_ivs_start, kmp_loop_nest_iv_t new_iv,
1151  /*out*/ kmp_point_t original_ivs) {
1152 
1153  // Iterations in the expanded space:
1154  CollapseAllocator<kmp_loop_nest_iv_t> iterations(n);
1155  // First, calc corresponding iteration in every modified loop:
1156  for (kmp_index_t ind = n; ind > 0;) {
1157  --ind;
1158  auto &updated_bounds = updated_bounds_nest[ind];
1159 
1160  // should be optimized to OPDIVREM:
1161  auto new_ind = new_iv / updated_bounds.b.trip_count;
1162  auto iteration = new_iv % updated_bounds.b.trip_count;
1163 
1164  new_iv = new_ind;
1165  iterations[ind] = iteration;
1166  }
1167  KMP_DEBUG_ASSERT(new_iv == 0);
1168 
1169  kmp_index_t lengthened_ind = n;
1170  kmp_index_t equal_ind = -1;
1171 
1172  // Next calculate the point, but in original loop nest.
1173  for (kmp_index_t ind = 0; ind < n;) {
1174  auto bounds = &(original_bounds_nest[ind]);
1175  auto updated_bounds = &(updated_bounds_nest[ind].b);
1176 
1177  bool good = kmp_calc_one_iv_for_chunk_end(
1178  bounds, updated_bounds,
1179  /*in/out*/ original_ivs, iterations, ind, (lengthened_ind < ind),
1180  (equal_ind >= ind - 1), original_ivs_start);
1181 
1182  if (!good) {
1183  // Too big (or too small for >=).
1184  if (ind == 0) {
1185  // Need to reduce to the end.
1186  return false;
1187  } else {
1188  // Go to next iteration on outer loop:
1189  --ind;
1190  ++(iterations[ind]);
1191  lengthened_ind = ind;
1192  if (equal_ind >= lengthened_ind) {
1193  // We've changed the number of iterations here,
1194  // can't be same anymore:
1195  equal_ind = lengthened_ind - 1;
1196  }
1197  for (kmp_index_t i = ind + 1; i < n; ++i) {
1198  iterations[i] = 0;
1199  }
1200  continue;
1201  }
1202  }
1203 
1204  if ((equal_ind == ind - 1) &&
1205  (kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1206  original_ivs_start[ind]))) {
1207  equal_ind = ind;
1208  } else if ((equal_ind > ind - 1) &&
1209  !(kmp_ivs_eq(bounds->loop_iv_type, original_ivs[ind],
1210  original_ivs_start[ind]))) {
1211  equal_ind = ind - 1;
1212  }
1213  ++ind;
1214  }
1215 
1216  return true;
1217 }
1218 
1219 //----------Calculate upper bounds for the last chunk-------------------------
1220 
1221 // Calculate one upper bound for the end.
1222 template <typename T>
1223 void kmp_calc_one_iv_end_XX(const bounds_infoXX_template<T> *bounds,
1224  /*in/out*/ kmp_point_t original_ivs,
1225  kmp_index_t ind) {
1226 
1227  T temp = bounds->ub0 +
1228  bounds->ub1 * static_cast<T>(original_ivs[bounds->outer_iv]);
1229 
1230  original_ivs[ind] = kmp_fix_iv(bounds->loop_iv_type, temp);
1231 }
1232 
1233 void kmp_calc_one_iv_end(const bounds_info_t *bounds,
1234  /*in/out*/ kmp_point_t original_ivs, kmp_index_t ind) {
1235 
1236  switch (bounds->loop_type) {
1237  default:
1238  KMP_ASSERT(false);
1239  break;
1240  case loop_type_t::loop_type_int32:
1241  kmp_calc_one_iv_end_XX<kmp_int32>(
1243  /*in/out*/ original_ivs, ind);
1244  break;
1245  case loop_type_t::loop_type_uint32:
1246  kmp_calc_one_iv_end_XX<kmp_uint32>(
1248  /*in/out*/ original_ivs, ind);
1249  break;
1250  case loop_type_t::loop_type_int64:
1251  kmp_calc_one_iv_end_XX<kmp_int64>(
1253  /*in/out*/ original_ivs, ind);
1254  break;
1255  case loop_type_t::loop_type_uint64:
1256  kmp_calc_one_iv_end_XX<kmp_uint64>(
1258  /*in/out*/ original_ivs, ind);
1259  break;
1260  }
1261 }
1262 
1263 // Calculate upper bounds for the last loop iteration. Just use original upper
1264 // bounds (adjusted when canonicalized to use <= / >=). No need to check that
1265 // this point is in the original space (it's likely not)
1266 void kmp_calc_original_ivs_for_end(
1267  const bounds_info_t *const original_bounds_nest, kmp_index_t n,
1268  /*out*/ kmp_point_t original_ivs) {
1269  for (kmp_index_t ind = 0; ind < n; ++ind) {
1270  auto bounds = &(original_bounds_nest[ind]);
1271  kmp_calc_one_iv_end(bounds, /*in/out*/ original_ivs, ind);
1272  }
1273 }
1274 
1275 /**************************************************************************
1276  * Identify nested loop structure - loops come in the canonical form
1277  * Lower triangle matrix: i = 0; i <= N; i++ {0,0}:{N,0}
1278  * j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
1279  * Upper Triangle matrix
1280  * i = 0; i <= N; i++ {0,0}:{N,0}
1281  * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1282  * ************************************************************************/
1283 nested_loop_type_t
1284 kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
1285  /*in*/ kmp_index_t n) {
1286  // only 2-level nested loops are supported
1287  if (n != 2) {
1288  return nested_loop_type_unkown;
1289  }
1290  // loops must be canonical
1291  KMP_ASSERT(
1292  (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
1293  (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
1294  // check outer loop bounds: for triangular need to be {0,0}:{N,0}
1295  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1296  original_bounds_nest[0].lb0_u64);
1297  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1298  original_bounds_nest[0].ub0_u64);
1299  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1300  original_bounds_nest[0].lb1_u64);
1301  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1302  original_bounds_nest[0].ub1_u64);
1303  if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
1304  return nested_loop_type_unkown;
1305  }
1306  // check inner bounds to determine triangle type
1307  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1308  original_bounds_nest[1].lb0_u64);
1309  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1310  original_bounds_nest[1].ub0_u64);
1311  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1312  original_bounds_nest[1].lb1_u64);
1313  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1314  original_bounds_nest[1].ub1_u64);
1315  // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
1316  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
1317  (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
1318  return nested_loop_type_lower_triangular_matrix;
1319  }
1320  // upper triangle loop inner bounds need to be {0,1}:{N,0}
1321  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
1322  inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
1323  return nested_loop_type_upper_triangular_matrix;
1324  }
1325  return nested_loop_type_unkown;
1326 }
1327 
1328 /**************************************************************************
1329  * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
1330  * Start point is x so the result is always > sqrt(x)
1331  * The method has uniform convergence, PRECISION is set to 0.1
1332  * ************************************************************************/
1333 #define level_of_precision 0.1
1334 double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
1335  double sqrt_old = 0.;
1336  double sqrt_new = (double)x;
1337  do {
1338  sqrt_old = sqrt_new;
1339  sqrt_new = (sqrt_old + x / sqrt_old) / 2;
1340  } while ((sqrt_old - sqrt_new) > level_of_precision);
1341  return sqrt_new;
1342 }
1343 
1344 /**************************************************************************
1345  * Handle lower triangle matrix in the canonical form
1346  * i = 0; i <= N; i++ {0,0}:{N,0}
1347  * j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
1348  * ************************************************************************/
1349 void kmp_handle_lower_triangle_matrix(
1350  /*in*/ kmp_uint32 nth,
1351  /*in*/ kmp_uint32 tid,
1352  /*in */ kmp_index_t n,
1353  /*in/out*/ bounds_info_t *original_bounds_nest,
1354  /*out*/ bounds_info_t *chunk_bounds_nest) {
1355 
1356  // transfer loop types from the original loop to the chunks
1357  for (kmp_index_t i = 0; i < n; ++i) {
1358  chunk_bounds_nest[i] = original_bounds_nest[i];
1359  }
1360  // cleanup iv variables
1361  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1362  original_bounds_nest[0].ub0_u64);
1363  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1364  original_bounds_nest[0].lb0_u64);
1365  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
1366  original_bounds_nest[1].ub0_u64);
1367  // calculate the chunk's lower and upper bounds
1368  // the total number of iterations in the loop is the sum of the arithmetic
1369  // progression from the outer lower to outer upper bound (inclusive since the
1370  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1371  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1372  // + 1) -> N - 1
1373  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
1374  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1375  // the current thread's number of iterations:
1376  // each thread gets an equal number of iterations: total number of iterations
1377  // divided by the number of threads plus, if there's a remainder,
1378  // the first threads with the number up to the remainder get an additional
1379  // iteration each to cover it
1380  kmp_uint64 iter_current =
1381  iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1382  // cumulative number of iterations executed by all the previous threads:
1383  // threads with the tid below the remainder will have (iter_total/nth+1)
1384  // elements, and so will all threads before them so the cumulative number of
1385  // iterations executed by the all previous will be the current thread's number
1386  // of iterations multiplied by the number of previous threads which is equal
1387  // to the current thread's tid; threads with the number equal or above the
1388  // remainder will have (iter_total/nth) elements so the cumulative number of
1389  // iterations previously executed is its number of iterations multipled by the
1390  // number of previous threads which is again equal to the current thread's tid
1391  // PLUS all the remainder iterations that will have been executed by the
1392  // previous threads
1393  kmp_uint64 iter_before_current =
1394  tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1395  // cumulative number of iterations executed with the current thread is
1396  // the cumulative number executed before it plus its own
1397  kmp_uint64 iter_with_current = iter_before_current + iter_current;
1398  // calculate the outer loop lower bound (lbo) which is the max outer iv value
1399  // that gives the number of iterations that is equal or just below the total
1400  // number of iterations executed by the previous threads, for less_than
1401  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
1402  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
1403  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
1404  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
1405  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
1406  // using a parameter to control the equation sign
1407  kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
1408  kmp_uint64 lower_bound_outer =
1409  (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1410  8 * iter_before_current) +
1411  inner_adjustment) /
1412  2 -
1413  inner_adjustment;
1414  // calculate the inner loop lower bound which is the remaining number of
1415  // iterations required to hit the total number of iterations executed by the
1416  // previous threads giving the starting point of this thread
1417  kmp_uint64 lower_bound_inner =
1418  iter_before_current -
1419  ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
1420  // calculate the outer loop upper bound using the same approach as for the
1421  // inner bound except using the total number of iterations executed with the
1422  // current thread
1423  kmp_uint64 upper_bound_outer =
1424  (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
1425  8 * iter_with_current) +
1426  inner_adjustment) /
1427  2 -
1428  inner_adjustment;
1429  // calculate the inner loop upper bound which is the remaining number of
1430  // iterations required to hit the total number of iterations executed after
1431  // the current thread giving the starting point of the next thread
1432  kmp_uint64 upper_bound_inner =
1433  iter_with_current -
1434  ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
1435  // adjust the upper bounds down by 1 element to point at the last iteration of
1436  // the current thread the first iteration of the next thread
1437  if (upper_bound_inner == 0) {
1438  // {n,0} => {n-1,n-1}
1439  upper_bound_outer -= 1;
1440  upper_bound_inner = upper_bound_outer;
1441  } else {
1442  // {n,m} => {n,m-1} (m!=0)
1443  upper_bound_inner -= 1;
1444  }
1445 
1446  // assign the values, zeroing out lb1 and ub1 values since the iteration space
1447  // is now one-dimensional
1448  chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
1449  chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
1450  chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
1451  chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
1452  chunk_bounds_nest[0].lb1_u64 = 0;
1453  chunk_bounds_nest[0].ub1_u64 = 0;
1454  chunk_bounds_nest[1].lb1_u64 = 0;
1455  chunk_bounds_nest[1].ub1_u64 = 0;
1456 
1457 #if 0
1458  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1459  tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1460  chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1461 #endif
1462 }
1463 
1464 /**************************************************************************
1465  * Handle upper triangle matrix in the canonical form
1466  * i = 0; i <= N; i++ {0,0}:{N,0}
1467  * j = 0+1*i; j <= N; j++ {0,1}:{N,0}
1468  * ************************************************************************/
1469 void kmp_handle_upper_triangle_matrix(
1470  /*in*/ kmp_uint32 nth,
1471  /*in*/ kmp_uint32 tid,
1472  /*in */ kmp_index_t n,
1473  /*in/out*/ bounds_info_t *original_bounds_nest,
1474  /*out*/ bounds_info_t *chunk_bounds_nest) {
1475 
1476  // transfer loop types from the original loop to the chunks
1477  for (kmp_index_t i = 0; i < n; ++i) {
1478  chunk_bounds_nest[i] = original_bounds_nest[i];
1479  }
1480  // cleanup iv variables
1481  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1482  original_bounds_nest[0].ub0_u64);
1483  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
1484  original_bounds_nest[0].lb0_u64);
1485  [[maybe_unused]] kmp_uint64 inner_ub0 = kmp_fix_iv(
1486  original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
1487  // calculate the chunk's lower and upper bounds
1488  // the total number of iterations in the loop is the sum of the arithmetic
1489  // progression from the outer lower to outer upper bound (inclusive since the
1490  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
1491  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
1492  // + 1) -> N - 1
1493  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
1494  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
1495  // the current thread's number of iterations:
1496  // each thread gets an equal number of iterations: total number of iterations
1497  // divided by the number of threads plus, if there's a remainder,
1498  // the first threads with the number up to the remainder get an additional
1499  // iteration each to cover it
1500  kmp_uint64 iter_current =
1501  iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
1502  // cumulative number of iterations executed by all the previous threads:
1503  // threads with the tid below the remainder will have (iter_total/nth+1)
1504  // elements, and so will all threads before them so the cumulative number of
1505  // iterations executed by the all previous will be the current thread's number
1506  // of iterations multiplied by the number of previous threads which is equal
1507  // to the current thread's tid; threads with the number equal or above the
1508  // remainder will have (iter_total/nth) elements so the cumulative number of
1509  // iterations previously executed is its number of iterations multipled by the
1510  // number of previous threads which is again equal to the current thread's tid
1511  // PLUS all the remainder iterations that will have been executed by the
1512  // previous threads
1513  kmp_uint64 iter_before_current =
1514  tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
1515  // cumulative number of iterations executed with the current thread is
1516  // the cumulative number executed before it plus its own
1517  kmp_uint64 iter_with_current = iter_before_current + iter_current;
1518  // calculate the outer loop lower bound (lbo) which is the max outer iv value
1519  // that gives the number of iterations that is equal or just below the total
1520  // number of iterations executed by the previous threads:
1521  // lbo*(lbo+1)/2<=iter_before_current =>
1522  // lbo^2+lbo-2*iter_before_current<=0
1523  kmp_uint64 lower_bound_outer =
1524  (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
1525  // calculate the inner loop lower bound which is the remaining number of
1526  // iterations required to hit the total number of iterations executed by the
1527  // previous threads giving the starting point of this thread
1528  kmp_uint64 lower_bound_inner =
1529  iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
1530  // calculate the outer loop upper bound using the same approach as for the
1531  // inner bound except using the total number of iterations executed with the
1532  // current thread
1533  kmp_uint64 upper_bound_outer =
1534  (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
1535  // calculate the inner loop upper bound which is the remaining number of
1536  // iterations required to hit the total number of iterations executed after
1537  // the current thread giving the starting point of the next thread
1538  kmp_uint64 upper_bound_inner =
1539  iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
1540  // adjust the upper bounds down by 1 element to point at the last iteration of
1541  // the current thread the first iteration of the next thread
1542  if (upper_bound_inner == 0) {
1543  // {n,0} => {n-1,n-1}
1544  upper_bound_outer -= 1;
1545  upper_bound_inner = upper_bound_outer;
1546  } else {
1547  // {n,m} => {n,m-1} (m!=0)
1548  upper_bound_inner -= 1;
1549  }
1550 
1551  // assign the values, zeroing out lb1 and ub1 values since the iteration space
1552  // is now one-dimensional
1553  chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
1554  chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
1555  chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
1556  chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
1557  chunk_bounds_nest[0].lb1_u64 = 0;
1558  chunk_bounds_nest[0].ub1_u64 = 0;
1559  chunk_bounds_nest[1].lb1_u64 = 0;
1560  chunk_bounds_nest[1].ub1_u64 = 0;
1561 
1562 #if 0
1563  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
1564  tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
1565  chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
1566 #endif
1567 }
1568 //----------Init API for non-rectangular loops--------------------------------
1569 
1570 // Init API for collapsed loops (static, no chunks defined).
1571 // "bounds_nest" has to be allocated per thread.
1572 // API will modify original bounds_nest array to bring it to a canonical form
1573 // (only <= and >=, no !=, <, >). If the original loop nest was already in a
1574 // canonical form there will be no changes to bounds in bounds_nest array
1575 // (only trip counts will be calculated). Internally API will expand the space
1576 // to parallelogram/parallelepiped, calculate total, calculate bounds for the
1577 // chunks in terms of the new IV, re-calc them in terms of old IVs (especially
1578 // important on the left side, to hit the lower bounds and not step over), and
1579 // pick the correct chunk for this thread (so it will calculate chunks up to the
1580 // needed one). It could be optimized to calculate just this chunk, potentially
1581 // a bit less well distributed among threads. It is designed to make sure that
1582 // threads will receive predictable chunks, deterministically (so that next nest
1583 // of loops with similar characteristics will get exactly same chunks on same
1584 // threads).
1585 // Current contract: chunk_bounds_nest has only lb0 and ub0,
1586 // lb1 and ub1 are set to 0 and can be ignored. (This may change in the future).
1587 extern "C" kmp_int32
1588 __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
1589  /*in/out*/ bounds_info_t *original_bounds_nest,
1590  /*out*/ bounds_info_t *chunk_bounds_nest,
1591  kmp_index_t n, /*out*/ kmp_int32 *plastiter) {
1592 
1593  KMP_DEBUG_ASSERT(plastiter && original_bounds_nest);
1594  KE_TRACE(10, ("__kmpc_for_collapsed_init called (%d)\n", gtid));
1595 
1596  if (__kmp_env_consistency_check) {
1597  __kmp_push_workshare(gtid, ct_pdo, loc);
1598  }
1599 
1600  kmp_canonicalize_loop_nest(loc, /*in/out*/ original_bounds_nest, n);
1601 
1602  CollapseAllocator<bounds_info_internal_t> updated_bounds_nest(n);
1603 
1604  for (kmp_index_t i = 0; i < n; ++i) {
1605  updated_bounds_nest[i].b = original_bounds_nest[i];
1606  }
1607 
1608  kmp_loop_nest_iv_t total =
1609  kmp_process_loop_nest(/*in/out*/ updated_bounds_nest, n);
1610 
1611  if (plastiter != NULL) {
1612  *plastiter = FALSE;
1613  }
1614 
1615  if (total == 0) {
1616  // Loop won't execute:
1617  return FALSE;
1618  }
1619 
1620  // OMPTODO: DISTRIBUTE is not supported yet
1621  __kmp_assert_valid_gtid(gtid);
1622  kmp_uint32 tid = __kmp_tid_from_gtid(gtid);
1623 
1624  kmp_info_t *th = __kmp_threads[gtid];
1625  kmp_team_t *team = th->th.th_team;
1626  kmp_uint32 nth = team->t.t_nproc; // Number of threads
1627 
1628  KMP_DEBUG_ASSERT(tid < nth);
1629 
1630  // Handle special cases
1631  nested_loop_type_t loop_type =
1632  kmp_identify_nested_loop_structure(original_bounds_nest, n);
1633  if (loop_type == nested_loop_type_lower_triangular_matrix) {
1634  kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
1635  chunk_bounds_nest);
1636  return TRUE;
1637  } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
1638  kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
1639  chunk_bounds_nest);
1640  return TRUE;
1641  }
1642 
1643  CollapseAllocator<kmp_uint64> original_ivs_start(n);
1644 
1645  if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
1646  /*out*/ original_ivs_start)) {
1647  // Loop won't execute:
1648  return FALSE;
1649  }
1650 
1651  // Not doing this optimization for one thread:
1652  // (1) more to test
1653  // (2) without it current contract that chunk_bounds_nest has only lb0 and
1654  // ub0, lb1 and ub1 are set to 0 and can be ignored.
1655  // if (nth == 1) {
1656  // // One thread:
1657  // // Copy all info from original_bounds_nest, it'll be good enough.
1658 
1659  // for (kmp_index_t i = 0; i < n; ++i) {
1660  // chunk_bounds_nest[i] = original_bounds_nest[i];
1661  // }
1662 
1663  // if (plastiter != NULL) {
1664  // *plastiter = TRUE;
1665  // }
1666  // return TRUE;
1667  //}
1668 
1669  kmp_loop_nest_iv_t new_iv = kmp_calc_new_iv_from_original_ivs(
1670  updated_bounds_nest, original_ivs_start, n);
1671 
1672  bool last_iter = false;
1673 
1674  for (; nth > 0;) {
1675  // We could calculate chunk size once, but this is to compensate that the
1676  // original space is not parallelepiped and some threads can be left
1677  // without work:
1678  KMP_DEBUG_ASSERT(total >= new_iv);
1679 
1680  kmp_loop_nest_iv_t total_left = total - new_iv;
1681  kmp_loop_nest_iv_t chunk_size = total_left / nth;
1682  kmp_loop_nest_iv_t remainder = total_left % nth;
1683 
1684  kmp_loop_nest_iv_t curr_chunk_size = chunk_size;
1685 
1686  if (remainder > 0) {
1687  ++curr_chunk_size;
1688  --remainder;
1689  }
1690 
1691 #if defined(KMP_DEBUG)
1692  kmp_loop_nest_iv_t new_iv_for_start = new_iv;
1693 #endif
1694 
1695  if (curr_chunk_size > 1) {
1696  new_iv += curr_chunk_size - 1;
1697  }
1698 
1699  CollapseAllocator<kmp_uint64> original_ivs_end(n);
1700  if ((nth == 1) || (new_iv >= total - 1)) {
1701  // Do this one till the end - just in case we miscalculated
1702  // and either too much is left to process or new_iv is a bit too big:
1703  kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1704  /*out*/ original_ivs_end);
1705 
1706  last_iter = true;
1707  } else {
1708  // Note: here we make sure it's past (or equal to) the previous point.
1709  if (!kmp_calc_original_ivs_for_chunk_end(original_bounds_nest, n,
1710  updated_bounds_nest,
1711  original_ivs_start, new_iv,
1712  /*out*/ original_ivs_end)) {
1713  // We could not find the ending point, use the original upper bounds:
1714  kmp_calc_original_ivs_for_end(original_bounds_nest, n,
1715  /*out*/ original_ivs_end);
1716 
1717  last_iter = true;
1718  }
1719  }
1720 
1721 #if defined(KMP_DEBUG)
1722  auto new_iv_for_end = kmp_calc_new_iv_from_original_ivs(
1723  updated_bounds_nest, original_ivs_end, n);
1724  KMP_DEBUG_ASSERT(new_iv_for_end >= new_iv_for_start);
1725 #endif
1726 
1727  if (last_iter && (tid != 0)) {
1728  // We are done, this was last chunk, but no chunk for current thread was
1729  // found:
1730  return FALSE;
1731  }
1732 
1733  if (tid == 0) {
1734  // We found the chunk for this thread, now we need to check if it's the
1735  // last chunk or not:
1736 
1737  CollapseAllocator<kmp_uint64> original_ivs_next_start(n);
1738  if (last_iter ||
1739  !kmp_calc_next_original_ivs(original_bounds_nest, n, original_ivs_end,
1740  /*out*/ original_ivs_next_start)) {
1741  // no more loop iterations left to process,
1742  // this means that currently found chunk is the last chunk:
1743  if (plastiter != NULL) {
1744  *plastiter = TRUE;
1745  }
1746  }
1747 
1748  // Fill in chunk bounds:
1749  for (kmp_index_t i = 0; i < n; ++i) {
1750  chunk_bounds_nest[i] =
1751  original_bounds_nest[i]; // To fill in types, etc. - optional
1752  chunk_bounds_nest[i].lb0_u64 = original_ivs_start[i];
1753  chunk_bounds_nest[i].lb1_u64 = 0;
1754 
1755  chunk_bounds_nest[i].ub0_u64 = original_ivs_end[i];
1756  chunk_bounds_nest[i].ub1_u64 = 0;
1757  }
1758 
1759  return TRUE;
1760  }
1761 
1762  --tid;
1763  --nth;
1764 
1765  bool next_chunk = kmp_calc_next_original_ivs(
1766  original_bounds_nest, n, original_ivs_end, /*out*/ original_ivs_start);
1767  if (!next_chunk) {
1768  // no more loop iterations to process,
1769  // the prevoius chunk was the last chunk
1770  break;
1771  }
1772 
1773  // original_ivs_start is next to previous chunk original_ivs_end,
1774  // we need to start new chunk here, so chunks will be one after another
1775  // without any gap or overlap:
1776  new_iv = kmp_calc_new_iv_from_original_ivs(updated_bounds_nest,
1777  original_ivs_start, n);
1778  }
1779 
1780  return FALSE;
1781 }
Definition: kmp.h:247